From 316f81bb1dfca9f109bf3edf77f4da5821d0ec99 Mon Sep 17 00:00:00 2001 From: "David G. Andersen" Date: Wed, 25 Jan 2012 15:04:16 -0800 Subject: [PATCH] math/big: assembly versions of bitLen for x86-64, 386, and ARM. Roughly 2x speedup for the internal bitLen function in arith.go. Added TestWordBitLen test. Performance differences against the new version of bitLen generic: x86-64 Macbook pro (current tip): benchmark old ns/op new ns/op delta big.BenchmarkBitLen0 6 4 -37.40% big.BenchmarkBitLen1 6 2 -51.79% big.BenchmarkBitLen2 6 2 -65.04% big.BenchmarkBitLen3 6 2 -66.10% big.BenchmarkBitLen4 6 2 -60.96% big.BenchmarkBitLen5 6 2 -55.80% big.BenchmarkBitLen8 6 2 -56.19% big.BenchmarkBitLen9 6 2 -64.73% big.BenchmarkBitLen16 7 2 -68.84% big.BenchmarkBitLen17 6 2 -67.11% big.BenchmarkBitLen31 7 2 -61.57% 386 Intel Atom (current tip): benchmark old ns/op new ns/op delta big.BenchmarkBitLen0 23 20 -13.04% big.BenchmarkBitLen1 23 20 -14.77% big.BenchmarkBitLen2 24 20 -19.28% big.BenchmarkBitLen3 25 20 -21.57% big.BenchmarkBitLen4 24 20 -16.94% big.BenchmarkBitLen5 25 20 -20.78% big.BenchmarkBitLen8 24 20 -19.28% big.BenchmarkBitLen9 25 20 -20.47% big.BenchmarkBitLen16 26 20 -23.37% big.BenchmarkBitLen17 26 20 -25.09% big.BenchmarkBitLen31 32 20 -35.51% ARM v5 SheevaPlug, previous weekly patched with bitLen: benchmark old ns/op new ns/op delta big.BenchmarkBitLen0 50 29 -41.73% big.BenchmarkBitLen1 51 29 -42.75% big.BenchmarkBitLen2 59 29 -50.08% big.BenchmarkBitLen3 60 29 -50.75% big.BenchmarkBitLen4 59 29 -50.08% big.BenchmarkBitLen5 60 29 -50.75% big.BenchmarkBitLen8 59 29 -50.08% big.BenchmarkBitLen9 60 29 -50.75% big.BenchmarkBitLen16 69 29 -57.35% big.BenchmarkBitLen17 70 29 -57.89% big.BenchmarkBitLen31 95 29 -69.07% R=golang-dev, minux.ma, gri CC=golang-dev https://golang.org/cl/5574054 --- src/pkg/math/big/arith.go | 2 +- src/pkg/math/big/arith_386.s | 13 ++++++++++++- src/pkg/math/big/arith_amd64.s | 13 ++++++++++++- src/pkg/math/big/arith_arm.s | 11 ++++++++++- src/pkg/math/big/arith_decl.go | 1 + src/pkg/math/big/arith_test.go | 23 +++++++++++++++++++++++ 6 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/pkg/math/big/arith.go b/src/pkg/math/big/arith.go index f30951ef0f6..5a30d3cf3a7 100644 --- a/src/pkg/math/big/arith.go +++ b/src/pkg/math/big/arith.go @@ -79,7 +79,7 @@ func mulAddWWW_g(x, y, c Word) (z1, z0 Word) { } // Length of x in bits. -func bitLen(x Word) (n int) { +func bitLen_g(x Word) (n int) { for ; x >= 0x8000; x >>= 16 { n += 16 } diff --git a/src/pkg/math/big/arith_386.s b/src/pkg/math/big/arith_386.s index 07c07b02ccf..f1262c6514c 100644 --- a/src/pkg/math/big/arith_386.s +++ b/src/pkg/math/big/arith_386.s @@ -245,7 +245,7 @@ E6: CMPL BX, $0 // i < 0 RET -// divWVW(z* Word, xn Word, x []Word, y Word) (r Word) +// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) TEXT ·divWVW(SB),7,$0 MOVL z+0(FP), DI MOVL xn+12(FP), DX // r = xn @@ -263,3 +263,14 @@ E7: SUBL $1, BX // i-- MOVL DX, r+32(FP) RET + +// func bitLen(x Word) (n int) +TEXT ·bitLen(SB),7,$0 + BSRL x+0(FP), AX + JZ Z1 + INCL AX + MOVL AX, n+4(FP) + RET + +Z1: MOVL $0, n+4(FP) + RET diff --git a/src/pkg/math/big/arith_amd64.s b/src/pkg/math/big/arith_amd64.s index 89b65f38a11..088f724704a 100644 --- a/src/pkg/math/big/arith_amd64.s +++ b/src/pkg/math/big/arith_amd64.s @@ -243,7 +243,7 @@ E6: CMPQ BX, R11 // i < n RET -// divWVW(z []Word, xn Word, x []Word, y Word) (r Word) +// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) TEXT ·divWVW(SB),7,$0 MOVQ z+0(FP), R10 MOVQ xn+16(FP), DX // r = xn @@ -261,3 +261,14 @@ E7: SUBL $1, BX // i-- MOVQ DX, r+48(FP) RET + +// func bitLen(x Word) (n int) +TEXT ·bitLen(SB),7,$0 + BSRQ x+0(FP), AX + JZ Z1 + INCQ AX + MOVQ AX, n+8(FP) + RET + +Z1: MOVQ $0, n+8(FP) + RET diff --git a/src/pkg/math/big/arith_arm.s b/src/pkg/math/big/arith_arm.s index 60abe6eaa95..dbf3360b58e 100644 --- a/src/pkg/math/big/arith_arm.s +++ b/src/pkg/math/big/arith_arm.s @@ -290,7 +290,7 @@ E9: RET -// divWVW(z* Word, xn Word, x []Word, y Word) (r Word) +// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) TEXT ·divWVW(SB),7,$0 // ARM has no multiword division, so use portable code. B ·divWVW_g(SB) @@ -310,3 +310,12 @@ TEXT ·mulWW(SB),7,$0 MOVW R4, z1+8(FP) MOVW R3, z0+12(FP) RET + +// func bitLen(x Word) (n int) +TEXT ·bitLen(SB),7,$0 + MOVW x+0(FP), R0 + WORD $0xe16f0f10 // CLZ R0, R0 (count leading zeros) + MOVW $32, R1 + SUB.S R0, R1 + MOVW R1, n+4(FP) + RET diff --git a/src/pkg/math/big/arith_decl.go b/src/pkg/math/big/arith_decl.go index 95fcd8b94be..068cc8d9388 100644 --- a/src/pkg/math/big/arith_decl.go +++ b/src/pkg/math/big/arith_decl.go @@ -16,3 +16,4 @@ func shrVU(z, x []Word, s uint) (c Word) func mulAddVWW(z, x []Word, y, r Word) (c Word) func addMulVVW(z, x []Word, y Word) (c Word) func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) +func bitLen(x Word) (n int) diff --git a/src/pkg/math/big/arith_test.go b/src/pkg/math/big/arith_test.go index 106cd92d856..cd02ba36749 100644 --- a/src/pkg/math/big/arith_test.go +++ b/src/pkg/math/big/arith_test.go @@ -334,6 +334,29 @@ func TestMulAddWWW(t *testing.T) { } } +func TestWordBitLen(t *testing.T) { + // Test every possible output of bitLen with the high bit set + // and then with all bits below max set + z := bitLen(0) + if z != 0 { + t.Errorf("0 got %d want 0", z) + } + x := Word(1) // Will be ...00010000... + y := Word(1) // Will be ...00011111... + for i := 1; i <= _W; i++ { + z = bitLen(x) + if z != i { + t.Errorf("%x got %d want %d", x, z, i) + } + z = bitLen(y) + if z != i { + t.Errorf("%x got %d want %d", y, z, i) + } + x <<= 1 + y = (y << 1) | 0x1 + } +} + // runs b.N iterations of bitLen called on a Word containing (1 << nbits)-1. func benchmarkBitLenN(b *testing.B, nbits uint) { testword := Word((uint64(1) << nbits) - 1)