From d54a9a9c42e751a020308cae296add426b56d0f0 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Tue, 25 Aug 2020 16:33:50 +0800 Subject: [PATCH] math/big: replace division with multiplication by reciprocal word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Division is much slower than multiplication. And the method of using multiplication by multiplying reciprocal and replacing division with it can increase the speed of divWVW algorithm by three times,and at the same time increase the speed of nats division. The benchmark test on arm64 is as follows: name old time/op new time/op delta DivWVW/1-4 13.1ns ± 4% 13.3ns ± 4% ~ (p=0.444 n=5+5) DivWVW/2-4 48.6ns ± 1% 51.2ns ± 2% +5.39% (p=0.008 n=5+5) DivWVW/3-4 82.0ns ± 1% 69.7ns ± 1% -15.03% (p=0.008 n=5+5) DivWVW/4-4 116ns ± 1% 71ns ± 2% -38.88% (p=0.008 n=5+5) DivWVW/5-4 152ns ± 1% 84ns ± 4% -44.70% (p=0.008 n=5+5) DivWVW/10-4 319ns ± 1% 155ns ± 4% -51.50% (p=0.008 n=5+5) DivWVW/100-4 3.44µs ± 3% 1.30µs ± 8% -62.30% (p=0.008 n=5+5) DivWVW/1000-4 33.8µs ± 0% 10.9µs ± 1% -67.74% (p=0.008 n=5+5) DivWVW/10000-4 343µs ± 4% 111µs ± 5% -67.63% (p=0.008 n=5+5) DivWVW/100000-4 3.35ms ± 1% 1.25ms ± 3% -62.79% (p=0.008 n=5+5) QuoRem-4 3.08µs ± 2% 2.21µs ± 4% -28.40% (p=0.008 n=5+5) ModSqrt225_Tonelli-4 444µs ± 2% 457µs ± 3% ~ (p=0.095 n=5+5) ModSqrt225_3Mod4-4 136µs ± 1% 138µs ± 3% ~ (p=0.151 n=5+5) ModSqrt231_Tonelli-4 473µs ± 3% 483µs ± 4% ~ (p=0.548 n=5+5) ModSqrt231_5Mod8-4 164µs ± 9% 169µs ±12% ~ (p=0.421 n=5+5) Sqrt-4 36.8µs ± 1% 28.6µs ± 0% -22.17% (p=0.016 n=5+4) Div/20/10-4 50.0ns ± 3% 51.3ns ± 6% ~ (p=0.238 n=5+5) Div/40/20-4 49.8ns ± 2% 51.3ns ± 6% ~ (p=0.222 n=5+5) Div/100/50-4 85.8ns ± 4% 86.5ns ± 5% ~ (p=0.246 n=5+5) Div/200/100-4 335ns ± 3% 296ns ± 2% -11.60% (p=0.008 n=5+5) Div/400/200-4 442ns ± 2% 359ns ± 5% -18.81% (p=0.008 n=5+5) Div/1000/500-4 858ns ± 3% 643ns ± 6% -25.06% (p=0.008 n=5+5) Div/2000/1000-4 1.70µs ± 3% 1.28µs ± 4% -24.80% (p=0.008 n=5+5) Div/20000/10000-4 45.0µs ± 5% 41.8µs ± 4% -7.17% (p=0.016 n=5+5) Div/200000/100000-4 1.51ms ± 7% 1.43ms ± 3% -5.42% (p=0.016 n=5+5) Div/2000000/1000000-4 57.6ms ± 4% 57.5ms ± 3% ~ (p=1.000 n=5+5) Div/20000000/10000000-4 2.08s ± 3% 2.04s ± 1% ~ (p=0.095 n=5+5) name old speed new speed delta DivWVW/1-4 4.87GB/s ± 4% 4.80GB/s ± 4% ~ (p=0.310 n=5+5) DivWVW/2-4 2.63GB/s ± 1% 2.50GB/s ± 2% -5.07% (p=0.008 n=5+5) DivWVW/3-4 2.34GB/s ± 1% 2.76GB/s ± 1% +17.70% (p=0.008 n=5+5) DivWVW/4-4 2.21GB/s ± 1% 3.61GB/s ± 2% +63.42% (p=0.008 n=5+5) DivWVW/5-4 2.10GB/s ± 2% 3.81GB/s ± 4% +80.89% (p=0.008 n=5+5) DivWVW/10-4 2.01GB/s ± 0% 4.13GB/s ± 4% +105.91% (p=0.008 n=5+5) DivWVW/100-4 1.86GB/s ± 2% 4.95GB/s ± 7% +165.63% (p=0.008 n=5+5) DivWVW/1000-4 1.89GB/s ± 0% 5.86GB/s ± 1% +209.96% (p=0.008 n=5+5) DivWVW/10000-4 1.87GB/s ± 4% 5.76GB/s ± 5% +208.96% (p=0.008 n=5+5) DivWVW/100000-4 1.91GB/s ± 1% 5.14GB/s ± 3% +168.85% (p=0.008 n=5+5) Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830 Reviewed-on: https://go-review.googlesource.com/c/go/+/250417 Trust: Giovanni Bajo Trust: Keith Randall Run-TryBot: Giovanni Bajo TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/cmd/compile/internal/gc/ssa.go | 5 -- src/math/big/arith.go | 93 ++++++++++++++++++++++++++---- src/math/big/arith_386.s | 27 --------- src/math/big/arith_amd64.s | 26 --------- src/math/big/arith_arm.s | 11 ---- src/math/big/arith_arm64.s | 9 +-- src/math/big/arith_decl.go | 2 - src/math/big/arith_decl_pure.go | 8 --- src/math/big/arith_mips64x.s | 5 -- src/math/big/arith_mipsx.s | 5 -- src/math/big/arith_ppc64x.s | 40 ------------- src/math/big/arith_riscv64.s | 5 -- src/math/big/arith_s390x.s | 33 ----------- src/math/big/arith_test.go | 54 ++++++++++++++++- src/math/big/arith_wasm.s | 5 -- src/math/big/nat.go | 3 +- 16 files changed, 138 insertions(+), 193 deletions(-) diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index d0b3e8df94e..815ff7f99fb 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -4022,11 +4022,6 @@ func init() { return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1]) }, sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64LE, sys.ArchPPC64, sys.ArchS390X) - add("math/big", "divWW", - func(s *state, n *Node, args []*ssa.Value) *ssa.Value { - return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2]) - }, - sys.ArchAMD64) } // findIntrinsic returns a function which builds the SSA equivalent of the diff --git a/src/math/big/arith.go b/src/math/big/arith.go index b0885f261fe..750ce8aa398 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -60,12 +60,6 @@ func nlz(x Word) uint { return uint(bits.LeadingZeros(uint(x))) } -// q = (u1<<_W + u0 - r)/v -func divWW_g(u1, u0, v Word) (q, r Word) { - qq, rr := bits.Div(uint(u1), uint(u0), uint(v)) - return Word(qq), Word(rr) -} - // The resulting carry c is either 0 or 1. func addVV_g(z, x, y []Word) (c Word) { // The comment near the top of this file discusses this for loop condition. @@ -207,10 +201,87 @@ func addMulVVW_g(z, x []Word, y Word) (c Word) { return } -func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) { - r = xn - for i := len(z) - 1; i >= 0; i-- { - z[i], r = divWW_g(r, x[i], y) +// q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1>(_W-s) + x0 <<= s + y <<= s } - return + d := uint(y) + // We know that + // m = ⎣(B^2-1)/d⎦-B + // ⎣(B^2-1)/d⎦ = m+B + // (B^2-1)/d = m+B+delta1 0 <= delta1 <= (d-1)/d + // B^2/d = m+B+delta2 0 <= delta2 <= 1 + // The quotient we're trying to compute is + // quotient = ⎣(x1*B+x0)/d⎦ + // = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦ + // = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦ + // = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦ + // The latter two terms of this three-term sum are between 0 and 1. + // So we can compute just the first term, and we will be low by at most 2. + t1, t0 := bits.Mul(uint(m), uint(x1)) + _, c := bits.Add(t0, uint(x0), 0) + t1, _ = bits.Add(t1, uint(x1), c) + // The quotient is either t1, t1+1, or t1+2. + // We'll try t1 and adjust if needed. + qq := t1 + // compute remainder r=x-d*q. + dq1, dq0 := bits.Mul(d, qq) + r0, b := bits.Sub(uint(x0), dq0, 0) + r1, _ := bits.Sub(uint(x1), dq1, b) + // The remainder we just computed is bounded above by B+d: + // r = x1*B + x0 - d*q. + // = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦ + // = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha) 0 <= alpha < 1 + // = x1*B + x0 - x1*d/B*m - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta) - x1*d - x0*d/B + d*alpha 0 <= beta < 1 + // = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha + // = x0 + x1/B + x1*d/B*beta - x0*d/B + d*alpha + // = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha + // < B*(1-d/B) + d*B/B + d because x00), x1= d { + qq++ + r0 -= d + } + return Word(qq), Word(r0 >> s) +} + +func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { + r = xn + if len(x) == 1 { + qq, rr := bits.Div(uint(r), uint(x[0]), uint(y)) + z[0] = Word(qq) + return Word(rr) + } + rec := reciprocalWord(y) + for i := len(z) - 1; i >= 0; i-- { + z[i], r = divWW(r, x[i], y, rec) + } + return r +} + +// reciprocalWord return the reciprocal of the divisor. rec = floor(( _B^2 - 1 ) / u - _B). u = d1 << nlz(d1). +func reciprocalWord(d1 Word) Word { + u := uint(d1 << nlz(d1)) + x1 := ^u + x0 := uint(_M) + rec, _ := bits.Div(x1, x0, u) // (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U + return Word(rec) } diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s index f61da2aba72..d0ea949fe66 100644 --- a/src/math/big/arith_386.s +++ b/src/math/big/arith_386.s @@ -18,16 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVL x1+0(FP), DX - MOVL x0+4(FP), AX - DIVL y+8(FP) - MOVL AX, q+12(FP) - MOVL DX, r+16(FP) - RET - - // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 MOVL z+0(FP), DI @@ -251,21 +241,4 @@ E6: CMPL BX, $0 // i < 0 RET -// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVL z+0(FP), DI - MOVL xn+12(FP), DX // r = xn - MOVL x+16(FP), SI - MOVL y+28(FP), CX - MOVL z_len+4(FP), BX // i = z - JMP E7 -L7: MOVL (SI)(BX*4), AX - DIVL CX - MOVL AX, (DI)(BX*4) - -E7: SUBL $1, BX // i-- - JGE L7 // i >= 0 - - MOVL DX, r+32(FP) - RET diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index b75639f5406..61043ca2d97 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -18,14 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVQ x1+0(FP), DX - MOVQ x0+8(FP), AX - DIVQ y+16(FP) - MOVQ AX, q+24(FP) - MOVQ DX, r+32(FP) - RET // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. @@ -531,21 +523,3 @@ adx_short: -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVQ z+0(FP), R10 - MOVQ xn+24(FP), DX // r = xn - MOVQ x+32(FP), R8 - MOVQ y+56(FP), R9 - MOVQ z_len+8(FP), BX // i = z - JMP E7 - -L7: MOVQ (R8)(BX*8), AX - DIVQ R9 - MOVQ AX, (R10)(BX*8) - -E7: SUBQ $1, BX // i-- - JGE L7 // i >= 0 - - MOVQ DX, r+64(FP) - RET diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s index 33aa36f7090..cbf7445e7ab 100644 --- a/src/math/big/arith_arm.s +++ b/src/math/big/arith_arm.s @@ -272,17 +272,6 @@ E9: RET -// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWVW_g(SB) - - -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWW_g(SB) - // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index da6e408e19f..22357d088e4 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -23,11 +23,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - B ·divWW_g(SB) // ARM64 has no multiword division - - // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 MOVD z_len+8(FP), R0 @@ -585,6 +580,4 @@ done: MOVD R4, c+56(FP) RET -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - B ·divWVW_g(SB) + diff --git a/src/math/big/arith_decl.go b/src/math/big/arith_decl.go index 41e592334c3..d519bdc87b6 100644 --- a/src/math/big/arith_decl.go +++ b/src/math/big/arith_decl.go @@ -8,7 +8,6 @@ package big // implemented in arith_$GOARCH.s func mulWW(x, y Word) (z1, z0 Word) -func divWW(x1, x0, y Word) (q, r Word) func addVV(z, x, y []Word) (c Word) func subVV(z, x, y []Word) (c Word) func addVW(z, x []Word, y Word) (c Word) @@ -17,4 +16,3 @@ func shlVU(z, x []Word, s uint) (c Word) func shrVU(z, x []Word, s uint) (c Word) func mulAddVWW(z, x []Word, y, r Word) (c Word) func addMulVVW(z, x []Word, y Word) (c Word) -func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go index 305f7ee03b4..5faa3bd2819 100644 --- a/src/math/big/arith_decl_pure.go +++ b/src/math/big/arith_decl_pure.go @@ -10,10 +10,6 @@ func mulWW(x, y Word) (z1, z0 Word) { return mulWW_g(x, y) } -func divWW(x1, x0, y Word) (q, r Word) { - return divWW_g(x1, x0, y) -} - func addVV(z, x, y []Word) (c Word) { return addVV_g(z, x, y) } @@ -55,7 +51,3 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) { func addMulVVW(z, x []Word, y Word) (c Word) { return addMulVVW_g(z, x, y) } - -func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { - return divWVW_g(z, xn, x, y) -} diff --git a/src/math/big/arith_mips64x.s b/src/math/big/arith_mips64x.s index 983510ee3d4..804b9fe06ed 100644 --- a/src/math/big/arith_mips64x.s +++ b/src/math/big/arith_mips64x.s @@ -12,9 +12,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s index 54cafbd9c0c..efdecb80f32 100644 --- a/src/math/big/arith_mipsx.s +++ b/src/math/big/arith_mipsx.s @@ -12,9 +12,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 409e10ab48b..b299ccc2fb8 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -478,44 +478,4 @@ done: MOVD R4, c+56(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB), NOSPLIT, $0 - MOVD x1+0(FP), R4 - MOVD x0+8(FP), R5 - MOVD y+16(FP), R6 - CMPU R4, R6 - BGE divbigger - - // from the programmer's note in ch. 3 of the ISA manual, p.74 - DIVDEU R6, R4, R3 - DIVDU R6, R5, R7 - MULLD R6, R3, R8 - MULLD R6, R7, R20 - SUB R20, R5, R10 - ADD R7, R3, R3 - SUB R8, R10, R4 - CMPU R4, R10 - BLT adjust - CMPU R4, R6 - BLT end - -adjust: - MOVD $1, R21 - ADD R21, R3, R3 - SUB R6, R4, R4 - -end: - MOVD R3, q+24(FP) - MOVD R4, r+32(FP) - - RET - -divbigger: - MOVD $-1, R7 - MOVD R7, q+24(FP) - MOVD R7, r+32(FP) - RET - -TEXT ·divWVW(SB), NOSPLIT, $0 - BR ·divWVW_g(SB) diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s index 59065c3f7ba..a2f7666c7b9 100644 --- a/src/math/big/arith_riscv64.s +++ b/src/math/big/arith_riscv64.s @@ -19,9 +19,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 MOV X8, z0+24(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) // riscv64 has no multiword division TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -47,5 +44,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s index 48917681112..242aca7434a 100644 --- a/src/math/big/arith_s390x.s +++ b/src/math/big/arith_s390x.s @@ -17,15 +17,6 @@ TEXT ·mulWW(SB), NOSPLIT, $0 MOVD R11, z0+24(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB), NOSPLIT, $0 - MOVD x1+0(FP), R10 - MOVD x0+8(FP), R11 - MOVD y+16(FP), R5 - WORD $0xb98700a5 // dlgr r10,r5 - MOVD R11, q+24(FP) - MOVD R10, r+32(FP) - RET // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 // func addVV(z, x, y []Word) (c Word) @@ -990,27 +981,3 @@ E6: MOVD R4, c+56(FP) RET -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i -TEXT ·divWVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R2 - MOVD xn+24(FP), R10 // r = xn - MOVD x+32(FP), R8 - MOVD y+56(FP), R9 - MOVD z_len+8(FP), R7 // i = z - SLD $3, R7, R1 // i*8 - MOVD $0, R0 // make sure it's zero - BR E7 - -L7: - MOVD (R8)(R1*1), R11 - WORD $0xB98700A9 // DLGR R10,R9 - MOVD R11, (R2)(R1*1) - -E7: - SUB $1, R7 // i-- - SUB $8, R1 - BGE L7 // i >= 0 - - MOVD R10, r+64(FP) - RET diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index fc205934c5c..808d1784592 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -7,6 +7,7 @@ package big import ( "fmt" "internal/testenv" + "math/bits" "math/rand" "strings" "testing" @@ -493,7 +494,6 @@ func TestFunVWW(t *testing.T) { if a.y != 0 && a.r < a.y { arg := argWVW{a.x, a.c, a.z, a.y, a.r} - testFunWVW(t, "divWVW_g", divWVW_g, arg) testFunWVW(t, "divWVW", divWVW, arg) } } @@ -536,6 +536,42 @@ func TestMulAddWWW(t *testing.T) { } } +var divWWTests = []struct { + x1, x0, y Word + q, r Word +}{ + {_M >> 1, 0, _M, _M >> 1, _M >> 1}, + {_M - (1 << (_W - 2)), _M, 3 << (_W - 2), _M, _M - (1 << (_W - 2))}, +} + +const testsNumber = 1 << 16 + +func TestDivWW(t *testing.T) { + i := 0 + for i, test := range divWWTests { + rec := reciprocalWord(test.y) + q, r := divWW(test.x1, test.x0, test.y, rec) + if q != test.q || r != test.r { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) + } + } + //random tests + for ; i < testsNumber; i++ { + x1 := rndW() + x0 := rndW() + y := rndW() + if x1 >= y { + continue + } + rec := reciprocalWord(y) + qGot, rGot := divWW(x1, x0, y, rec) + qWant, rWant := bits.Div(uint(x1), uint(x0), uint(y)) + if uint(qGot) != qWant || uint(rGot) != rWant { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, qGot, rGot, qWant, rWant) + } + } +} + func BenchmarkMulAddVWW(b *testing.B) { for _, n := range benchSizes { if isRaceBuilder && n > 1e3 { @@ -570,3 +606,19 @@ func BenchmarkAddMulVVW(b *testing.B) { }) } } +func BenchmarkDivWVW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndW() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + divWVW(z, 0, x, y) + } + }) + } +} diff --git a/src/math/big/arith_wasm.s b/src/math/big/arith_wasm.s index 382597c6945..add10644690 100644 --- a/src/math/big/arith_wasm.s +++ b/src/math/big/arith_wasm.s @@ -9,9 +9,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -36,5 +33,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/nat.go b/src/math/big/nat.go index 6a3989bf9d8..c2f3787848c 100644 --- a/src/math/big/nat.go +++ b/src/math/big/nat.go @@ -751,6 +751,7 @@ func (q nat) divBasic(u, v nat) { // D2. vn1 := v[n-1] + rec := reciprocalWord(vn1) for j := m; j >= 0; j-- { // D3. qhat := Word(_M) @@ -760,7 +761,7 @@ func (q nat) divBasic(u, v nat) { } if ujn != vn1 { var rhat Word - qhat, rhat = divWW(ujn, u[j+n-1], vn1) + qhat, rhat = divWW(ujn, u[j+n-1], vn1, rec) // x1 | x2 = q̂v_{n-2} vn2 := v[n-2]