1
0
mirror of https://github.com/golang/go synced 2024-09-29 20:34:36 -06:00

math/big: replace division with multiplication by reciprocal word

Division is much slower than multiplication. And the method of using
multiplication by multiplying reciprocal and replacing division with it
can increase the speed of divWVW algorithm by three times,and at the
same time increase the speed of nats division.

The benchmark test on arm64 is as follows:
name                     old time/op    new time/op    delta
DivWVW/1-4                 13.1ns ± 4%    13.3ns ± 4%      ~     (p=0.444 n=5+5)
DivWVW/2-4                 48.6ns ± 1%    51.2ns ± 2%    +5.39%  (p=0.008 n=5+5)
DivWVW/3-4                 82.0ns ± 1%    69.7ns ± 1%   -15.03%  (p=0.008 n=5+5)
DivWVW/4-4                  116ns ± 1%      71ns ± 2%   -38.88%  (p=0.008 n=5+5)
DivWVW/5-4                  152ns ± 1%      84ns ± 4%   -44.70%  (p=0.008 n=5+5)
DivWVW/10-4                 319ns ± 1%     155ns ± 4%   -51.50%  (p=0.008 n=5+5)
DivWVW/100-4               3.44µs ± 3%    1.30µs ± 8%   -62.30%  (p=0.008 n=5+5)
DivWVW/1000-4              33.8µs ± 0%    10.9µs ± 1%   -67.74%  (p=0.008 n=5+5)
DivWVW/10000-4              343µs ± 4%     111µs ± 5%   -67.63%  (p=0.008 n=5+5)
DivWVW/100000-4            3.35ms ± 1%    1.25ms ± 3%   -62.79%  (p=0.008 n=5+5)
QuoRem-4                   3.08µs ± 2%    2.21µs ± 4%   -28.40%  (p=0.008 n=5+5)
ModSqrt225_Tonelli-4        444µs ± 2%     457µs ± 3%      ~     (p=0.095 n=5+5)
ModSqrt225_3Mod4-4          136µs ± 1%     138µs ± 3%      ~     (p=0.151 n=5+5)
ModSqrt231_Tonelli-4        473µs ± 3%     483µs ± 4%      ~     (p=0.548 n=5+5)
ModSqrt231_5Mod8-4          164µs ± 9%     169µs ±12%      ~     (p=0.421 n=5+5)
Sqrt-4                     36.8µs ± 1%    28.6µs ± 0%   -22.17%  (p=0.016 n=5+4)
Div/20/10-4                50.0ns ± 3%    51.3ns ± 6%      ~     (p=0.238 n=5+5)
Div/40/20-4                49.8ns ± 2%    51.3ns ± 6%      ~     (p=0.222 n=5+5)
Div/100/50-4               85.8ns ± 4%    86.5ns ± 5%	   ~     (p=0.246 n=5+5)
Div/200/100-4               335ns ± 3%     296ns ± 2%   -11.60%  (p=0.008 n=5+5)
Div/400/200-4               442ns ± 2%     359ns ± 5%   -18.81%  (p=0.008 n=5+5)
Div/1000/500-4              858ns ± 3%     643ns ± 6%   -25.06%  (p=0.008 n=5+5)
Div/2000/1000-4            1.70µs ± 3%    1.28µs ± 4%   -24.80%  (p=0.008 n=5+5)
Div/20000/10000-4          45.0µs ± 5%    41.8µs ± 4%    -7.17%  (p=0.016 n=5+5)
Div/200000/100000-4        1.51ms ± 7%    1.43ms ± 3%    -5.42%  (p=0.016 n=5+5)
Div/2000000/1000000-4      57.6ms ± 4%    57.5ms ± 3%      ~     (p=1.000 n=5+5)
Div/20000000/10000000-4     2.08s ± 3%     2.04s ± 1%      ~     (p=0.095 n=5+5)

name                     old speed      new speed      delta
DivWVW/1-4               4.87GB/s ± 4%  4.80GB/s ± 4%      ~     (p=0.310 n=5+5)
DivWVW/2-4               2.63GB/s ± 1%  2.50GB/s ± 2%    -5.07%  (p=0.008 n=5+5)
DivWVW/3-4               2.34GB/s ± 1%  2.76GB/s ± 1%   +17.70%  (p=0.008 n=5+5)
DivWVW/4-4               2.21GB/s ± 1%  3.61GB/s ± 2%   +63.42%  (p=0.008 n=5+5)
DivWVW/5-4               2.10GB/s ± 2%  3.81GB/s ± 4%   +80.89%  (p=0.008 n=5+5)
DivWVW/10-4              2.01GB/s ± 0%  4.13GB/s ± 4%  +105.91%  (p=0.008 n=5+5)
DivWVW/100-4             1.86GB/s ± 2%  4.95GB/s ± 7%  +165.63%  (p=0.008 n=5+5)
DivWVW/1000-4            1.89GB/s ± 0%  5.86GB/s ± 1%  +209.96%  (p=0.008 n=5+5)
DivWVW/10000-4           1.87GB/s ± 4%  5.76GB/s ± 5%  +208.96%  (p=0.008 n=5+5)
DivWVW/100000-4          1.91GB/s ± 1%  5.14GB/s ± 3%  +168.85%  (p=0.008 n=5+5)

Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830
Reviewed-on: https://go-review.googlesource.com/c/go/+/250417
Trust: Giovanni Bajo <rasky@develer.com>
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Giovanni Bajo <rasky@develer.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
SparrowLii 2020-08-25 16:33:50 +08:00 committed by Giovanni Bajo
parent b6632f770f
commit d54a9a9c42
16 changed files with 138 additions and 193 deletions

View File

@ -4022,11 +4022,6 @@ func init() {
return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1])
},
sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64LE, sys.ArchPPC64, sys.ArchS390X)
add("math/big", "divWW",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
},
sys.ArchAMD64)
}
// findIntrinsic returns a function which builds the SSA equivalent of the

View File

@ -60,12 +60,6 @@ func nlz(x Word) uint {
return uint(bits.LeadingZeros(uint(x)))
}
// q = (u1<<_W + u0 - r)/v
func divWW_g(u1, u0, v Word) (q, r Word) {
qq, rr := bits.Div(uint(u1), uint(u0), uint(v))
return Word(qq), Word(rr)
}
// The resulting carry c is either 0 or 1.
func addVV_g(z, x, y []Word) (c Word) {
// The comment near the top of this file discusses this for loop condition.
@ -207,10 +201,87 @@ func addMulVVW_g(z, x []Word, y Word) (c Word) {
return
}
func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) {
r = xn
for i := len(z) - 1; i >= 0; i-- {
z[i], r = divWW_g(r, x[i], y)
// q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1<y.
// An approximate reciprocal with a reference to "Improved Division by Invariant Integers
// (IEEE Transactions on Computers, 11 Jun. 2010)"
func divWW(x1, x0, y, m Word) (q, r Word) {
s := nlz(y)
if s != 0 {
x1 = x1<<s | x0>>(_W-s)
x0 <<= s
y <<= s
}
return
d := uint(y)
// We know that
// m = ⎣(B^2-1)/d⎦-B
// ⎣(B^2-1)/d⎦ = m+B
// (B^2-1)/d = m+B+delta1 0 <= delta1 <= (d-1)/d
// B^2/d = m+B+delta2 0 <= delta2 <= 1
// The quotient we're trying to compute is
// quotient = ⎣(x1*B+x0)/d⎦
// = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦
// = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦
// = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦
// The latter two terms of this three-term sum are between 0 and 1.
// So we can compute just the first term, and we will be low by at most 2.
t1, t0 := bits.Mul(uint(m), uint(x1))
_, c := bits.Add(t0, uint(x0), 0)
t1, _ = bits.Add(t1, uint(x1), c)
// The quotient is either t1, t1+1, or t1+2.
// We'll try t1 and adjust if needed.
qq := t1
// compute remainder r=x-d*q.
dq1, dq0 := bits.Mul(d, qq)
r0, b := bits.Sub(uint(x0), dq0, 0)
r1, _ := bits.Sub(uint(x1), dq1, b)
// The remainder we just computed is bounded above by B+d:
// r = x1*B + x0 - d*q.
// = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦
// = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha) 0 <= alpha < 1
// = x1*B + x0 - x1*d/B*m - x1*d - x0*d/B + d*alpha
// = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha
// = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha
// = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta) - x1*d - x0*d/B + d*alpha 0 <= beta < 1
// = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha
// = x0 + x1/B + x1*d/B*beta - x0*d/B + d*alpha
// = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha
// < B*(1-d/B) + d*B/B + d because x0<B (and 1-d/B>0), x1<d, 1+d*beta<=B, alpha<1
// = B - d + d + d
// = B+d
// So r1 can only be 0 or 1. If r1 is 1, then we know q was too small.
// Add 1 to q and subtract d from r. That guarantees that r is <B, so
// we no longer need to keep track of r1.
if r1 != 0 {
qq++
r0 -= d
}
// If the remainder is still too large, increment q one more time.
if r0 >= d {
qq++
r0 -= d
}
return Word(qq), Word(r0 >> s)
}
func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) {
r = xn
if len(x) == 1 {
qq, rr := bits.Div(uint(r), uint(x[0]), uint(y))
z[0] = Word(qq)
return Word(rr)
}
rec := reciprocalWord(y)
for i := len(z) - 1; i >= 0; i-- {
z[i], r = divWW(r, x[i], y, rec)
}
return r
}
// reciprocalWord return the reciprocal of the divisor. rec = floor(( _B^2 - 1 ) / u - _B). u = d1 << nlz(d1).
func reciprocalWord(d1 Word) Word {
u := uint(d1 << nlz(d1))
x1 := ^u
x0 := uint(_M)
rec, _ := bits.Div(x1, x0, u) // (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U
return Word(rec)
}

View File

@ -18,16 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
MOVL x1+0(FP), DX
MOVL x0+4(FP), AX
DIVL y+8(FP)
MOVL AX, q+12(FP)
MOVL DX, r+16(FP)
RET
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVL z+0(FP), DI
@ -251,21 +241,4 @@ E6: CMPL BX, $0 // i < 0
RET
// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL xn+12(FP), DX // r = xn
MOVL x+16(FP), SI
MOVL y+28(FP), CX
MOVL z_len+4(FP), BX // i = z
JMP E7
L7: MOVL (SI)(BX*4), AX
DIVL CX
MOVL AX, (DI)(BX*4)
E7: SUBL $1, BX // i--
JGE L7 // i >= 0
MOVL DX, r+32(FP)
RET

View File

@ -18,14 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
MOVQ x1+0(FP), DX
MOVQ x0+8(FP), AX
DIVQ y+16(FP)
MOVQ AX, q+24(FP)
MOVQ DX, r+32(FP)
RET
// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
@ -531,21 +523,3 @@ adx_short:
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
MOVQ z+0(FP), R10
MOVQ xn+24(FP), DX // r = xn
MOVQ x+32(FP), R8
MOVQ y+56(FP), R9
MOVQ z_len+8(FP), BX // i = z
JMP E7
L7: MOVQ (R8)(BX*8), AX
DIVQ R9
MOVQ AX, (R10)(BX*8)
E7: SUBQ $1, BX // i--
JGE L7 // i >= 0
MOVQ DX, r+64(FP)
RET

View File

@ -272,17 +272,6 @@ E9:
RET
// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
// ARM has no multiword division, so use portable code.
B ·divWVW_g(SB)
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
// ARM has no multiword division, so use portable code.
B ·divWW_g(SB)
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0

View File

@ -23,11 +23,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
B ·divWW_g(SB) // ARM64 has no multiword division
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVD z_len+8(FP), R0
@ -585,6 +580,4 @@ done:
MOVD R4, c+56(FP)
RET
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
B ·divWVW_g(SB)

View File

@ -8,7 +8,6 @@ package big
// implemented in arith_$GOARCH.s
func mulWW(x, y Word) (z1, z0 Word)
func divWW(x1, x0, y Word) (q, r Word)
func addVV(z, x, y []Word) (c Word)
func subVV(z, x, y []Word) (c Word)
func addVW(z, x []Word, y Word) (c Word)
@ -17,4 +16,3 @@ func shlVU(z, x []Word, s uint) (c Word)
func shrVU(z, x []Word, s uint) (c Word)
func mulAddVWW(z, x []Word, y, r Word) (c Word)
func addMulVVW(z, x []Word, y Word) (c Word)
func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)

View File

@ -10,10 +10,6 @@ func mulWW(x, y Word) (z1, z0 Word) {
return mulWW_g(x, y)
}
func divWW(x1, x0, y Word) (q, r Word) {
return divWW_g(x1, x0, y)
}
func addVV(z, x, y []Word) (c Word) {
return addVV_g(z, x, y)
}
@ -55,7 +51,3 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) {
func addMulVVW(z, x []Word, y Word) (c Word) {
return addMulVVW_g(z, x, y)
}
func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) {
return divWVW_g(z, xn, x, y)
}

View File

@ -12,9 +12,6 @@
TEXT ·mulWW(SB),NOSPLIT,$0
JMP ·mulWW_g(SB)
TEXT ·divWW(SB),NOSPLIT,$0
JMP ·divWW_g(SB)
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·divWVW(SB),NOSPLIT,$0
JMP ·divWVW_g(SB)

View File

@ -12,9 +12,6 @@
TEXT ·mulWW(SB),NOSPLIT,$0
JMP ·mulWW_g(SB)
TEXT ·divWW(SB),NOSPLIT,$0
JMP ·divWW_g(SB)
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·divWVW(SB),NOSPLIT,$0
JMP ·divWVW_g(SB)

View File

@ -478,44 +478,4 @@ done:
MOVD R4, c+56(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB), NOSPLIT, $0
MOVD x1+0(FP), R4
MOVD x0+8(FP), R5
MOVD y+16(FP), R6
CMPU R4, R6
BGE divbigger
// from the programmer's note in ch. 3 of the ISA manual, p.74
DIVDEU R6, R4, R3
DIVDU R6, R5, R7
MULLD R6, R3, R8
MULLD R6, R7, R20
SUB R20, R5, R10
ADD R7, R3, R3
SUB R8, R10, R4
CMPU R4, R10
BLT adjust
CMPU R4, R6
BLT end
adjust:
MOVD $1, R21
ADD R21, R3, R3
SUB R6, R4, R4
end:
MOVD R3, q+24(FP)
MOVD R4, r+32(FP)
RET
divbigger:
MOVD $-1, R7
MOVD R7, q+24(FP)
MOVD R7, r+32(FP)
RET
TEXT ·divWVW(SB), NOSPLIT, $0
BR ·divWVW_g(SB)

View File

@ -19,9 +19,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
MOV X8, z0+24(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
JMP ·divWW_g(SB) // riscv64 has no multiword division
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
@ -47,5 +44,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·divWVW(SB),NOSPLIT,$0
JMP ·divWVW_g(SB)

View File

@ -17,15 +17,6 @@ TEXT ·mulWW(SB), NOSPLIT, $0
MOVD R11, z0+24(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB), NOSPLIT, $0
MOVD x1+0(FP), R10
MOVD x0+8(FP), R11
MOVD y+16(FP), R5
WORD $0xb98700a5 // dlgr r10,r5
MOVD R11, q+24(FP)
MOVD R10, r+32(FP)
RET
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
// func addVV(z, x, y []Word) (c Word)
@ -990,27 +981,3 @@ E6:
MOVD R4, c+56(FP)
RET
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
TEXT ·divWVW(SB), NOSPLIT, $0
MOVD z+0(FP), R2
MOVD xn+24(FP), R10 // r = xn
MOVD x+32(FP), R8
MOVD y+56(FP), R9
MOVD z_len+8(FP), R7 // i = z
SLD $3, R7, R1 // i*8
MOVD $0, R0 // make sure it's zero
BR E7
L7:
MOVD (R8)(R1*1), R11
WORD $0xB98700A9 // DLGR R10,R9
MOVD R11, (R2)(R1*1)
E7:
SUB $1, R7 // i--
SUB $8, R1
BGE L7 // i >= 0
MOVD R10, r+64(FP)
RET

View File

@ -7,6 +7,7 @@ package big
import (
"fmt"
"internal/testenv"
"math/bits"
"math/rand"
"strings"
"testing"
@ -493,7 +494,6 @@ func TestFunVWW(t *testing.T) {
if a.y != 0 && a.r < a.y {
arg := argWVW{a.x, a.c, a.z, a.y, a.r}
testFunWVW(t, "divWVW_g", divWVW_g, arg)
testFunWVW(t, "divWVW", divWVW, arg)
}
}
@ -536,6 +536,42 @@ func TestMulAddWWW(t *testing.T) {
}
}
var divWWTests = []struct {
x1, x0, y Word
q, r Word
}{
{_M >> 1, 0, _M, _M >> 1, _M >> 1},
{_M - (1 << (_W - 2)), _M, 3 << (_W - 2), _M, _M - (1 << (_W - 2))},
}
const testsNumber = 1 << 16
func TestDivWW(t *testing.T) {
i := 0
for i, test := range divWWTests {
rec := reciprocalWord(test.y)
q, r := divWW(test.x1, test.x0, test.y, rec)
if q != test.q || r != test.r {
t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r)
}
}
//random tests
for ; i < testsNumber; i++ {
x1 := rndW()
x0 := rndW()
y := rndW()
if x1 >= y {
continue
}
rec := reciprocalWord(y)
qGot, rGot := divWW(x1, x0, y, rec)
qWant, rWant := bits.Div(uint(x1), uint(x0), uint(y))
if uint(qGot) != qWant || uint(rGot) != rWant {
t.Errorf("#%d got (%x, %x) want (%x, %x)", i, qGot, rGot, qWant, rWant)
}
}
}
func BenchmarkMulAddVWW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
@ -570,3 +606,19 @@ func BenchmarkAddMulVVW(b *testing.B) {
})
}
}
func BenchmarkDivWVW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
continue
}
x := rndV(n)
y := rndW()
z := make([]Word, n)
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n * _W))
for i := 0; i < b.N; i++ {
divWVW(z, 0, x, y)
}
})
}
}

View File

@ -9,9 +9,6 @@
TEXT ·mulWW(SB),NOSPLIT,$0
JMP ·mulWW_g(SB)
TEXT ·divWW(SB),NOSPLIT,$0
JMP ·divWW_g(SB)
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
@ -36,5 +33,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·divWVW(SB),NOSPLIT,$0
JMP ·divWVW_g(SB)

View File

@ -751,6 +751,7 @@ func (q nat) divBasic(u, v nat) {
// D2.
vn1 := v[n-1]
rec := reciprocalWord(vn1)
for j := m; j >= 0; j-- {
// D3.
qhat := Word(_M)
@ -760,7 +761,7 @@ func (q nat) divBasic(u, v nat) {
}
if ujn != vn1 {
var rhat Word
qhat, rhat = divWW(ujn, u[j+n-1], vn1)
qhat, rhat = divWW(ujn, u[j+n-1], vn1, rec)
// x1 | x2 = q̂v_{n-2}
vn2 := v[n-2]