mirror of
https://github.com/golang/go
synced 2024-11-20 05:04:43 -07:00
big: implemented overlap-tolerant shifts in assembly
- no need to make copies in cases of aliases - removed deprecated internal shift functions - minor unrelated simplifications This change improves pidigits -s -n10000 by almost 20%: user 0m6.156s (old) user 0m4.999s (new) (pidigits -s -n20000 goes from ~25s to ~19s) R=rsc CC=golang-dev https://golang.org/cl/1149041
This commit is contained in:
parent
fbf8d263f9
commit
3f287b5002
@ -325,10 +325,16 @@ func subVW_g(z, x *Word, y Word, n int) (c Word) {
|
||||
|
||||
func shlVW(z, x *Word, s Word, n int) (c Word)
|
||||
func shlVW_g(z, x *Word, s Word, n int) (c Word) {
|
||||
if n > 0 {
|
||||
ŝ := _W - s
|
||||
for i := 0; i < n; i++ {
|
||||
w := *x.at(i)
|
||||
c, *z.at(i) = w>>ŝ, w<<s|c
|
||||
w1 := *x.at(n - 1)
|
||||
c = w1 >> ŝ
|
||||
for i := n - 1; i > 0; i-- {
|
||||
w := w1
|
||||
w1 = *x.at(i - 1)
|
||||
*z.at(i) = w<<s | w1>>ŝ
|
||||
}
|
||||
*z.at(0) = w1 << s
|
||||
}
|
||||
return
|
||||
}
|
||||
@ -336,10 +342,16 @@ func shlVW_g(z, x *Word, s Word, n int) (c Word) {
|
||||
|
||||
func shrVW(z, x *Word, s Word, n int) (c Word)
|
||||
func shrVW_g(z, x *Word, s Word, n int) (c Word) {
|
||||
if n > 0 {
|
||||
ŝ := _W - s
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
w := *x.at(i)
|
||||
c, *z.at(i) = w<<ŝ, w>>s|c
|
||||
w1 := *x.at(0)
|
||||
c = w1 << ŝ
|
||||
for i := 0; i < n-1; i++ {
|
||||
w := w1
|
||||
w1 = *x.at(i + 1)
|
||||
*z.at(i) = w>>s | w1<<ŝ
|
||||
}
|
||||
*z.at(n - 1) = w1 >> s
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -101,53 +101,73 @@ E4: CMPL BX, BP // i < n
|
||||
|
||||
// func shlVW(z, x *Word, s Word, n int) (c Word)
|
||||
TEXT ·shlVW(SB),7,$0
|
||||
MOVL n+12(FP), BX // i = n
|
||||
SUBL $1, BX // i--
|
||||
JL X8b // i < 0 (n <= 0)
|
||||
|
||||
// n > 0
|
||||
MOVL z+0(FP), DI
|
||||
MOVL x+4(FP), SI
|
||||
MOVL s+8(FP), CX
|
||||
MOVL n+12(FP), BX
|
||||
LEAL (DI)(BX*4), DI
|
||||
LEAL (SI)(BX*4), SI
|
||||
NEGL BX // i = -n
|
||||
MOVL $0, AX // c = 0
|
||||
JMP E8
|
||||
|
||||
L8: MOVL (SI)(BX*4), DX
|
||||
MOVL DX, BP
|
||||
SHLL CX, DX:AX
|
||||
MOVL DX, (DI)(BX*4)
|
||||
MOVL BP, AX
|
||||
ADDL $1, BX // i++
|
||||
|
||||
E8: CMPL BX, $0 // i < 0
|
||||
JL L8
|
||||
|
||||
MOVL (SI)(BX*4), AX // w1 = x[n-1]
|
||||
MOVL $0, DX
|
||||
SHLL CX, DX:AX
|
||||
SHLL CX, DX:AX // w1>>ŝ
|
||||
MOVL DX, c+16(FP)
|
||||
|
||||
CMPL BX, $0
|
||||
JLE X8a // i <= 0
|
||||
|
||||
// i > 0
|
||||
L8: MOVL AX, DX // w = w1
|
||||
MOVL -4(SI)(BX*4), AX // w1 = x[i-1]
|
||||
SHLL CX, DX:AX // w<<s | w1>>ŝ
|
||||
MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ
|
||||
SUBL $1, BX // i--
|
||||
JG L8 // i > 0
|
||||
|
||||
// i <= 0
|
||||
X8a: SHLL CX, AX // w1<<s
|
||||
MOVL AX, (DI) // z[0] = w1<<s
|
||||
RET
|
||||
|
||||
X8b: MOVL $0, c+16(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func shrVW(z, x *Word, s Word, n int) (c Word)
|
||||
TEXT ·shrVW(SB),7,$0
|
||||
MOVL n+24(FP), BP
|
||||
SUBL $1, BP // n--
|
||||
JL X9b // n < 0 (n <= 0)
|
||||
|
||||
// n > 0
|
||||
MOVL z+0(FP), DI
|
||||
MOVL x+4(FP), SI
|
||||
MOVL s+8(FP), CX
|
||||
MOVL n+12(FP), BX // i = n
|
||||
MOVL $0, AX // c = 0
|
||||
MOVL (SI), AX // w1 = x[0]
|
||||
MOVL $0, DX
|
||||
SHRL CX, DX:AX // w1<<ŝ
|
||||
MOVL DX, c+16(FP)
|
||||
|
||||
MOVL $0, BX // i = 0
|
||||
JMP E9
|
||||
|
||||
L9: MOVL (SI)(BX*4), DX
|
||||
MOVL DX, BP
|
||||
SHRL CX, DX:AX
|
||||
MOVL DX, (DI)(BX*4)
|
||||
MOVL BP, AX
|
||||
// i < n-1
|
||||
L9: MOVL AX, DX // w = w1
|
||||
MOVL 4(SI)(BX*4), AX // w1 = x[i+1]
|
||||
SHRL CX, DX:AX // w>>s | w1<<ŝ
|
||||
MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ
|
||||
ADDL $1, BX // i++
|
||||
|
||||
E9: SUBL $1, BX // i--
|
||||
JGE L9
|
||||
E9: CMPL BX, BP
|
||||
JL L9 // i < n-1
|
||||
|
||||
MOVL $0, DX
|
||||
SHRL CX, DX:AX
|
||||
MOVL DX, c+16(FP)
|
||||
// i >= n-1
|
||||
X9a: SHRL CX, AX // w1>>s
|
||||
MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s
|
||||
RET
|
||||
|
||||
X9b: MOVL $0, c+16(FP)
|
||||
RET
|
||||
|
||||
|
||||
|
@ -103,47 +103,93 @@ E4: CMPQ BX, R11 // i < n
|
||||
|
||||
// func shlVW(z, x *Word, s Word, n int) (c Word)
|
||||
TEXT ·shlVW(SB),7,$0
|
||||
MOVL n+24(FP), BX // i = n
|
||||
SUBL $1, BX // i--
|
||||
JL X8b // i < 0 (n <= 0)
|
||||
|
||||
// n > 0
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+8(FP), R8
|
||||
MOVQ s+16(FP), CX
|
||||
MOVL n+24(FP), R11
|
||||
MOVQ $0, AX // c = 0
|
||||
MOVQ $0, BX // i = 0
|
||||
JMP E8
|
||||
|
||||
L8: MOVQ (R8)(BX*8), DX
|
||||
MOVQ DX, R12
|
||||
SHLQ CX, DX:AX
|
||||
MOVQ DX, (R10)(BX*8)
|
||||
MOVQ R12, AX
|
||||
ADDL $1, BX // i++
|
||||
|
||||
E8: CMPQ BX, R11 // i < n
|
||||
JL L8
|
||||
|
||||
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
|
||||
MOVQ $0, DX
|
||||
SHLQ CX, DX:AX
|
||||
SHLQ CX, DX:AX // w1>>ŝ
|
||||
MOVQ DX, c+32(FP)
|
||||
|
||||
CMPL BX, $0
|
||||
JLE X8a // i <= 0
|
||||
|
||||
// i > 0
|
||||
L8: MOVQ AX, DX // w = w1
|
||||
MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
|
||||
SHLQ CX, DX:AX // w<<s | w1>>ŝ
|
||||
MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
|
||||
SUBL $1, BX // i--
|
||||
JG L8 // i > 0
|
||||
|
||||
// i <= 0
|
||||
X8a: SHLQ CX, AX // w1<<s
|
||||
MOVQ AX, (R10) // z[0] = w1<<s
|
||||
RET
|
||||
|
||||
X8b: MOVQ $0, c+32(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func shrVW(z, x *Word, s Word, n int) (c Word)
|
||||
TEXT ·shrVW(SB),7,$0
|
||||
MOVL n+24(FP), R11
|
||||
SUBL $1, R11 // n--
|
||||
JL X9b // n < 0 (n <= 0)
|
||||
|
||||
// n > 0
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+8(FP), R8
|
||||
MOVQ s+16(FP), CX
|
||||
MOVQ (R8), AX // w1 = x[0]
|
||||
MOVQ $0, DX
|
||||
SHRQ CX, DX:AX // w1<<ŝ
|
||||
MOVQ DX, c+32(FP)
|
||||
|
||||
MOVQ $0, BX // i = 0
|
||||
JMP E9
|
||||
|
||||
// i < n-1
|
||||
L9: MOVQ AX, DX // w = w1
|
||||
MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
|
||||
SHRQ CX, DX:AX // w>>s | w1<<ŝ
|
||||
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
|
||||
ADDL $1, BX // i++
|
||||
|
||||
E9: CMPQ BX, R11
|
||||
JL L9 // i < n-1
|
||||
|
||||
// i >= n-1
|
||||
X9a: SHRQ CX, AX // w1>>s
|
||||
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
|
||||
RET
|
||||
|
||||
X9b: MOVQ $0, c+32(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func shrVW(z, x *Word, s Word, n int) (c Word)
|
||||
TEXT ·shrVW_(SB),7,$0
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+8(FP), R8
|
||||
MOVQ s+16(FP), CX
|
||||
MOVL n+24(FP), BX // i = n
|
||||
MOVQ $0, AX // c = 0
|
||||
JMP E9
|
||||
JMP E9_
|
||||
|
||||
L9: MOVQ (R8)(BX*8), DX
|
||||
L9_: MOVQ (R8)(BX*8), DX
|
||||
MOVQ DX, R12
|
||||
SHRQ CX, DX:AX
|
||||
MOVQ DX, (R10)(BX*8)
|
||||
MOVQ R12, AX
|
||||
|
||||
E9: SUBL $1, BX // i--
|
||||
JGE L9
|
||||
E9_: SUBL $1, BX // i--
|
||||
JGE L9_
|
||||
|
||||
MOVQ $0, DX
|
||||
SHRQ CX, DX:AX
|
||||
|
@ -60,6 +60,11 @@ func (z nat) norm() nat {
|
||||
}
|
||||
|
||||
|
||||
// TODO(gri) Consider changing "make" such that is does not reserve space
|
||||
// for a potential carry; instead callers must provide the correct
|
||||
// m (+1). Should lead to clearer code and shorter allocations on
|
||||
// average.
|
||||
|
||||
func (z nat) make(m int) nat {
|
||||
if cap(z) > m {
|
||||
return z[0:m] // reuse z - has at least one extra word for a carry, if any
|
||||
@ -219,11 +224,7 @@ func (z nat) mulAddWW(x nat, y, r Word) nat {
|
||||
// basicMul multiplies x and y and leaves the result in z.
|
||||
// The (non-normalized) result is placed in z[0 : len(x) + len(y)].
|
||||
func basicMul(z, x, y nat) {
|
||||
// initialize z
|
||||
for i := range z[0 : len(x)+len(y)] {
|
||||
z[i] = 0
|
||||
}
|
||||
// multiply
|
||||
z[0 : len(x)+len(y)].clear() // initialize z
|
||||
for i, d := range y {
|
||||
if d != 0 {
|
||||
z[len(x)+i] = addMulVVW(&z[i], &x[0], d, len(x))
|
||||
@ -534,28 +535,26 @@ func (z nat) div(z2, u, v nat) (q, r nat) {
|
||||
|
||||
|
||||
// q = (uIn-r)/v, with 0 <= r < y
|
||||
// Uses z as storage for q, and u as storage for r if possible.
|
||||
// See Knuth, Volume 2, section 4.3.1, Algorithm D.
|
||||
// Preconditions:
|
||||
// len(v) >= 2
|
||||
// len(uIn) >= len(v)
|
||||
func (z nat) divLarge(z2, uIn, v nat) (q, r nat) {
|
||||
func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
|
||||
n := len(v)
|
||||
m := len(uIn) - len(v)
|
||||
m := len(uIn) - n
|
||||
|
||||
var u nat
|
||||
if z2 == nil || &z2[0] == &uIn[0] {
|
||||
u = u.make(len(uIn) + 1).clear() // uIn is an alias for z2
|
||||
} else {
|
||||
u = z2.make(len(uIn) + 1).clear()
|
||||
}
|
||||
qhatv := make(nat, len(v)+1)
|
||||
q = z.make(m + 1)
|
||||
qhatv := make(nat, n+1)
|
||||
if alias(u, uIn) {
|
||||
u = nil // u is an alias for uIn - cannot reuse
|
||||
}
|
||||
u = u.make(len(uIn) + 1).clear()
|
||||
|
||||
// D1.
|
||||
shift := leadingZeros(v[n-1])
|
||||
v.shiftLeftDeprecated(v, shift)
|
||||
u.shiftLeftDeprecated(uIn, shift)
|
||||
u[len(uIn)] = uIn[len(uIn)-1] >> (_W - shift)
|
||||
shift := Word(leadingZeros(v[n-1]))
|
||||
shlVW(&v[0], &v[0], shift, n)
|
||||
u[len(uIn)] = shlVW(&u[0], &uIn[0], shift, len(uIn))
|
||||
|
||||
// D2.
|
||||
for j := m; j >= 0; j-- {
|
||||
@ -583,12 +582,12 @@ func (z nat) divLarge(z2, uIn, v nat) (q, r nat) {
|
||||
}
|
||||
|
||||
// D4.
|
||||
qhatv[len(v)] = mulAddVWW(&qhatv[0], &v[0], qhat, 0, len(v))
|
||||
qhatv[n] = mulAddVWW(&qhatv[0], &v[0], qhat, 0, n)
|
||||
|
||||
c := subVV(&u[j], &u[j], &qhatv[0], len(qhatv))
|
||||
if c != 0 {
|
||||
c := addVV(&u[j], &u[j], &v[0], len(v))
|
||||
u[j+len(v)] += c
|
||||
c := addVV(&u[j], &u[j], &v[0], n)
|
||||
u[j+n] += c
|
||||
qhat--
|
||||
}
|
||||
|
||||
@ -596,8 +595,8 @@ func (z nat) divLarge(z2, uIn, v nat) (q, r nat) {
|
||||
}
|
||||
|
||||
q = q.norm()
|
||||
u.shiftRightDeprecated(u, shift)
|
||||
v.shiftRightDeprecated(v, shift)
|
||||
shrVW(&u[0], &u[0], shift, len(u))
|
||||
shrVW(&v[0], &v[0], shift, n)
|
||||
r = u.norm()
|
||||
|
||||
return q, r
|
||||
@ -755,15 +754,10 @@ func (z nat) shl(x nat, s uint) nat {
|
||||
}
|
||||
// m > 0
|
||||
|
||||
// determine if z can be reused
|
||||
// TODO(gri) change shlVW so we don't need this
|
||||
if alias(z, x) {
|
||||
z = nil // z is an alias for x - cannot reuse
|
||||
}
|
||||
|
||||
n := m + int(s/_W)
|
||||
z = z.make(n + 1)
|
||||
z[n] = shlVW(&z[n-m], &x[0], Word(s%_W), m)
|
||||
z[0 : n-m].clear()
|
||||
|
||||
return z.norm()
|
||||
}
|
||||
@ -778,12 +772,6 @@ func (z nat) shr(x nat, s uint) nat {
|
||||
}
|
||||
// n > 0
|
||||
|
||||
// determine if z can be reused
|
||||
// TODO(gri) change shrVW so we don't need this
|
||||
if alias(z, x) {
|
||||
z = nil // z is an alias for x - cannot reuse
|
||||
}
|
||||
|
||||
z = z.make(n)
|
||||
shrVW(&z[0], &x[m-n], Word(s%_W), n)
|
||||
|
||||
@ -791,48 +779,6 @@ func (z nat) shr(x nat, s uint) nat {
|
||||
}
|
||||
|
||||
|
||||
// TODO(gri) Remove these shift functions once shlVW and shrVW can be
|
||||
// used directly in divLarge and powersOfTwoDecompose
|
||||
//
|
||||
// To avoid losing the top n bits, z should be sized so that
|
||||
// len(z) == len(x) + 1.
|
||||
func (z nat) shiftLeftDeprecated(x nat, n uint) nat {
|
||||
if len(x) == 0 {
|
||||
return x
|
||||
}
|
||||
|
||||
ñ := _W - n
|
||||
m := x[len(x)-1]
|
||||
if len(z) > len(x) {
|
||||
z[len(x)] = m >> ñ
|
||||
}
|
||||
for i := len(x) - 1; i >= 1; i-- {
|
||||
y := x[i-1]
|
||||
z[i] = m<<n | y>>ñ
|
||||
m = y
|
||||
}
|
||||
z[0] = m << n
|
||||
return z
|
||||
}
|
||||
|
||||
|
||||
func (z nat) shiftRightDeprecated(x nat, n uint) nat {
|
||||
if len(x) == 0 {
|
||||
return x
|
||||
}
|
||||
|
||||
ñ := _W - n
|
||||
m := x[0]
|
||||
for i := 0; i < len(x)-1; i++ {
|
||||
y := x[i+1]
|
||||
z[i] = m>>n | y<<ñ
|
||||
m = y
|
||||
}
|
||||
z[len(x)-1] = m >> n
|
||||
return z
|
||||
}
|
||||
|
||||
|
||||
func (z nat) and(x, y nat) nat {
|
||||
m := len(x)
|
||||
n := len(y)
|
||||
@ -936,7 +882,7 @@ func (n nat) powersOfTwoDecompose() (q nat, k Word) {
|
||||
x := trailingZeroBits(n[zeroWords])
|
||||
|
||||
q = q.make(len(n) - zeroWords)
|
||||
q.shiftRightDeprecated(n[zeroWords:], uint(x))
|
||||
shrVW(&q[0], &n[zeroWords], Word(x), len(q))
|
||||
q = q.norm()
|
||||
|
||||
k = Word(_W*zeroWords + x)
|
||||
|
Loading…
Reference in New Issue
Block a user