mirror of
https://github.com/golang/go
synced 2024-11-17 10:54:50 -07:00
math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x
This updates the assembly implementation of AddMulVVW to unroll the main loop to do 64 bytes at a time. The code for addMulVVWx is based on the same code and has also been updated to improve performance. goos: linux goarch: ppc64le pkg: crypto/internal/bigmod cpu: POWER10 │ bg.orig.out │ bg.out │ │ sec/op │ sec/op vs base │ ModAdd 116.3n ± 0% 116.9n ± 0% +0.52% (p=0.002 n=6) ModSub 111.5n ± 0% 111.5n ± 0% 0.00% (p=0.273 n=6) MontgomeryRepr 2.195µ ± 0% 1.944µ ± 0% -11.44% (p=0.002 n=6) MontgomeryMul 2.195µ ± 0% 1.943µ ± 0% -11.48% (p=0.002 n=6) ModMul 4.418µ ± 0% 3.900µ ± 0% -11.72% (p=0.002 n=6) ExpBig 5.736m ± 0% 5.117m ± 0% -10.78% (p=0.002 n=6) Exp 5.891m ± 0% 5.237m ± 0% -11.11% (p=0.002 n=6) geomean 9.901µ 9.094µ -8.15% goos: linux goarch: ppc64le pkg: math/big cpu: POWER10 │ am.orig.out │ am.out │ │ sec/op │ sec/op vs base │ AddMulVVW/1 4.456n ± 1% 3.565n ± 0% -20.00% (p=0.002 n=6) AddMulVVW/2 4.875n ± 1% 5.938n ± 1% +21.79% (p=0.002 n=6) AddMulVVW/3 5.484n ± 0% 5.693n ± 0% +3.80% (p=0.002 n=6) AddMulVVW/4 6.370n ± 0% 6.065n ± 0% -4.79% (p=0.002 n=6) AddMulVVW/5 7.321n ± 0% 7.188n ± 0% -1.82% (p=0.002 n=6) AddMulVVW/10 12.26n ± 8% 11.41n ± 0% -6.97% (p=0.002 n=6) AddMulVVW/100 100.70n ± 0% 93.58n ± 0% -7.08% (p=0.002 n=6) AddMulVVW/1000 938.6n ± 0% 845.5n ± 0% -9.92% (p=0.002 n=6) AddMulVVW/10000 9.459µ ± 0% 8.415µ ± 0% -11.04% (p=0.002 n=6) AddMulVVW/100000 94.57µ ± 0% 84.01µ ± 0% -11.16% (p=0.002 n=6) geomean 75.17n 71.21n -5.27% Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f Reviewed-on: https://go-review.googlesource.com/c/go/+/557915 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Paul Murphy <murp@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Filippo Valsorda <filippo@golang.org>
This commit is contained in:
parent
67d30fc315
commit
4fde3ef2ac
@ -8,44 +8,75 @@
|
||||
|
||||
// func addMulVVW1024(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW1024(SB), $0-32
|
||||
MOVD $16, R22 // R22 = z_len
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $4, R6 // R6 = z_len/4
|
||||
JMP addMulVVWx<>(SB)
|
||||
|
||||
// func addMulVVW1536(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW1536(SB), $0-32
|
||||
MOVD $24, R22 // R22 = z_len
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $6, R6 // R6 = z_len/4
|
||||
JMP addMulVVWx<>(SB)
|
||||
|
||||
// func addMulVVW2048(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW2048(SB), $0-32
|
||||
MOVD $32, R22 // R22 = z_len
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $8, R6 // R6 = z_len/4
|
||||
JMP addMulVVWx<>(SB)
|
||||
|
||||
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
|
||||
MOVD z+0(FP), R10 // R10 = z[]
|
||||
MOVD x+8(FP), R8 // R8 = x[]
|
||||
MOVD y+16(FP), R9 // R9 = y
|
||||
// This local function expects to be called only by
|
||||
// callers above. R6 contains the z length/4
|
||||
// since 4 values are processed for each
|
||||
// loop iteration, and is guaranteed to be > 0.
|
||||
// If other callers are added this function might
|
||||
// need to change.
|
||||
TEXT addMulVVWx<>(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R3
|
||||
MOVD x+8(FP), R4
|
||||
MOVD y+16(FP), R5
|
||||
|
||||
MOVD R0, R3 // R3 will be the index register
|
||||
CMP R0, R22
|
||||
MOVD R0, R4 // R4 = c = 0
|
||||
MOVD R22, CTR // Initialize loop counter
|
||||
BEQ done
|
||||
PCALIGN $16
|
||||
MOVD $0, R9 // R9 = c = 0
|
||||
MOVD R6, CTR // Initialize loop counter
|
||||
PCALIGN $16
|
||||
|
||||
loop:
|
||||
MOVD (R8)(R3), R20 // Load x[i]
|
||||
MOVD (R10)(R3), R21 // Load z[i]
|
||||
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
|
||||
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
|
||||
ADDC R21, R6 // R6 = z0
|
||||
ADDZE R7 // R7 = z1
|
||||
ADDC R4, R6 // R6 = z0 + c + 0
|
||||
ADDZE R7, R4 // c += z1
|
||||
MOVD R6, (R10)(R3) // Store z[i]
|
||||
ADD $8, R3
|
||||
BC 16, 0, loop // bdnz
|
||||
MOVD 0(R4), R14 // x[i]
|
||||
MOVD 8(R4), R16 // x[i+1]
|
||||
MOVD 16(R4), R18 // x[i+2]
|
||||
MOVD 24(R4), R20 // x[i+3]
|
||||
MOVD 0(R3), R15 // z[i]
|
||||
MOVD 8(R3), R17 // z[i+1]
|
||||
MOVD 16(R3), R19 // z[i+2]
|
||||
MOVD 24(R3), R21 // z[i+3]
|
||||
MULLD R5, R14, R10 // low x[i]*y
|
||||
MULHDU R5, R14, R11 // high x[i]*y
|
||||
ADDC R15, R10
|
||||
ADDZE R11
|
||||
ADDC R9, R10
|
||||
ADDZE R11, R9
|
||||
MULLD R5, R16, R14 // low x[i+1]*y
|
||||
MULHDU R5, R16, R15 // high x[i+1]*y
|
||||
ADDC R17, R14
|
||||
ADDZE R15
|
||||
ADDC R9, R14
|
||||
ADDZE R15, R9
|
||||
MULLD R5, R18, R16 // low x[i+2]*y
|
||||
MULHDU R5, R18, R17 // high x[i+2]*y
|
||||
ADDC R19, R16
|
||||
ADDZE R17
|
||||
ADDC R9, R16
|
||||
ADDZE R17, R9
|
||||
MULLD R5, R20, R18 // low x[i+3]*y
|
||||
MULHDU R5, R20, R19 // high x[i+3]*y
|
||||
ADDC R21, R18
|
||||
ADDZE R19
|
||||
ADDC R9, R18
|
||||
ADDZE R19, R9
|
||||
MOVD R10, 0(R3) // z[i]
|
||||
MOVD R14, 8(R3) // z[i+1]
|
||||
MOVD R16, 16(R3) // z[i+2]
|
||||
MOVD R18, 24(R3) // z[i+3]
|
||||
ADD $32, R3
|
||||
ADD $32, R4
|
||||
BDNZ loop
|
||||
|
||||
done:
|
||||
MOVD R4, c+24(FP)
|
||||
MOVD R9, c+24(FP)
|
||||
RET
|
||||
|
@ -599,33 +599,80 @@ done:
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R10 // R10 = z[]
|
||||
MOVD x+24(FP), R8 // R8 = x[]
|
||||
MOVD y+48(FP), R9 // R9 = y
|
||||
MOVD z_len+8(FP), R22 // R22 = z_len
|
||||
MOVD z+0(FP), R3 // R3 = z[]
|
||||
MOVD x+24(FP), R4 // R4 = x[]
|
||||
MOVD y+48(FP), R5 // R5 = y
|
||||
MOVD z_len+8(FP), R6 // R6 = z_len
|
||||
|
||||
MOVD R0, R3 // R3 will be the index register
|
||||
CMP R0, R22
|
||||
MOVD R0, R4 // R4 = c = 0
|
||||
MOVD R22, CTR // Initialize loop counter
|
||||
BEQ done
|
||||
PCALIGN $16
|
||||
CMP R6, $4
|
||||
MOVD R0, R9 // R9 = c = 0
|
||||
BLT tail
|
||||
SRD $2, R6, R7
|
||||
MOVD R7, CTR // Initialize loop counter
|
||||
PCALIGN $16
|
||||
|
||||
loop:
|
||||
MOVD (R8)(R3), R20 // Load x[i]
|
||||
MOVD (R10)(R3), R21 // Load z[i]
|
||||
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
|
||||
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
|
||||
ADDC R21, R6 // R6 = z0
|
||||
ADDZE R7 // R7 = z1
|
||||
ADDC R4, R6 // R6 = z0 + c + 0
|
||||
ADDZE R7, R4 // c += z1
|
||||
MOVD R6, (R10)(R3) // Store z[i]
|
||||
ADD $8, R3
|
||||
BC 16, 0, loop // bdnz
|
||||
MOVD 0(R4), R14 // x[i]
|
||||
MOVD 8(R4), R16 // x[i+1]
|
||||
MOVD 16(R4), R18 // x[i+2]
|
||||
MOVD 24(R4), R20 // x[i+3]
|
||||
MOVD 0(R3), R15 // z[i]
|
||||
MOVD 8(R3), R17 // z[i+1]
|
||||
MOVD 16(R3), R19 // z[i+2]
|
||||
MOVD 24(R3), R21 // z[i+3]
|
||||
MULLD R5, R14, R10 // low x[i]*y
|
||||
MULHDU R5, R14, R11 // high x[i]*y
|
||||
ADDC R15, R10
|
||||
ADDZE R11
|
||||
ADDC R9, R10
|
||||
ADDZE R11, R9
|
||||
MULLD R5, R16, R14 // low x[i+1]*y
|
||||
MULHDU R5, R16, R15 // high x[i+1]*y
|
||||
ADDC R17, R14
|
||||
ADDZE R15
|
||||
ADDC R9, R14
|
||||
ADDZE R15, R9
|
||||
MULLD R5, R18, R16 // low x[i+2]*y
|
||||
MULHDU R5, R18, R17 // high x[i+2]*y
|
||||
ADDC R19, R16
|
||||
ADDZE R17
|
||||
ADDC R9, R16
|
||||
ADDZE R17, R9
|
||||
MULLD R5, R20, R18 // low x[i+3]*y
|
||||
MULHDU R5, R20, R19 // high x[i+3]*y
|
||||
ADDC R21, R18
|
||||
ADDZE R19
|
||||
ADDC R9, R18
|
||||
ADDZE R19, R9
|
||||
MOVD R10, 0(R3) // z[i]
|
||||
MOVD R14, 8(R3) // z[i+1]
|
||||
MOVD R16, 16(R3) // z[i+2]
|
||||
MOVD R18, 24(R3) // z[i+3]
|
||||
ADD $32, R3
|
||||
ADD $32, R4
|
||||
BDNZ loop
|
||||
|
||||
ANDCC $3, R6
|
||||
tail:
|
||||
CMP R0, R6
|
||||
BEQ done
|
||||
MOVD R6, CTR
|
||||
PCALIGN $16
|
||||
tailloop:
|
||||
MOVD 0(R4), R14
|
||||
MOVD 0(R3), R15
|
||||
MULLD R5, R14, R10
|
||||
MULHDU R5, R14, R11
|
||||
ADDC R15, R10
|
||||
ADDZE R11
|
||||
ADDC R9, R10
|
||||
ADDZE R11, R9
|
||||
MOVD R10, 0(R3)
|
||||
ADD $8, R3
|
||||
ADD $8, R4
|
||||
BDNZ tailloop
|
||||
|
||||
done:
|
||||
MOVD R4, c+56(FP)
|
||||
MOVD R9, c+56(FP)
|
||||
RET
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user