1
0
mirror of https://github.com/golang/go synced 2024-11-17 10:54:50 -07:00

math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x

This updates the assembly implementation of AddMulVVW to
unroll the main loop to do 64 bytes at a time.

The code for addMulVVWx is based on the same code and has
also been updated to improve performance.

goos: linux
goarch: ppc64le
pkg: crypto/internal/bigmod
cpu: POWER10
               │ bg.orig.out │               bg.out               │
               │   sec/op    │   sec/op     vs base               │
ModAdd           116.3n ± 0%   116.9n ± 0%   +0.52% (p=0.002 n=6)
ModSub           111.5n ± 0%   111.5n ± 0%    0.00% (p=0.273 n=6)
MontgomeryRepr   2.195µ ± 0%   1.944µ ± 0%  -11.44% (p=0.002 n=6)
MontgomeryMul    2.195µ ± 0%   1.943µ ± 0%  -11.48% (p=0.002 n=6)
ModMul           4.418µ ± 0%   3.900µ ± 0%  -11.72% (p=0.002 n=6)
ExpBig           5.736m ± 0%   5.117m ± 0%  -10.78% (p=0.002 n=6)
Exp              5.891m ± 0%   5.237m ± 0%  -11.11% (p=0.002 n=6)
geomean          9.901µ        9.094µ        -8.15%


goos: linux
goarch: ppc64le
pkg: math/big
cpu: POWER10
                 │ am.orig.out  │               am.out               │
                 │    sec/op    │   sec/op     vs base               │
AddMulVVW/1         4.456n ± 1%   3.565n ± 0%  -20.00% (p=0.002 n=6)
AddMulVVW/2         4.875n ± 1%   5.938n ± 1%  +21.79% (p=0.002 n=6)
AddMulVVW/3         5.484n ± 0%   5.693n ± 0%   +3.80% (p=0.002 n=6)
AddMulVVW/4         6.370n ± 0%   6.065n ± 0%   -4.79% (p=0.002 n=6)
AddMulVVW/5         7.321n ± 0%   7.188n ± 0%   -1.82% (p=0.002 n=6)
AddMulVVW/10        12.26n ± 8%   11.41n ± 0%   -6.97% (p=0.002 n=6)
AddMulVVW/100      100.70n ± 0%   93.58n ± 0%   -7.08% (p=0.002 n=6)
AddMulVVW/1000      938.6n ± 0%   845.5n ± 0%   -9.92% (p=0.002 n=6)
AddMulVVW/10000     9.459µ ± 0%   8.415µ ± 0%  -11.04% (p=0.002 n=6)
AddMulVVW/100000    94.57µ ± 0%   84.01µ ± 0%  -11.16% (p=0.002 n=6)
geomean             75.17n        71.21n        -5.27%


Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f
Reviewed-on: https://go-review.googlesource.com/c/go/+/557915
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
This commit is contained in:
Lynn Boger 2024-01-23 12:46:05 -06:00
parent 67d30fc315
commit 4fde3ef2ac
2 changed files with 129 additions and 51 deletions

View File

@ -8,44 +8,75 @@
// func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32
MOVD $16, R22 // R22 = z_len
JMP addMulVVWx(SB)
MOVD $4, R6 // R6 = z_len/4
JMP addMulVVWx<>(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32
MOVD $24, R22 // R22 = z_len
JMP addMulVVWx(SB)
MOVD $6, R6 // R6 = z_len/4
JMP addMulVVWx<>(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32
MOVD $32, R22 // R22 = z_len
JMP addMulVVWx(SB)
MOVD $8, R6 // R6 = z_len/4
JMP addMulVVWx<>(SB)
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+8(FP), R8 // R8 = x[]
MOVD y+16(FP), R9 // R9 = y
// This local function expects to be called only by
// callers above. R6 contains the z length/4
// since 4 values are processed for each
// loop iteration, and is guaranteed to be > 0.
// If other callers are added this function might
// need to change.
TEXT addMulVVWx<>(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOVD x+8(FP), R4
MOVD y+16(FP), R5
MOVD R0, R3 // R3 will be the index register
CMP R0, R22
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
PCALIGN $16
MOVD $0, R9 // R9 = c = 0
MOVD R6, CTR // Initialize loop counter
PCALIGN $16
loop:
MOVD (R8)(R3), R20 // Load x[i]
MOVD (R10)(R3), R21 // Load z[i]
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
ADDC R21, R6 // R6 = z0
ADDZE R7 // R7 = z1
ADDC R4, R6 // R6 = z0 + c + 0
ADDZE R7, R4 // c += z1
MOVD R6, (R10)(R3) // Store z[i]
ADD $8, R3
BC 16, 0, loop // bdnz
MOVD 0(R4), R14 // x[i]
MOVD 8(R4), R16 // x[i+1]
MOVD 16(R4), R18 // x[i+2]
MOVD 24(R4), R20 // x[i+3]
MOVD 0(R3), R15 // z[i]
MOVD 8(R3), R17 // z[i+1]
MOVD 16(R3), R19 // z[i+2]
MOVD 24(R3), R21 // z[i+3]
MULLD R5, R14, R10 // low x[i]*y
MULHDU R5, R14, R11 // high x[i]*y
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MULLD R5, R16, R14 // low x[i+1]*y
MULHDU R5, R16, R15 // high x[i+1]*y
ADDC R17, R14
ADDZE R15
ADDC R9, R14
ADDZE R15, R9
MULLD R5, R18, R16 // low x[i+2]*y
MULHDU R5, R18, R17 // high x[i+2]*y
ADDC R19, R16
ADDZE R17
ADDC R9, R16
ADDZE R17, R9
MULLD R5, R20, R18 // low x[i+3]*y
MULHDU R5, R20, R19 // high x[i+3]*y
ADDC R21, R18
ADDZE R19
ADDC R9, R18
ADDZE R19, R9
MOVD R10, 0(R3) // z[i]
MOVD R14, 8(R3) // z[i+1]
MOVD R16, 16(R3) // z[i+2]
MOVD R18, 24(R3) // z[i+3]
ADD $32, R3
ADD $32, R4
BDNZ loop
done:
MOVD R4, c+24(FP)
MOVD R9, c+24(FP)
RET

View File

@ -599,33 +599,80 @@ done:
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y
MOVD z_len+8(FP), R22 // R22 = z_len
MOVD z+0(FP), R3 // R3 = z[]
MOVD x+24(FP), R4 // R4 = x[]
MOVD y+48(FP), R5 // R5 = y
MOVD z_len+8(FP), R6 // R6 = z_len
MOVD R0, R3 // R3 will be the index register
CMP R0, R22
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
PCALIGN $16
CMP R6, $4
MOVD R0, R9 // R9 = c = 0
BLT tail
SRD $2, R6, R7
MOVD R7, CTR // Initialize loop counter
PCALIGN $16
loop:
MOVD (R8)(R3), R20 // Load x[i]
MOVD (R10)(R3), R21 // Load z[i]
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
ADDC R21, R6 // R6 = z0
ADDZE R7 // R7 = z1
ADDC R4, R6 // R6 = z0 + c + 0
ADDZE R7, R4 // c += z1
MOVD R6, (R10)(R3) // Store z[i]
ADD $8, R3
BC 16, 0, loop // bdnz
MOVD 0(R4), R14 // x[i]
MOVD 8(R4), R16 // x[i+1]
MOVD 16(R4), R18 // x[i+2]
MOVD 24(R4), R20 // x[i+3]
MOVD 0(R3), R15 // z[i]
MOVD 8(R3), R17 // z[i+1]
MOVD 16(R3), R19 // z[i+2]
MOVD 24(R3), R21 // z[i+3]
MULLD R5, R14, R10 // low x[i]*y
MULHDU R5, R14, R11 // high x[i]*y
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MULLD R5, R16, R14 // low x[i+1]*y
MULHDU R5, R16, R15 // high x[i+1]*y
ADDC R17, R14
ADDZE R15
ADDC R9, R14
ADDZE R15, R9
MULLD R5, R18, R16 // low x[i+2]*y
MULHDU R5, R18, R17 // high x[i+2]*y
ADDC R19, R16
ADDZE R17
ADDC R9, R16
ADDZE R17, R9
MULLD R5, R20, R18 // low x[i+3]*y
MULHDU R5, R20, R19 // high x[i+3]*y
ADDC R21, R18
ADDZE R19
ADDC R9, R18
ADDZE R19, R9
MOVD R10, 0(R3) // z[i]
MOVD R14, 8(R3) // z[i+1]
MOVD R16, 16(R3) // z[i+2]
MOVD R18, 24(R3) // z[i+3]
ADD $32, R3
ADD $32, R4
BDNZ loop
ANDCC $3, R6
tail:
CMP R0, R6
BEQ done
MOVD R6, CTR
PCALIGN $16
tailloop:
MOVD 0(R4), R14
MOVD 0(R3), R15
MULLD R5, R14, R10
MULHDU R5, R14, R11
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MOVD R10, 0(R3)
ADD $8, R3
ADD $8, R4
BDNZ tailloop
done:
MOVD R4, c+56(FP)
MOVD R9, c+56(FP)
RET