mirror of
https://github.com/golang/go
synced 2024-11-17 20:04:47 -07:00
math/big: optimize mulAddVWW on arm64 for better performance
Unroll the cycle 4 times to reduce load overhead. Benchmarks: name old time/op new time/op delta MulAddVWW/1-8 15.9ns ± 0% 11.9ns ± 0% -24.92% (p=0.000 n=8+8) MulAddVWW/2-8 16.1ns ± 0% 13.9ns ± 1% -13.82% (p=0.000 n=8+8) MulAddVWW/3-8 18.9ns ± 0% 17.3ns ± 0% -8.47% (p=0.000 n=8+8) MulAddVWW/4-8 21.7ns ± 0% 19.5ns ± 0% -10.14% (p=0.000 n=8+8) MulAddVWW/5-8 25.1ns ± 0% 22.5ns ± 0% -10.27% (p=0.000 n=8+8) MulAddVWW/10-8 41.6ns ± 0% 40.0ns ± 0% -3.79% (p=0.000 n=8+8) MulAddVWW/100-8 368ns ± 0% 363ns ± 0% -1.36% (p=0.000 n=8+8) MulAddVWW/1000-8 3.52µs ± 0% 3.52µs ± 0% -0.14% (p=0.000 n=8+8) MulAddVWW/10000-8 35.1µs ± 0% 35.1µs ± 0% -0.01% (p=0.000 n=7+6) MulAddVWW/100000-8 351µs ± 0% 351µs ± 0% +0.15% (p=0.038 n=8+8) Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e Reviewed-on: https://go-review.googlesource.com/c/go/+/155780 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
parent
f8f265b9cf
commit
d17d41e58d
@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
|||||||
MOVD x+24(FP), R2
|
MOVD x+24(FP), R2
|
||||||
MOVD y+48(FP), R3
|
MOVD y+48(FP), R3
|
||||||
MOVD r+56(FP), R4
|
MOVD r+56(FP), R4
|
||||||
|
// c, z = x * y + r
|
||||||
|
TBZ $0, R0, two
|
||||||
|
MOVD.P 8(R2), R5
|
||||||
|
MUL R3, R5, R7
|
||||||
|
UMULH R3, R5, R8
|
||||||
|
ADDS R4, R7
|
||||||
|
ADC $0, R8, R4 // c, z[i] = x[i] * y + r
|
||||||
|
MOVD.P R7, 8(R1)
|
||||||
|
SUB $1, R0
|
||||||
|
two:
|
||||||
|
TBZ $1, R0, loop
|
||||||
|
LDP.P 16(R2), (R5, R6)
|
||||||
|
MUL R3, R5, R10
|
||||||
|
UMULH R3, R5, R11
|
||||||
|
ADDS R4, R10
|
||||||
|
MUL R3, R6, R12
|
||||||
|
UMULH R3, R6, R13
|
||||||
|
ADCS R12, R11
|
||||||
|
ADC $0, R13, R4
|
||||||
|
|
||||||
|
STP.P (R10, R11), 16(R1)
|
||||||
|
SUB $2, R0
|
||||||
loop:
|
loop:
|
||||||
CBZ R0, done
|
CBZ R0, done
|
||||||
MOVD.P 8(R2), R5
|
LDP.P 32(R2), (R5, R6)
|
||||||
UMULH R5, R3, R7
|
LDP -16(R2), (R7, R8)
|
||||||
MUL R5, R3, R6
|
|
||||||
ADDS R4, R6
|
MUL R3, R5, R10
|
||||||
ADC $0, R7
|
UMULH R3, R5, R11
|
||||||
MOVD.P R6, 8(R1)
|
ADDS R4, R10
|
||||||
MOVD R7, R4
|
MUL R3, R6, R12
|
||||||
SUB $1, R0
|
UMULH R3, R6, R13
|
||||||
|
ADCS R11, R12
|
||||||
|
|
||||||
|
MUL R3, R7, R14
|
||||||
|
UMULH R3, R7, R15
|
||||||
|
ADCS R13, R14
|
||||||
|
MUL R3, R8, R16
|
||||||
|
UMULH R3, R8, R17
|
||||||
|
ADCS R15, R16
|
||||||
|
ADC $0, R17, R4
|
||||||
|
|
||||||
|
STP.P (R10, R12), 32(R1)
|
||||||
|
STP (R14, R16), -16(R1)
|
||||||
|
SUB $4, R0
|
||||||
B loop
|
B loop
|
||||||
done:
|
done:
|
||||||
MOVD R4, c+64(FP)
|
MOVD R4, c+64(FP)
|
||||||
|
@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkMulAddVWW(b *testing.B) {
|
||||||
|
for _, n := range benchSizes {
|
||||||
|
if isRaceBuilder && n > 1e3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
z := make([]Word, n+1)
|
||||||
|
x := rndV(n)
|
||||||
|
y := rndW()
|
||||||
|
r := rndW()
|
||||||
|
b.Run(fmt.Sprint(n), func(b *testing.B) {
|
||||||
|
b.SetBytes(int64(n * _W))
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
mulAddVWW(z, x, y, r)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func BenchmarkAddMulVVW(b *testing.B) {
|
func BenchmarkAddMulVVW(b *testing.B) {
|
||||||
for _, n := range benchSizes {
|
for _, n := range benchSizes {
|
||||||
if isRaceBuilder && n > 1e3 {
|
if isRaceBuilder && n > 1e3 {
|
||||||
|
Loading…
Reference in New Issue
Block a user