1
0
mirror of https://github.com/golang/go synced 2024-11-17 20:04:47 -07:00

math/big: optimize mulAddVWW on arm64 for better performance

Unroll the cycle 4 times to reduce load overhead.

Benchmarks:
name                old time/op    new time/op    delta
MulAddVWW/1-8         15.9ns ± 0%    11.9ns ± 0%  -24.92%  (p=0.000 n=8+8)
MulAddVWW/2-8         16.1ns ± 0%    13.9ns ± 1%  -13.82%  (p=0.000 n=8+8)
MulAddVWW/3-8         18.9ns ± 0%    17.3ns ± 0%   -8.47%  (p=0.000 n=8+8)
MulAddVWW/4-8         21.7ns ± 0%    19.5ns ± 0%  -10.14%  (p=0.000 n=8+8)
MulAddVWW/5-8         25.1ns ± 0%    22.5ns ± 0%  -10.27%  (p=0.000 n=8+8)
MulAddVWW/10-8        41.6ns ± 0%    40.0ns ± 0%   -3.79%  (p=0.000 n=8+8)
MulAddVWW/100-8        368ns ± 0%     363ns ± 0%   -1.36%  (p=0.000 n=8+8)
MulAddVWW/1000-8      3.52µs ± 0%    3.52µs ± 0%   -0.14%  (p=0.000 n=8+8)
MulAddVWW/10000-8     35.1µs ± 0%    35.1µs ± 0%   -0.01%  (p=0.000 n=7+6)
MulAddVWW/100000-8     351µs ± 0%     351µs ± 0%   +0.15%  (p=0.038 n=8+8)

Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e
Reviewed-on: https://go-review.googlesource.com/c/go/+/155780
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
erifan01 2018-05-16 06:25:07 +00:00 committed by Cherry Zhang
parent f8f265b9cf
commit d17d41e58d
2 changed files with 61 additions and 8 deletions

View File

@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD x+24(FP), R2 MOVD x+24(FP), R2
MOVD y+48(FP), R3 MOVD y+48(FP), R3
MOVD r+56(FP), R4 MOVD r+56(FP), R4
// c, z = x * y + r
TBZ $0, R0, two
MOVD.P 8(R2), R5
MUL R3, R5, R7
UMULH R3, R5, R8
ADDS R4, R7
ADC $0, R8, R4 // c, z[i] = x[i] * y + r
MOVD.P R7, 8(R1)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R6)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R12, R11
ADC $0, R13, R4
STP.P (R10, R11), 16(R1)
SUB $2, R0
loop: loop:
CBZ R0, done CBZ R0, done
MOVD.P 8(R2), R5 LDP.P 32(R2), (R5, R6)
UMULH R5, R3, R7 LDP -16(R2), (R7, R8)
MUL R5, R3, R6
ADDS R4, R6 MUL R3, R5, R10
ADC $0, R7 UMULH R3, R5, R11
MOVD.P R6, 8(R1) ADDS R4, R10
MOVD R7, R4 MUL R3, R6, R12
SUB $1, R0 UMULH R3, R6, R13
ADCS R11, R12
MUL R3, R7, R14
UMULH R3, R7, R15
ADCS R13, R14
MUL R3, R8, R16
UMULH R3, R8, R17
ADCS R15, R16
ADC $0, R17, R4
STP.P (R10, R12), 32(R1)
STP (R14, R16), -16(R1)
SUB $4, R0
B loop B loop
done: done:
MOVD R4, c+64(FP) MOVD R4, c+64(FP)

View File

@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) {
} }
} }
func BenchmarkMulAddVWW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
continue
}
z := make([]Word, n+1)
x := rndV(n)
y := rndW()
r := rndW()
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n * _W))
for i := 0; i < b.N; i++ {
mulAddVWW(z, x, y, r)
}
})
}
}
func BenchmarkAddMulVVW(b *testing.B) { func BenchmarkAddMulVVW(b *testing.B) {
for _, n := range benchSizes { for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 { if isRaceBuilder && n > 1e3 {