1
0
mirror of https://github.com/golang/go synced 2024-09-29 22:24:33 -06:00

math/big: optimize mulAddVWW on arm64 for better performance

Unroll the cycle 4 times to reduce load overhead.

Benchmarks:
name                old time/op    new time/op    delta
MulAddVWW/1-8         15.9ns ± 0%    11.9ns ± 0%  -24.92%  (p=0.000 n=8+8)
MulAddVWW/2-8         16.1ns ± 0%    13.9ns ± 1%  -13.82%  (p=0.000 n=8+8)
MulAddVWW/3-8         18.9ns ± 0%    17.3ns ± 0%   -8.47%  (p=0.000 n=8+8)
MulAddVWW/4-8         21.7ns ± 0%    19.5ns ± 0%  -10.14%  (p=0.000 n=8+8)
MulAddVWW/5-8         25.1ns ± 0%    22.5ns ± 0%  -10.27%  (p=0.000 n=8+8)
MulAddVWW/10-8        41.6ns ± 0%    40.0ns ± 0%   -3.79%  (p=0.000 n=8+8)
MulAddVWW/100-8        368ns ± 0%     363ns ± 0%   -1.36%  (p=0.000 n=8+8)
MulAddVWW/1000-8      3.52µs ± 0%    3.52µs ± 0%   -0.14%  (p=0.000 n=8+8)
MulAddVWW/10000-8     35.1µs ± 0%    35.1µs ± 0%   -0.01%  (p=0.000 n=7+6)
MulAddVWW/100000-8     351µs ± 0%     351µs ± 0%   +0.15%  (p=0.038 n=8+8)

Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e
Reviewed-on: https://go-review.googlesource.com/c/go/+/155780
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
erifan01 2018-05-16 06:25:07 +00:00 committed by Cherry Zhang
parent f8f265b9cf
commit d17d41e58d
2 changed files with 61 additions and 8 deletions

View File

@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD x+24(FP), R2
MOVD y+48(FP), R3
MOVD r+56(FP), R4
// c, z = x * y + r
TBZ $0, R0, two
MOVD.P 8(R2), R5
MUL R3, R5, R7
UMULH R3, R5, R8
ADDS R4, R7
ADC $0, R8, R4 // c, z[i] = x[i] * y + r
MOVD.P R7, 8(R1)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R6)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R12, R11
ADC $0, R13, R4
STP.P (R10, R11), 16(R1)
SUB $2, R0
loop:
CBZ R0, done
MOVD.P 8(R2), R5
UMULH R5, R3, R7
MUL R5, R3, R6
ADDS R4, R6
ADC $0, R7
MOVD.P R6, 8(R1)
MOVD R7, R4
SUB $1, R0
LDP.P 32(R2), (R5, R6)
LDP -16(R2), (R7, R8)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R11, R12
MUL R3, R7, R14
UMULH R3, R7, R15
ADCS R13, R14
MUL R3, R8, R16
UMULH R3, R8, R17
ADCS R15, R16
ADC $0, R17, R4
STP.P (R10, R12), 32(R1)
STP (R14, R16), -16(R1)
SUB $4, R0
B loop
done:
MOVD R4, c+64(FP)

View File

@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) {
}
}
func BenchmarkMulAddVWW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
continue
}
z := make([]Word, n+1)
x := rndV(n)
y := rndW()
r := rndW()
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n * _W))
for i := 0; i < b.N; i++ {
mulAddVWW(z, x, y, r)
}
})
}
}
func BenchmarkAddMulVVW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {