From d17d41e58d2f69d284398f1d86d93c0f31648b16 Mon Sep 17 00:00:00 2001 From: erifan01 Date: Wed, 16 May 2018 06:25:07 +0000 Subject: [PATCH] math/big: optimize mulAddVWW on arm64 for better performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unroll the cycle 4 times to reduce load overhead. Benchmarks: name old time/op new time/op delta MulAddVWW/1-8 15.9ns ± 0% 11.9ns ± 0% -24.92% (p=0.000 n=8+8) MulAddVWW/2-8 16.1ns ± 0% 13.9ns ± 1% -13.82% (p=0.000 n=8+8) MulAddVWW/3-8 18.9ns ± 0% 17.3ns ± 0% -8.47% (p=0.000 n=8+8) MulAddVWW/4-8 21.7ns ± 0% 19.5ns ± 0% -10.14% (p=0.000 n=8+8) MulAddVWW/5-8 25.1ns ± 0% 22.5ns ± 0% -10.27% (p=0.000 n=8+8) MulAddVWW/10-8 41.6ns ± 0% 40.0ns ± 0% -3.79% (p=0.000 n=8+8) MulAddVWW/100-8 368ns ± 0% 363ns ± 0% -1.36% (p=0.000 n=8+8) MulAddVWW/1000-8 3.52µs ± 0% 3.52µs ± 0% -0.14% (p=0.000 n=8+8) MulAddVWW/10000-8 35.1µs ± 0% 35.1µs ± 0% -0.01% (p=0.000 n=7+6) MulAddVWW/100000-8 351µs ± 0% 351µs ± 0% +0.15% (p=0.038 n=8+8) Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e Reviewed-on: https://go-review.googlesource.com/c/go/+/155780 Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/math/big/arith_arm64.s | 51 ++++++++++++++++++++++++++++++++------ src/math/big/arith_test.go | 18 ++++++++++++++ 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index bb23751ba3..114d5f67f2 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 MOVD x+24(FP), R2 MOVD y+48(FP), R3 MOVD r+56(FP), R4 + // c, z = x * y + r + TBZ $0, R0, two + MOVD.P 8(R2), R5 + MUL R3, R5, R7 + UMULH R3, R5, R8 + ADDS R4, R7 + ADC $0, R8, R4 // c, z[i] = x[i] * y + r + MOVD.P R7, 8(R1) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R2), (R5, R6) + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R12, R11 + ADC $0, R13, R4 + + STP.P (R10, R11), 16(R1) + SUB $2, R0 loop: CBZ R0, done - MOVD.P 8(R2), R5 - UMULH R5, R3, R7 - MUL R5, R3, R6 - ADDS R4, R6 - ADC $0, R7 - MOVD.P R6, 8(R1) - MOVD R7, R4 - SUB $1, R0 + LDP.P 32(R2), (R5, R6) + LDP -16(R2), (R7, R8) + + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R11, R12 + + MUL R3, R7, R14 + UMULH R3, R7, R15 + ADCS R13, R14 + MUL R3, R8, R16 + UMULH R3, R8, R17 + ADCS R15, R16 + ADC $0, R17, R4 + + STP.P (R10, R12), 32(R1) + STP (R14, R16), -16(R1) + SUB $4, R0 B loop done: MOVD R4, c+64(FP) diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index 8a64321102..d28f680688 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) { } } +func BenchmarkMulAddVWW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + z := make([]Word, n+1) + x := rndV(n) + y := rndW() + r := rndW() + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + mulAddVWW(z, x, y, r) + } + }) + } +} + func BenchmarkAddMulVVW(b *testing.B) { for _, n := range benchSizes { if isRaceBuilder && n > 1e3 {