mirror of
https://github.com/golang/go
synced 2024-11-18 14:54:40 -07:00
math/big: improve performance of addVW/subVW for ppc64x
This change adds a better implementation in asm for addVW/subVW for ppc64x, with speedups up to 3.11x. benchmark old ns/op new ns/op delta BenchmarkAddVW/1-16 6.87 5.71 -16.89% BenchmarkAddVW/2-16 7.72 5.94 -23.06% BenchmarkAddVW/3-16 8.74 6.56 -24.94% BenchmarkAddVW/4-16 9.66 7.26 -24.84% BenchmarkAddVW/5-16 10.8 7.26 -32.78% BenchmarkAddVW/10-16 17.4 9.97 -42.70% BenchmarkAddVW/100-16 164 56.0 -65.85% BenchmarkAddVW/1000-16 1638 524 -68.01% BenchmarkAddVW/10000-16 16421 5201 -68.33% BenchmarkAddVW/100000-16 165762 53324 -67.83% BenchmarkSubVW/1-16 6.76 5.62 -16.86% BenchmarkSubVW/2-16 7.69 6.02 -21.72% BenchmarkSubVW/3-16 8.85 6.61 -25.31% BenchmarkSubVW/4-16 10.0 7.34 -26.60% BenchmarkSubVW/5-16 11.3 7.33 -35.13% BenchmarkSubVW/10-16 19.5 18.7 -4.10% BenchmarkSubVW/100-16 153 55.9 -63.46% BenchmarkSubVW/1000-16 1502 519 -65.45% BenchmarkSubVW/10000-16 15005 5165 -65.58% BenchmarkSubVW/100000-16 150620 53124 -64.73% benchmark old MB/s new MB/s speedup BenchmarkAddVW/1-16 1165.12 1400.76 1.20x BenchmarkAddVW/2-16 2071.39 2693.25 1.30x BenchmarkAddVW/3-16 2744.72 3656.92 1.33x BenchmarkAddVW/4-16 3311.63 4407.34 1.33x BenchmarkAddVW/5-16 3700.52 5512.48 1.49x BenchmarkAddVW/10-16 4605.63 8026.37 1.74x BenchmarkAddVW/100-16 4856.15 14296.76 2.94x BenchmarkAddVW/1000-16 4883.96 15264.21 3.13x BenchmarkAddVW/10000-16 4871.52 15380.78 3.16x BenchmarkAddVW/100000-16 4826.17 15002.48 3.11x BenchmarkSubVW/1-16 1183.20 1423.03 1.20x BenchmarkSubVW/2-16 2081.92 2657.44 1.28x BenchmarkSubVW/3-16 2711.52 3632.30 1.34x BenchmarkSubVW/4-16 3198.30 4360.30 1.36x BenchmarkSubVW/5-16 3534.43 5460.40 1.54x BenchmarkSubVW/10-16 4106.34 4273.51 1.04x BenchmarkSubVW/100-16 5213.48 14306.32 2.74x BenchmarkSubVW/1000-16 5324.27 15391.21 2.89x BenchmarkSubVW/10000-16 5331.33 15486.57 2.90x BenchmarkSubVW/100000-16 5311.35 15059.01 2.84x Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1 Reviewed-on: https://go-review.googlesource.com/101975 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
parent
a42ea51ae9
commit
a44c72823c
@ -84,11 +84,155 @@ sublend:
|
||||
MOVD R4, c+72(FP)
|
||||
RET
|
||||
|
||||
// func addVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addVW(SB), NOSPLIT, $0
|
||||
BR ·addVW_g(SB)
|
||||
MOVD z+0(FP), R10 // R10 = z[]
|
||||
MOVD x+24(FP), R8 // R8 = x[]
|
||||
MOVD y+48(FP), R4 // R4 = y = c
|
||||
MOVD z_len+8(FP), R11 // R11 = z_len
|
||||
|
||||
CMP R0, R11 // If z_len is zero, return
|
||||
BEQ done
|
||||
|
||||
// We will process the first iteration out of the loop so we capture
|
||||
// the value of c. In the subsequent iterations, we will rely on the
|
||||
// value of CA set here.
|
||||
MOVD 0(R8), R20 // R20 = x[i]
|
||||
ADD $-1, R11 // R11 = z_len - 1
|
||||
ADDC R20, R4, R6 // R6 = x[i] + c
|
||||
CMP R0, R11 // If z_len was 1, we are done
|
||||
MOVD R6, 0(R10) // z[i]
|
||||
BEQ final
|
||||
|
||||
// We will read 4 elements per iteration
|
||||
SRD $2, R11, R9 // R9 = z_len/4
|
||||
DCBT (R8)
|
||||
CMP R0, R9
|
||||
MOVD R9, CTR // Set up the loop counter
|
||||
BEQ tail // If R9 = 0, we can't use the loop
|
||||
|
||||
loop:
|
||||
MOVD 8(R8), R20 // R20 = x[i]
|
||||
MOVD 16(R8), R21 // R21 = x[i+1]
|
||||
MOVD 24(R8), R22 // R22 = x[i+2]
|
||||
MOVDU 32(R8), R23 // R23 = x[i+3]
|
||||
ADDZE R20, R24 // R24 = x[i] + CA
|
||||
ADDZE R21, R25 // R25 = x[i+1] + CA
|
||||
ADDZE R22, R26 // R26 = x[i+2] + CA
|
||||
ADDZE R23, R27 // R27 = x[i+3] + CA
|
||||
MOVD R24, 8(R10) // z[i]
|
||||
MOVD R25, 16(R10) // z[i+1]
|
||||
MOVD R26, 24(R10) // z[i+2]
|
||||
MOVDU R27, 32(R10) // z[i+3]
|
||||
ADD $-4, R11 // R11 = z_len - 4
|
||||
BC 16, 0, loop // bdnz
|
||||
|
||||
// We may have some elements to read
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
tail:
|
||||
MOVDU 8(R8), R20
|
||||
ADDZE R20, R24
|
||||
ADD $-1, R11
|
||||
MOVDU R24, 8(R10)
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
MOVDU 8(R8), R20
|
||||
ADDZE R20, R24
|
||||
ADD $-1, R11
|
||||
MOVDU R24, 8(R10)
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
MOVD 8(R8), R20
|
||||
ADDZE R20, R24
|
||||
MOVD R24, 8(R10)
|
||||
|
||||
final:
|
||||
ADDZE R0, R4 // c = CA
|
||||
done:
|
||||
MOVD R4, c+56(FP)
|
||||
RET
|
||||
|
||||
// func subVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·subVW(SB), NOSPLIT, $0
|
||||
BR ·subVW_g(SB)
|
||||
MOVD z+0(FP), R10 // R10 = z[]
|
||||
MOVD x+24(FP), R8 // R8 = x[]
|
||||
MOVD y+48(FP), R4 // R4 = y = c
|
||||
MOVD z_len+8(FP), R11 // R11 = z_len
|
||||
|
||||
CMP R0, R11 // If z_len is zero, return
|
||||
BEQ done
|
||||
|
||||
// We will process the first iteration out of the loop so we capture
|
||||
// the value of c. In the subsequent iterations, we will rely on the
|
||||
// value of CA set here.
|
||||
MOVD 0(R8), R20 // R20 = x[i]
|
||||
ADD $-1, R11 // R11 = z_len - 1
|
||||
SUBC R4, R20, R6 // R6 = x[i] - c
|
||||
CMP R0, R11 // If z_len was 1, we are done
|
||||
MOVD R6, 0(R10) // z[i]
|
||||
BEQ final
|
||||
|
||||
// We will read 4 elements per iteration
|
||||
SRD $2, R11, R9 // R9 = z_len/4
|
||||
DCBT (R8)
|
||||
CMP R0, R9
|
||||
MOVD R9, CTR // Set up the loop counter
|
||||
BEQ tail // If R9 = 0, we can't use the loop
|
||||
|
||||
// The loop here is almost the same as the one used in s390x, but
|
||||
// we don't need to capture CA every iteration because we've already
|
||||
// done that above.
|
||||
loop:
|
||||
MOVD 8(R8), R20
|
||||
MOVD 16(R8), R21
|
||||
MOVD 24(R8), R22
|
||||
MOVDU 32(R8), R23
|
||||
SUBE R0, R20
|
||||
SUBE R0, R21
|
||||
SUBE R0, R22
|
||||
SUBE R0, R23
|
||||
MOVD R20, 8(R10)
|
||||
MOVD R21, 16(R10)
|
||||
MOVD R22, 24(R10)
|
||||
MOVDU R23, 32(R10)
|
||||
ADD $-4, R11
|
||||
BC 16, 0, loop // bdnz
|
||||
|
||||
// We may have some elements to read
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
tail:
|
||||
MOVDU 8(R8), R20
|
||||
SUBE R0, R20
|
||||
ADD $-1, R11
|
||||
MOVDU R20, 8(R10)
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
MOVDU 8(R8), R20
|
||||
SUBE R0, R20
|
||||
ADD $-1, R11
|
||||
MOVDU R20, 8(R10)
|
||||
CMP R0, R11
|
||||
BEQ final
|
||||
|
||||
MOVD 8(R8), R20
|
||||
SUBE R0, R20
|
||||
MOVD R20, 8(R10)
|
||||
|
||||
final:
|
||||
// Capture CA
|
||||
SUBE R4, R4
|
||||
NEG R4, R4
|
||||
|
||||
done:
|
||||
MOVD R4, c+56(FP)
|
||||
RET
|
||||
|
||||
TEXT ·shlVU(SB), NOSPLIT, $0
|
||||
BR ·shlVU_g(SB)
|
||||
|
Loading…
Reference in New Issue
Block a user