From a44c72823ca4a0abd595a5ab15225811e979b353 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Seo Date: Mon, 19 Mar 2018 19:23:34 -0300 Subject: [PATCH] math/big: improve performance of addVW/subVW for ppc64x This change adds a better implementation in asm for addVW/subVW for ppc64x, with speedups up to 3.11x. benchmark old ns/op new ns/op delta BenchmarkAddVW/1-16 6.87 5.71 -16.89% BenchmarkAddVW/2-16 7.72 5.94 -23.06% BenchmarkAddVW/3-16 8.74 6.56 -24.94% BenchmarkAddVW/4-16 9.66 7.26 -24.84% BenchmarkAddVW/5-16 10.8 7.26 -32.78% BenchmarkAddVW/10-16 17.4 9.97 -42.70% BenchmarkAddVW/100-16 164 56.0 -65.85% BenchmarkAddVW/1000-16 1638 524 -68.01% BenchmarkAddVW/10000-16 16421 5201 -68.33% BenchmarkAddVW/100000-16 165762 53324 -67.83% BenchmarkSubVW/1-16 6.76 5.62 -16.86% BenchmarkSubVW/2-16 7.69 6.02 -21.72% BenchmarkSubVW/3-16 8.85 6.61 -25.31% BenchmarkSubVW/4-16 10.0 7.34 -26.60% BenchmarkSubVW/5-16 11.3 7.33 -35.13% BenchmarkSubVW/10-16 19.5 18.7 -4.10% BenchmarkSubVW/100-16 153 55.9 -63.46% BenchmarkSubVW/1000-16 1502 519 -65.45% BenchmarkSubVW/10000-16 15005 5165 -65.58% BenchmarkSubVW/100000-16 150620 53124 -64.73% benchmark old MB/s new MB/s speedup BenchmarkAddVW/1-16 1165.12 1400.76 1.20x BenchmarkAddVW/2-16 2071.39 2693.25 1.30x BenchmarkAddVW/3-16 2744.72 3656.92 1.33x BenchmarkAddVW/4-16 3311.63 4407.34 1.33x BenchmarkAddVW/5-16 3700.52 5512.48 1.49x BenchmarkAddVW/10-16 4605.63 8026.37 1.74x BenchmarkAddVW/100-16 4856.15 14296.76 2.94x BenchmarkAddVW/1000-16 4883.96 15264.21 3.13x BenchmarkAddVW/10000-16 4871.52 15380.78 3.16x BenchmarkAddVW/100000-16 4826.17 15002.48 3.11x BenchmarkSubVW/1-16 1183.20 1423.03 1.20x BenchmarkSubVW/2-16 2081.92 2657.44 1.28x BenchmarkSubVW/3-16 2711.52 3632.30 1.34x BenchmarkSubVW/4-16 3198.30 4360.30 1.36x BenchmarkSubVW/5-16 3534.43 5460.40 1.54x BenchmarkSubVW/10-16 4106.34 4273.51 1.04x BenchmarkSubVW/100-16 5213.48 14306.32 2.74x BenchmarkSubVW/1000-16 5324.27 15391.21 2.89x BenchmarkSubVW/10000-16 5331.33 15486.57 2.90x BenchmarkSubVW/100000-16 5311.35 15059.01 2.84x Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1 Reviewed-on: https://go-review.googlesource.com/101975 Reviewed-by: Lynn Boger --- src/math/big/arith_ppc64x.s | 148 +++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 2 deletions(-) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 74db48933f..b3ac91e35e 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -84,11 +84,155 @@ sublend: MOVD R4, c+72(FP) RET +// func addVW(z, x []Word, y Word) (c Word) TEXT ·addVW(SB), NOSPLIT, $0 - BR ·addVW_g(SB) + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R20, R4, R6 // R6 = x[i] + c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + +loop: + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + ADDZE R20, R24 // R24 = x[i] + CA + ADDZE R21, R25 // R25 = x[i+1] + CA + ADDZE R22, R26 // R26 = x[i+2] + CA + ADDZE R23, R27 // R27 = x[i+3] + CA + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + ADDZE R20, R24 + MOVD R24, 8(R10) + +final: + ADDZE R0, R4 // c = CA +done: + MOVD R4, c+56(FP) + RET + +// func subVW(z, x []Word, y Word) (c Word) TEXT ·subVW(SB), NOSPLIT, $0 - BR ·subVW_g(SB) + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + SUBC R4, R20, R6 // R6 = x[i] - c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + + // The loop here is almost the same as the one used in s390x, but + // we don't need to capture CA every iteration because we've already + // done that above. +loop: + MOVD 8(R8), R20 + MOVD 16(R8), R21 + MOVD 24(R8), R22 + MOVDU 32(R8), R23 + SUBE R0, R20 + SUBE R0, R21 + SUBE R0, R22 + SUBE R0, R23 + MOVD R20, 8(R10) + MOVD R21, 16(R10) + MOVD R22, 24(R10) + MOVDU R23, 32(R10) + ADD $-4, R11 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + SUBE R0, R20 + MOVD R20, 8(R10) + +final: + // Capture CA + SUBE R4, R4 + NEG R4, R4 + +done: + MOVD R4, c+56(FP) + RET TEXT ·shlVU(SB), NOSPLIT, $0 BR ·shlVU_g(SB)