From a44c72823ca4a0abd595a5ab15225811e979b353 Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Date: Mon, 19 Mar 2018 19:23:34 -0300
Subject: [PATCH] math/big: improve performance of addVW/subVW for ppc64x

This change adds a better implementation in asm for addVW/subVW for
ppc64x, with speedups up to 3.11x.

benchmark                    old ns/op     new ns/op     delta
BenchmarkAddVW/1-16          6.87          5.71          -16.89%
BenchmarkAddVW/2-16          7.72          5.94          -23.06%
BenchmarkAddVW/3-16          8.74          6.56          -24.94%
BenchmarkAddVW/4-16          9.66          7.26          -24.84%
BenchmarkAddVW/5-16          10.8          7.26          -32.78%
BenchmarkAddVW/10-16         17.4          9.97          -42.70%
BenchmarkAddVW/100-16        164           56.0          -65.85%
BenchmarkAddVW/1000-16       1638          524           -68.01%
BenchmarkAddVW/10000-16      16421         5201          -68.33%
BenchmarkAddVW/100000-16     165762        53324         -67.83%
BenchmarkSubVW/1-16          6.76          5.62          -16.86%
BenchmarkSubVW/2-16          7.69          6.02          -21.72%
BenchmarkSubVW/3-16          8.85          6.61          -25.31%
BenchmarkSubVW/4-16          10.0          7.34          -26.60%
BenchmarkSubVW/5-16          11.3          7.33          -35.13%
BenchmarkSubVW/10-16         19.5          18.7          -4.10%
BenchmarkSubVW/100-16        153           55.9          -63.46%
BenchmarkSubVW/1000-16       1502          519           -65.45%
BenchmarkSubVW/10000-16      15005         5165          -65.58%
BenchmarkSubVW/100000-16     150620        53124         -64.73%

benchmark                    old MB/s     new MB/s     speedup
BenchmarkAddVW/1-16          1165.12      1400.76      1.20x
BenchmarkAddVW/2-16          2071.39      2693.25      1.30x
BenchmarkAddVW/3-16          2744.72      3656.92      1.33x
BenchmarkAddVW/4-16          3311.63      4407.34      1.33x
BenchmarkAddVW/5-16          3700.52      5512.48      1.49x
BenchmarkAddVW/10-16         4605.63      8026.37      1.74x
BenchmarkAddVW/100-16        4856.15      14296.76     2.94x
BenchmarkAddVW/1000-16       4883.96      15264.21     3.13x
BenchmarkAddVW/10000-16      4871.52      15380.78     3.16x
BenchmarkAddVW/100000-16     4826.17      15002.48     3.11x
BenchmarkSubVW/1-16          1183.20      1423.03      1.20x
BenchmarkSubVW/2-16          2081.92      2657.44      1.28x
BenchmarkSubVW/3-16          2711.52      3632.30      1.34x
BenchmarkSubVW/4-16          3198.30      4360.30      1.36x
BenchmarkSubVW/5-16          3534.43      5460.40      1.54x
BenchmarkSubVW/10-16         4106.34      4273.51      1.04x
BenchmarkSubVW/100-16        5213.48      14306.32     2.74x
BenchmarkSubVW/1000-16       5324.27      15391.21     2.89x
BenchmarkSubVW/10000-16      5331.33      15486.57     2.90x
BenchmarkSubVW/100000-16     5311.35      15059.01     2.84x

Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1
Reviewed-on: https://go-review.googlesource.com/101975
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/math/big/arith_ppc64x.s | 148 +++++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index 74db48933f..b3ac91e35e 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -84,11 +84,155 @@ sublend:
 	MOVD  R4, c+72(FP)
 	RET
 
+// func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB), NOSPLIT, $0
-	BR ·addVW_g(SB)
+	MOVD z+0(FP), R10	// R10 = z[]
+	MOVD x+24(FP), R8	// R8 = x[]
+	MOVD y+48(FP), R4	// R4 = y = c
+	MOVD z_len+8(FP), R11	// R11 = z_len
 
+	CMP   R0, R11		// If z_len is zero, return
+	BEQ   done
+
+	// We will process the first iteration out of the loop so we capture
+	// the value of c. In the subsequent iterations, we will rely on the
+	// value of CA set here.
+	MOVD  0(R8), R20	// R20 = x[i]
+	ADD   $-1, R11		// R11 = z_len - 1
+	ADDC  R20, R4, R6	// R6 = x[i] + c
+	CMP   R0, R11		// If z_len was 1, we are done
+	MOVD  R6, 0(R10)	// z[i]
+	BEQ   final
+
+	// We will read 4 elements per iteration
+	SRD   $2, R11, R9	// R9 = z_len/4
+	DCBT  (R8)
+	CMP   R0, R9
+	MOVD  R9, CTR		// Set up the loop counter
+	BEQ   tail		// If R9 = 0, we can't use the loop
+
+loop:
+	MOVD  8(R8), R20	// R20 = x[i]
+	MOVD  16(R8), R21	// R21 = x[i+1]
+	MOVD  24(R8), R22	// R22 = x[i+2]
+	MOVDU 32(R8), R23	// R23 = x[i+3]
+	ADDZE R20, R24		// R24 = x[i] + CA
+	ADDZE R21, R25		// R25 = x[i+1] + CA
+	ADDZE R22, R26		// R26 = x[i+2] + CA
+	ADDZE R23, R27		// R27 = x[i+3] + CA
+	MOVD  R24, 8(R10)	// z[i]
+	MOVD  R25, 16(R10)	// z[i+1]
+	MOVD  R26, 24(R10)	// z[i+2]
+	MOVDU R27, 32(R10)	// z[i+3]
+	ADD   $-4, R11		// R11 = z_len - 4
+	BC    16, 0, loop	// bdnz
+
+	// We may have some elements to read
+	CMP R0, R11
+	BEQ final
+
+tail:
+	MOVDU 8(R8), R20
+	ADDZE R20, R24
+	ADD $-1, R11
+	MOVDU R24, 8(R10)
+	CMP R0, R11
+	BEQ final
+
+	MOVDU 8(R8), R20
+	ADDZE R20, R24
+	ADD $-1, R11
+	MOVDU R24, 8(R10)
+	CMP R0, R11
+	BEQ final
+
+	MOVD 8(R8), R20
+	ADDZE R20, R24
+	MOVD R24, 8(R10)
+
+final:
+	ADDZE R0, R4		// c = CA
+done:
+	MOVD  R4, c+56(FP)
+	RET
+
+// func subVW(z, x []Word, y Word) (c Word)
 TEXT ·subVW(SB), NOSPLIT, $0
-	BR ·subVW_g(SB)
+	MOVD  z+0(FP), R10	// R10 = z[]
+	MOVD  x+24(FP), R8	// R8 = x[]
+	MOVD  y+48(FP), R4	// R4 = y = c
+	MOVD  z_len+8(FP), R11	// R11 = z_len
+
+	CMP   R0, R11		// If z_len is zero, return
+	BEQ   done
+
+	// We will process the first iteration out of the loop so we capture
+	// the value of c. In the subsequent iterations, we will rely on the
+	// value of CA set here.
+	MOVD  0(R8), R20	// R20 = x[i]
+	ADD   $-1, R11		// R11 = z_len - 1
+	SUBC  R4, R20, R6	// R6 = x[i] - c
+	CMP   R0, R11		// If z_len was 1, we are done
+	MOVD  R6, 0(R10)	// z[i]
+	BEQ   final
+
+	// We will read 4 elements per iteration
+	SRD   $2, R11, R9	// R9 = z_len/4
+	DCBT  (R8)
+	CMP   R0, R9
+	MOVD  R9, CTR		// Set up the loop counter
+	BEQ   tail		// If R9 = 0, we can't use the loop
+
+	// The loop here is almost the same as the one used in s390x, but
+	// we don't need to capture CA every iteration because we've already
+	// done that above.
+loop:
+	MOVD  8(R8), R20
+	MOVD  16(R8), R21
+	MOVD  24(R8), R22
+	MOVDU 32(R8), R23
+	SUBE  R0, R20
+	SUBE  R0, R21
+	SUBE  R0, R22
+	SUBE  R0, R23
+	MOVD  R20, 8(R10)
+	MOVD  R21, 16(R10)
+	MOVD  R22, 24(R10)
+	MOVDU R23, 32(R10)
+	ADD   $-4, R11
+	BC    16, 0, loop	// bdnz
+
+	// We may have some elements to read
+	CMP   R0, R11
+	BEQ   final
+
+tail:
+	MOVDU 8(R8), R20
+	SUBE  R0, R20
+	ADD   $-1, R11
+	MOVDU R20, 8(R10)
+	CMP   R0, R11
+	BEQ   final
+
+	MOVDU 8(R8), R20
+	SUBE  R0, R20
+	ADD   $-1, R11
+	MOVDU R20, 8(R10)
+	CMP   R0, R11
+	BEQ   final
+
+	MOVD  8(R8), R20
+	SUBE  R0, R20
+	MOVD  R20, 8(R10)
+
+final:
+	// Capture CA
+	SUBE  R4, R4
+	NEG   R4, R4
+
+done:
+	MOVD  R4, c+56(FP)
+	RET
 
 TEXT ·shlVU(SB), NOSPLIT, $0
 	BR ·shlVU_g(SB)