math/big: Implement shlVU and shrVU in ASM for PPC64

Currently the shift left and shift right functions are coded in .go on PPC64. Implementing them in ASM just like AMD and ARM results in overall speedup of shift benchmarks on POWER8/9/10. name old time/op new time/op delta NonZeroShifts/1/shrVU 8.50ns ± 0% 5.21ns ± 0% -38.66% NonZeroShifts/1/shlVU 8.85ns ± 1% 5.24ns ± 0% -40.78% NonZeroShifts/2/shrVU 9.16ns ± 0% 5.51ns ± 0% -39.80% NonZeroShifts/2/shlVU 9.24ns ± 2% 5.61ns ± 0% -39.28% NonZeroShifts/3/shrVU 10.6ns ± 0% 6.8ns ± 0% -35.78% NonZeroShifts/3/shlVU 10.7ns ± 2% 6.4ns ± 0% -40.82% NonZeroShifts/4/shrVU 12.4ns ± 0% 7.7ns ± 0% -38.12% NonZeroShifts/4/shlVU 12.3ns ± 1% 7.5ns ± 0% -38.67% NonZeroShifts/5/shrVU 13.2ns ± 0% 8.5ns ± 0% -35.51% NonZeroShifts/5/shlVU 13.3ns ± 2% 9.3ns ± 0% -30.05% NonZeroShifts/10/shrVU 16.5ns ± 0% 13.1ns ± 0% -20.12% NonZeroShifts/10/shlVU 16.8ns ± 1% 14.1ns ± 0% -16.02% NonZeroShifts/100/shrVU 122ns ± 0% 94ns ± 0% -22.87% NonZeroShifts/100/shlVU 115ns ± 0% 103ns ± 0% -10.50% NonZeroShifts/1000/shrVU 1.10µs ± 0% 0.91µs ± 0% -17.03% NonZeroShifts/1000/shlVU 1.02µs ± 0% 0.93µs ± 0% -8.74% NonZeroShifts/10000/shrVU 10.9µs ± 0% 9.1µs ± 0% -16.66% NonZeroShifts/10000/shlVU 10.1µs ± 0% 9.3µs ± 0% -8.19% NonZeroShifts/100000/shrVU 109µs ± 0% 91µs ± 0% -16.01% NonZeroShifts/100000/shlVU 101µs ± 0% 94µs ± 0% -7.16% Change-Id: Ia31951cc29a4169beb494d2951427cbe1e963b11 Reviewed-on: https://go-review.googlesource.com/c/go/+/384474 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com>
2024-11-24 10:50:13 -07:00 · 2022-02-09 07:37:12 -06:00 · 2022-02-09 07:37:12 -06:00 · 6183920a33
commit 6183920a33
parent d85694ab4f
1 changed files with 152 additions and 2 deletions
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@ -346,11 +346,161 @@ done:
 	MOVD  R4, c+56(FP)
 	RET

+//func shlVU(z, x []Word, s uint) (c Word)
 TEXT ·shlVU(SB), NOSPLIT, $0
-	BR ·shlVU_g(SB)
+	MOVD    z+0(FP), R3
+	MOVD    x+24(FP), R6
+	MOVD    s+48(FP), R9
+	MOVD    z_len+8(FP), R4
+	MOVD    x_len+32(FP), R7
+	CMP     R9, R0          // s==0 copy(z,x)
+	BEQ     zeroshift
+	CMP     R4, R0          // len(z)==0 return
+	BEQ     done

+	ADD     $-1, R4, R5     // len(z)-1
+	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
+	SLD     $3, R5, R7
+	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
+	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
+	MOVD    (R6)(R7), R14
+	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
+	CMP     R5, R0          // iterate from i=len(z)-1 to 0
+	BEQ     loopexit        // Already at end?
+	MOVD	0(R15),R10	// x[i]
+shloop:
+	SLD     R9, R10, R10    // x[i]<<s
+	MOVDU   -8(R15), R14
+	SRD     R4, R14, R11    // x[i-1]>>ŝ
+	OR      R11, R10, R10
+	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
+	MOVD	R14, R10	// reuse x[i-1] for next iteration
+	ADD     $-8, R16        // i--
+	CMP     R15, R6         // &x[i-1]>&x[0]?
+	BGT     shloop
+loopexit:
+	MOVD    0(R6), R4
+	SLD     R9, R4, R4
+	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
+	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
+	RET
+
+zeroshift:
+	CMP     R6, R0          // x is null, nothing to copy
+	BEQ     done
+	CMP     R6, R3          // if x is same as z, nothing to copy
+	BEQ     done
+	CMP     R7, R4
+	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
+	SLD     $3, R7, R7
+	SUB     R6, R3, R11     // dest - src
+	CMPU    R11, R7, CR2    // < len?
+	BLT     CR2, backward   // there is overlap, copy backwards
+	MOVD    $0, R14
+	// shlVU processes backwards, but added a forward copy option 
+	// since its faster on POWER
+repeat:
+	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
+	MOVD    R15, (R3)(R14)
+	ADD     $8, R14
+	CMP     R14, R7         // More 8 bytes left?
+	BLT     repeat
+	BR      done
+backward:
+	ADD     $-8,R7, R14
+repeatback:
+	MOVD    (R6)(R14), R15  // copy x into z backwards
+	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
+	SUB     $8, R14
+	CMP     R14, $-8        // More 8 bytes left?
+	BGT     repeatback
+
+done:
+	MOVD    R0, c+56(FP)    // c=0
+	RET
+
+//func shrVU(z, x []Word, s uint) (c Word)
 TEXT ·shrVU(SB), NOSPLIT, $0
-	BR ·shrVU_g(SB)
+	MOVD    z+0(FP), R3
+	MOVD    x+24(FP), R6
+	MOVD    s+48(FP), R9
+	MOVD    z_len+8(FP), R4
+	MOVD    x_len+32(FP), R7
+
+	CMP     R9, R0          // s==0, copy(z,x)
+	BEQ     zeroshift
+	CMP     R4, R0          // len(z)==0 return
+	BEQ     done
+	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
+
+	MOVD    0(R6), R7
+	SLD     R5, R7, R7      // compute x[0]<<ŝ
+	MOVD    $1, R8          // iterate from i=1 to i<len(z)
+	CMP     R8, R4
+	BGE     loopexit        // Already at end?
+
+	// vectorize if len(z) is >=3, else jump to scalar loop
+	CMP     R4, $3
+	BLT     scalar
+	MTVSRD  R9, VS38        // s
+	VSPLTB  $7, V6, V4
+	MTVSRD  R5, VS39        // ŝ
+	VSPLTB  $7, V7, V2
+	ADD     $-2, R4, R16
+	PCALIGN $16
+loopback:
+	ADD     $-1, R8, R10
+	SLD     $3, R10
+	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
+	SLD     $3, R8, R12
+	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
+
+	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
+	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
+	VOR     V3, V5, V5      // Or(|) the two registers together
+	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
+	ADD     $2, R8          // Done processing 2 entries, i and i+1
+	CMP     R8, R16         // Are there at least a couple of more entries left?
+	BLE     loopback
+	CMP     R8, R4          // Are we at the last element?
+	BEQ     loopexit
+scalar:	
+	ADD     $-1, R8, R10
+	SLD     $3, R10
+	MOVD    (R6)(R10),R11
+	SRD     R9, R11, R11    // x[len(z)-2] >> s
+	SLD     $3, R8, R12
+	MOVD    (R6)(R12), R12
+	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
+	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
+	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
+loopexit:
+	ADD     $-1, R4
+	SLD     $3, R4
+	MOVD    (R6)(R4), R5
+	SRD     R9, R5, R5      // x[len(z)-1]>>s
+	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
+	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
+	RET
+
+zeroshift:
+	CMP     R6, R0          // x is null, nothing to copy
+	BEQ     done
+	CMP     R6, R3          // if x is same as z, nothing to copy
+	BEQ     done
+	CMP     R7, R4
+	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
+	SLD     $3, R7, R7
+	MOVD    $0, R14
+repeat:
+	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
+	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
+	ADD     $8, R14
+	CMP     R14, R7         // More 8 bytes left?
+	BLT     repeat
+done:
+	MOVD    R0, c+56(FP)
+	RET

 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
 TEXT ·mulAddVWW(SB), NOSPLIT, $0