crypto/subtle: improve xorBytes assembler on PPC64

This makes some improvements to the xorBytes assembler implementation for PPC64 targets. The loops to process large streams of bytes has been changed to do 64 bytes at a time. Other changes were made to prevent degradations in some of the common sizes like 8, 16. The case for < 8 bytes on power10 has been modified to use the LXVL and STXVL instructions. Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c Reviewed-on: https://go-review.googlesource.com/c/go/+/530877 Reviewed-by: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Benny Siegert <bsiegert@gmail.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
2024-11-18 09:54:57 -07:00 · 2023-09-25 10:49:11 -05:00 · 2023-09-25 10:49:11 -05:00 · 9b61b0054b
commit 9b61b0054b
parent be3d5fb6e6
1 changed files with 114 additions and 59 deletions
--- a/src/crypto/subtle/xor_ppc64x.s
+++ b/src/crypto/subtle/xor_ppc64x.s
@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
 	MOVD	b+16(FP), R5	// R5 = b
 	MOVD	n+24(FP), R6	// R6 = n
-	CMPU	R6, $32, CR7	// Check if n ≥ 32 bytes
+	CMPU	R6, $64, CR7	// Check if n ≥ 64 bytes
 	MOVD	R0, R8		// R8 = index
-	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 32 bytes
+	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 64 bytes
-	BLT	CR6, small	// Smaller than 8
+	BLE	CR6, small	// <= 8
-	BLT	CR7, xor16	// Case for 16 ≤ n < 32 bytes
+	BLT	CR7, xor32	// Case for 32 ≤ n < 64 bytes
-	// Case for n ≥ 32 bytes
+	// Case for n ≥ 64 bytes
-preloop32:
+preloop64:
-	SRD	$5, R6, R7	// Setup loop counter
+	SRD	$6, R6, R7	// Set up loop counter
 	MOVD	R7, CTR
 	MOVD	$16, R10
-	ANDCC	$31, R6, R9	// Check for tailing bytes for later
+	MOVD	$32, R14
-loop32:
+	MOVD	$48, R15
-	LXVD2X		(R4)(R8), VS32		// VS32 = a[i,...,i+15]
+	ANDCC	$63, R6, R9	// Check for tailing bytes for later
-	LXVD2X		(R4)(R10), VS34
+	PCALIGN $16
-	LXVD2X		(R5)(R8), VS33		// VS33 = b[i,...,i+15]
+	// Case for >= 64 bytes
-	LXVD2X		(R5)(R10), VS35
+	// Process 64 bytes per iteration
-	XXLXOR		VS32, VS33, VS32	// VS34 = a[] ^ b[]
+	// Load 4 vectors of a and b
-	XXLXOR		VS34, VS35, VS34
+	// XOR the corresponding vectors
-	STXVD2X		VS32, (R3)(R8)		// Store to dst
+	// from a and b and store the result
-	STXVD2X		VS34, (R3)(R10)
+loop64:
-	ADD		$32, R8			// Update index
+	LXVD2X	(R4)(R8), VS32
-	ADD		$32, R10
+	LXVD2X	(R4)(R10), VS34
-	BC		16, 0, loop32		// bdnz loop16
+	LXVD2X	(R4)(R14), VS36
-
+	LXVD2X	(R4)(R15), VS38
-	BEQ		CR0, done
+	LXVD2X	(R5)(R8), VS33
-
+	LXVD2X	(R5)(R10), VS35
-	MOVD		R9, R6
+	LXVD2X	(R5)(R14), VS37
-	CMP		R6, $8
+	LXVD2X	(R5)(R15), VS39
-	BLT		small
+	XXLXOR	VS32, VS33, VS32
 	XXLXOR	VS34, VS35, VS34
 	XXLXOR	VS36, VS37, VS36
 	XXLXOR	VS38, VS39, VS38
 	STXVD2X	VS32, (R3)(R8)
 	STXVD2X	VS34, (R3)(R10)
 	STXVD2X	VS36, (R3)(R14)
 	STXVD2X	VS38, (R3)(R15)
 	ADD	$64, R8
 	ADD	$64, R10
 	ADD	$64, R14
 	ADD	$64, R15
 	BDNZ	loop64
 	BC	12,2,LR		// BEQLR
 	MOVD	R9, R6
 	CMP	R6, $8
 	BLE	small
 	// Case for 8 <= n < 64 bytes
 	// Process 32 bytes if available
 xor32:
 	CMP	R6, $32
 	BLT	xor16
 	ADD	$16, R8, R9
 	LXVD2X	(R4)(R8), VS32
 	LXVD2X	(R4)(R9), VS33
 	LXVD2X	(R5)(R8), VS34
 	LXVD2X	(R5)(R9), VS35
 	XXLXOR	VS32, VS34, VS32
 	XXLXOR	VS33, VS35, VS33
 	STXVD2X	VS32, (R3)(R8)
 	STXVD2X	VS33, (R3)(R9)
 	ADD	$32, R8
 	ADD	$-32, R6
 	CMP	R6, $8
 	BLE	small
 	// Case for 8 <= n < 32 bytes
 	// Process 16 bytes if available
 xor16:
-	CMP		R6, $16
+	CMP	R6, $16
-	BLT		xor8
+	BLT	xor8
-	LXVD2X		(R4)(R8), VS32
+	LXVD2X	(R4)(R8), VS32
-	LXVD2X		(R5)(R8), VS33
+	LXVD2X	(R5)(R8), VS33
-	XXLXOR		VS32, VS33, VS32
+	XXLXOR	VS32, VS33, VS32
-	STXVD2X		VS32, (R3)(R8)
+	STXVD2X	VS32, (R3)(R8)
-	ADD		$16, R8
+	ADD	$16, R8
-	ADD		$-16, R6
+	ADD	$-16, R6
 	CMP		R6, $8
 	BLT		small
 xor8:
 	// Case for 8 ≤ n < 16 bytes
 	MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
 	MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
 	XOR     R14, R15, R16   // R16 = a[] ^ b[]
 	SUB     $8, R6          // n = n - 8
 	MOVD    R16, (R3)(R8)   // Store to dst
 	ADD     $8, R8
 	// Check if we're finished
 	CMP     R6, R0
 	BGT     small
 	RET
 	// Case for n < 8 bytes and tailing bytes from the
 	// previous cases.
 small:
 	CMP	R6, R0
-	BEQ	done
+	BC	12,2,LR		// BEQLR
-	MOVD	R6, CTR		// Setup loop counter
+xor8:
-
+#ifdef GOPPC64_power10
-loop:
+	SLD	$56,R6,R17
 	ADD	R4,R8,R18
 	ADD	R5,R8,R19
 	ADD	R3,R8,R20
 	LXVL	R18,R17,V0
 	LXVL	R19,R17,V1
 	VXOR	V0,V1,V1
 	STXVL	V1,R20,R17
 	RET
 #else
 	CMP	R6, $8
 	BLT	xor4
 	// Case for 8 ≤ n < 16 bytes
 	MOVD	(R4)(R8), R14   // R14 = a[i,...,i+7]
 	MOVD	(R5)(R8), R15   // R15 = b[i,...,i+7]
 	XOR	R14, R15, R16   // R16 = a[] ^ b[]
 	SUB	$8, R6          // n = n - 8
 	MOVD	R16, (R3)(R8)   // Store to dst
 	ADD	$8, R8
 xor4:
 	CMP	R6, $4
 	BLT	xor2
 	MOVWZ	(R4)(R8), R14
 	MOVWZ	(R5)(R8), R15
 	XOR	R14, R15, R16
 	MOVW	R16, (R3)(R8)
 	ADD	$4,R8
 	ADD	$-4,R6
 xor2:
 	CMP	R6, $2
 	BLT	xor1
 	MOVHZ	(R4)(R8), R14
 	MOVHZ	(R5)(R8), R15
 	XOR	R14, R15, R16
 	MOVH	R16, (R3)(R8)
 	ADD	$2,R8
 	ADD	$-2,R6
 xor1:
 	CMP	R6, R0
 	BC	12,2,LR		// BEQLR
 	MOVBZ	(R4)(R8), R14	// R14 = a[i]
 	MOVBZ	(R5)(R8), R15	// R15 = b[i]
 	XOR	R14, R15, R16	// R16 = a[i] ^ b[i]
 	MOVB	R16, (R3)(R8)	// Store to dst
-	ADD	$1, R8
+#endif
 	BC	16, 0, loop	// bdnz loop
 done:
 	RET