internal/bytealg: improve performance of IndexByte for ppc64x

Use addi+lvx instruction fusion and remove register dependencies in the main loop to improve performance. benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-192 9.86 9.75 -1.12% BenchmarkIndexByte/32-192 15.6 11.2 -28.21% BenchmarkIndexByte/4K-192 155 97.6 -37.03% BenchmarkIndexByte/4M-192 171790 129650 -24.53% BenchmarkIndexByte/64M-192 6530982 5018424 -23.16% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-192 1013.72 1025.76 1.01x BenchmarkIndexByte/32-192 2049.47 2868.01 1.40x BenchmarkIndexByte/4K-192 26422.69 41975.67 1.59x BenchmarkIndexByte/4M-192 24415.17 32350.74 1.33x BenchmarkIndexByte/64M-192 10275.46 13372.50 1.30x Change-Id: Iedf17f01f374d58e85dcd6a972209bfcb7eb6063 Reviewed-on: https://go-review.googlesource.com/137415 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
2024-11-23 14:40:02 -07:00 · 2018-09-21 16:06:32 -03:00 · 2018-09-21 16:06:32 -03:00 · 23f7554194
commit 23f7554194
parent 5bba505367
1 changed files with 16 additions and 12 deletions
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@ -38,14 +38,14 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
 	BR	indexbytebody<>(SB)

 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-	DCBT	(R3)		// Prepare cache line.
 	MOVD	R3,R17		// Save base address for calculating the index later.
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
 	ADD	R4,R3,R7	// Last acceptable address in R7.
+	DCBT	(R8)		// Prepare cache line.

 	RLDIMI	$16,R5,$32,R5
-	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
 	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
 	RLDIMI	$32,R5,$0,R5
@ -56,7 +56,7 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #else
 	SRD	R6,R9,R9	// Same for Big Endian
 #endif
-	BLE	small_string	// Jump to the small string case if it's <32 bytes.
+	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.

 	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
 	// in V0, V1 and V10, then branch to the preloop.
@ -97,7 +97,7 @@ qw_align:
 	LVSL	  (R0+R0),V11
 	VSLB	  V11,V10,V10
 	VSPLTB	  $7,V1,V1	// Replicate byte across V1
-	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
 	BLE	  tail

 	// We will load 4 quardwords per iteration in the loop, so check for
@ -131,7 +131,7 @@ qw_align:
 	// 64-byte aligned. Prepare for the main loop.
 preloop:
 	CMPU	R4,$64
-	BLE	tail	      // If len <= 64, don't use the vectorized loop
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop

 	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
 	// per loop iteration. The last doubleword is in R10, so our loop counter
@ -140,30 +140,34 @@ preloop:
 	SRD	$6,R6,R9      // Loop counter in R9
 	MOVD	R9,CTR

+	ADD	$-64,R8,R8   // Adjust index for loop entry
 	MOVD	$16,R11      // Load offsets for the vector loads
 	MOVD	$32,R9
 	MOVD	$48,R7

 	// Main loop we will load 64 bytes per iteration
 loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
 	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	LVX	    (R11+R8),V3
-	LVX	    (R9+R8),V4
-	LVX	    (R7+R8),V5
+	LVX	    (R8+R11),V3
 	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
 	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
 	VCMPEQUB    V1,V4,V8
 	VCMPEQUB    V1,V5,V9
+
 	VOR	    V6,V7,V11	      // Compress the result in a single vector
 	VOR	    V8,V9,V12
-	VOR	    V11,V12,V11
-	VCMPEQUBCC  V0,V11,V11	      // Check for byte
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
 	BGE	    CR6,found
-	ADD	    $64,R8,R8
 	BC	    16,0,loop	      // bdnz loop

-	// Handle the tailing bytes or R4 <= 64
+	// Handle the tailing bytes or R4 ≤ 64
 	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
 tail:
 	CMPU	    R4,$0
 	BEQ	    notfound