runtime: improve memclr on ppc64x

This improves performance for memclr for sizes >= 64 and < 512 by unrolling the loop to clear 64 bytes at a time, whereas before it was doing 32 bytes. On a power9, the improvement is: Memclr/64 6.07ns ± 0% 5.17ns ± 0% -14.86% (p=1.000 n=1+1) Memclr/256 11.8ns ± 0% 8.3ns ± 0% -30.10% (p=1.000 n=1+1) GoMemclr/64 5.58ns ± 0% 5.02ns ± 0% -10.04% (p=1.000 n=1+1) GoMemclr/256 12.0ns ± 0% 8.8ns ± 0% -26.62% (p=1.000 n=1+1) Change-Id: I929389ae9e50128cba81e0c412e7ba431da7facc Reviewed-on: https://go-review.googlesource.com/c/go/+/399895 Reviewed-by: Cherry Mui <cherryyz@google.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com>
2024-11-17 02:14:42 -07:00 · 2022-04-12 09:37:31 -05:00 · 2022-04-12 09:37:31 -05:00 · 91b9915d3f
commit 91b9915d3f
parent 740a490f71
1 changed files with 49 additions and 40 deletions
--- a/src/runtime/memclr_ppc64x.s
+++ b/src/runtime/memclr_ppc64x.s
@ -52,37 +52,50 @@ byte4:
 	BR    zero512xsetup // ptr should now be 8 byte aligned

 under512:
-	MOVD  R6, CTR     // R6 = number of double words
-	SRDCC $2, R6, R7  // 32 byte chunks?
-	BNE   zero32setup
-
-	// Clear double words
-
-zero8:
-	MOVD R0, 0(R3)    // double word
-	ADD  $8, R3
-	ADD  $-8, R4
-	BC   16, 0, zero8 // dec ctr, br zero8 if ctr not 0
-	BR   nozerolarge  // handle leftovers
-
-	// Prepare to clear 32 bytes at a time.
-
-zero32setup:
-	DCBTST (R3)             // prepare data cache
+	SRDCC $3, R6, R7  // 64 byte chunks?
 	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
-	MOVD   R7, CTR          // number of 32 byte chunks
-	MOVD   $16, R8
+	BEQ   lt64gt8

-zero32:
+	// Prepare to clear 64 bytes at a time.
+
+zero64setup:
+	DCBTST (R3)             // prepare data cache
+	MOVD   R7, CTR          // number of 64 byte chunks
+	MOVD   $16, R8
+	MOVD   $32, R16
+	MOVD   $48, R17
+
+zero64:
 	STXVD2X VS32, (R3+R0)   // store 16 bytes
 	STXVD2X VS32, (R3+R8)
-	ADD     $32, R3
-	ADD     $-32, R4
-	BC      16, 0, zero32   // dec ctr, br zero32 if ctr not 0
-	RLDCLCC $61, R4, $3, R6 // remaining doublewords
+	STXVD2X VS32, (R3+R16)
+	STXVD2X VS32, (R3+R17)
+	ADD     $64, R3
+	ADD     $-64, R4
+	BDNZ    zero64          // dec ctr, br zero64 if ctr not 0
+	SRDCC   $3, R4, R6	// remaining doublewords
 	BEQ     nozerolarge
-	MOVD    R6, CTR         // set up the CTR for doublewords
-	BR      zero8
+
+lt64gt8:
+	CMP	R4, $32
+	BLT	lt32gt8
+	MOVD	$16, R8
+	STXVD2X	VS32, (R3+R0)
+	STXVD2X	VS32, (R3+R8)
+	ADD	$-32, R4
+	ADD	$32, R3
+lt32gt8:
+	CMP	R4, $16
+	BLT	lt16gt8
+	STXVD2X	VS32, (R3+R0)
+	ADD	$16, R3
+	ADD	$-16, R4
+lt16gt8:
+	CMP	R4, $8
+	BLT	nozerolarge
+	MOVD	R0, 0(R3)
+	ADD	$8, R3
+	ADD	$-8, R4

 nozerolarge:
 	ANDCC $7, R4, R5 // any remaining bytes
@ -94,7 +107,7 @@ zerotail:
 zerotailloop:
 	MOVB R0, 0(R3)           // clear single bytes
 	ADD  $1, R3
-	BC   16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
+	BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
 	RET

 zero512xsetup:  // 512 chunk with extra needed
@ -119,7 +132,7 @@ zero512preloop:  // clear up to 128 alignment
 	STXVD2X VS32, (R3+R0)         // clear 16 bytes
 	ADD     $16, R3               // update ptr
 	ADD     $-16, R4              // dec count
-	BC      16, 0, zero512preloop
+	BDNZ    zero512preloop

 zero512setup:  // setup for dcbz loop
 	CMP  R4, $512   // check if at least 512
@ -129,6 +142,7 @@ zero512setup:  // setup for dcbz loop
 	MOVD $128, R9   // index regs for 128 bytes
 	MOVD $256, R10
 	MOVD $384, R11
+	PCALIGN	$32

 zero512:
 	DCBZ (R3+R0)        // clear first chunk
@ -136,8 +150,8 @@ zero512:
 	DCBZ (R3+R10)       // clear third chunk
 	DCBZ (R3+R11)       // clear fourth chunk
 	ADD  $512, R3
-	ADD  $-512, R4
-	BC   16, 0, zero512
+	BDNZ zero512
+	ANDCC $511, R4

 remain:
 	CMP  R4, $128  // check if 128 byte chunks left
@ -150,16 +164,11 @@ remain:
 smaller:
 	ANDCC $127, R4, R7 // find leftovers
 	BEQ   done
-	CMP   R7, $64      // more than 64, do 32 at a time
-	BLT   zero8setup   // less than 64, do 8 at a time
-	SRD   $5, R7, R7   // set up counter for 32
-	BR    zero32setup
-
-zero8setup:
-	SRDCC $3, R7, R7  // less than 8 bytes
-	BEQ   nozerolarge
-	MOVD  R7, CTR
-	BR    zero8
+	CMP   R7, $64      // more than 64, do 64 at a time
+	XXLXOR VS32, VS32, VS32
+	BLT   lt64gt8	   // less than 64
+	SRD   $6, R7, R7   // set up counter for 64
+	BR    zero64setup

 done:
 	RET