runtime: improve memmove for short moves on ppc64

This improves the performance of memmove for almost all moves <= 16 bytes for the ppc64 assembler, improving linux/ppc64le, linux/ppc64, aix/ppc64. Only the forward moves were changed, the backward moves were left as is. Additional macro defines were added to improve the readability of the asm. Results from power8: name old time/op new time/op delta Memmove/0 5.70ns ± 0% 5.69ns ± 0% -0.18% (p=0.029 n=4+4) Memmove/1 5.54ns ± 0% 5.39ns ± 0% -2.71% (p=0.029 n=4+4) Memmove/2 6.31ns ± 0% 5.55ns ± 0% -12.08% (p=0.029 n=4+4) Memmove/3 7.41ns ± 0% 5.54ns ± 0% -25.24% (p=0.029 n=4+4) Memmove/4 8.41ns ± 0% 5.56ns ± 0% -33.87% (p=0.029 n=4+4) Memmove/5 10.1ns ± 5% 5.5ns ± 0% -45.30% (p=0.029 n=4+4) Memmove/6 10.3ns ± 0% 5.6ns ± 0% -45.92% (p=0.029 n=4+4) Memmove/7 11.4ns ± 0% 5.7ns ± 0% -50.33% (p=0.029 n=4+4) Memmove/8 5.66ns ± 0% 5.54ns ± 0% -2.12% (p=0.029 n=4+4) Memmove/9 5.66ns ± 0% 6.47ns ± 0% +14.31% (p=0.029 n=4+4) Memmove/10 6.67ns ± 0% 6.22ns ± 0% -6.82% (p=0.029 n=4+4) Memmove/11 7.83ns ± 0% 6.45ns ± 0% -17.60% (p=0.029 n=4+4) Memmove/12 8.91ns ± 0% 6.25ns ± 0% -29.85% (p=0.029 n=4+4) Memmove/13 9.81ns ± 0% 6.48ns ± 0% -33.94% (p=0.029 n=4+4) Memmove/14 10.7ns ± 1% 6.4ns ± 0% -40.00% (p=0.029 n=4+4) Memmove/15 11.8ns ± 0% 6.7ns ± 0% -42.84% (p=0.029 n=4+4) Memmove/16 5.63ns ± 0% 5.56ns ± 0% -1.20% (p=0.029 n=4+4) Change-Id: I2de434f543c5a017395e0850fb9b9f7219583bbb Reviewed-on: https://go-review.googlesource.com/c/go/+/223317 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2024-09-29 21:34:28 -06:00 · 2020-03-12 10:26:01 -04:00 · 2020-03-12 10:26:01 -04:00 · cbd421f75b
commit cbd421f75b
parent f9f57c4443
1 changed files with 107 additions and 73 deletions
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@ -9,78 +9,112 @@
 // See memmove Go doc for important implementation constraints.

 // func memmove(to, from unsafe.Pointer, n uintptr)
+
+// target address
+#define TGT R3
+// source address
+#define SRC R4
+// length to move
+#define LEN R5
+// number of doublewords
+#define DWORDS R6
+// number of bytes < 8
+#define BYTES R7
+// const 16 used as index
+#define IDX16 R8
+// temp used for copies, etc.
+#define TMP R9
+// number of 32 byte chunks
+#define QWORDS R10
+
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-	MOVD	to+0(FP), R3
-	MOVD	from+8(FP), R4
-	MOVD	n+16(FP), R5
+	MOVD	to+0(FP), TGT
+	MOVD	from+8(FP), SRC
+	MOVD	n+16(FP), LEN

 	// Determine if there are doublewords to
 	// copy so a more efficient move can be done
 check:
-	ANDCC	$7, R5, R7	// R7: bytes to copy
-	SRD	$3, R5, R6	// R6: double words to copy
-	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
+	ANDCC	$7, LEN, BYTES	// R7: bytes to copy
+	SRD	$3, LEN, DWORDS	// R6: double words to copy
+	MOVFL	CR0, CR3	// save CR from ANDCC
+	CMP	DWORDS, $0, CR1	// CR1[EQ] set if no double words to copy

 	// Determine overlap by subtracting dest - src and comparing against the
-	// length.  The catches the cases where src and dest are in different types
+	// length.  This catches the cases where src and dest are in different types
 	// of storage such as stack and static to avoid doing backward move when not
 	// necessary.

-	SUB	R4, R3, R8	// dest - src
-	CMPU	R8, R5, CR2	// < len?
+	SUB	SRC, TGT, TMP	// dest - src
+	CMPU	TMP, LEN, CR2	// < len?
 	BC	12, 8, backward // BLT CR2 backward

 	// Copying forward if no overlap.

-	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
-	SRDCC	$2,R6,R8		// 32 byte chunks?
-	BNE	forward32setup		//
-	MOVD	R6,CTR			// R6 = number of double words
-
-	// Move double words
-
-forward8:
-	MOVD    0(R4), R8		// double word
-	ADD     $8,R4
-	MOVD    R8, 0(R3)		//
-	ADD     $8,R3
-	BC      16, 0, forward8
-	BR	noforwardlarge		// handle remainder
+	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
+	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
+	BEQ	lt32gt8			// < 32 bytes

 	// Prepare for moves of 32 bytes at a time.

 forward32setup:
-	DCBTST	(R3)			// prepare data cache
-	DCBT	(R4)
-	MOVD	R8, CTR			// double work count
-	MOVD	$16, R8
+	DCBTST	(TGT)			// prepare data cache
+	DCBT	(SRC)
+	MOVD	QWORDS, CTR		// Number of 32 byte chunks
+	MOVD	$16, IDX16		// 16 for index

 forward32:
-	LXVD2X	(R4+R0), VS32		// load 16 bytes
-	LXVD2X	(R4+R8), VS33
-	ADD	$32, R4
-	STXVD2X	VS32, (R3+R0)		// store 16 bytes
-	STXVD2X	VS33, (R3+R8)
-	ADD	$32,R3			// bump up for next set
+	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
+	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
+	ADD	$32, SRC
+	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
+	STXVD2X	VS33, (IDX16)(TGT)
+	ADD	$32,TGT			// bump up for next set
 	BC	16, 0, forward32	// continue
-	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
-	BEQ	noforwardlarge
-	MOVD	R6,CTR			// set up the CTR
-	BR	forward8
+	ANDCC	$3, DWORDS		// remaining doublewords
+	BEQ	checkbytes		// only bytes remain

-noforwardlarge:
-	CMP	R7,$0			// any remaining bytes
-	BC	4, 1, LR		// ble lr
+lt32gt8:
+        // At this point >= 8 and < 32
+	// Move 16 bytes if possible
+	CMP     DWORDS, $2
+	BLT     lt16
+	LXVD2X	(R0)(SRC), VS32
+	ADD	$-2, DWORDS
+	STXVD2X	VS32, (R0)(TGT)
+	ADD     $16, SRC
+	ADD     $16, TGT

-forwardtail:
-	MOVD	R7, CTR			// move tail bytes
-
-forwardtailloop:
-	MOVBZ	0(R4), R8		// move single bytes
-	ADD	$1,R4
-	MOVBZ	R8, 0(R3)
-	ADD	$1,R3
-	BC	16, 0, forwardtailloop
+lt16:	// Move 8 bytes if possible
+	CMP     DWORDS, $1
+	BLT     checkbytes
+	MOVD    0(SRC), TMP
+	ADD	$8, SRC
+	MOVD    TMP, 0(TGT)
+	ADD     $8, TGT
+checkbytes:
+	BC	12, 14, LR		// BEQ lr
+lt8:	// Move word if possible
+	CMP BYTES, $4
+	BLT lt4
+	MOVWZ 0(SRC), TMP
+	ADD $-4, BYTES
+	MOVW TMP, 0(TGT)
+	ADD $4, SRC
+	ADD $4, TGT
+lt4:	// Move halfword if possible
+	CMP BYTES, $2
+	BLT lt2
+	MOVHZ 0(SRC), TMP
+	ADD $-2, BYTES
+	MOVH TMP, 0(TGT)
+	ADD $2, SRC
+	ADD $2, TGT
+lt2:	// Move last byte if 1 left
+	CMP BYTES, $1
+	BC 12, 0, LR	// ble lr
+	MOVBZ 0(SRC), TMP
+	MOVBZ TMP, 0(TGT)
 	RET

 backward:
@ -88,51 +122,51 @@ backward:
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.

-	ADD	R5, R4, R4		// end of source
-	ADD	R3, R5, R3		// end of dest
+	ADD	LEN, SRC, SRC		// end of source
+	ADD	TGT, LEN, TGT		// end of dest

 	BEQ	nobackwardtail		// earlier condition

-	MOVD	R7, CTR			// bytes to move
+	MOVD	BYTES, CTR			// bytes to move

 backwardtailloop:
-	MOVBZ 	-1(R4), R8		// point to last byte
-	SUB	$1,R4
-	MOVBZ 	R8, -1(R3)
-	SUB	$1,R3
+	MOVBZ 	-1(SRC), TMP		// point to last byte
+	SUB	$1,SRC
+	MOVBZ 	TMP, -1(TGT)
+	SUB	$1,TGT
 	BC	16, 0, backwardtailloop // bndz

 nobackwardtail:
 	BC	4, 5, LR		// ble CR1 lr

 backwardlarge:
-	MOVD	R6, CTR
-	SUB	R3, R4, R9		// Use vsx if moving
-	CMP	R9, $32			// at least 32 byte chunks
+	MOVD	DWORDS, CTR
+	SUB	TGT, SRC, TMP		// Use vsx if moving
+	CMP	TMP, $32		// at least 32 byte chunks
 	BLT	backwardlargeloop	// and distance >= 32
-	SRDCC	$2,R6,R8		// 32 byte chunks
+	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
 	BNE	backward32setup

 backwardlargeloop:
-	MOVD 	-8(R4), R8
-	SUB	$8,R4
-	MOVD 	R8, -8(R3)
-	SUB	$8,R3
+	MOVD 	-8(SRC), TMP
+	SUB	$8,SRC
+	MOVD 	TMP, -8(TGT)
+	SUB	$8,TGT
 	BC	16, 0, backwardlargeloop // bndz
 	RET

 backward32setup:
-	MOVD	R8, CTR			// set up loop ctr
-	MOVD	$16, R8			// 32 bytes at at time
+	MOVD	QWORDS, CTR			// set up loop ctr
+	MOVD	$16, IDX16			// 32 bytes at at time

 backward32loop:
-	SUB	$32, R4
-	SUB	$32, R3
-	LXVD2X	(R4+R0), VS32           // load 16 bytes
-	LXVD2X	(R4+R8), VS33
-	STXVD2X	VS32, (R3+R0)           // store 16 bytes
-	STXVD2X	VS33, (R3+R8)
+	SUB	$32, TGT
+	SUB	$32, SRC
+	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
+	LXVD2X	(IDX16)(TGT), VS33
+	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
+	STXVD2X	VS33, (IDX16)(SRC)
 	BC      16, 0, backward32loop   // bndz
 	BC	4, 5, LR		// ble CR1 lr
-	MOVD	R6, CTR
+	MOVD	DWORDS, CTR
 	BR	backwardlargeloop