internal/bytealg: optimize Count/CountString for PPC64/Power10

Power10 adds a handful of new instructions which make this noticeably quicker for smaller values. Likewise, since the vector loop requires 32B to enter, unroll it once to count 32B per iteration. This improvement benefits all PPC64 cpus. On Power10 comparing a binary built with GOPPC64=power8 CountSingle/10 8.99ns ± 0% 5.55ns ± 3% -38.24% CountSingle/16 7.55ns ± 0% 5.56ns ± 3% -26.37% CountSingle/17 7.45ns ± 0% 5.25ns ± 0% -29.52% CountSingle/31 18.4ns ± 0% 6.2ns ± 0% -66.41% CountSingle/32 6.17ns ± 0% 5.04ns ± 0% -18.37% CountSingle/33 7.13ns ± 0% 5.99ns ± 0% -15.94% CountSingle/4K 198ns ± 0% 115ns ± 0% -42.08% CountSingle/4M 190µs ± 0% 109µs ± 0% -42.49% CountSingle/64M 3.28ms ± 0% 2.08ms ± 0% -36.53% Furthermore, comparing the new tail implementation on GOPPC64=power8 with GOPPC64=power10: CountSingle/10 5.55ns ± 3% 4.52ns ± 1% -18.66% CountSingle/16 5.56ns ± 3% 4.80ns ± 0% -13.65% CountSingle/17 5.25ns ± 0% 4.79ns ± 0% -8.78% CountSingle/31 6.17ns ± 0% 4.82ns ± 0% -21.79% CountSingle/32 5.04ns ± 0% 5.09ns ± 6% +1.01% CountSingle/33 5.99ns ± 0% 5.42ns ± 2% -9.54% Change-Id: I62d80be3b5d706e1abbb4bec7d6278a939a5eed4 Reviewed-on: https://go-review.googlesource.com/c/go/+/512695 Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2024-11-23 04:40:09 -07:00 · 2023-07-17 15:23:28 -05:00 · 2023-07-17 15:23:28 -05:00 · 756841bffa
commit 756841bffa
parent 02b548e5c8
1 changed files with 123 additions and 65 deletions
--- a/src/internal/bytealg/count_ppc64x.s
+++ b/src/internal/bytealg/count_ppc64x.s
@ -8,89 +8,147 @@
 #include "textflag.h"
 TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
-	// R3 = byte array pointer 
+	// R3 = byte array pointer
 	// R4 = length
-	MOVBZ R6, R5              // R5 = byte
+	// R6 = byte to count
-	BR    countbytebody<>(SB)
+	MTVRD	R6, V1		// move compare byte
 	MOVD	R6, R5
 	VSPLTB	$7, V1, V1	// replicate byte across V1
 	BR	countbytebody<>(SB)
 TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
 	// R3 = byte array pointer
 	// R4 = length
-	MOVBZ R5, R5              // R5 = byte
+	// R5 = byte to count
-	BR    countbytebody<>(SB)
+	MTVRD	R5, V1		// move compare byte
 	VSPLTB	$7, V1, V1	// replicate byte across V1
 	BR	countbytebody<>(SB)
 // R3: addr of string
 // R4: len of string
 // R5: byte to count
 // V1: byte to count, splatted.
 // On exit:
 // R3: return value
 // endianness shouldn't matter since we are just counting and order
 // is irrelevant
 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
-	DCBT (R3)    // Prepare cache line.
+	MOVD	$0, R18 // byte count
 	MOVD R0, R18 // byte count
 	MOVD R3, R19 // Save base address for calculating the index later.
 	MOVD R4, R16
-	MOVD   R5, R6
+#ifndef GOPPC64_power10
-	RLDIMI $8, R6, $48, R6
+	RLDIMI	$8, R5, $48, R5
-	RLDIMI $16, R6, $32, R6
+	RLDIMI	$16, R5, $32, R5
-	RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
+	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
 #endif
-	VSPLTISW $3, V4     // used for shift
+	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
-	MTVRD    R6, V1     // move compare byte
+	BLT	tail		// Jump to the small string case
-	VSPLTB   $7, V1, V1 // replicate byte across V1
+	SRD	$5, R4, R20
-
+	MOVD	R20, CTR
-	CMPU   R4, $32          // Check if it's a small string (<32 bytes)
+	MOVD	$16, R21
-	BLT    tail             // Jump to the small string case
+	XXLXOR	V4, V4, V4
-	XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+	XXLXOR	V5, V5, V5
 	PCALIGN	$16
 cmploop:
-	LXVW4X (R3), VS32 // load bytes from string
+	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
 	LXVD2X	(R21)(R3), V2
 	VCMPEQUB V2, V1, V2
 	VCMPEQUB V0, V1, V0
 	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
 	VPOPCNTD V0, V0
 	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
 	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
 	ADD	$32, R3
 	BDNZ	cmploop
-	// when the bytes match, the corresponding byte contains all 1s
+	VADDUDM	V4, V5, V5
-	VCMPEQUB V1, V0, V2     // compare bytes
+	MFVSRD	V5, R18
-	VPOPCNTD V2, V3         // each double word contains its count
+	VSLDOI	$8, V5, V5, V5
-	VADDUDM  V3, V5, V5     // accumulate bit count in each double word
+	MFVSRD	V5, R21
-	ADD      $16, R3, R3    // increment pointer
+	ADD	R21, R18, R18
-	SUB      $16, R16, R16  // remaining bytes
+	ANDCC	$31, R4, R4
-	CMP      R16, $16       // at least 16 remaining?
+	// Skip the tail processing if no bytes remaining.
-	BGE      cmploop
+	BEQ	tail_0
 	VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
 	VSLDOI   $8, V5, V5, V6 // get the double word values from vector
 	MFVSRD   V5, R9
 	MFVSRD   V6, R10
 	ADD      R9, R10, R9
 	ADD      R9, R18, R18
-tail:
+#ifdef GOPPC64_power10
-	CMP R16, $8 // 8 bytes left?
+	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
 	BLT small
-	MOVD    (R3), R12     // load 8 bytes
+tail:	// Count the last 0 - 31 bytes.
-	CMPB    R12, R6, R17  // compare bytes
+	CMP	R4, $16
-	POPCNTD R17, R15      // bit count
+	BLE	small_tail_p10
-	SRD     $3, R15, R15  // byte count
+	LXV	0(R3), V0
-	ADD     R15, R18, R18 // add to byte count
+	VCMPEQUB V0, V1, V0
 	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
 	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
 	ADD	R14, R18, R18
 	ADD	$16, R3, R3
 	ANDCC	$15, R4, R4
-next1:
+small_tail_p10:
-	ADD $8, R3, R3
+	SLD	$56, R4, R6
-	SUB $8, R16, R16 // remaining bytes
+	LXVLL	R3, R6, V0
-	BR  tail
+	VCMPEQUB V0, V1, V0
-
+	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
-small:
+	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
-	CMP   $0, R16   // any remaining
+	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
-	BEQ   done
+	ADD	R14, R18, R3
-	MOVBZ (R3), R12 // check each remaining byte
+	RET
-	CMP   R12, R5
+
-	BNE   next2
+#else
-	ADD   $1, R18
+tail:	// Count the last 0 - 31 bytes.
-
+	CMP	R4, $16
-next2:
+	BLT	tail_8
-	SUB $1, R16
+	MOVD	(R3), R12
-	ADD $1, R3  // inc address
+	MOVD	8(R3), R14
-	BR  small
+	CMPB	R12, R5, R12
-
+	CMPB	R14, R5, R14
-done:
+	POPCNTD	R12, R12
-	MOVD R18, R3    // return count
+	POPCNTD	R14, R14
 	ADD	R12, R18, R18
 	ADD	R14, R18, R18
 	ADD	$16, R3, R3
 	ADD	$-16, R4, R4
 tail_8:	// Count the remaining 0 - 15 bytes.
 	CMP	R4, $8
 	BLT	tail_4
 	MOVD	(R3), R12
 	CMPB	R12, R5, R12
 	POPCNTD	R12, R12
 	ADD	R12, R18, R18
 	ADD	$8, R3, R3
 	ADD	$-8, R4, R4
 tail_4:	// Count the remaining 0 - 7 bytes.
 	CMP	R4, $4
 	BLT	tail_2
 	MOVWZ	(R3), R12
 	CMPB	R12, R5, R12
 	SLD	$32, R12, R12	// Remove non-participating matches.
 	POPCNTD	R12, R12
 	ADD	R12, R18, R18
 	ADD	$4, R3, R3
 	ADD	$-4, R4, R4
 tail_2:	// Count the remaining 0 - 3 bytes.
 	CMP	R4, $2
 	BLT	tail_1
 	MOVHZ	(R3), R12
 	CMPB	R12, R5, R12
 	SLD	$48, R12, R12	// Remove non-participating matches.
 	POPCNTD	R12, R12
 	ADD	R12, R18, R18
 	ADD	$2, R3, R3
 	ADD	$-2, R4, R4
 tail_1:	// Count the remaining 0 - 1 bytes.
 	CMP	R4, $1
 	BLT	tail_0
 	MOVBZ	(R3), R12
 	CMPB	R12, R5, R12
 	ANDCC	$0x8, R12, R12
 	ADD	R12, R18, R18
 #endif
 tail_0:	// No remaining tail to count.
 	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
 	RET