internal/bytealg: rewrite indexbytebody on PPC64

Use P8 instructions throughout to be backwards compatible, but otherwise not impede performance. Use overlapping loads where possible, and prioritize larger checks over smaller check. However, some newer instructions can be used surgically when targeting a newer GOPPC64. These can lead to noticeable performance improvements with minimal impact to readability. All tests run below on a Power10/ppc64le, and use a small modification to BenchmarkIndexByte to ensure the IndexByte wrapper call is inlined (as it likely is under realistic usage). This wrapper adds substantial overhead if not inlined. Previous (power9 path, GOPPC64=power8) vs. GOPPC64=power8: IndexByte/1 3.81ns ± 8% 3.11ns ± 5% -18.39% IndexByte/2 3.82ns ± 3% 3.20ns ± 6% -16.23% IndexByte/3 3.61ns ± 4% 3.25ns ± 6% -10.13% IndexByte/4 3.66ns ± 5% 3.08ns ± 1% -15.91% IndexByte/5 3.82ns ± 0% 3.75ns ± 2% -1.94% IndexByte/6 3.83ns ± 0% 3.87ns ± 4% +1.04% IndexByte/7 3.83ns ± 0% 3.82ns ± 0% -0.27% IndexByte/8 3.82ns ± 0% 2.92ns ±11% -23.70% IndexByte/9 3.70ns ± 2% 3.08ns ± 2% -16.87% IndexByte/10 3.74ns ± 2% 3.04ns ± 0% -18.75% IndexByte/11 3.75ns ± 0% 3.31ns ± 8% -11.79% IndexByte/12 3.74ns ± 0% 3.04ns ± 0% -18.86% IndexByte/13 3.83ns ± 4% 3.04ns ± 0% -20.64% IndexByte/14 3.80ns ± 1% 3.30ns ± 8% -13.18% IndexByte/15 3.77ns ± 1% 3.04ns ± 0% -19.33% IndexByte/16 3.81ns ± 0% 2.78ns ± 7% -26.88% IndexByte/17 4.12ns ± 0% 3.04ns ± 1% -26.11% IndexByte/18 4.27ns ± 6% 3.05ns ± 0% -28.64% IndexByte/19 4.30ns ± 4% 3.02ns ± 2% -29.65% IndexByte/20 4.43ns ± 7% 3.45ns ± 7% -22.15% IndexByte/21 4.12ns ± 0% 3.03ns ± 1% -26.35% IndexByte/22 4.40ns ± 6% 3.05ns ± 0% -30.82% IndexByte/23 4.40ns ± 6% 3.01ns ± 2% -31.48% IndexByte/24 4.32ns ± 5% 3.07ns ± 0% -28.98% IndexByte/25 4.76ns ± 2% 3.04ns ± 1% -36.11% IndexByte/26 4.82ns ± 0% 3.05ns ± 0% -36.66% IndexByte/27 4.82ns ± 0% 2.97ns ± 3% -38.39% IndexByte/28 4.82ns ± 0% 2.96ns ± 3% -38.57% IndexByte/29 4.82ns ± 0% 3.34ns ± 9% -30.71% IndexByte/30 4.82ns ± 0% 3.05ns ± 0% -36.77% IndexByte/31 4.81ns ± 0% 3.05ns ± 0% -36.70% IndexByte/32 3.52ns ± 0% 3.44ns ± 1% -2.15% IndexByte/33 4.77ns ± 1% 3.35ns ± 0% -29.81% IndexByte/34 5.01ns ± 5% 3.35ns ± 0% -33.15% IndexByte/35 4.92ns ± 9% 3.35ns ± 0% -31.89% IndexByte/36 4.81ns ± 5% 3.35ns ± 0% -30.37% IndexByte/37 4.99ns ± 6% 3.35ns ± 0% -32.86% IndexByte/38 5.06ns ± 5% 3.35ns ± 0% -33.84% IndexByte/39 5.02ns ± 5% 3.48ns ± 9% -30.58% IndexByte/40 5.21ns ± 9% 3.55ns ± 4% -31.82% IndexByte/41 5.18ns ± 0% 3.42ns ± 2% -33.98% IndexByte/42 5.19ns ± 0% 3.55ns ±11% -31.56% IndexByte/43 5.18ns ± 0% 3.45ns ± 5% -33.46% IndexByte/44 5.18ns ± 0% 3.39ns ± 0% -34.56% IndexByte/45 5.18ns ± 0% 3.43ns ± 4% -33.74% IndexByte/46 5.18ns ± 0% 3.47ns ± 1% -33.03% IndexByte/47 5.18ns ± 0% 3.44ns ± 2% -33.54% IndexByte/48 5.18ns ± 0% 3.39ns ± 0% -34.52% IndexByte/49 5.69ns ± 0% 3.79ns ± 0% -33.45% IndexByte/50 5.70ns ± 0% 3.70ns ± 3% -34.98% IndexByte/51 5.70ns ± 0% 3.70ns ± 2% -35.05% IndexByte/52 5.69ns ± 0% 3.80ns ± 1% -33.35% IndexByte/53 5.69ns ± 0% 3.78ns ± 0% -33.54% IndexByte/54 5.69ns ± 0% 3.78ns ± 1% -33.51% IndexByte/55 5.69ns ± 0% 3.78ns ± 0% -33.61% IndexByte/56 5.69ns ± 0% 3.81ns ± 3% -33.12% IndexByte/57 6.20ns ± 0% 3.79ns ± 4% -38.89% IndexByte/58 6.20ns ± 0% 3.74ns ± 2% -39.58% IndexByte/59 6.20ns ± 0% 3.69ns ± 2% -40.47% IndexByte/60 6.20ns ± 0% 3.79ns ± 1% -38.81% IndexByte/61 6.20ns ± 0% 3.77ns ± 1% -39.23% IndexByte/62 6.20ns ± 0% 3.79ns ± 0% -38.89% IndexByte/63 6.20ns ± 0% 3.79ns ± 0% -38.90% IndexByte/64 4.17ns ± 0% 3.47ns ± 3% -16.70% IndexByte/65 5.38ns ± 0% 4.21ns ± 0% -21.59% IndexByte/66 5.38ns ± 0% 4.21ns ± 0% -21.58% IndexByte/67 5.38ns ± 0% 4.22ns ± 0% -21.58% IndexByte/68 5.38ns ± 0% 4.22ns ± 0% -21.59% IndexByte/69 5.38ns ± 0% 4.22ns ± 0% -21.56% IndexByte/70 5.38ns ± 0% 4.21ns ± 0% -21.59% IndexByte/71 5.37ns ± 0% 4.21ns ± 0% -21.51% IndexByte/72 5.37ns ± 0% 4.22ns ± 0% -21.46% IndexByte/73 5.71ns ± 0% 4.22ns ± 0% -26.20% IndexByte/74 5.71ns ± 0% 4.21ns ± 0% -26.21% IndexByte/75 5.71ns ± 0% 4.21ns ± 0% -26.17% IndexByte/76 5.71ns ± 0% 4.22ns ± 0% -26.22% IndexByte/77 5.71ns ± 0% 4.22ns ± 0% -26.22% IndexByte/78 5.71ns ± 0% 4.21ns ± 0% -26.22% IndexByte/79 5.71ns ± 0% 4.22ns ± 0% -26.21% IndexByte/80 5.71ns ± 0% 4.21ns ± 0% -26.19% IndexByte/81 6.20ns ± 0% 4.39ns ± 0% -29.13% IndexByte/82 6.20ns ± 0% 4.36ns ± 0% -29.67% IndexByte/83 6.20ns ± 0% 4.36ns ± 0% -29.63% IndexByte/84 6.20ns ± 0% 4.39ns ± 0% -29.21% IndexByte/85 6.20ns ± 0% 4.36ns ± 0% -29.64% IndexByte/86 6.20ns ± 0% 4.36ns ± 0% -29.63% IndexByte/87 6.20ns ± 0% 4.39ns ± 0% -29.21% IndexByte/88 6.20ns ± 0% 4.36ns ± 0% -29.65% IndexByte/89 6.74ns ± 0% 4.36ns ± 0% -35.33% IndexByte/90 6.75ns ± 0% 4.37ns ± 0% -35.22% IndexByte/91 6.74ns ± 0% 4.36ns ± 0% -35.30% IndexByte/92 6.74ns ± 0% 4.36ns ± 0% -35.34% IndexByte/93 6.74ns ± 0% 4.37ns ± 0% -35.20% IndexByte/94 6.74ns ± 0% 4.36ns ± 0% -35.33% IndexByte/95 6.75ns ± 0% 4.36ns ± 0% -35.32% IndexByte/96 4.83ns ± 0% 4.34ns ± 2% -10.24% IndexByte/97 5.91ns ± 0% 4.65ns ± 0% -21.24% IndexByte/98 5.91ns ± 0% 4.65ns ± 0% -21.24% IndexByte/99 5.91ns ± 0% 4.65ns ± 0% -21.23% IndexByte/100 5.90ns ± 0% 4.65ns ± 0% -21.21% IndexByte/101 5.90ns ± 0% 4.65ns ± 0% -21.22% IndexByte/102 5.90ns ± 0% 4.65ns ± 0% -21.23% IndexByte/103 5.91ns ± 0% 4.65ns ± 0% -21.23% IndexByte/104 5.91ns ± 0% 4.65ns ± 0% -21.24% IndexByte/105 6.25ns ± 0% 4.65ns ± 0% -25.59% IndexByte/106 6.25ns ± 0% 4.65ns ± 0% -25.59% IndexByte/107 6.25ns ± 0% 4.65ns ± 0% -25.60% IndexByte/108 6.25ns ± 0% 4.65ns ± 0% -25.58% IndexByte/109 6.24ns ± 0% 4.65ns ± 0% -25.50% IndexByte/110 6.25ns ± 0% 4.65ns ± 0% -25.56% IndexByte/111 6.25ns ± 0% 4.65ns ± 0% -25.60% IndexByte/112 6.25ns ± 0% 4.65ns ± 0% -25.59% IndexByte/113 6.76ns ± 0% 5.05ns ± 0% -25.37% IndexByte/114 6.76ns ± 0% 5.05ns ± 0% -25.31% IndexByte/115 6.76ns ± 0% 5.05ns ± 0% -25.38% IndexByte/116 6.76ns ± 0% 5.05ns ± 0% -25.31% IndexByte/117 6.76ns ± 0% 5.05ns ± 0% -25.38% IndexByte/118 6.76ns ± 0% 5.05ns ± 0% -25.31% IndexByte/119 6.76ns ± 0% 5.05ns ± 0% -25.38% IndexByte/120 6.76ns ± 0% 5.05ns ± 0% -25.36% IndexByte/121 7.35ns ± 0% 5.05ns ± 0% -31.33% IndexByte/122 7.36ns ± 0% 5.05ns ± 0% -31.42% IndexByte/123 7.38ns ± 0% 5.05ns ± 0% -31.60% IndexByte/124 7.38ns ± 0% 5.05ns ± 0% -31.59% IndexByte/125 7.38ns ± 0% 5.05ns ± 0% -31.60% IndexByte/126 7.38ns ± 0% 5.05ns ± 0% -31.58% IndexByte/128 5.28ns ± 0% 5.10ns ± 0% -3.41% IndexByte/256 7.27ns ± 0% 7.28ns ± 2% +0.13% IndexByte/512 12.1ns ± 0% 11.8ns ± 0% -2.51% IndexByte/1K 23.1ns ± 3% 22.0ns ± 0% -4.66% IndexByte/2K 42.6ns ± 0% 42.4ns ± 0% -0.41% IndexByte/4K 90.3ns ± 0% 89.4ns ± 0% -0.98% IndexByte/8K 170ns ± 0% 170ns ± 0% -0.59% IndexByte/16K 331ns ± 0% 330ns ± 0% -0.27% IndexByte/32K 660ns ± 0% 660ns ± 0% -0.08% IndexByte/64K 1.30µs ± 0% 1.30µs ± 0% -0.08% IndexByte/128K 2.58µs ± 0% 2.58µs ± 0% -0.04% IndexByte/256K 5.15µs ± 0% 5.15µs ± 0% -0.04% IndexByte/512K 10.3µs ± 0% 10.3µs ± 0% -0.03% IndexByte/1M 20.6µs ± 0% 20.5µs ± 0% -0.03% IndexByte/2M 41.1µs ± 0% 41.1µs ± 0% -0.03% IndexByte/4M 82.2µs ± 0% 82.1µs ± 0% -0.02% IndexByte/8M 164µs ± 0% 164µs ± 0% -0.01% IndexByte/16M 328µs ± 0% 328µs ± 0% -0.01% IndexByte/32M 657µs ± 0% 657µs ± 0% -0.00% GOPPC64=power8 vs GOPPC64=power9. The Improvement is most noticed between 16 and 64B, and goes away around 128B. IndexByte/16 2.78ns ± 7% 2.65ns ±15% -4.74% IndexByte/17 3.04ns ± 1% 2.80ns ± 3% -7.85% IndexByte/18 3.05ns ± 0% 2.71ns ± 4% -11.00% IndexByte/19 3.02ns ± 2% 2.76ns ±10% -8.74% IndexByte/20 3.45ns ± 7% 2.91ns ± 0% -15.46% IndexByte/21 3.03ns ± 1% 2.84ns ± 9% -6.33% IndexByte/22 3.05ns ± 0% 2.67ns ± 1% -12.38% IndexByte/23 3.01ns ± 2% 2.67ns ± 1% -11.24% IndexByte/24 3.07ns ± 0% 2.92ns ±12% -4.79% IndexByte/25 3.04ns ± 1% 3.15ns ±15% +3.63% IndexByte/26 3.05ns ± 0% 2.83ns ±13% -7.33% IndexByte/27 2.97ns ± 3% 2.98ns ±10% +0.56% IndexByte/28 2.96ns ± 3% 2.96ns ± 9% -0.05% IndexByte/29 3.34ns ± 9% 3.03ns ±12% -9.33% IndexByte/30 3.05ns ± 0% 2.68ns ± 1% -12.05% IndexByte/31 3.05ns ± 0% 2.83ns ±12% -7.27% IndexByte/32 3.44ns ± 1% 3.21ns ±10% -6.78% IndexByte/33 3.35ns ± 0% 3.41ns ± 2% +1.95% IndexByte/34 3.35ns ± 0% 3.13ns ± 0% -6.53% IndexByte/35 3.35ns ± 0% 3.13ns ± 0% -6.54% IndexByte/36 3.35ns ± 0% 3.13ns ± 0% -6.52% IndexByte/37 3.35ns ± 0% 3.13ns ± 0% -6.52% IndexByte/38 3.35ns ± 0% 3.24ns ± 4% -3.30% IndexByte/39 3.48ns ± 9% 3.44ns ± 2% -1.19% IndexByte/40 3.55ns ± 4% 3.46ns ± 2% -2.44% IndexByte/41 3.42ns ± 2% 3.39ns ± 4% -0.86% IndexByte/42 3.55ns ±11% 3.46ns ± 1% -2.65% IndexByte/43 3.45ns ± 5% 3.44ns ± 2% -0.31% IndexByte/44 3.39ns ± 0% 3.43ns ± 3% +1.23% IndexByte/45 3.43ns ± 4% 3.50ns ± 1% +2.07% IndexByte/46 3.47ns ± 1% 3.46ns ± 2% -0.31% IndexByte/47 3.44ns ± 2% 3.47ns ± 1% +0.78% IndexByte/48 3.39ns ± 0% 3.46ns ± 2% +1.96% IndexByte/49 3.79ns ± 0% 3.47ns ± 0% -8.41% IndexByte/50 3.70ns ± 3% 3.64ns ± 5% -1.66% IndexByte/51 3.70ns ± 2% 3.75ns ± 0% +1.40% IndexByte/52 3.80ns ± 1% 3.77ns ± 0% -0.70% IndexByte/53 3.78ns ± 0% 3.77ns ± 0% -0.46% IndexByte/54 3.78ns ± 1% 3.53ns ± 7% -6.74% IndexByte/55 3.78ns ± 0% 3.47ns ± 0% -8.17% IndexByte/56 3.81ns ± 3% 3.45ns ± 0% -9.43% IndexByte/57 3.79ns ± 4% 3.47ns ± 0% -8.45% IndexByte/58 3.74ns ± 2% 3.55ns ± 4% -5.16% IndexByte/59 3.69ns ± 2% 3.61ns ± 4% -2.01% IndexByte/60 3.79ns ± 1% 3.45ns ± 0% -9.09% IndexByte/61 3.77ns ± 1% 3.47ns ± 0% -7.93% IndexByte/62 3.79ns ± 0% 3.45ns ± 0% -8.97% IndexByte/63 3.79ns ± 0% 3.47ns ± 0% -8.44% IndexByte/64 3.47ns ± 3% 3.18ns ± 0% -8.41% GOPPC64=power9 vs GOPPC64=power10. Only sizes <16 will show meaningful changes. IndexByte/1 3.27ns ± 8% 2.36ns ± 2% -27.58% IndexByte/2 3.06ns ± 4% 2.34ns ± 1% -23.42% IndexByte/3 3.77ns ±11% 2.48ns ± 7% -34.03% IndexByte/4 3.18ns ± 8% 2.33ns ± 1% -26.69% IndexByte/5 3.18ns ± 5% 2.34ns ± 4% -26.26% IndexByte/6 3.13ns ± 3% 2.35ns ± 1% -24.97% IndexByte/7 3.25ns ± 1% 2.33ns ± 1% -28.22% IndexByte/8 2.79ns ± 2% 2.36ns ± 1% -15.32% IndexByte/9 2.90ns ± 0% 2.34ns ± 2% -19.36% IndexByte/10 2.99ns ± 3% 2.31ns ± 1% -22.70% IndexByte/11 3.13ns ± 7% 2.31ns ± 0% -26.08% IndexByte/12 3.01ns ± 4% 2.32ns ± 1% -22.91% IndexByte/13 2.98ns ± 3% 2.31ns ± 1% -22.72% IndexByte/14 2.92ns ± 2% 2.61ns ±16% -10.58% IndexByte/15 3.02ns ± 5% 2.69ns ± 7% -10.90% IndexByte/16 2.65ns ±15% 2.29ns ± 1% -13.61% Change-Id: I4482f762d25eabf60def4981a0b2bc0c10ccf50c Reviewed-on: https://go-review.googlesource.com/c/go/+/478656 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Bryan Mills <bcmills@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Archana Ravindar <aravind5@in.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2024-11-17 00:14:50 -07:00 · 2023-03-06 16:51:31 -06:00 · 2023-03-06 16:51:31 -06:00 · db32aba508
commit db32aba508
parent ccb8db88c5
1 changed files with 249 additions and 326 deletions
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@ -11,381 +11,304 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
 	// R3 = byte array pointer
 	// R4 = length
 	MOVD	R6, R5		// R5 = byte
-	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)

 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
 	// R3 = string
 	// R4 = length
 	// R5 = byte
-	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)

+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX	MOVDBR
+#define _LWBEX	MOVWBR
+#define _LHBEX	MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX	MOVD
+#define _LWBEX	MOVW
+#define _LHBEX	MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
-// R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-	MOVD	R3,R17		// Save base address for calculating the index later.
-	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
-	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
-	ADD	R4,R3,R7	// Last acceptable address in R7.
+	CMPU	R4,$32

-	RLDIMI	$16,R5,$32,R5
-	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
-	MOVD	$-1,R9
-	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
-	RLDIMI	$32,R5,$0,R5
-	MOVD	R7,R10		// Save last acceptable address in R10 for later.
-	ADD	$-1,R7,R7
-#ifdef GOARCH_ppc64le
-	SLD	R6,R9,R9	// Prepare mask for Little Endian
-#else
-	SRD	R6,R9,R9	// Same for Big Endian
+#ifndef GOPPC64_power9
+	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
+	MOVD	$indexbytevbperm<>+00(SB),R16
+	LXVD2X	(R16),V0	// Set up swap string
 #endif
-	BLT	small_string	// Jump to the small string case if it's <32 bytes.
-	CMP	R16,$1		// optimize for power8 v power9
-	BNE	power8
-	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
+
 	MTVRD	R5,V1
-	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
-	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
 	VSPLTB	$7,V1,V1	// Replicate byte across V1
-	CMP	R4,$64
+
+	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
+
+	CMP	R4,$64,CR1
 	MOVD	$16,R11
 	MOVD	R3,R8
-	BLT	cmp32
+	BLT	CR1,cmp32	// Special case for length 32 - 63
 	MOVD	$32,R12
 	MOVD	$48,R6

+	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
+	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
+	ANDCC	$63,R4		// (len &= 63) cmp 0.
+
+	PCALIGN	$16
 loop64:
-	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
+	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat0	// match found at R8, jump out
+	BNE	CR6,foundat0	// Match found at R8, jump out

-	LXVB16X	(R8)(R11),V2
+	LXVD2X	(R11)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
+	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out

-	LXVB16X	(R8)(R12),V2
+	LXVD2X	(R12)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
+	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out

-	LXVB16X	(R8)(R6),V2
+	LXVD2X	(R6)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
+	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
+
 	ADD	$64,R8
-	ADD	$-64,R4
-	CMP	R4,$64		// >=64 bytes left to scan?
-	BGE	loop64
-	CMP	R4,$32
-	BLT	rem		// jump to rem if there are < 32 bytes left
-cmp32:
-	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
+	CMPU	R8,R9,CR1
+	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
+
+	PCALIGN	$32
+	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
+
+	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
+	CMP	R4,$16,CR1
+	BGE	cmp32
+
+	ADD	R8,R4,R9
+	ADD	$-16,R9
+	BLE	CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16:	// Tail length 17 - 32
+	LXVD2X	(R0)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat0	// match found at R8
+	BNE	CR6,foundat0

-	LXVB16X	(R11)(R8),V2
+cmp64_tail_gt0:	// Tail length 1 - 16
+	MOVD	R9,R8
+	LXVD2X	(R0)(R9),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat1	// match found at R8+16
+	BNE	CR6,foundat0

-	ADD	$32,R8
-	ADD	$-32,R4
-rem:
-	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
-	BR	small_string
+	BR	notfound

+cmp32:	// Length 32 - 63
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0
+
+	// Bytes 16 - 31
+	LXVD2X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
+
+	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+	CMP	R4,$48
+
+	ADD	R4,R8,R9		// Compute &s[len(s)-16]
+	ADD	$32,R8,R8
+	ADD	$-16,R9,R9
+	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
+
+	// Bytes 33 - 47
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
+
+	BLE	notfound
+
+	// Bytes 48 - 63
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
 foundat3:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$48+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat2:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$32+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat1:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$16+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat0:
-	// Compress the result into a single doubleword and
-	// move it to a GPR for the final calculation.
-	VBPERMQ	V6,V10,V6
-	MFVRD	V6,R3
-	// count leading zeroes upto the match that ends up in low 16 bits
-	// in both endian modes, compute index by subtracting the number by 16
-	CNTLZW	R3,R11
-	ADD	$-16,R11
-	ADD	R8,R11,R3	// Calculate byte address
-	SUB	R17,R3
+	SUB	R3,R8,R3
+	ADD	$0+ADJUST_FOR_CNTLZW,R3
+vfound:
+	// Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+	VBPERMQ	V6,V0,V6
+	MFVRD	V6,R4
+	CNTLZW	R4,R4
+#else
+#ifdef GOARCH_ppc64le
+	// Put the value back into LE ordering by swapping doublewords.
+	XXPERMDI	V6,V6,$2,V6
+#endif
+	_VCZBEBB	V6,R4
+#endif
+	ADD	R3,R4,R3
 	RET
-power8:
-	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
-	// in V0, V1 and V10, then branch to the preloop.
-	ANDCC	$63,R3,R11
-	BEQ	CR0,qw_align
-	RLDICL	$0,R3,$61,R11

-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
-	BNE	CR7,done
-	ADD	$8,R8,R8
+cmp16:	// Length 16 - 31
+	CMPU	R4,$16
+	ADD	R4,R3,R9
+	BLT	cmp8
+
+	ADD	$-16,R9,R9		// &s[len(s)-16]
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R3),V2
+	VCMPEQUBCC	V2,V1,V6
+	MOVD	R3,R8
+	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
+
+	BEQ	notfound
+
+	// Bytes 16 - 30
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+
+cmp8:	// Length 8 - 15
+#ifdef GOPPC64_power10
+	// Load all the bytes into a single VSR in BE order.
+	SLD	$56,R4,R5
+	LXVLL	R3,R5,V2
+	// Compare and count the number which don't match.
+	VCMPEQUB	V2,V1,V6
+	VCLZLSBB	V6,R3
+	// If count is the number of bytes, or more. No matches are found.
+	CMPU	R3,R4
+	MOVD	$-1,R5
+	// Otherwise, the count is the index of the first match.
+	ISEL	CR0LT,R3,R5,R3
+	RET
+#else
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	RLDIMI	$16,R5,$32,R5
+	RLDIMI	$32,R5,$0,R5
+	CMPU	R4,$8
+	BLT	cmp4
+	MOVD	$-8,R11
 	ADD	$-8,R4,R4
+
+	_LDBEX	(R0)(R3),R10
+	_LDBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CMPU	R10,$0
+	CMPU	R11,$0,CR1
+	CNTLZD	R10,R10
+	CNTLZD	R11,R11
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
 	ADD	R4,R11,R4
-
-	// Check for quadword alignment
-	ANDCC	$15,R8,R11
-	BEQ	CR0,qw_align
-
-	// Not aligned, so handle the next doubleword
-	MOVD	0(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR7
-	BNE	CR7,done
-	ADD	$8,R8,R8
-	ADD	$-8,R4,R4
-
-	// Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
-	// Set up auxiliary data for the vectorized algorithm.
-	VSPLTISB  $0,V0		// Replicate 0 across V0
-	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
-	MTVRD	  R5,V1
-	LVSL	  (R0+R0),V11
-	VSLB	  V11,V10,V10
-	VSPLTB	  $7,V1,V1	// Replicate byte across V1
-	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
-	BLE	  tail
-
-	// We will load 4 quardwords per iteration in the loop, so check for
-	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
-	ANDCC	  $63,R8,R11
-	BEQ	  CR0,preloop
-
-	// Not 64-byte aligned. Load one quadword at a time until aligned.
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $-16,R4,R4
-	ADD	    $16,R8,R8
-
-	// 64-byte aligned. Prepare for the main loop.
-preloop:
-	CMPU	R4,$64
-	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
-
-	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
-	// per loop iteration. The last doubleword is in R10, so our loop counter
-	// starts at (R10-R8)/64.
-	SUB	R8,R10,R6
-	SRD	$6,R6,R9      // Loop counter in R9
-	MOVD	R9,CTR
-
-	ADD	$-64,R8,R8   // Adjust index for loop entry
-	MOVD	$16,R11      // Load offsets for the vector loads
-	MOVD	$32,R9
-	MOVD	$48,R7
-
-	// Main loop we will load 64 bytes per iteration
-loop:
-	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
-	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	LVX	    (R8+R11),V3
-	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
-	VCMPEQUB    V1,V3,V7
-
-	LVX	    (R8+R9),V4
-	LVX	    (R8+R7),V5
-	VCMPEQUB    V1,V4,V8
-	VCMPEQUB    V1,V5,V9
-
-	VOR	    V6,V7,V11	      // Compress the result in a single vector
-	VOR	    V8,V9,V12
-	VOR	    V11,V12,V13
-	VCMPEQUBCC  V0,V13,V14	      // Check for byte
-	BGE	    CR6,found
-	BC	    16,0,loop	      // bdnz loop
-
-	// Handle the tailing bytes or R4 ≤ 64
-	RLDICL	$0,R6,$58,R4
-	ADD	$64,R8,R8
-tail:
-	CMPU	    R4,$0
-	BEQ	    notfound
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-
-notfound:
-	MOVD	$-1, R3
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
 	RET

+cmp4:	// Length 4 - 7
+	CMPU	R4,$4
+	BLT	cmp2
+	MOVD	$-4,R11
+	ADD	$-4,R4,R4
+
+	_LWBEX	(R0)(R3),R10
+	_LWBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CNTLZW	R10,R10
+	CNTLZW	R11,R11
+	CMPU	R10,$32
+	CMPU	R11,$32,CR1
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
+	ADD	R4,R11,R4
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
+	RET
+
+cmp2:	// Length 2 - 3
+	CMPU	R4,$2
+	BLT	cmp1
+
+	_LHBEX	(R0)(R3),R10
+	CMPB	R10,R5,R10
+	SLDCC	$48,R10,R10
+	CNTLZD	R10,R10
+	SRD	$3,R10,R3
+	BNE	found
+
+cmp1:	// Length 1
+	MOVD	$-1,R3
+	ANDCC	$1,R4,R31
+	BEQ	found
+
+	MOVBZ	-1(R9),R10
+	CMPB	R10,R5,R10
+	ANDCC	$1,R10
+	ADD	$-1,R4
+	ISEL	CR0EQ,R3,R4,R3
+
 found:
-	// We will now compress the results into a single doubleword,
-	// so it can be moved to a GPR for the final index calculation.
-
-	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-	// first bit of each byte into bits 48-63.
-	VBPERMQ	  V6,V10,V6
-	VBPERMQ	  V7,V10,V7
-	VBPERMQ	  V8,V10,V8
-	VBPERMQ	  V9,V10,V9
-
-	// Shift each 16-bit component into its correct position for
-	// merging into a single doubleword.
-#ifdef GOARCH_ppc64le
-	VSLDOI	  $2,V7,V7,V7
-	VSLDOI	  $4,V8,V8,V8
-	VSLDOI	  $6,V9,V9,V9
-#else
-	VSLDOI	  $6,V6,V6,V6
-	VSLDOI	  $4,V7,V7,V7
-	VSLDOI	  $2,V8,V8,V8
+	RET
 #endif

-	// Merge V6-V9 into a single doubleword and move to a GPR.
-	VOR	V6,V7,V11
-	VOR	V8,V9,V4
-	VOR	V4,V11,V4
-	MFVRD	V4,R3
-
-#ifdef GOARCH_ppc64le
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	ADD	R8,R11,R3	// Calculate byte address
-
-return:
-	SUB	R17, R3
+notfound:
+	MOVD $-1,R3
 	RET

-found_qw_align:
-	// Use the same algorithm as above. Compress the result into
-	// a single doubleword and move it to a GPR for the final
-	// calculation.
-	VBPERMQ	  V6,V10,V6
-
-#ifdef GOARCH_ppc64le
-	MFVRD	  V6,R3
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11
-#else
-	VSLDOI	  $6,V6,V6,V6
-	MFVRD	  V6,R3
-	CNTLZD	  R3,R11
-#endif
-	ADD	  R8,R11,R3
-	CMPU	  R11,R4
-	BLT	  return
-	BR	  notfound
-	PCALIGN	  $16
-
-done:
-	ADD	$-1,R10,R6
-	// Offset of last index for the final
-	// doubleword comparison
-	RLDICL	$0,R6,$61,R6
-	// At this point, R3 has 0xFF in the same position as the byte we are
-	// looking for in the doubleword. Use that to calculate the exact index
-	// of the byte.
-#ifdef GOARCH_ppc64le
-	ADD	$-1,R3,R11
-	ANDN	R3,R11,R11
-	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	CMPU	R8,R7		// Check if we are at the last doubleword.
-	SRD	$3,R11		// Convert trailing zeros to bytes.
-	ADD	R11,R8,R3
-	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
-	BNE	return
-	BLE	CR7,return
-	BR	notfound
-
-small_string:
-	// process string of length < 32 bytes
-	// We unroll this loop for better performance.
-	CMPU	R4,$0		// Check for length=0
-	BEQ	notfound
-
-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base.
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
-	CMPU	R8,R7
-	BNE	CR7,done
-	BEQ	notfound	// Hit length.
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	BNE	CR6,done
-	BR	notfound
-