internal/bytealg: add power9 version of bytes index

This adds a power9 version of the bytes.Index function for little endian. Here is the improvement on power9 for some of the Index benchmarks: Index/10 -0.14% Index/32 -3.19% Index/4K -12.66% Index/4M -13.34% Index/64M -13.17% Count/10 -0.59% Count/32 -2.88% Count/4K -12.63% Count/4M -13.35% Count/64M -13.17% IndexHard1 -23.03% IndexHard2 -13.01% IndexHard3 -22.12% IndexHard4 +0.16% CountHard1 -23.02% CountHard2 -13.01% CountHard3 -22.12% IndexPeriodic/IndexPeriodic2 -22.85% IndexPeriodic/IndexPeriodic4 -23.15% Change-Id: Id72353e2771eba2efbb1544d5f0be65f8a9f0433 Reviewed-on: https://go-review.googlesource.com/c/go/+/311380 Run-TryBot: Carlos Eduardo Seo <carlos.seo@linaro.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <carlos.seo@linaro.org> Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
2024-11-26 17:26:56 -07:00 · 2021-04-19 15:37:54 -05:00 · 2021-04-19 15:37:54 -05:00 · 54af9fd9e6
commit 54af9fd9e6
parent 122fca49f9
3 changed files with 353 additions and 2 deletions
--- a/src/internal/bytealg/bytealg.go
+++ b/src/internal/bytealg/bytealg.go
@ -17,6 +17,8 @@ const (
 	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
 	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
 	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
 )
 // MaxLen is the maximum length of the string to be searched for (argument b) in Index.
--- a/src/internal/bytealg/index_ppc64x.go
+++ b/src/internal/bytealg/index_ppc64x.go
@ -8,8 +8,12 @@
 package bytealg
 import "internal/cpu"
 const MaxBruteForce = 16
 var SupportsPower9 = cpu.PPC64.IsPOWER9
 func init() {
 	MaxLen = 32
 }
--- a/src/internal/bytealg/index_ppc64x.s
+++ b/src/internal/bytealg/index_ppc64x.s
@ -51,7 +51,16 @@ TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
 	MOVD b_base+24(FP), R5 // R5 = separator pointer
 	MOVD b_len+32(FP), R6  // R6 = separator length
 	MOVD $ret+48(FP), R14  // R14 = &ret
-	BR   indexbody<>(SB)
+
 #ifdef GOARCH_ppc64le
 	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
 	CMP   R7, $1
 	BNE   power8
 	BR    indexbodyp9<>(SB)
 #endif
 power8:
 	BR indexbody<>(SB)
 TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
 	MOVD a_base+0(FP), R3  // R3 = string
@ -59,7 +68,16 @@ TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
 	MOVD b_base+16(FP), R5 // R5 = separator pointer
 	MOVD b_len+24(FP), R6  // R6 = separator length
 	MOVD $ret+32(FP), R14  // R14 = &ret
-	BR   indexbody<>(SB)
+
 #ifdef GOARCH_ppc64le
 	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
 	CMP   R7, $1
 	BNE   power8
 	BR    indexbody<>(SB)
 #endif
 power8:
 	BR indexbody<>(SB)
 	// s: string we are searching
 	// sep: string to search for
@ -420,3 +438,330 @@ found:
 	SUB  R3, R7    // Return byte where found
 	MOVD R7, (R14)
 	RET
 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
 	CMP      R6, R4                // Compare lengths
 	BGT      notfound              // If sep len is > string, notfound
 	ADD      R4, R3, LASTBYTE      // find last byte addr
 	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
 	CMP      R6, $0                // Check sep len
 	BEQ      notfound              // sep len 0 -- not found
 	MOVD     R3, R7                // Copy of string addr
 	MOVD     $16, R16              // Index value 16
 	MOVD     $17, R17              // Index value 17
 	MOVD     $18, R18              // Index value 18
 	MOVD     $1, R19               // Index value 1
 	VSPLTISB $0xFF, ONES           // splat all 1s
 	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
 	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
 	BGE    CR4, loadge16       // Load for len(sep) >= 16
 	SUB    R6, R16, R9         // 16-len of sep
 	SLD    $3, R9              // Set up for VSLO
 	MTVSRD R9, V9_             // Set up for VSLO
 	VSLDOI $8, V9, V9, V9      // Set up for VSLO
 	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
 loadge16:
 	ANDCC $15, R5, R9 // Find byte offset of sep
 	ADD   R9, R6, R10 // Add sep len
 	CMP   R10, $16    // Check if sep len+offset > 16
 	BGE   sepcross16  // Sep crosses 16 byte boundary
 	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
 	LXVB16X (R8)(R0), V0_   // Load 16 bytes @R8 into V0
 	SLD     $3, R9          // Set up shift count for VSLO
 	MTVSRD  R9, V8_         // Set up shift count for VSLO
 	VSLDOI  $8, V8, V8, V8
 	VSLO    V0, V8, V0      // Shift by start byte
 	VAND V0, SEPMASK, V0 // Mask separator (< 16)
 	BR   index2plus
 sepcross16:
 	LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
 	VAND V0, SEPMASK, V0 // mask out separator
 	BLE  CR4, index2to16
 	BR   index17plus     // Handle sep > 16
 index2plus:
 	CMP      R6, $2       // Check length of sep
 	BNE      index3plus   // If not 2, check for 3
 	ADD      $16, R7, R9  // Check if next 16 bytes past last
 	CMP      R9, LASTBYTE // compare with last
 	BGE      index2to16   // 2 <= len(string) <= 16
 	MOVD     $0xff00, R21 // Mask for later
 	MTVSRD   R21, V25     // Move to Vreg
 	VSPLTH   $3, V25, V31 // Splat mask
 	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
 	VSPLTISB $0, V10      // Clear V10
 	// First case: 2 byte separator
 	// V1: 2 byte separator splatted
 	// V2: 16 bytes at addr
 	// V4: 16 bytes at addr+1
 	// Compare 2 byte separator at start
 	// and at start+1. Use VSEL to combine
 	// those results to find the first
 	// matching start byte, returning
 	// that value when found. Loop as
 	// long as len(string) > 16
 index2loop2:
 	LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
 index2loop:
 	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7 into V2
 	VCMPEQUH V1, V2, V5      // Search for sep
 	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
 	VSEL     V6, V5, V31, V7 // merge even and odd indices
 	VCLZD    V7, V18         // find index of first match
 	MFVSRD   V18, R25        // get first value
 	CMP      R25, $64        // Found if < 64
 	BLT      foundR25        // Return byte index where found
 	MFVSRLD V18, R25        // get second value
 	CMP     R25, $64        // Found if < 64
 	ADD     $64, R25        // Update byte offset
 	BLT     foundR25        // Return value
 	ADD     $16, R7         // R7+=16 Update string pointer
 	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
 	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
 	BLT     index2loop2     // If < last, continue loop
 	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
 	BLT     index2to16      // If < 16 handle specially
 	LXVB16X (R7)(R0), V3_   // Load 16 bytes @R7 into V3
 	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
 	BR      index2loop
 index3plus:
 	CMP    R6, $3       // Check if sep == 3
 	BNE    index4plus   // If not check larger
 	ADD    $19, R7, R9  // Find bytes for use in this loop
 	CMP    R9, LASTBYTE // Compare against last byte
 	BGE    index2to16   // Remaining string 2<=len<=16
 	MOVD   $0xff00, R21 // Set up mask for upcoming loop
 	MTVSRD R21, V25     // Move mask to Vreg
 	VSPLTH $3, V25, V31 // Splat mask
 	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
 	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
 	// Loop to process 3 byte separator.
 	// string[0:16] is in V2
 	// string[2:18] is in V3
 	// sep[0:2] splatted in V1
 	// sec[3] splatted in v8
 	// Load vectors at string, string+1
 	// and string+2. Compare string, string+1
 	// against first 2 bytes of separator
 	// splatted, and string+2 against 3rd
 	// byte splatted. Merge the results with
 	// VSEL to find the first byte of a match.
 	// Special handling for last 16 bytes if the
 	// string fits in 16 byte multiple.
 index3loop2:
 	MOVD     $2, R21          // Set up index for 2
 	VSPLTISB $0, V10          // Clear V10
 	LXVB16X  (R7)(R21), V3_   // Load 16 bytes @R7+2 into V3
 	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
 index3loop:
 	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7
 	VSLDOI   $1, V2, V3, V4  // string[1:17]
 	VSLDOI   $2, V2, V3, V9  // string[2:18]
 	VCMPEQUH V1, V2, V5      // compare hw even indices
 	VCMPEQUH V1, V4, V6      // compare hw odd indices
 	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
 	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
 	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
 	VCLZD    V7, V18         // Find first nonzero indexes
 	MFVSRD   V18, R25        // Move 1st doubleword
 	CMP      R25, $64        // If < 64 found
 	BLT      foundR25        // Return matching index
 	MFVSRLD  V18, R25     // Move 2nd doubleword
 	CMP      R25, $64     // If < 64 found
 	ADD      $64, R25     // Update byte index
 	BLT      foundR25     // Return matching index
 	ADD      $16, R7      // R7+=16 string ptr
 	ADD      $19, R7, R9  // Number of string bytes for loop
 	CMP      R9, LASTBYTE // Compare against last byte of string
 	BLT      index3loop2  // If within, continue this loop
 	CMP      R7, LASTSTR  // Compare against last start byte
 	BLT      index2to16   // Process remainder
 	VSPLTISB $0, V3       // Special case for last 16 bytes
 	BR       index3loop   // Continue this loop
 	// Loop to process 4 byte separator
 	// string[0:16] in V2
 	// string[3:16] in V3
 	// sep[0:4] splatted in V1
 	// Set up vectors with strings at offsets
 	// 0, 1, 2, 3 and compare against the 4 byte
 	// separator also splatted. Use VSEL with the
 	// compare results to find the first byte where
 	// a separator match is found.
 index4plus:
 	CMP  R6, $4       // Check if 4 byte separator
 	BNE  index5plus   // If not next higher
 	ADD  $20, R7, R9  // Check string size to load
 	CMP  R9, LASTBYTE // Verify string length
 	BGE  index2to16   // If not large enough, process remaining
 	MOVD $2, R15      // Set up index
 	// Set up masks for use with VSEL
 	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
 	SLD     $24, R21
 	MTVSRWS R21, V29
 	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
 	MOVD    $0xffff, R21
 	SLD     $16, R21
 	MTVSRWS R21, V31
 	VSPLTW $0, V0, V1 // Splat 1st word of separator
 index4loop:
 	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
 next4:
 	VSPLTISB $0, V10            // Clear
 	MOVD     $3, R9             // Number of bytes beyond 16
 	LXVB16X  (R7)(R9), V3_      // Load 16 bytes @R7 into V2
 	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
 	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
 	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
 	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
 	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
 	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
 	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
 	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
 	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
 	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
 	VSEL     V14, V13, V31, V7  // final merge
 	VCLZD    V7, V18            // Find first index for each half
 	MFVSRD   V18, R25           // Isolate value
 	CMP      R25, $64           // If < 64, found
 	BLT      foundR25           // Return found index
 	MFVSRLD V18, R25     // Isolate other value
 	CMP     R25, $64     // If < 64, found
 	ADD     $64, R25     // Update index for high doubleword
 	BLT     foundR25     // Return found index
 	ADD     $16, R7      // R7+=16 for next string
 	ADD     $20, R7, R9  // R+20 for all bytes to load
 	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
 	BLT     index4loop   // If not, continue loop
 	CMP     R7, LASTSTR  // Check remainder
 	BLE     index2to16   // Process remainder
 	BR      notfound     // Not found
 index5plus:
 	CMP R6, $16     // Check for sep > 16
 	BGT index17plus // Handle large sep
 	// Assumption is that the separator is smaller than the string at this point
 index2to16:
 	CMP R7, LASTSTR // Compare last start byte
 	BGT notfound    // last takes len(sep) into account
 	ADD $16, R7, R9    // Check for last byte of string
 	CMP R9, LASTBYTE
 	BGT index2to16tail
 	// At least 16 bytes of string left
 	// Mask the number of bytes in sep
 index2to16loop:
 	LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
 compare:
 	VAND       V1, SEPMASK, V2 // Mask out sep size
 	VCMPEQUBCC V0, V2, V3      // Compare masked string
 	BLT        CR6, found      // All equal
 	ADD        $1, R7          // Update ptr to next byte
 	CMP        R7, LASTSTR     // Still less than last start byte
 	BGT        notfound        // Not found
 	ADD        $16, R7, R9     // Verify remaining bytes
 	CMP        R9, LASTBYTE    // At least 16
 	BLT        index2to16loop  // Try again
 	// Less than 16 bytes remaining in string
 	// Separator >= 2
 index2to16tail:
 	ADD     R3, R4, R9     // End of string
 	SUB     R7, R9, R9     // Number of bytes left
 	ANDCC   $15, R7, R10   // 16 byte offset
 	ADD     R10, R9, R11   // offset + len
 	CMP     R11, $16       // >= 16?
 	BLE     short          // Does not cross 16 bytes
 	LXVB16X (R7)(R0), V1_  // Load 16 bytes @R7 into V1
 	BR      index2to16next // Continue on
 short:
 	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
 	LXVB16X  (R9)(R0), V1_   // Load 16 bytes @R9 into V1
 	SLD      $3, R10         // Set up shift
 	MTVSRD   R10, V8_        // Set up shift
 	VSLDOI   $8, V8, V8, V8
 	VSLO     V1, V8, V1      // Shift by start byte
 	VSPLTISB $0, V25         // Clear for later use
 index2to16next:
 	VAND       V1, SEPMASK, V2 // Just compare size of sep
 	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
 	BLT        CR6, found      // Found
 	ADD        $1, R7          // Not found, try next partial string
 	CMP        R7, LASTSTR     // Check for end of string
 	BGT        notfound        // If at end, then not found
 	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
 	BR         index2to16next  // Check the next partial string
 index17plus:
 	CMP      R6, $32       // Check if 17 < len(sep) <= 32
 	BGT      index33plus
 	SUB      $16, R6, R9   // Extra > 16
 	SLD      $56, R9, R10  // Shift to use in VSLO
 	MTVSRD   R10, V9_      // Set up for VSLO
 	LXVB16X  (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
 	VSLO     V1, V9, V1    // Shift left
 	VSPLTISB $0xff, V7     // Splat 1s
 	VSPLTISB $0, V27       // Splat 0
 index17to32loop:
 	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
 next17:
 	LXVB16X    (R7)(R9), V3_   // Load 16 bytes @R7+R9 into V3
 	VSLO       V3, V9, V3      // Shift left
 	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
 	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
 	VAND       V4, V5, V6      // Check if both equal
 	VCMPEQUBCC V6, V7, V8      // All equal?
 	BLT        CR6, found      // Yes
 	ADD        $1, R7          // On to next byte
 	CMP        R7, LASTSTR     // Check if last start byte
 	BGT        notfound        // If too high, not found
 	BR         index17to32loop // Continue
 notfound:
 	MOVD $-1, R8   // Return -1 if not found
 	MOVD R8, (R14)
 	RET
 index33plus:
 	MOVD $0, (R0) // Case not implemented
 	RET           // Crash before return
 foundR25:
 	SRD  $3, R25   // Convert from bits to bytes
 	ADD  R25, R7   // Add to current string address
 	SUB  R3, R7    // Subtract from start of string
 	MOVD R7, (R14) // Return byte where found
 	RET
 found:
 	SUB  R3, R7    // Return byte where found
 	MOVD R7, (R14)
 	RET