mirror of
https://github.com/golang/go
synced 2024-11-23 04:50:06 -07:00
runtime: improve index on ppc64x/power10
Rewrite index asm function to use the new power10 instruction lxvl, stxvl or the load, store vector with length which can specify the number of bytes to be stored in a register. This avoids the need to create a separator mask and extra AND instructions. It also allows us to process the tail end of the string using a lot fewer instructions as we can load bytes of separator length directly rather than loading 16 bytes and masking out bytes that are greater than separator length On power9 and power8 the code remains unchanged. The performance for smaller sizes improve the most, on larger sizes we see minimal improvement. name old time/op new time/op delta Index/10 10.6ns ± 3% 9.8ns ± 2% -7.20% Index/11 11.2ns ± 4% 10.6ns ± 0% -5.99% Index/12 12.7ns ± 3% 11.3ns ± 0% -11.21% Index/13 13.5ns ± 2% 11.7ns ± 0% -13.11% Index/14 14.1ns ± 1% 12.0ns ± 0% -14.43% Index/15 14.3ns ± 2% 12.4ns ± 0% -13.39% Index/16 14.5ns ± 1% 12.7ns ± 0% -12.57% Index/17 26.7ns ± 0% 25.9ns ± 0% -2.99% Index/18 27.3ns ± 0% 26.4ns ± 1% -3.35% Index/19 35.7ns ±16% 26.1ns ± 1% -26.87% Index/20 29.4ns ± 0% 27.3ns ± 1% -7.06% Index/21 29.3ns ± 0% 26.9ns ± 1% -8.37% Index/22 30.0ns ± 0% 27.4ns ± 0% -8.68% Index/23 29.9ns ± 0% 27.7ns ± 0% -7.15% Index/24 31.0ns ± 0% 28.0ns ± 0% -9.92% Index/25 31.7ns ± 0% 28.4ns ± 0% -10.54% Index/26 30.6ns ± 0% 28.9ns ± 1% -5.67% Index/27 31.4ns ± 0% 29.3ns ± 0% -6.71% Index/28 32.7ns ± 0% 29.6ns ± 1% -9.36% Index/29 33.3ns ± 0% 30.1ns ± 1% -9.70% Index/30 32.4ns ± 0% 30.7ns ± 0% -5.23% Index/31 33.2ns ± 0% 30.6ns ± 1% -7.83% Index/32 34.3ns ± 0% 30.9ns ± 0% -9.94% Index/64 46.8ns ± 0% 44.2ns ± 0% -5.66% Index/128 71.2ns ± 0% 67.3ns ± 0% -5.43% Index/256 129ns ± 0% 127ns ± 0% -1.67% Index/2K 838ns ± 0% 804ns ± 0% -4.03% Index/4K 1.65µs ± 0% 1.58µs ± 0% -4.25% Index/2M 829µs ± 0% 793µs ± 0% -4.42% Index/4M 1.65ms ± 0% 1.59ms ± 0% -4.19% Index/64M 26.5ms ± 0% 25.4ms ± 0% -4.18% IndexHard2 412µs ± 0% 396µs ± 0% -3.76% IndexEasy/10 10.0ns ± 0% 9.3ns ± 1% -7.20% IndexEasy/11 10.8ns ± 1% 11.0ns ± 1% +2.22% IndexEasy/12 12.3ns ± 2% 11.5ns ± 1% -6.37% IndexEasy/13 13.1ns ± 0% 11.7ns ± 2% -10.83% IndexEasy/14 13.8ns ± 2% 11.9ns ± 1% -13.52% IndexEasy/15 14.0ns ± 2% 12.4ns ± 2% -11.46% IndexEasy/16 14.3ns ± 1% 12.5ns ± 0% -12.40% CountHard2 415µs ± 0% 396µs ± 0% -4.48% Change-Id: Id3efa5ed9c662a29f58125c7f866a09f29a59b6c Reviewed-on: https://go-review.googlesource.com/c/go/+/478918 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Paul Murphy <murp@ibm.com> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> Reviewed-by: David Chase <drchase@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
parent
e875cbd9aa
commit
56c4422770
@ -434,41 +434,49 @@ TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
|
|||||||
CMP R6, $0 // Check sep len
|
CMP R6, $0 // Check sep len
|
||||||
BEQ notfound // sep len 0 -- not found
|
BEQ notfound // sep len 0 -- not found
|
||||||
MOVD R3, R7 // Copy of string addr
|
MOVD R3, R7 // Copy of string addr
|
||||||
|
#ifndef GOPPC64_power10
|
||||||
MOVD $16, R16 // Index value 16
|
MOVD $16, R16 // Index value 16
|
||||||
MOVD $17, R17 // Index value 17
|
MOVD $17, R17 // Index value 17
|
||||||
MOVD $18, R18 // Index value 18
|
MOVD $18, R18 // Index value 18
|
||||||
MOVD $1, R19 // Index value 1
|
|
||||||
VSPLTISB $0xFF, ONES // splat all 1s
|
VSPLTISB $0xFF, ONES // splat all 1s
|
||||||
|
|
||||||
CMP R6, $16, CR4 // CR4 for len(sep) >= 16
|
|
||||||
VOR ONES, ONES, SEPMASK // Set up full SEPMASK
|
VOR ONES, ONES, SEPMASK // Set up full SEPMASK
|
||||||
|
#else
|
||||||
|
SLD $56, R6, R14 // Set up separator length for LXVLL
|
||||||
|
#endif
|
||||||
|
MOVD $1, R19 // Index value 1
|
||||||
|
CMP R6, $16, CR4 // CR4 for len(sep) >= 16
|
||||||
BGE CR4, loadge16 // Load for len(sep) >= 16
|
BGE CR4, loadge16 // Load for len(sep) >= 16
|
||||||
|
#ifndef GOPPC64_power10
|
||||||
SUB R6, R16, R9 // 16-len of sep
|
SUB R6, R16, R9 // 16-len of sep
|
||||||
SLD $3, R9 // Set up for VSLO
|
SLD $3, R9 // Set up for VSLO
|
||||||
MTVSRD R9, V9 // Set up for VSLO
|
MTVSRD R9, V9 // Set up for VSLO
|
||||||
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
||||||
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
||||||
|
#endif
|
||||||
loadge16:
|
loadge16:
|
||||||
ANDCC $15, R5, R9 // Find byte offset of sep
|
ANDCC $15, R5, R9 // Find byte offset of sep
|
||||||
ADD R9, R6, R10 // Add sep len
|
ADD R9, R6, R10 // Add sep len
|
||||||
CMP R10, $16 // Check if sep len+offset > 16
|
CMP R10, $16 // Check if sep len+offset > 16
|
||||||
BGT sepcross16 // Sep crosses 16 byte boundary
|
BGT sepcross16 // Sep crosses 16 byte boundary
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
LXVLL R5, R14, V0 // Load separator
|
||||||
|
#else
|
||||||
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
||||||
LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
|
LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
|
||||||
SLD $3, R9 // Set up shift count for VSLO
|
SLD $3, R9 // Set up shift count for VSLO
|
||||||
MTVSRD R9, V8 // Set up shift count for VSLO
|
MTVSRD R9, V8 // Set up shift count for VSLO
|
||||||
VSLDOI $8, V8, V8, V8
|
VSLDOI $8, V8, V8, V8
|
||||||
VSLO V0, V8, V0 // Shift by start byte
|
VSLO V0, V8, V0 // Shift by start byte
|
||||||
|
|
||||||
VAND V0, SEPMASK, V0 // Mask separator (< 16)
|
VAND V0, SEPMASK, V0 // Mask separator (< 16)
|
||||||
BR index2plus
|
#endif
|
||||||
|
BR index2plus
|
||||||
sepcross16:
|
sepcross16:
|
||||||
LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
|
#ifdef GOPPC64_power10
|
||||||
|
LXVLL R5, R14, V0 // Load separator
|
||||||
|
#else
|
||||||
|
LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\
|
||||||
VAND V0, SEPMASK, V0 // mask out separator
|
VAND V0, SEPMASK, V0 // mask out separator
|
||||||
|
#endif
|
||||||
BLE CR4, index2to16
|
BLE CR4, index2to16
|
||||||
BR index17plus // Handle sep > 16
|
BR index17plus // Handle sep > 16
|
||||||
|
|
||||||
@ -659,9 +667,23 @@ index2to16:
|
|||||||
VSPLTISB $0, V10 // Clear
|
VSPLTISB $0, V10 // Clear
|
||||||
BGT index2to16tail
|
BGT index2to16tail
|
||||||
|
|
||||||
MOVD $3, R17 // Number of bytes beyond 16
|
#ifdef GOPPC64_power10
|
||||||
|
ADD $3,R7, R17 // Base+3
|
||||||
|
ADD $2,R7, R8 // Base+2
|
||||||
|
ADD $1,R7, R10 // Base+1
|
||||||
|
#else
|
||||||
|
MOVD $3, R17 // Number of bytes beyond 16
|
||||||
|
#endif
|
||||||
PCALIGN $32
|
PCALIGN $32
|
||||||
|
|
||||||
index2to16loop:
|
index2to16loop:
|
||||||
|
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
LXVLL R7, R14, V8 // Load next 16 bytes of string from Base
|
||||||
|
LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1
|
||||||
|
LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2
|
||||||
|
LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3
|
||||||
|
#else
|
||||||
LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
|
LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
|
||||||
LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
|
LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
|
||||||
|
|
||||||
@ -672,6 +694,7 @@ index2to16loop:
|
|||||||
VAND V3, SEPMASK, V9 // Mask out sep size 1st index
|
VAND V3, SEPMASK, V9 // Mask out sep size 1st index
|
||||||
VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
|
VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
|
||||||
VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
|
VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
|
||||||
|
#endif
|
||||||
VCMPEQUBCC V0, V8, V8 // compare masked string
|
VCMPEQUBCC V0, V8, V8 // compare masked string
|
||||||
BLT CR6, found // All equal while comparing 0th index
|
BLT CR6, found // All equal while comparing 0th index
|
||||||
VCMPEQUBCC V0, V9, V9 // compare masked string
|
VCMPEQUBCC V0, V9, V9 // compare masked string
|
||||||
@ -682,14 +705,29 @@ index2to16loop:
|
|||||||
BLT CR6, found4 // All equal while comparing 3rd index
|
BLT CR6, found4 // All equal while comparing 3rd index
|
||||||
|
|
||||||
ADD $4, R7 // Update ptr to next 4 bytes
|
ADD $4, R7 // Update ptr to next 4 bytes
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
ADD $4, R17 // Update ptr to next 4 bytes
|
||||||
|
ADD $4, R8 // Update ptr to next 4 bytes
|
||||||
|
ADD $4, R10 // Update ptr to next 4 bytes
|
||||||
|
#endif
|
||||||
CMP R7, LASTSTR // Still less than last start byte
|
CMP R7, LASTSTR // Still less than last start byte
|
||||||
BGT notfound // Not found
|
BGT notfound // Not found
|
||||||
ADD $19, R7, R9 // Verify remaining bytes
|
ADD $19, R7, R9 // Verify remaining bytes
|
||||||
CMP R9, LASTBYTE // length of string at least 19
|
CMP R9, LASTBYTE // length of string at least 19
|
||||||
BLE index2to16loop // Try again, else do post processing and jump to index2to16next
|
BLE index2to16loop // Try again, else do post processing and jump to index2to16next
|
||||||
|
PCALIGN $32
|
||||||
// <19 bytes left, post process the remaining string
|
// <19 bytes left, post process the remaining string
|
||||||
index2to16tail:
|
index2to16tail:
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
index2to16next_p10:
|
||||||
|
LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1
|
||||||
|
VCMPEQUBCC V1, V0, V3 // Compare sep and partial string
|
||||||
|
BLT CR6, found // Found
|
||||||
|
ADD $1, R7 // Not found, try next partial string
|
||||||
|
CMP R7, LASTSTR // Check for end of string
|
||||||
|
BLE index2to16next_p10 // If at end, then not found
|
||||||
|
BR notfound // go to remainder loop
|
||||||
|
#else
|
||||||
ADD R3, R4, R9 // End of string
|
ADD R3, R4, R9 // End of string
|
||||||
SUB R7, R9, R9 // Number of bytes left
|
SUB R7, R9, R9 // Number of bytes left
|
||||||
ANDCC $15, R7, R10 // 16 byte offset
|
ANDCC $15, R7, R10 // 16 byte offset
|
||||||
@ -748,6 +786,7 @@ index2to16next:
|
|||||||
BGT notfound // If at end, then not found
|
BGT notfound // If at end, then not found
|
||||||
VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
|
VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
|
||||||
BR index2to16next // Check the next partial string
|
BR index2to16next // Check the next partial string
|
||||||
|
#endif // Tail processing if GOPPC64!=power10
|
||||||
|
|
||||||
index17plus:
|
index17plus:
|
||||||
CMP R6, $32 // Check if 17 < len(sep) <= 32
|
CMP R6, $32 // Check if 17 < len(sep) <= 32
|
||||||
|
Loading…
Reference in New Issue
Block a user