1
0
mirror of https://github.com/golang/go synced 2024-11-23 01:40:03 -07:00

runtime: improve index on ppc64x/power10

Rewrite index asm function to use the new power10 instruction lxvl,
stxvl or the load, store vector with length which can specify the
number of bytes to be stored in a register. This avoids the need to
create a separator mask and extra AND instructions. It also allows
us to process the tail end of the string using a lot fewer instructions
as we can load bytes of separator length directly rather than loading
16 bytes and masking out bytes that are greater than separator length
On power9 and power8 the code remains unchanged.
The performance for smaller sizes improve the most, on larger sizes
we see minimal improvement.

name          old time/op    new time/op     delta
Index/10          10.6ns ± 3%     9.8ns ± 2%   -7.20%
Index/11          11.2ns ± 4%    10.6ns ± 0%   -5.99%
Index/12          12.7ns ± 3%    11.3ns ± 0%  -11.21%
Index/13          13.5ns ± 2%    11.7ns ± 0%  -13.11%
Index/14          14.1ns ± 1%    12.0ns ± 0%  -14.43%
Index/15          14.3ns ± 2%    12.4ns ± 0%  -13.39%
Index/16          14.5ns ± 1%    12.7ns ± 0%  -12.57%
Index/17          26.7ns ± 0%    25.9ns ± 0%   -2.99%
Index/18          27.3ns ± 0%    26.4ns ± 1%   -3.35%
Index/19          35.7ns ±16%    26.1ns ± 1%  -26.87%
Index/20          29.4ns ± 0%    27.3ns ± 1%   -7.06%
Index/21          29.3ns ± 0%    26.9ns ± 1%   -8.37%
Index/22          30.0ns ± 0%    27.4ns ± 0%   -8.68%
Index/23          29.9ns ± 0%    27.7ns ± 0%   -7.15%
Index/24          31.0ns ± 0%    28.0ns ± 0%   -9.92%
Index/25          31.7ns ± 0%    28.4ns ± 0%  -10.54%
Index/26          30.6ns ± 0%    28.9ns ± 1%   -5.67%
Index/27          31.4ns ± 0%    29.3ns ± 0%   -6.71%
Index/28          32.7ns ± 0%    29.6ns ± 1%   -9.36%
Index/29          33.3ns ± 0%    30.1ns ± 1%   -9.70%
Index/30          32.4ns ± 0%    30.7ns ± 0%   -5.23%
Index/31          33.2ns ± 0%    30.6ns ± 1%   -7.83%
Index/32          34.3ns ± 0%    30.9ns ± 0%   -9.94%
Index/64          46.8ns ± 0%    44.2ns ± 0%   -5.66%
Index/128         71.2ns ± 0%    67.3ns ± 0%   -5.43%
Index/256          129ns ± 0%     127ns ± 0%   -1.67%
Index/2K           838ns ± 0%     804ns ± 0%   -4.03%
Index/4K          1.65µs ± 0%    1.58µs ± 0%   -4.25%
Index/2M           829µs ± 0%     793µs ± 0%   -4.42%
Index/4M          1.65ms ± 0%    1.59ms ± 0%   -4.19%
Index/64M         26.5ms ± 0%    25.4ms ± 0%   -4.18%
IndexHard2         412µs ± 0%     396µs ± 0%   -3.76%
IndexEasy/10      10.0ns ± 0%     9.3ns ± 1%   -7.20%
IndexEasy/11      10.8ns ± 1%    11.0ns ± 1%   +2.22%
IndexEasy/12      12.3ns ± 2%    11.5ns ± 1%   -6.37%
IndexEasy/13      13.1ns ± 0%    11.7ns ± 2%  -10.83%
IndexEasy/14      13.8ns ± 2%    11.9ns ± 1%  -13.52%
IndexEasy/15      14.0ns ± 2%    12.4ns ± 2%  -11.46%
IndexEasy/16      14.3ns ± 1%    12.5ns ± 0%  -12.40%
CountHard2         415µs ± 0%     396µs ± 0%   -4.48%

Change-Id: Id3efa5ed9c662a29f58125c7f866a09f29a59b6c
Reviewed-on: https://go-review.googlesource.com/c/go/+/478918
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
Archana R 2023-03-23 12:35:39 -05:00 committed by Lynn Boger
parent e875cbd9aa
commit 56c4422770

View File

@ -434,41 +434,49 @@ TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
CMP R6, $0 // Check sep len
BEQ notfound // sep len 0 -- not found
MOVD R3, R7 // Copy of string addr
#ifndef GOPPC64_power10
MOVD $16, R16 // Index value 16
MOVD $17, R17 // Index value 17
MOVD $18, R18 // Index value 18
MOVD $1, R19 // Index value 1
VSPLTISB $0xFF, ONES // splat all 1s
CMP R6, $16, CR4 // CR4 for len(sep) >= 16
VOR ONES, ONES, SEPMASK // Set up full SEPMASK
#else
SLD $56, R6, R14 // Set up separator length for LXVLL
#endif
MOVD $1, R19 // Index value 1
CMP R6, $16, CR4 // CR4 for len(sep) >= 16
BGE CR4, loadge16 // Load for len(sep) >= 16
#ifndef GOPPC64_power10
SUB R6, R16, R9 // 16-len of sep
SLD $3, R9 // Set up for VSLO
MTVSRD R9, V9 // Set up for VSLO
VSLDOI $8, V9, V9, V9 // Set up for VSLO
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
#endif
loadge16:
ANDCC $15, R5, R9 // Find byte offset of sep
ADD R9, R6, R10 // Add sep len
CMP R10, $16 // Check if sep len+offset > 16
BGT sepcross16 // Sep crosses 16 byte boundary
#ifdef GOPPC64_power10
LXVLL R5, R14, V0 // Load separator
#else
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
SLD $3, R9 // Set up shift count for VSLO
MTVSRD R9, V8 // Set up shift count for VSLO
VSLDOI $8, V8, V8, V8
VSLO V0, V8, V0 // Shift by start byte
VAND V0, SEPMASK, V0 // Mask separator (< 16)
#endif
BR index2plus
sepcross16:
LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
#ifdef GOPPC64_power10
LXVLL R5, R14, V0 // Load separator
#else
LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\
VAND V0, SEPMASK, V0 // mask out separator
#endif
BLE CR4, index2to16
BR index17plus // Handle sep > 16
@ -659,9 +667,23 @@ index2to16:
VSPLTISB $0, V10 // Clear
BGT index2to16tail
#ifdef GOPPC64_power10
ADD $3,R7, R17 // Base+3
ADD $2,R7, R8 // Base+2
ADD $1,R7, R10 // Base+1
#else
MOVD $3, R17 // Number of bytes beyond 16
#endif
PCALIGN $32
index2to16loop:
#ifdef GOPPC64_power10
LXVLL R7, R14, V8 // Load next 16 bytes of string from Base
LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1
LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2
LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3
#else
LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
@ -672,6 +694,7 @@ index2to16loop:
VAND V3, SEPMASK, V9 // Mask out sep size 1st index
VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
#endif
VCMPEQUBCC V0, V8, V8 // compare masked string
BLT CR6, found // All equal while comparing 0th index
VCMPEQUBCC V0, V9, V9 // compare masked string
@ -682,14 +705,29 @@ index2to16loop:
BLT CR6, found4 // All equal while comparing 3rd index
ADD $4, R7 // Update ptr to next 4 bytes
#ifdef GOPPC64_power10
ADD $4, R17 // Update ptr to next 4 bytes
ADD $4, R8 // Update ptr to next 4 bytes
ADD $4, R10 // Update ptr to next 4 bytes
#endif
CMP R7, LASTSTR // Still less than last start byte
BGT notfound // Not found
ADD $19, R7, R9 // Verify remaining bytes
CMP R9, LASTBYTE // length of string at least 19
BLE index2to16loop // Try again, else do post processing and jump to index2to16next
PCALIGN $32
// <19 bytes left, post process the remaining string
index2to16tail:
#ifdef GOPPC64_power10
index2to16next_p10:
LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1
VCMPEQUBCC V1, V0, V3 // Compare sep and partial string
BLT CR6, found // Found
ADD $1, R7 // Not found, try next partial string
CMP R7, LASTSTR // Check for end of string
BLE index2to16next_p10 // If at end, then not found
BR notfound // go to remainder loop
#else
ADD R3, R4, R9 // End of string
SUB R7, R9, R9 // Number of bytes left
ANDCC $15, R7, R10 // 16 byte offset
@ -748,6 +786,7 @@ index2to16next:
BGT notfound // If at end, then not found
VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
BR index2to16next // Check the next partial string
#endif // Tail processing if GOPPC64!=power10
index17plus:
CMP R6, $32 // Check if 17 < len(sep) <= 32