1
0
mirror of https://github.com/golang/go synced 2024-11-17 00:14:50 -07:00

internal/bytealg: rewrite indexbytebody on PPC64

Use P8 instructions throughout to be backwards compatible, but
otherwise not impede performance. Use overlapping loads where
possible, and prioritize larger checks over smaller check.

However, some newer instructions can be used surgically when
targeting a newer GOPPC64. These can lead to noticeable
performance improvements with minimal impact to readability.

All tests run below on a Power10/ppc64le, and use a small
modification to BenchmarkIndexByte to ensure the IndexByte
wrapper call is inlined (as it likely is under realistic usage).
This wrapper adds substantial overhead if not inlined.

Previous (power9 path, GOPPC64=power8) vs. GOPPC64=power8:

IndexByte/1       3.81ns ± 8%     3.11ns ± 5%  -18.39%
IndexByte/2       3.82ns ± 3%     3.20ns ± 6%  -16.23%
IndexByte/3       3.61ns ± 4%     3.25ns ± 6%  -10.13%
IndexByte/4       3.66ns ± 5%     3.08ns ± 1%  -15.91%
IndexByte/5       3.82ns ± 0%     3.75ns ± 2%   -1.94%
IndexByte/6       3.83ns ± 0%     3.87ns ± 4%   +1.04%
IndexByte/7       3.83ns ± 0%     3.82ns ± 0%   -0.27%
IndexByte/8       3.82ns ± 0%     2.92ns ±11%  -23.70%
IndexByte/9       3.70ns ± 2%     3.08ns ± 2%  -16.87%
IndexByte/10      3.74ns ± 2%     3.04ns ± 0%  -18.75%
IndexByte/11      3.75ns ± 0%     3.31ns ± 8%  -11.79%
IndexByte/12      3.74ns ± 0%     3.04ns ± 0%  -18.86%
IndexByte/13      3.83ns ± 4%     3.04ns ± 0%  -20.64%
IndexByte/14      3.80ns ± 1%     3.30ns ± 8%  -13.18%
IndexByte/15      3.77ns ± 1%     3.04ns ± 0%  -19.33%
IndexByte/16      3.81ns ± 0%     2.78ns ± 7%  -26.88%
IndexByte/17      4.12ns ± 0%     3.04ns ± 1%  -26.11%
IndexByte/18      4.27ns ± 6%     3.05ns ± 0%  -28.64%
IndexByte/19      4.30ns ± 4%     3.02ns ± 2%  -29.65%
IndexByte/20      4.43ns ± 7%     3.45ns ± 7%  -22.15%
IndexByte/21      4.12ns ± 0%     3.03ns ± 1%  -26.35%
IndexByte/22      4.40ns ± 6%     3.05ns ± 0%  -30.82%
IndexByte/23      4.40ns ± 6%     3.01ns ± 2%  -31.48%
IndexByte/24      4.32ns ± 5%     3.07ns ± 0%  -28.98%
IndexByte/25      4.76ns ± 2%     3.04ns ± 1%  -36.11%
IndexByte/26      4.82ns ± 0%     3.05ns ± 0%  -36.66%
IndexByte/27      4.82ns ± 0%     2.97ns ± 3%  -38.39%
IndexByte/28      4.82ns ± 0%     2.96ns ± 3%  -38.57%
IndexByte/29      4.82ns ± 0%     3.34ns ± 9%  -30.71%
IndexByte/30      4.82ns ± 0%     3.05ns ± 0%  -36.77%
IndexByte/31      4.81ns ± 0%     3.05ns ± 0%  -36.70%
IndexByte/32      3.52ns ± 0%     3.44ns ± 1%   -2.15%
IndexByte/33      4.77ns ± 1%     3.35ns ± 0%  -29.81%
IndexByte/34      5.01ns ± 5%     3.35ns ± 0%  -33.15%
IndexByte/35      4.92ns ± 9%     3.35ns ± 0%  -31.89%
IndexByte/36      4.81ns ± 5%     3.35ns ± 0%  -30.37%
IndexByte/37      4.99ns ± 6%     3.35ns ± 0%  -32.86%
IndexByte/38      5.06ns ± 5%     3.35ns ± 0%  -33.84%
IndexByte/39      5.02ns ± 5%     3.48ns ± 9%  -30.58%
IndexByte/40      5.21ns ± 9%     3.55ns ± 4%  -31.82%
IndexByte/41      5.18ns ± 0%     3.42ns ± 2%  -33.98%
IndexByte/42      5.19ns ± 0%     3.55ns ±11%  -31.56%
IndexByte/43      5.18ns ± 0%     3.45ns ± 5%  -33.46%
IndexByte/44      5.18ns ± 0%     3.39ns ± 0%  -34.56%
IndexByte/45      5.18ns ± 0%     3.43ns ± 4%  -33.74%
IndexByte/46      5.18ns ± 0%     3.47ns ± 1%  -33.03%
IndexByte/47      5.18ns ± 0%     3.44ns ± 2%  -33.54%
IndexByte/48      5.18ns ± 0%     3.39ns ± 0%  -34.52%
IndexByte/49      5.69ns ± 0%     3.79ns ± 0%  -33.45%
IndexByte/50      5.70ns ± 0%     3.70ns ± 3%  -34.98%
IndexByte/51      5.70ns ± 0%     3.70ns ± 2%  -35.05%
IndexByte/52      5.69ns ± 0%     3.80ns ± 1%  -33.35%
IndexByte/53      5.69ns ± 0%     3.78ns ± 0%  -33.54%
IndexByte/54      5.69ns ± 0%     3.78ns ± 1%  -33.51%
IndexByte/55      5.69ns ± 0%     3.78ns ± 0%  -33.61%
IndexByte/56      5.69ns ± 0%     3.81ns ± 3%  -33.12%
IndexByte/57      6.20ns ± 0%     3.79ns ± 4%  -38.89%
IndexByte/58      6.20ns ± 0%     3.74ns ± 2%  -39.58%
IndexByte/59      6.20ns ± 0%     3.69ns ± 2%  -40.47%
IndexByte/60      6.20ns ± 0%     3.79ns ± 1%  -38.81%
IndexByte/61      6.20ns ± 0%     3.77ns ± 1%  -39.23%
IndexByte/62      6.20ns ± 0%     3.79ns ± 0%  -38.89%
IndexByte/63      6.20ns ± 0%     3.79ns ± 0%  -38.90%
IndexByte/64      4.17ns ± 0%     3.47ns ± 3%  -16.70%
IndexByte/65      5.38ns ± 0%     4.21ns ± 0%  -21.59%
IndexByte/66      5.38ns ± 0%     4.21ns ± 0%  -21.58%
IndexByte/67      5.38ns ± 0%     4.22ns ± 0%  -21.58%
IndexByte/68      5.38ns ± 0%     4.22ns ± 0%  -21.59%
IndexByte/69      5.38ns ± 0%     4.22ns ± 0%  -21.56%
IndexByte/70      5.38ns ± 0%     4.21ns ± 0%  -21.59%
IndexByte/71      5.37ns ± 0%     4.21ns ± 0%  -21.51%
IndexByte/72      5.37ns ± 0%     4.22ns ± 0%  -21.46%
IndexByte/73      5.71ns ± 0%     4.22ns ± 0%  -26.20%
IndexByte/74      5.71ns ± 0%     4.21ns ± 0%  -26.21%
IndexByte/75      5.71ns ± 0%     4.21ns ± 0%  -26.17%
IndexByte/76      5.71ns ± 0%     4.22ns ± 0%  -26.22%
IndexByte/77      5.71ns ± 0%     4.22ns ± 0%  -26.22%
IndexByte/78      5.71ns ± 0%     4.21ns ± 0%  -26.22%
IndexByte/79      5.71ns ± 0%     4.22ns ± 0%  -26.21%
IndexByte/80      5.71ns ± 0%     4.21ns ± 0%  -26.19%
IndexByte/81      6.20ns ± 0%     4.39ns ± 0%  -29.13%
IndexByte/82      6.20ns ± 0%     4.36ns ± 0%  -29.67%
IndexByte/83      6.20ns ± 0%     4.36ns ± 0%  -29.63%
IndexByte/84      6.20ns ± 0%     4.39ns ± 0%  -29.21%
IndexByte/85      6.20ns ± 0%     4.36ns ± 0%  -29.64%
IndexByte/86      6.20ns ± 0%     4.36ns ± 0%  -29.63%
IndexByte/87      6.20ns ± 0%     4.39ns ± 0%  -29.21%
IndexByte/88      6.20ns ± 0%     4.36ns ± 0%  -29.65%
IndexByte/89      6.74ns ± 0%     4.36ns ± 0%  -35.33%
IndexByte/90      6.75ns ± 0%     4.37ns ± 0%  -35.22%
IndexByte/91      6.74ns ± 0%     4.36ns ± 0%  -35.30%
IndexByte/92      6.74ns ± 0%     4.36ns ± 0%  -35.34%
IndexByte/93      6.74ns ± 0%     4.37ns ± 0%  -35.20%
IndexByte/94      6.74ns ± 0%     4.36ns ± 0%  -35.33%
IndexByte/95      6.75ns ± 0%     4.36ns ± 0%  -35.32%
IndexByte/96      4.83ns ± 0%     4.34ns ± 2%  -10.24%
IndexByte/97      5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/98      5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/99      5.91ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/100     5.90ns ± 0%     4.65ns ± 0%  -21.21%
IndexByte/101     5.90ns ± 0%     4.65ns ± 0%  -21.22%
IndexByte/102     5.90ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/103     5.91ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/104     5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/105     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/106     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/107     6.25ns ± 0%     4.65ns ± 0%  -25.60%
IndexByte/108     6.25ns ± 0%     4.65ns ± 0%  -25.58%
IndexByte/109     6.24ns ± 0%     4.65ns ± 0%  -25.50%
IndexByte/110     6.25ns ± 0%     4.65ns ± 0%  -25.56%
IndexByte/111     6.25ns ± 0%     4.65ns ± 0%  -25.60%
IndexByte/112     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/113     6.76ns ± 0%     5.05ns ± 0%  -25.37%
IndexByte/114     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/115     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/116     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/117     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/118     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/119     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/120     6.76ns ± 0%     5.05ns ± 0%  -25.36%
IndexByte/121     7.35ns ± 0%     5.05ns ± 0%  -31.33%
IndexByte/122     7.36ns ± 0%     5.05ns ± 0%  -31.42%
IndexByte/123     7.38ns ± 0%     5.05ns ± 0%  -31.60%
IndexByte/124     7.38ns ± 0%     5.05ns ± 0%  -31.59%
IndexByte/125     7.38ns ± 0%     5.05ns ± 0%  -31.60%
IndexByte/126     7.38ns ± 0%     5.05ns ± 0%  -31.58%
IndexByte/128     5.28ns ± 0%     5.10ns ± 0%   -3.41%
IndexByte/256     7.27ns ± 0%     7.28ns ± 2%   +0.13%
IndexByte/512     12.1ns ± 0%     11.8ns ± 0%   -2.51%
IndexByte/1K      23.1ns ± 3%     22.0ns ± 0%   -4.66%
IndexByte/2K      42.6ns ± 0%     42.4ns ± 0%   -0.41%
IndexByte/4K      90.3ns ± 0%     89.4ns ± 0%   -0.98%
IndexByte/8K       170ns ± 0%      170ns ± 0%   -0.59%
IndexByte/16K      331ns ± 0%      330ns ± 0%   -0.27%
IndexByte/32K      660ns ± 0%      660ns ± 0%   -0.08%
IndexByte/64K     1.30µs ± 0%     1.30µs ± 0%   -0.08%
IndexByte/128K    2.58µs ± 0%     2.58µs ± 0%   -0.04%
IndexByte/256K    5.15µs ± 0%     5.15µs ± 0%   -0.04%
IndexByte/512K    10.3µs ± 0%     10.3µs ± 0%   -0.03%
IndexByte/1M      20.6µs ± 0%     20.5µs ± 0%   -0.03%
IndexByte/2M      41.1µs ± 0%     41.1µs ± 0%   -0.03%
IndexByte/4M      82.2µs ± 0%     82.1µs ± 0%   -0.02%
IndexByte/8M       164µs ± 0%      164µs ± 0%   -0.01%
IndexByte/16M      328µs ± 0%      328µs ± 0%   -0.01%
IndexByte/32M      657µs ± 0%      657µs ± 0%   -0.00%

GOPPC64=power8 vs GOPPC64=power9. The Improvement is
most noticed between 16 and 64B, and goes away around
128B.

IndexByte/16      2.78ns ± 7%     2.65ns ±15%   -4.74%
IndexByte/17      3.04ns ± 1%     2.80ns ± 3%   -7.85%
IndexByte/18      3.05ns ± 0%     2.71ns ± 4%  -11.00%
IndexByte/19      3.02ns ± 2%     2.76ns ±10%   -8.74%
IndexByte/20      3.45ns ± 7%     2.91ns ± 0%  -15.46%
IndexByte/21      3.03ns ± 1%     2.84ns ± 9%   -6.33%
IndexByte/22      3.05ns ± 0%     2.67ns ± 1%  -12.38%
IndexByte/23      3.01ns ± 2%     2.67ns ± 1%  -11.24%
IndexByte/24      3.07ns ± 0%     2.92ns ±12%   -4.79%
IndexByte/25      3.04ns ± 1%     3.15ns ±15%   +3.63%
IndexByte/26      3.05ns ± 0%     2.83ns ±13%   -7.33%
IndexByte/27      2.97ns ± 3%     2.98ns ±10%   +0.56%
IndexByte/28      2.96ns ± 3%     2.96ns ± 9%   -0.05%
IndexByte/29      3.34ns ± 9%     3.03ns ±12%   -9.33%
IndexByte/30      3.05ns ± 0%     2.68ns ± 1%  -12.05%
IndexByte/31      3.05ns ± 0%     2.83ns ±12%   -7.27%
IndexByte/32      3.44ns ± 1%     3.21ns ±10%   -6.78%
IndexByte/33      3.35ns ± 0%     3.41ns ± 2%   +1.95%
IndexByte/34      3.35ns ± 0%     3.13ns ± 0%   -6.53%
IndexByte/35      3.35ns ± 0%     3.13ns ± 0%   -6.54%
IndexByte/36      3.35ns ± 0%     3.13ns ± 0%   -6.52%
IndexByte/37      3.35ns ± 0%     3.13ns ± 0%   -6.52%
IndexByte/38      3.35ns ± 0%     3.24ns ± 4%   -3.30%
IndexByte/39      3.48ns ± 9%     3.44ns ± 2%   -1.19%
IndexByte/40      3.55ns ± 4%     3.46ns ± 2%   -2.44%
IndexByte/41      3.42ns ± 2%     3.39ns ± 4%   -0.86%
IndexByte/42      3.55ns ±11%     3.46ns ± 1%   -2.65%
IndexByte/43      3.45ns ± 5%     3.44ns ± 2%   -0.31%
IndexByte/44      3.39ns ± 0%     3.43ns ± 3%   +1.23%
IndexByte/45      3.43ns ± 4%     3.50ns ± 1%   +2.07%
IndexByte/46      3.47ns ± 1%     3.46ns ± 2%   -0.31%
IndexByte/47      3.44ns ± 2%     3.47ns ± 1%   +0.78%
IndexByte/48      3.39ns ± 0%     3.46ns ± 2%   +1.96%
IndexByte/49      3.79ns ± 0%     3.47ns ± 0%   -8.41%
IndexByte/50      3.70ns ± 3%     3.64ns ± 5%   -1.66%
IndexByte/51      3.70ns ± 2%     3.75ns ± 0%   +1.40%
IndexByte/52      3.80ns ± 1%     3.77ns ± 0%   -0.70%
IndexByte/53      3.78ns ± 0%     3.77ns ± 0%   -0.46%
IndexByte/54      3.78ns ± 1%     3.53ns ± 7%   -6.74%
IndexByte/55      3.78ns ± 0%     3.47ns ± 0%   -8.17%
IndexByte/56      3.81ns ± 3%     3.45ns ± 0%   -9.43%
IndexByte/57      3.79ns ± 4%     3.47ns ± 0%   -8.45%
IndexByte/58      3.74ns ± 2%     3.55ns ± 4%   -5.16%
IndexByte/59      3.69ns ± 2%     3.61ns ± 4%   -2.01%
IndexByte/60      3.79ns ± 1%     3.45ns ± 0%   -9.09%
IndexByte/61      3.77ns ± 1%     3.47ns ± 0%   -7.93%
IndexByte/62      3.79ns ± 0%     3.45ns ± 0%   -8.97%
IndexByte/63      3.79ns ± 0%     3.47ns ± 0%   -8.44%
IndexByte/64      3.47ns ± 3%     3.18ns ± 0%   -8.41%

GOPPC64=power9 vs GOPPC64=power10. Only sizes <16 will
show meaningful changes.

IndexByte/1       3.27ns ± 8%     2.36ns ± 2%  -27.58%
IndexByte/2       3.06ns ± 4%     2.34ns ± 1%  -23.42%
IndexByte/3       3.77ns ±11%     2.48ns ± 7%  -34.03%
IndexByte/4       3.18ns ± 8%     2.33ns ± 1%  -26.69%
IndexByte/5       3.18ns ± 5%     2.34ns ± 4%  -26.26%
IndexByte/6       3.13ns ± 3%     2.35ns ± 1%  -24.97%
IndexByte/7       3.25ns ± 1%     2.33ns ± 1%  -28.22%
IndexByte/8       2.79ns ± 2%     2.36ns ± 1%  -15.32%
IndexByte/9       2.90ns ± 0%     2.34ns ± 2%  -19.36%
IndexByte/10      2.99ns ± 3%     2.31ns ± 1%  -22.70%
IndexByte/11      3.13ns ± 7%     2.31ns ± 0%  -26.08%
IndexByte/12      3.01ns ± 4%     2.32ns ± 1%  -22.91%
IndexByte/13      2.98ns ± 3%     2.31ns ± 1%  -22.72%
IndexByte/14      2.92ns ± 2%     2.61ns ±16%  -10.58%
IndexByte/15      3.02ns ± 5%     2.69ns ± 7%  -10.90%
IndexByte/16      2.65ns ±15%     2.29ns ± 1%  -13.61%

Change-Id: I4482f762d25eabf60def4981a0b2bc0c10ccf50c
Reviewed-on: https://go-review.googlesource.com/c/go/+/478656
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Paul E. Murphy 2023-03-06 16:51:31 -06:00 committed by Paul Murphy
parent ccb8db88c5
commit db32aba508

View File

@ -11,381 +11,304 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// R3 = byte array pointer
// R4 = length
MOVD R6, R5 // R5 = byte
MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR indexbytebody<>(SB)
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
// R3 = string
// R4 = length
// R5 = byte
MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR indexbytebody<>(SB)
#ifndef GOPPC64_power9
#ifdef GOARCH_ppc64le
DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
#else
DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
#endif
GLOBL indexbytevbperm<>+0(SB), RODATA, $16
#endif
// Some operations are endian specific, choose the correct opcode base on GOARCH.
// Note, _VCZBEBB is only available on power9 and newer.
#ifdef GOARCH_ppc64le
#define _LDBEX MOVDBR
#define _LWBEX MOVWBR
#define _LHBEX MOVHBR
#define _VCZBEBB VCTZLSBB
#else
#define _LDBEX MOVD
#define _LWBEX MOVW
#define _LHBEX MOVH
#define _VCZBEBB VCLZLSBB
#endif
// R3 = addr of string
// R4 = len of string
// R5 = byte to find
// R16 = 1 if running on a POWER9 system, 0 otherwise
// On exit:
// R3 = return value
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
ADD R4,R3,R7 // Last acceptable address in R7.
CMPU R4,$32
RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (32 bytes). Those will be processed differently.
MOVD $-1,R9
RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
RLDIMI $32,R5,$0,R5
MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7
#ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian
#else
SRD R6,R9,R9 // Same for Big Endian
#ifndef GOPPC64_power9
// Load VBPERMQ constant to reduce compare into an ordered bit mask.
MOVD $indexbytevbperm<>+00(SB),R16
LXVD2X (R16),V0 // Set up swap string
#endif
BLT small_string // Jump to the small string case if it's <32 bytes.
CMP R16,$1 // optimize for power8 v power9
BNE power8
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
VSLB V11,V10,V10 // to extract the first bit of match result into GPR
VSPLTB $7,V1,V1 // Replicate byte across V1
CMP R4,$64
BLT cmp16 // Jump to the small string case if it's <32 bytes.
CMP R4,$64,CR1
MOVD $16,R11
MOVD R3,R8
BLT cmp32
BLT CR1,cmp32 // Special case for length 32 - 63
MOVD $32,R12
MOVD $48,R6
RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63
ADD R3,R9,R9 // R9 = &s[len &^ 63]
ANDCC $63,R4 // (len &= 63) cmp 0.
PCALIGN $16
loop64:
LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0]
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // match found at R8, jump out
BNE CR6,foundat0 // Match found at R8, jump out
LXVB16X (R8)(R11),V2
LXVD2X (R11)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat1 // match found at R8+16 bytes, jump out
BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
LXVB16X (R8)(R12),V2
LXVD2X (R12)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat2 // match found at R8+32 bytes, jump out
BNE CR6,foundat2 // Match found at R8+32 bytes, jump out
LXVB16X (R8)(R6),V2
LXVD2X (R6)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat3 // match found at R8+48 bytes, jump out
BNE CR6,foundat3 // Match found at R8+48 bytes, jump out
ADD $64,R8
ADD $-64,R4
CMP R4,$64 // >=64 bytes left to scan?
BGE loop64
CMP R4,$32
BLT rem // jump to rem if there are < 32 bytes left
cmp32:
LXVB16X (R0)(R8),V2 // 32-63 bytes left
CMPU R8,R9,CR1
BNE CR1,loop64 // R8 != &s[len &^ 63]?
PCALIGN $32
BEQ notfound // Is tail length 0? CR0 is set before entering loop64.
CMP R4,$32 // Tail length >= 32, use cmp32 path.
CMP R4,$16,CR1
BGE cmp32
ADD R8,R4,R9
ADD $-16,R9
BLE CR1,cmp64_tail_gt0
cmp64_tail_gt16: // Tail length 17 - 32
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // match found at R8
BNE CR6,foundat0
LXVB16X (R11)(R8),V2
cmp64_tail_gt0: // Tail length 1 - 16
MOVD R9,R8
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat1 // match found at R8+16
BNE CR6,foundat0
ADD $32,R8
ADD $-32,R4
rem:
RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
BR small_string
BR notfound
cmp32: // Length 32 - 63
// Bytes 0 - 15
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0
// Bytes 16 - 31
LXVD2X (R8)(R11),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
CMP R4,$48
ADD R4,R8,R9 // Compute &s[len(s)-16]
ADD $32,R8,R8
ADD $-16,R9,R9
ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8
// Bytes 33 - 47
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // match found at R8+32 bytes, jump out
BLE notfound
// Bytes 48 - 63
MOVD R9,R8 // R9 holds the final check.
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
BR notfound
// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
#ifndef GOPPC64_power9
#define ADJUST_FOR_CNTLZW -16
#else
#define ADJUST_FOR_CNTLZW 0
#endif
// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
foundat3:
ADD $16,R8
SUB R3,R8,R3
ADD $48+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat2:
ADD $16,R8
SUB R3,R8,R3
ADD $32+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat1:
ADD $16,R8
SUB R3,R8,R3
ADD $16+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat0:
// Compress the result into a single doubleword and
// move it to a GPR for the final calculation.
VBPERMQ V6,V10,V6
MFVRD V6,R3
// count leading zeroes upto the match that ends up in low 16 bits
// in both endian modes, compute index by subtracting the number by 16
CNTLZW R3,R11
ADD $-16,R11
ADD R8,R11,R3 // Calculate byte address
SUB R17,R3
SUB R3,R8,R3
ADD $0+ADJUST_FOR_CNTLZW,R3
vfound:
// Map equal values into a 16 bit value with earlier matches setting higher bits.
#ifndef GOPPC64_power9
VBPERMQ V6,V0,V6
MFVRD V6,R4
CNTLZW R4,R4
#else
#ifdef GOARCH_ppc64le
// Put the value back into LE ordering by swapping doublewords.
XXPERMDI V6,V6,$2,V6
#endif
_VCZBEBB V6,R4
#endif
ADD R3,R4,R3
RET
power8:
// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
// in V0, V1 and V10, then branch to the preloop.
ANDCC $63,R3,R11
BEQ CR0,qw_align
RLDICL $0,R3,$61,R11
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base
RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done
ADD $8,R8,R8
cmp16: // Length 16 - 31
CMPU R4,$16
ADD R4,R3,R9
BLT cmp8
ADD $-16,R9,R9 // &s[len(s)-16]
// Bytes 0 - 15
LXVD2X (R0)(R3),V2
VCMPEQUBCC V2,V1,V6
MOVD R3,R8
BNE CR6,foundat0 // Match found at R8+32 bytes, jump out
BEQ notfound
// Bytes 16 - 30
MOVD R9,R8 // R9 holds the final check.
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
BR notfound
cmp8: // Length 8 - 15
#ifdef GOPPC64_power10
// Load all the bytes into a single VSR in BE order.
SLD $56,R4,R5
LXVLL R3,R5,V2
// Compare and count the number which don't match.
VCMPEQUB V2,V1,V6
VCLZLSBB V6,R3
// If count is the number of bytes, or more. No matches are found.
CMPU R3,R4
MOVD $-1,R5
// Otherwise, the count is the index of the first match.
ISEL CR0LT,R3,R5,R3
RET
#else
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
RLDIMI $16,R5,$32,R5
RLDIMI $32,R5,$0,R5
CMPU R4,$8
BLT cmp4
MOVD $-8,R11
ADD $-8,R4,R4
_LDBEX (R0)(R3),R10
_LDBEX (R11)(R9),R11
CMPB R10,R5,R10
CMPB R11,R5,R11
CMPU R10,$0
CMPU R11,$0,CR1
CNTLZD R10,R10
CNTLZD R11,R11
SRD $3,R10,R3
SRD $3,R11,R11
BNE found
ADD R4,R11,R4
// Check for quadword alignment
ANDCC $15,R8,R11
BEQ CR0,qw_align
// Not aligned, so handle the next doubleword
MOVD 0(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR7
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:
// Set up auxiliary data for the vectorized algorithm.
VSPLTISB $0,V0 // Replicate 0 across V0
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11
VSLB V11,V10,V10
VSPLTB $7,V1,V1 // Replicate byte across V1
CMPU R4, $64 // If len 64, don't use the vectorized loop
BLE tail
// We will load 4 quardwords per iteration in the loop, so check for
// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
ANDCC $63,R8,R11
BEQ CR0,preloop
// Not 64-byte aligned. Load one quadword at a time until aligned.
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $-16,R4,R4
ADD $16,R8,R8
// 64-byte aligned. Prepare for the main loop.
preloop:
CMPU R4,$64
BLE tail // If len 64, don't use the vectorized loop
// We are now aligned to a 64-byte boundary. We will load 4 quadwords
// per loop iteration. The last doubleword is in R10, so our loop counter
// starts at (R10-R8)/64.
SUB R8,R10,R6
SRD $6,R6,R9 // Loop counter in R9
MOVD R9,CTR
ADD $-64,R8,R8 // Adjust index for loop entry
MOVD $16,R11 // Load offsets for the vector loads
MOVD $32,R9
MOVD $48,R7
// Main loop we will load 64 bytes per iteration
loop:
ADD $64,R8,R8 // Fuse addi+lvx for performance
LVX (R8+R0),V2 // Load 4 16-byte vectors
LVX (R8+R11),V3
VCMPEQUB V1,V2,V6 // Look for byte in each vector
VCMPEQUB V1,V3,V7
LVX (R8+R9),V4
LVX (R8+R7),V5
VCMPEQUB V1,V4,V8
VCMPEQUB V1,V5,V9
VOR V6,V7,V11 // Compress the result in a single vector
VOR V8,V9,V12
VOR V11,V12,V13
VCMPEQUBCC V0,V13,V14 // Check for byte
BGE CR6,found
BC 16,0,loop // bdnz loop
// Handle the tailing bytes or R4 64
RLDICL $0,R6,$58,R4
ADD $64,R8,R8
tail:
CMPU R4,$0
BEQ notfound
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
notfound:
MOVD $-1, R3
MOVD $-1,R3
ISEL CR1EQ,R3,R4,R3
RET
cmp4: // Length 4 - 7
CMPU R4,$4
BLT cmp2
MOVD $-4,R11
ADD $-4,R4,R4
_LWBEX (R0)(R3),R10
_LWBEX (R11)(R9),R11
CMPB R10,R5,R10
CMPB R11,R5,R11
CNTLZW R10,R10
CNTLZW R11,R11
CMPU R10,$32
CMPU R11,$32,CR1
SRD $3,R10,R3
SRD $3,R11,R11
BNE found
ADD R4,R11,R4
MOVD $-1,R3
ISEL CR1EQ,R3,R4,R3
RET
cmp2: // Length 2 - 3
CMPU R4,$2
BLT cmp1
_LHBEX (R0)(R3),R10
CMPB R10,R5,R10
SLDCC $48,R10,R10
CNTLZD R10,R10
SRD $3,R10,R3
BNE found
cmp1: // Length 1
MOVD $-1,R3
ANDCC $1,R4,R31
BEQ found
MOVBZ -1(R9),R10
CMPB R10,R5,R10
ANDCC $1,R10
ADD $-1,R4
ISEL CR0EQ,R3,R4,R3
found:
// We will now compress the results into a single doubleword,
// so it can be moved to a GPR for the final index calculation.
// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
// first bit of each byte into bits 48-63.
VBPERMQ V6,V10,V6
VBPERMQ V7,V10,V7
VBPERMQ V8,V10,V8
VBPERMQ V9,V10,V9
// Shift each 16-bit component into its correct position for
// merging into a single doubleword.
#ifdef GOARCH_ppc64le
VSLDOI $2,V7,V7,V7
VSLDOI $4,V8,V8,V8
VSLDOI $6,V9,V9,V9
#else
VSLDOI $6,V6,V6,V6
VSLDOI $4,V7,V7,V7
VSLDOI $2,V8,V8,V8
RET
#endif
// Merge V6-V9 into a single doubleword and move to a GPR.
VOR V6,V7,V11
VOR V8,V9,V4
VOR V4,V11,V4
MFVRD V4,R3
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
ADD R8,R11,R3 // Calculate byte address
return:
SUB R17, R3
notfound:
MOVD $-1,R3
RET
found_qw_align:
// Use the same algorithm as above. Compress the result into
// a single doubleword and move it to a GPR for the final
// calculation.
VBPERMQ V6,V10,V6
#ifdef GOARCH_ppc64le
MFVRD V6,R3
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11
#else
VSLDOI $6,V6,V6,V6
MFVRD V6,R3
CNTLZD R3,R11
#endif
ADD R8,R11,R3
CMPU R11,R4
BLT return
BR notfound
PCALIGN $16
done:
ADD $-1,R10,R6
// Offset of last index for the final
// doubleword comparison
RLDICL $0,R6,$61,R6
// At this point, R3 has 0xFF in the same position as the byte we are
// looking for in the doubleword. Use that to calculate the exact index
// of the byte.
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3
CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return
BLE CR7,return
BR notfound
small_string:
// process string of length < 32 bytes
// We unroll this loop for better performance.
CMPU R4,$0 // Check for length=0
BEQ notfound
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
CMPU R8,R7
BNE CR7,done
BEQ notfound // Hit length.
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
BNE CR6,done
BR notfound