1
0
mirror of https://github.com/golang/go synced 2024-09-29 22:24:33 -06:00

internal/bytealg: improve PPC64 equal

Rewrite the vector loop to process 64B per iteration,
this greatly improves performance on POWER9/POWER10 for
large sizes.

Likewise, use a similar tricks for sizes >= 8 and <= 64.

And, rewrite small comparisons, it's a little slower for 1 byte,
but constant time for 1-7 bytes. Thus, it is increasingly faster
for 2-7B.

Benchmarks results below are from P8/P9 ppc64le (in that order),
several additional testcases have been added to test interesting
sizes. Likewise, the old variant was padded to the same code
size of the new variant to minimize layout related noise:

POWER8/ppc64le/linux:
name       old speed      new speed       delta
Equal/1     110MB/s ± 0%    106MB/s ± 0%    -3.26%
Equal/2     202MB/s ± 0%    203MB/s ± 0%    +0.18%
Equal/3     280MB/s ± 0%    319MB/s ± 0%   +13.89%
Equal/4     350MB/s ± 0%    414MB/s ± 0%   +18.27%
Equal/5     412MB/s ± 0%    533MB/s ± 0%   +29.19%
Equal/6     462MB/s ± 0%    620MB/s ± 0%   +34.11%
Equal/7     507MB/s ± 0%    745MB/s ± 0%   +47.02%
Equal/8     913MB/s ± 0%    994MB/s ± 0%    +8.84%
Equal/9     909MB/s ± 0%   1117MB/s ± 0%   +22.85%
Equal/10    937MB/s ± 0%   1242MB/s ± 0%   +32.59%
Equal/11    962MB/s ± 0%   1370MB/s ± 0%   +42.37%
Equal/12    989MB/s ± 0%   1490MB/s ± 0%   +50.60%
Equal/13   1.01GB/s ± 0%   1.61GB/s ± 0%   +60.27%
Equal/14   1.02GB/s ± 0%   1.74GB/s ± 0%   +71.22%
Equal/15   1.03GB/s ± 0%   1.86GB/s ± 0%   +81.45%
Equal/16   1.60GB/s ± 0%   1.99GB/s ± 0%   +24.21%
Equal/17   1.54GB/s ± 0%   2.04GB/s ± 0%   +32.28%
Equal/20   1.48GB/s ± 0%   2.40GB/s ± 0%   +62.64%
Equal/32   3.58GB/s ± 0%   3.84GB/s ± 0%    +7.18%
Equal/63   3.74GB/s ± 0%   7.17GB/s ± 0%   +91.79%
Equal/64   6.35GB/s ± 0%   7.29GB/s ± 0%   +14.75%
Equal/65   5.85GB/s ± 0%   7.00GB/s ± 0%   +19.66%
Equal/127  6.74GB/s ± 0%  13.74GB/s ± 0%  +103.77%
Equal/128  10.6GB/s ± 0%   12.9GB/s ± 0%   +21.98%
Equal/129  9.66GB/s ± 0%  11.96GB/s ± 0%   +23.85%
Equal/191  9.12GB/s ± 0%  17.80GB/s ± 0%   +95.26%
Equal/192  13.4GB/s ± 0%   17.2GB/s ± 0%   +28.66%
Equal/4K   29.5GB/s ± 0%   37.3GB/s ± 0%   +26.39%
Equal/4M   22.6GB/s ± 0%   23.1GB/s ± 0%    +2.40%
Equal/64M  10.6GB/s ± 0%   11.2GB/s ± 0%    +5.83%

POWER9/ppc64le/linux:
name       old speed      new speed       delta
Equal/1     122MB/s ± 0%    121MB/s ± 0%    -0.94%
Equal/2     223MB/s ± 0%    241MB/s ± 0%    +8.29%
Equal/3     289MB/s ± 0%    362MB/s ± 0%   +24.90%
Equal/4     366MB/s ± 0%    483MB/s ± 0%   +31.82%
Equal/5     427MB/s ± 0%    603MB/s ± 0%   +41.28%
Equal/6     462MB/s ± 0%    723MB/s ± 0%   +56.65%
Equal/7     509MB/s ± 0%    843MB/s ± 0%   +65.57%
Equal/8     974MB/s ± 0%   1066MB/s ± 0%    +9.46%
Equal/9    1.00GB/s ± 0%   1.20GB/s ± 0%   +19.53%
Equal/10   1.00GB/s ± 0%   1.33GB/s ± 0%   +32.81%
Equal/11   1.01GB/s ± 0%   1.47GB/s ± 0%   +45.28%
Equal/12   1.04GB/s ± 0%   1.60GB/s ± 0%   +53.46%
Equal/13   1.05GB/s ± 0%   1.73GB/s ± 0%   +64.67%
Equal/14   1.02GB/s ± 0%   1.87GB/s ± 0%   +82.93%
Equal/15   1.04GB/s ± 0%   2.00GB/s ± 0%   +92.07%
Equal/16   1.83GB/s ± 0%   2.13GB/s ± 0%   +16.58%
Equal/17   1.78GB/s ± 0%   2.18GB/s ± 0%   +22.65%
Equal/20   1.72GB/s ± 0%   2.57GB/s ± 0%   +49.24%
Equal/32   3.89GB/s ± 0%   4.10GB/s ± 0%    +5.53%
Equal/63   3.63GB/s ± 0%   7.63GB/s ± 0%  +110.45%
Equal/64   6.69GB/s ± 0%   7.75GB/s ± 0%   +15.84%
Equal/65   6.28GB/s ± 0%   7.07GB/s ± 0%   +12.46%
Equal/127  6.41GB/s ± 0%  13.65GB/s ± 0%  +112.95%
Equal/128  11.1GB/s ± 0%   14.1GB/s ± 0%   +26.56%
Equal/129  10.2GB/s ± 0%   11.2GB/s ± 0%    +9.44%
Equal/191  8.64GB/s ± 0%  16.39GB/s ± 0%   +89.75%
Equal/192  15.3GB/s ± 0%   17.8GB/s ± 0%   +16.31%
Equal/4K   24.6GB/s ± 0%   27.8GB/s ± 0%   +13.12%
Equal/4M   21.1GB/s ± 0%   22.7GB/s ± 0%    +7.66%
Equal/64M  20.8GB/s ± 0%   22.4GB/s ± 0%    +8.06%

Change-Id: Ie3c582133d526cc14e8846ef364c44c93eb7b9a2
Reviewed-on: https://go-review.googlesource.com/c/go/+/399976
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
Paul E. Murphy 2022-04-11 15:21:03 -05:00 committed by Paul Murphy
parent 3b01a80319
commit 8375b54d44

View File

@ -7,6 +7,21 @@
#include "go_asm.h"
#include "textflag.h"
// 4K (smallest case) page size offset mask for PPC64.
#define PAGE_OFFSET 4095
// TODO: At writing, ISEL and BC do not support CR bit type arguments,
// define them here for readability.
#define CR0LT 4*0+0
#define CR0EQ 4*0+2
#define CR1LT 4*1+0
#define CR6LT 4*6+0
// Likewise, the BC opcode is hard to read, and no extended
// mnemonics are offered for these forms.
#define BGELR_CR6 BC 4, CR6LT, (LR)
#define BEQLR BC 12, CR0EQ, (LR)
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
// R3 = a
@ -33,66 +48,158 @@ eq:
// On exit:
// R3 = return value
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R5,CTR
CMP R5,$8 // only optimize >=8
BLT simplecheck
DCBT (R3) // cache hint
DCBT (R4)
CMP R5,$32 // optimize >= 32
MOVD R5,R6 // needed if setup8a branch
BLT setup8a // 8 byte moves only
setup32a: // 8 byte aligned, >= 32 bytes
SRADCC $5,R5,R6 // number of 32 byte chunks to compare
MOVD R6,CTR
MOVD $16,R14 // index for VSX loads and stores
loop32a:
LXVD2X (R3+R0), VS32 // VS32 = V0
LXVD2X (R4+R0), VS33 // VS33 = V1
MOVD R3, R8 // Move s1 into R8
ADD R5, R3, R9 // &s1[len(s1)]
ADD R5, R4, R10 // &s2[len(s2)]
MOVD $1, R11
CMP R5, $16 // Use GPR checks for check for len <= 16
BLE check0_16
MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
CMP R5, $32 // Use overlapping VSX loads for len <= 32
BLE check17_32 // Do a pair of overlapping VSR compares
CMP R5, $64
BLE check33_64 // Hybrid check + overlap compare.
setup64:
SRD $6, R5, R6 // number of 64 byte chunks to compare
MOVD R6, CTR
MOVD $16, R14 // index for VSX loads and stores
MOVD $32, R15
MOVD $48, R16
ANDCC $0x3F, R5, R5 // len%64==0?
PCALIGN $32
loop64:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2 // compare, setting CR6
BGE CR6, noteq
LXVD2X (R3+R14), VS32
LXVD2X (R4+R14), VS33
VCMPEQUBCC V0, V1, V2
BGE CR6, noteq
ADD $32,R3 // bump up to next 32
ADD $32,R4
BC 16, 0, loop32a // br ctr and cr
ANDCC $24,R5,R6 // Any 8 byte chunks?
BEQ leftover // and result is 0
setup8a:
SRADCC $3,R6,R6 // get the 8 byte count
BEQ leftover // shifted value is 0
MOVD R6,CTR
loop8:
MOVD 0(R3),R6 // doublewords to compare
ADD $8,R3
MOVD 0(R4),R7
ADD $8,R4
CMP R6,R7 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BNE noteq
leftover:
ANDCC $7,R5,R6 // check for leftover bytes
BEQ equal
MOVD R6,CTR
BR simple
simplecheck:
CMP R5,$0
BEQ equal
simple:
MOVBZ 0(R3), R6
ADD $1,R3
MOVBZ 0(R4), R7
ADD $1,R4
CMP R6, R7
BNE noteq
BC 8,2,simple
BNE noteq
BR equal
noteq:
MOVD $0, R3
RET
equal:
MOVD $1, R3
BGELR_CR6
LXVD2X (R8+R14), V0
LXVD2X (R4+R14), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R15), V0
LXVD2X (R4+R15), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R16), V0
LXVD2X (R4+R16), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
ADD $64,R8 // bump up to next 64
ADD $64,R4
BDNZ loop64
ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
BEQLR // return if no tail.
ADD $-64, R9, R8
ADD $-64, R10, R4
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R14), V0
LXVD2X (R4+R14), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R15), V0
LXVD2X (R4+R15), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R16), V0
LXVD2X (R4+R16), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R11, R0, R3
RET
check33_64:
// Bytes 0-15
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
ADD $16, R8
ADD $16, R4
// Bytes 16-31
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
// A little tricky, but point R4,R8 to &sx[len-32],
// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
ADD $-32, R9, R8
ADD $-32, R10, R4
// Fallthrough
check17_32:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R11, R0, R5
// Load sX[len(sX)-16:len(sX)] and compare.
ADD $-16, R9
ADD $-16, R10
LXVD2X (R9+R0), V0
LXVD2X (R10+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R5, R0, R3
RET
check0_16:
CMP R5, $8
BLT check0_7
// Load sX[0:7] and compare.
MOVD (R8), R6
MOVD (R4), R7
CMP R6, R7
ISEL $CR0EQ, R11, R0, R5
// Load sX[len(sX)-8:len(sX)] and compare.
MOVD -8(R9), R6
MOVD -8(R10), R7
CMP R6, R7
ISEL $CR0EQ, R5, R0, R3
RET
check0_7:
CMP R5,$0
MOVD $1, R3
BEQLR // return if len == 0
// Check < 8B loads with a single compare, but select the load address
// such that it cannot cross a page boundary. Load a few bytes from the
// lower address if that does not cross the lower page. Or, load a few
// extra bytes from the higher addresses. And align those values
// consistently in register as either address may have differing
// alignment requirements.
ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
ANDCC $PAGE_OFFSET, R4, R9
SUBC R5, $8, R12 // 8-len
SLD $3, R12, R14 // (8-len)*8
CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
CMPU R9, R12, CR0
SUB R12, R8, R6 // compute lower load address
SUB R12, R4, R9
ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
ISEL $CR0LT, R4, R9, R4 // Similar for s2
MOVD (R8), R15
MOVD (R4), R16
SLD R14, R15, R7
SLD R14, R16, R17
SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
SRD R14, R17, R17
SRD R14, R15, R6 // Clear the lower (8-len) bytes
SRD R14, R16, R9
#ifdef GOARCH_ppc64le
ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
ISEL $CR0LT, R17, R9, R4
#else
ISEL $CR1LT, R6, R7, R8
ISEL $CR0LT, R9, R17, R4
#endif
CMP R4, R8
ISEL $CR0EQ, R11, R0, R3
RET