1
0
mirror of https://github.com/golang/go synced 2024-09-30 00:24:29 -06:00

internal/bytealg: improve PPC64 equal

Rewrite the vector loop to process 64B per iteration,
this greatly improves performance on POWER9/POWER10 for
large sizes.

Likewise, use a similar tricks for sizes >= 8 and <= 64.

And, rewrite small comparisons, it's a little slower for 1 byte,
but constant time for 1-7 bytes. Thus, it is increasingly faster
for 2-7B.

Benchmarks results below are from P8/P9 ppc64le (in that order),
several additional testcases have been added to test interesting
sizes. Likewise, the old variant was padded to the same code
size of the new variant to minimize layout related noise:

POWER8/ppc64le/linux:
name       old speed      new speed       delta
Equal/1     110MB/s ± 0%    106MB/s ± 0%    -3.26%
Equal/2     202MB/s ± 0%    203MB/s ± 0%    +0.18%
Equal/3     280MB/s ± 0%    319MB/s ± 0%   +13.89%
Equal/4     350MB/s ± 0%    414MB/s ± 0%   +18.27%
Equal/5     412MB/s ± 0%    533MB/s ± 0%   +29.19%
Equal/6     462MB/s ± 0%    620MB/s ± 0%   +34.11%
Equal/7     507MB/s ± 0%    745MB/s ± 0%   +47.02%
Equal/8     913MB/s ± 0%    994MB/s ± 0%    +8.84%
Equal/9     909MB/s ± 0%   1117MB/s ± 0%   +22.85%
Equal/10    937MB/s ± 0%   1242MB/s ± 0%   +32.59%
Equal/11    962MB/s ± 0%   1370MB/s ± 0%   +42.37%
Equal/12    989MB/s ± 0%   1490MB/s ± 0%   +50.60%
Equal/13   1.01GB/s ± 0%   1.61GB/s ± 0%   +60.27%
Equal/14   1.02GB/s ± 0%   1.74GB/s ± 0%   +71.22%
Equal/15   1.03GB/s ± 0%   1.86GB/s ± 0%   +81.45%
Equal/16   1.60GB/s ± 0%   1.99GB/s ± 0%   +24.21%
Equal/17   1.54GB/s ± 0%   2.04GB/s ± 0%   +32.28%
Equal/20   1.48GB/s ± 0%   2.40GB/s ± 0%   +62.64%
Equal/32   3.58GB/s ± 0%   3.84GB/s ± 0%    +7.18%
Equal/63   3.74GB/s ± 0%   7.17GB/s ± 0%   +91.79%
Equal/64   6.35GB/s ± 0%   7.29GB/s ± 0%   +14.75%
Equal/65   5.85GB/s ± 0%   7.00GB/s ± 0%   +19.66%
Equal/127  6.74GB/s ± 0%  13.74GB/s ± 0%  +103.77%
Equal/128  10.6GB/s ± 0%   12.9GB/s ± 0%   +21.98%
Equal/129  9.66GB/s ± 0%  11.96GB/s ± 0%   +23.85%
Equal/191  9.12GB/s ± 0%  17.80GB/s ± 0%   +95.26%
Equal/192  13.4GB/s ± 0%   17.2GB/s ± 0%   +28.66%
Equal/4K   29.5GB/s ± 0%   37.3GB/s ± 0%   +26.39%
Equal/4M   22.6GB/s ± 0%   23.1GB/s ± 0%    +2.40%
Equal/64M  10.6GB/s ± 0%   11.2GB/s ± 0%    +5.83%

POWER9/ppc64le/linux:
name       old speed      new speed       delta
Equal/1     122MB/s ± 0%    121MB/s ± 0%    -0.94%
Equal/2     223MB/s ± 0%    241MB/s ± 0%    +8.29%
Equal/3     289MB/s ± 0%    362MB/s ± 0%   +24.90%
Equal/4     366MB/s ± 0%    483MB/s ± 0%   +31.82%
Equal/5     427MB/s ± 0%    603MB/s ± 0%   +41.28%
Equal/6     462MB/s ± 0%    723MB/s ± 0%   +56.65%
Equal/7     509MB/s ± 0%    843MB/s ± 0%   +65.57%
Equal/8     974MB/s ± 0%   1066MB/s ± 0%    +9.46%
Equal/9    1.00GB/s ± 0%   1.20GB/s ± 0%   +19.53%
Equal/10   1.00GB/s ± 0%   1.33GB/s ± 0%   +32.81%
Equal/11   1.01GB/s ± 0%   1.47GB/s ± 0%   +45.28%
Equal/12   1.04GB/s ± 0%   1.60GB/s ± 0%   +53.46%
Equal/13   1.05GB/s ± 0%   1.73GB/s ± 0%   +64.67%
Equal/14   1.02GB/s ± 0%   1.87GB/s ± 0%   +82.93%
Equal/15   1.04GB/s ± 0%   2.00GB/s ± 0%   +92.07%
Equal/16   1.83GB/s ± 0%   2.13GB/s ± 0%   +16.58%
Equal/17   1.78GB/s ± 0%   2.18GB/s ± 0%   +22.65%
Equal/20   1.72GB/s ± 0%   2.57GB/s ± 0%   +49.24%
Equal/32   3.89GB/s ± 0%   4.10GB/s ± 0%    +5.53%
Equal/63   3.63GB/s ± 0%   7.63GB/s ± 0%  +110.45%
Equal/64   6.69GB/s ± 0%   7.75GB/s ± 0%   +15.84%
Equal/65   6.28GB/s ± 0%   7.07GB/s ± 0%   +12.46%
Equal/127  6.41GB/s ± 0%  13.65GB/s ± 0%  +112.95%
Equal/128  11.1GB/s ± 0%   14.1GB/s ± 0%   +26.56%
Equal/129  10.2GB/s ± 0%   11.2GB/s ± 0%    +9.44%
Equal/191  8.64GB/s ± 0%  16.39GB/s ± 0%   +89.75%
Equal/192  15.3GB/s ± 0%   17.8GB/s ± 0%   +16.31%
Equal/4K   24.6GB/s ± 0%   27.8GB/s ± 0%   +13.12%
Equal/4M   21.1GB/s ± 0%   22.7GB/s ± 0%    +7.66%
Equal/64M  20.8GB/s ± 0%   22.4GB/s ± 0%    +8.06%

Change-Id: Ie3c582133d526cc14e8846ef364c44c93eb7b9a2
Reviewed-on: https://go-review.googlesource.com/c/go/+/399976
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
Paul E. Murphy 2022-04-11 15:21:03 -05:00 committed by Paul Murphy
parent 3b01a80319
commit 8375b54d44

View File

@ -7,6 +7,21 @@
#include "go_asm.h" #include "go_asm.h"
#include "textflag.h" #include "textflag.h"
// 4K (smallest case) page size offset mask for PPC64.
#define PAGE_OFFSET 4095
// TODO: At writing, ISEL and BC do not support CR bit type arguments,
// define them here for readability.
#define CR0LT 4*0+0
#define CR0EQ 4*0+2
#define CR1LT 4*1+0
#define CR6LT 4*6+0
// Likewise, the BC opcode is hard to read, and no extended
// mnemonics are offered for these forms.
#define BGELR_CR6 BC 4, CR6LT, (LR)
#define BEQLR BC 12, CR0EQ, (LR)
// memequal(a, b unsafe.Pointer, size uintptr) bool // memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
// R3 = a // R3 = a
@ -33,66 +48,158 @@ eq:
// On exit: // On exit:
// R3 = return value // R3 = return value
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R5,CTR MOVD R3, R8 // Move s1 into R8
CMP R5,$8 // only optimize >=8 ADD R5, R3, R9 // &s1[len(s1)]
BLT simplecheck ADD R5, R4, R10 // &s2[len(s2)]
DCBT (R3) // cache hint MOVD $1, R11
DCBT (R4) CMP R5, $16 // Use GPR checks for check for len <= 16
CMP R5,$32 // optimize >= 32 BLE check0_16
MOVD R5,R6 // needed if setup8a branch MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
BLT setup8a // 8 byte moves only CMP R5, $32 // Use overlapping VSX loads for len <= 32
setup32a: // 8 byte aligned, >= 32 bytes BLE check17_32 // Do a pair of overlapping VSR compares
SRADCC $5,R5,R6 // number of 32 byte chunks to compare CMP R5, $64
MOVD R6,CTR BLE check33_64 // Hybrid check + overlap compare.
MOVD $16,R14 // index for VSX loads and stores
loop32a: setup64:
LXVD2X (R3+R0), VS32 // VS32 = V0 SRD $6, R5, R6 // number of 64 byte chunks to compare
LXVD2X (R4+R0), VS33 // VS33 = V1 MOVD R6, CTR
MOVD $16, R14 // index for VSX loads and stores
MOVD $32, R15
MOVD $48, R16
ANDCC $0x3F, R5, R5 // len%64==0?
PCALIGN $32
loop64:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2 // compare, setting CR6 VCMPEQUBCC V0, V1, V2 // compare, setting CR6
BGE CR6, noteq BGELR_CR6
LXVD2X (R3+R14), VS32 LXVD2X (R8+R14), V0
LXVD2X (R4+R14), VS33 LXVD2X (R4+R14), V1
VCMPEQUBCC V0, V1, V2 VCMPEQUBCC V0, V1, V2
BGE CR6, noteq BGELR_CR6
ADD $32,R3 // bump up to next 32 LXVD2X (R8+R15), V0
ADD $32,R4 LXVD2X (R4+R15), V1
BC 16, 0, loop32a // br ctr and cr VCMPEQUBCC V0, V1, V2
ANDCC $24,R5,R6 // Any 8 byte chunks? BGELR_CR6
BEQ leftover // and result is 0 LXVD2X (R8+R16), V0
setup8a: LXVD2X (R4+R16), V1
SRADCC $3,R6,R6 // get the 8 byte count VCMPEQUBCC V0, V1, V2
BEQ leftover // shifted value is 0 BGELR_CR6
MOVD R6,CTR ADD $64,R8 // bump up to next 64
loop8: ADD $64,R4
MOVD 0(R3),R6 // doublewords to compare BDNZ loop64
ADD $8,R3
MOVD 0(R4),R7 ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
ADD $8,R4 BEQLR // return if no tail.
CMP R6,R7 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr ADD $-64, R9, R8
BNE noteq ADD $-64, R10, R4
leftover: LXVD2X (R8+R0), V0
ANDCC $7,R5,R6 // check for leftover bytes LXVD2X (R4+R0), V1
BEQ equal VCMPEQUBCC V0, V1, V2
MOVD R6,CTR BGELR_CR6
BR simple LXVD2X (R8+R14), V0
simplecheck: LXVD2X (R4+R14), V1
CMP R5,$0 VCMPEQUBCC V0, V1, V2
BEQ equal BGELR_CR6
simple: LXVD2X (R8+R15), V0
MOVBZ 0(R3), R6 LXVD2X (R4+R15), V1
ADD $1,R3 VCMPEQUBCC V0, V1, V2
MOVBZ 0(R4), R7 BGELR_CR6
ADD $1,R4 LXVD2X (R8+R16), V0
CMP R6, R7 LXVD2X (R4+R16), V1
BNE noteq VCMPEQUBCC V0, V1, V2
BC 8,2,simple ISEL $CR6LT, R11, R0, R3
BNE noteq
BR equal
noteq:
MOVD $0, R3
RET
equal:
MOVD $1, R3
RET RET
check33_64:
// Bytes 0-15
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
ADD $16, R8
ADD $16, R4
// Bytes 16-31
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
// A little tricky, but point R4,R8 to &sx[len-32],
// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
ADD $-32, R9, R8
ADD $-32, R10, R4
// Fallthrough
check17_32:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R11, R0, R5
// Load sX[len(sX)-16:len(sX)] and compare.
ADD $-16, R9
ADD $-16, R10
LXVD2X (R9+R0), V0
LXVD2X (R10+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R5, R0, R3
RET
check0_16:
CMP R5, $8
BLT check0_7
// Load sX[0:7] and compare.
MOVD (R8), R6
MOVD (R4), R7
CMP R6, R7
ISEL $CR0EQ, R11, R0, R5
// Load sX[len(sX)-8:len(sX)] and compare.
MOVD -8(R9), R6
MOVD -8(R10), R7
CMP R6, R7
ISEL $CR0EQ, R5, R0, R3
RET
check0_7:
CMP R5,$0
MOVD $1, R3
BEQLR // return if len == 0
// Check < 8B loads with a single compare, but select the load address
// such that it cannot cross a page boundary. Load a few bytes from the
// lower address if that does not cross the lower page. Or, load a few
// extra bytes from the higher addresses. And align those values
// consistently in register as either address may have differing
// alignment requirements.
ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
ANDCC $PAGE_OFFSET, R4, R9
SUBC R5, $8, R12 // 8-len
SLD $3, R12, R14 // (8-len)*8
CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
CMPU R9, R12, CR0
SUB R12, R8, R6 // compute lower load address
SUB R12, R4, R9
ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
ISEL $CR0LT, R4, R9, R4 // Similar for s2
MOVD (R8), R15
MOVD (R4), R16
SLD R14, R15, R7
SLD R14, R16, R17
SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
SRD R14, R17, R17
SRD R14, R15, R6 // Clear the lower (8-len) bytes
SRD R14, R16, R9
#ifdef GOARCH_ppc64le
ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
ISEL $CR0LT, R17, R9, R4
#else
ISEL $CR1LT, R6, R7, R8
ISEL $CR0LT, R9, R17, R4
#endif
CMP R4, R8
ISEL $CR0EQ, R11, R0, R3
RET