1
0
mirror of https://github.com/golang/go synced 2024-11-24 04:40:24 -07:00

internal/bytealg: optimize cmpbody for ppc64le/ppc64

Vectorize the cmpbody loop for bytes of size greater than or equal
to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve
performance of smaller size compares

Performance improves for most sizes with this change on POWER8, 9
and POWER10. For the very small sizes (upto 8) the overhead of
calling function starts to impact performance.

POWER9:
name               old time/op  new time/op  delta
BytesCompare/1     4.60ns ± 0%  5.49ns ± 0%  +19.27%
BytesCompare/2     4.68ns ± 0%  5.46ns ± 0%  +16.71%
BytesCompare/4     6.58ns ± 0%  5.49ns ± 0%  -16.58%
BytesCompare/8     4.89ns ± 0%  5.46ns ± 0%  +11.64%
BytesCompare/16    5.21ns ± 0%  4.96ns ± 0%   -4.70%
BytesCompare/32    5.09ns ± 0%  4.98ns ± 0%   -2.14%
BytesCompare/64    6.40ns ± 0%  5.96ns ± 0%   -6.84%
BytesCompare/128   11.3ns ± 0%   8.1ns ± 0%  -28.09%
BytesCompare/256   15.1ns ± 0%  12.8ns ± 0%  -15.16%
BytesCompare/512   26.5ns ± 0%  23.3ns ± 5%  -12.03%
BytesCompare/1024  50.2ns ± 0%  41.6ns ± 2%  -17.01%
BytesCompare/2048  99.3ns ± 0%  86.5ns ± 0%  -12.88%

Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609
Reviewed-on: https://go-review.googlesource.com/c/go/+/362797
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
This commit is contained in:
Archana R 2021-11-10 01:18:42 -06:00 committed by Lynn Boger
parent 1e5987635c
commit 78fb1d03d3

View File

@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
#ifdef GOARCH_ppc64le
BR cmpbodyLE<>(SB)
#else
BR cmpbodyBE<>(SB)
#endif
MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
CMP R16,$1
BNE power8
BR cmpbodyp9<>(SB)
power8:
BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
#ifdef GOARCH_ppc64le
BR cmpbodyLE<>(SB)
#else
BR cmpbodyBE<>(SB)
#endif
MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
CMP R16,$1
BNE power8
BR cmpbodyp9<>(SB)
power8:
BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@ -70,209 +72,431 @@ done:
MOVD $0, R3
RET
// Do an efficient memcmp for ppc64le
#ifdef GOARCH_ppc64le
DATA byteswap<>+0(SB)/8, $0x0706050403020100
DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
GLOBL byteswap<>+0(SB), RODATA, $16
#define SWAP V21
#endif
// Do an efficient memcmp for ppc64le/ppc64/POWER8
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2
BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
MOVD R8,CTR // set up loop counter
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9
BLT setup8a // 8 byte moves only
setup32a:
SRADCC $5,R8,R9 // number of 32 byte chunks
MOVD R9,CTR
BLT setup8a // optimize < 32
MOVD $16,R10 // set offsets to load into vectors
CMP R8,$64
BLT cmp32 // process size 32-63
// Special processing for 32 bytes or longer.
// Loading this way is faster and correct as long as the
// doublewords being compared are equal. Once they
// are found unequal, reload them in proper byte order
// to determine greater or less than.
loop32a:
MOVD 0(R5),R9 // doublewords to compare
MOVD 0(R6),R10 // get 4 doublewords
MOVD 8(R5),R14
MOVD 8(R6),R15
CMPU R9,R10 // bytes equal?
MOVD $0,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD 16(R5),R9 // get next pair of doublewords
MOVD 16(R6),R10
CMPU R14,R15 // bytes match?
MOVD $8,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD 24(R5),R14 // get next pair of doublewords
MOVD 24(R6),R15
CMPU R9,R10 // bytes match?
MOVD $16,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
ADD $32,R5 // bump up to next 32
ADD $32,R6
CMPU R14,R15 // bytes match?
BC 8,2,loop32a // br ctr and cr
BNE cmpne
DCBT (R5) // optimize >= 64
DCBT (R6) // cache hint
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
loop64a:// process size 64 and greater
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different // jump out if its different
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $-64,R9,R9 // reduce remaining size by 64
ADD $64,R5,R5 // increment to next 64 bytes of A
ADD $64,R6,R6 // increment to next 64 bytes of B
CMPU R9,$64
BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
CMPU R9,$32
BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
CMPU R9,$0
BNE rem // loop to rem if the remainder is not 0
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
cmp32:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $-32,R9,R9 // reduce remaining size by 32
ADD $32,R5,R5 // increment to next 32 bytes of A
ADD $32,R6,R6 // increment to next 32 bytes of B
CMPU R9,$0
BNE rem // loop to rem if the remainder is not 0
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
rem:
MOVD R9,R8
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
BR setup8a
different:
#ifdef GOARCH_ppc64le
MOVD $byteswap<>+00(SB), R16
LXVD2X (R16)(R0),SWAP // Set up swap string
VPERM V3,V3,SWAP,V3
VPERM V4,V4,SWAP,V4
#endif
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
BGT greater
MOVD $-1,R3 // return value if A < B
RET
lower:
VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
MFVSRD VS35,R16
VSLDOI $8,V4,V4,V4
MFVSRD VS36,R10
CMPU R16,R10
BGT greater
MOVD $-1,R3 // return value if A < B
RET
setup8a:
SRADCC $3,R9,R9 // get the 8 byte count
SRADCC $3,R8,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
CMPU R8,$8 // optimize 8byte move
BEQ size8
CMPU R8,$16
BEQ size16
MOVD R9,CTR // loop count for doublewords
loop8:
MOVDBR (R5+R0),R9 // doublewords to compare
#ifdef GOARCH_ppc64le
MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
#else
MOVD (R5+R0),R16 // doublewords to compare
MOVD (R6+R0),R10 // BE compare order
#endif
ADD $8,R5
ADD $8,R6
CMPU R9,R10 // match?
CMPU R16,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
MOVD R9,CTR // save the ctr
BNE simple // leftover bytes
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less
BR greater
BEQ zeroremainder
simplecheck:
CMP R8,$0 // remaining compare length 0
BNE simple // do simple compare
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less // 1st len < 2nd len, result less
BR greater // 1st len > 2nd len must be greater
simple:
MOVBZ 0(R5), R9 // get byte from 1st operand
ADD $1,R5
MOVBZ 0(R6), R10 // get byte from 2nd operand
ADD $1,R6
CMPU R9, R10
BC 8,2,simple // bc ctr <> 0 && cr
BGT greater // 1st > 2nd
BLT less // 1st < 2nd
BC 12,10,equal // test CR2 for length comparison
BC 12,9,greater // 2nd len > 1st len
BR less // must be less
cmpne: // only here is not equal
MOVDBR (R5+R16),R8 // reload in reverse order
MOVDBR (R6+R16),R9
CMPU R8,R9 // compare correct endianness
BGT greater // here only if NE
less:
MOVD $-1, R3 // return value if A < B
MOVD R0,R14
CMP R9,$4 // process 4 bytes
BLT halfword
#ifdef GOARCH_ppc64le
MOVWBR (R5)(R14),R10
MOVWBR (R6)(R14),R11
#else
MOVWZ (R5)(R14),R10
MOVWZ (R6)(R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $-4,R9
ADD $4,R14
PCALIGN $16
halfword:
CMP R9,$2 // process 2 bytes
BLT byte
#ifdef GOARCH_ppc64le
MOVHBR (R5)(R14),R10
MOVHBR (R6)(R14),R11
#else
MOVHZ (R5)(R14),R10
MOVHZ (R6)(R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $-2,R9
ADD $2,R14
PCALIGN $16
byte:
CMP R9,$0 // process 1 byte
BEQ skip
MOVBZ (R5)(R14),R10
MOVBZ (R6)(R14),R11
CMPU R10,R11
BGT greater
BLT less
PCALIGN $16
skip:
BEQ CR2,equal
BGT CR2,greater
less: MOVD $-1,R3 // return value if A < B
RET
size16:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
zeroremainder:
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
size8:
#ifdef GOARCH_ppc64le
MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
#else
MOVD (R5+R0),R16 // doublewords to compare
MOVD (R6+R0),R10 // BE compare order
#endif
CMPU R16,R10 // match?
BGT greater
BLT less
BGT CR2,greater // 2nd len > 1st len
BLT CR2,less // 2nd len < 1st len
equal:
MOVD $0, R3 // return value if A == B
RET
greater:
MOVD $1, R3 // return value if A > B
MOVD $1,R3 // return value if A > B
RET
// Do an efficient memcmp for ppc64 (BE)
// Do an efficient memcmp for ppc64le/ppc64/POWER9
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2
BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
MOVD R8,CTR // set up loop counter
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32
CMP R8,$16 // optimize for size<16
MOVD R8,R9
BLT setup8a // 8 byte moves only
BLT simplecheck
MOVD $16,R10 // set offsets to load into vectors
CMP R8,$32 // optimize for size 16-31
BLT cmp16
CMP R8,$64
BLT cmp32 // optimize for size 32-63
DCBT (R5) // optimize for size>=64
DCBT (R6) // cache hint
setup32a:
SRADCC $5,R8,R9 // number of 32 byte chunks
MOVD R9,CTR
loop32a:
MOVD 0(R5),R9 // doublewords to compare
MOVD 0(R6),R10 // get 4 doublewords
MOVD 8(R5),R14
MOVD 8(R6),R15
CMPU R9,R10 // bytes equal?
BLT less // found to be less
BGT greater // found to be greater
MOVD 16(R5),R9 // get next pair of doublewords
MOVD 16(R6),R10
CMPU R14,R15 // bytes match?
BLT less // found less
BGT greater // found greater
MOVD 24(R5),R14 // get next pair of doublewords
MOVD 24(R6),R15
CMPU R9,R10 // bytes match?
BLT less // found to be less
BGT greater // found to be greater
ADD $32,R5 // bump up to next 32
ADD $32,R6
CMPU R14,R15 // bytes match?
BC 8,2,loop32a // br ctr and cr
BLT less // with BE, byte ordering is
BGT greater // good for compare
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
setup8a:
SRADCC $3,R9,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
MOVD R9,CTR // loop count for doublewords
loop8:
MOVD (R5),R9
MOVD (R6),R10
ADD $8,R5
ADD $8,R6
CMPU R9,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
loop64a:// process size 64 and greater
LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
VCMPNEBCC V3,V4,V1 // record comparison into V1
BNE CR6,different // jump out if its different
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
VCMPNEBCC V3,V4,V1
BNE CR6,different
LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
VCMPNEBCC V3,V4,V1
BNE CR6,different
LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
VCMPNEBCC V3,V4,V1
BNE CR6,different
ADD $-64,R9,R9 // reduce remaining size by 64
ADD $64,R5,R5 // increment to next 64 bytes of A
ADD $64,R6,R6 // increment to next 64 bytes of B
CMPU R9,$64
BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
CMPU R9,$32
BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
CMPU R9,$16
BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
CMPU R9,$0
BNE simplecheck // loop to simplecheck for remaining bytes
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
cmp32:
LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
VCMPNEBCC V3,V4,V1 // record comparison into V1
BNE CR6,different // jump out if its different
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
VCMPNEBCC V3,V4,V1
BNE CR6,different
ADD $-32,R9,R9 // reduce remaining size by 32
ADD $32,R5,R5 // increment to next 32 bytes of A
ADD $32,R6,R6 // increment to next 32 bytes of B
CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
BGE cmp16
CMPU R9,$0
BNE simplecheck // loop to simplecheck for remainder bytes
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
different:
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
BGT greater
MOVD $-1,R3 // return value if A < B
RET
lower:
MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
MFVSRLD VS36,R10
CMPU R16,R10
BGT greater
MOVD $-1,R3 // return value if A < B
RET
greater:
MOVD $1,R3 // return value if A > B
RET
cmp16:
ANDCC $16,R9,R31
BEQ tail
LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $16,R5
ADD $16,R6
tail:
ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
BEQ end
ADD R9,R5
ADD R9,R6
MOVD $-16,R10
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
end:
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
BR greater // jump to greater otherwise
simplecheck:
MOVD $0,R14 // process 8 bytes
CMP R9,$8
BLT word
#ifdef GOARCH_ppc64le
MOVDBR (R5+R14),R10
MOVDBR (R6+R14),R11
#else
MOVD (R5+R14),R10
MOVD (R6+R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
MOVD R9,CTR // save the ctr
BNE simple // leftover bytes
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less
BR greater
simplecheck:
CMP R8,$0 // remaining compare length 0
BNE simple // do simple compare
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less // 1st len < 2nd len, result less
BR greater // same len, must be equal
simple:
MOVBZ 0(R5),R9 // get byte from 1st operand
ADD $1,R5
MOVBZ 0(R6),R10 // get byte from 2nd operand
ADD $1,R6
CMPU R9,R10
BC 8,2,simple // bc ctr <> 0 && cr
BGT greater // 1st > 2nd
BLT less // 1st < 2nd
BC 12,10,equal // test CR2 for length comparison
BC 12,9,greater // 2nd len > 1st len
ADD $8,R14
ADD $-8,R9
PCALIGN $16
word:
CMP R9,$4 // process 4 bytes
BLT halfword
#ifdef GOARCH_ppc64le
MOVWBR (R5+R14),R10
MOVWBR (R6+R14),R11
#else
MOVWZ (R5+R14),R10
MOVWZ (R6+R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $4,R14
ADD $-4,R9
PCALIGN $16
halfword:
CMP R9,$2 // process 2 bytes
BLT byte
#ifdef GOARCH_ppc64le
MOVHBR (R5+R14),R10
MOVHBR (R6+R14),R11
#else
MOVHZ (R5+R14),R10
MOVHZ (R6+R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $2,R14
ADD $-2,R9
PCALIGN $16
byte:
CMP R9,$0 // process 1 byte
BEQ skip
MOVBZ (R5+R14),R10
MOVBZ (R6+R14),R11
CMPU R10,R11
BGT greater
BLT less
PCALIGN $16
skip:
BEQ CR2,equal
BGT CR2,greater
less:
MOVD $-1, R3 // return value if A < B
MOVD $-1,R3 // return value if A < B
RET
equal:
MOVD $0, R3 // return value if A == B
RET
greater:
MOVD $1, R3 // return value if A > B
RET