mirror of
https://github.com/golang/go
synced 2024-11-23 04:40:09 -07:00
internal/bytealg: optimize Count/CountString for PPC64/Power10
Power10 adds a handful of new instructions which make this noticeably quicker for smaller values. Likewise, since the vector loop requires 32B to enter, unroll it once to count 32B per iteration. This improvement benefits all PPC64 cpus. On Power10 comparing a binary built with GOPPC64=power8 CountSingle/10 8.99ns ± 0% 5.55ns ± 3% -38.24% CountSingle/16 7.55ns ± 0% 5.56ns ± 3% -26.37% CountSingle/17 7.45ns ± 0% 5.25ns ± 0% -29.52% CountSingle/31 18.4ns ± 0% 6.2ns ± 0% -66.41% CountSingle/32 6.17ns ± 0% 5.04ns ± 0% -18.37% CountSingle/33 7.13ns ± 0% 5.99ns ± 0% -15.94% CountSingle/4K 198ns ± 0% 115ns ± 0% -42.08% CountSingle/4M 190µs ± 0% 109µs ± 0% -42.49% CountSingle/64M 3.28ms ± 0% 2.08ms ± 0% -36.53% Furthermore, comparing the new tail implementation on GOPPC64=power8 with GOPPC64=power10: CountSingle/10 5.55ns ± 3% 4.52ns ± 1% -18.66% CountSingle/16 5.56ns ± 3% 4.80ns ± 0% -13.65% CountSingle/17 5.25ns ± 0% 4.79ns ± 0% -8.78% CountSingle/31 6.17ns ± 0% 4.82ns ± 0% -21.79% CountSingle/32 5.04ns ± 0% 5.09ns ± 6% +1.01% CountSingle/33 5.99ns ± 0% 5.42ns ± 2% -9.54% Change-Id: I62d80be3b5d706e1abbb4bec7d6278a939a5eed4 Reviewed-on: https://go-review.googlesource.com/c/go/+/512695 Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
parent
02b548e5c8
commit
756841bffa
@ -8,89 +8,147 @@
|
|||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
|
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
|
||||||
// R3 = byte array pointer
|
// R3 = byte array pointer
|
||||||
// R4 = length
|
// R4 = length
|
||||||
MOVBZ R6, R5 // R5 = byte
|
// R6 = byte to count
|
||||||
BR countbytebody<>(SB)
|
MTVRD R6, V1 // move compare byte
|
||||||
|
MOVD R6, R5
|
||||||
|
VSPLTB $7, V1, V1 // replicate byte across V1
|
||||||
|
BR countbytebody<>(SB)
|
||||||
|
|
||||||
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
|
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
|
||||||
// R3 = byte array pointer
|
// R3 = byte array pointer
|
||||||
// R4 = length
|
// R4 = length
|
||||||
MOVBZ R5, R5 // R5 = byte
|
// R5 = byte to count
|
||||||
BR countbytebody<>(SB)
|
MTVRD R5, V1 // move compare byte
|
||||||
|
VSPLTB $7, V1, V1 // replicate byte across V1
|
||||||
|
BR countbytebody<>(SB)
|
||||||
|
|
||||||
// R3: addr of string
|
// R3: addr of string
|
||||||
// R4: len of string
|
// R4: len of string
|
||||||
// R5: byte to count
|
// R5: byte to count
|
||||||
|
// V1: byte to count, splatted.
|
||||||
// On exit:
|
// On exit:
|
||||||
// R3: return value
|
// R3: return value
|
||||||
// endianness shouldn't matter since we are just counting and order
|
|
||||||
// is irrelevant
|
|
||||||
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
|
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
|
||||||
DCBT (R3) // Prepare cache line.
|
MOVD $0, R18 // byte count
|
||||||
MOVD R0, R18 // byte count
|
|
||||||
MOVD R3, R19 // Save base address for calculating the index later.
|
|
||||||
MOVD R4, R16
|
|
||||||
|
|
||||||
MOVD R5, R6
|
#ifndef GOPPC64_power10
|
||||||
RLDIMI $8, R6, $48, R6
|
RLDIMI $8, R5, $48, R5
|
||||||
RLDIMI $16, R6, $32, R6
|
RLDIMI $16, R5, $32, R5
|
||||||
RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
|
RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
|
||||||
|
#endif
|
||||||
|
|
||||||
VSPLTISW $3, V4 // used for shift
|
CMPU R4, $32 // Check if it's a small string (<32 bytes)
|
||||||
MTVRD R6, V1 // move compare byte
|
BLT tail // Jump to the small string case
|
||||||
VSPLTB $7, V1, V1 // replicate byte across V1
|
SRD $5, R4, R20
|
||||||
|
MOVD R20, CTR
|
||||||
CMPU R4, $32 // Check if it's a small string (<32 bytes)
|
MOVD $16, R21
|
||||||
BLT tail // Jump to the small string case
|
XXLXOR V4, V4, V4
|
||||||
XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
|
XXLXOR V5, V5, V5
|
||||||
|
|
||||||
|
PCALIGN $16
|
||||||
cmploop:
|
cmploop:
|
||||||
LXVW4X (R3), VS32 // load bytes from string
|
LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
|
||||||
|
LXVD2X (R21)(R3), V2
|
||||||
|
VCMPEQUB V2, V1, V2
|
||||||
|
VCMPEQUB V0, V1, V0
|
||||||
|
VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
|
||||||
|
VPOPCNTD V0, V0
|
||||||
|
VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
|
||||||
|
VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
|
||||||
|
ADD $32, R3
|
||||||
|
BDNZ cmploop
|
||||||
|
|
||||||
// when the bytes match, the corresponding byte contains all 1s
|
VADDUDM V4, V5, V5
|
||||||
VCMPEQUB V1, V0, V2 // compare bytes
|
MFVSRD V5, R18
|
||||||
VPOPCNTD V2, V3 // each double word contains its count
|
VSLDOI $8, V5, V5, V5
|
||||||
VADDUDM V3, V5, V5 // accumulate bit count in each double word
|
MFVSRD V5, R21
|
||||||
ADD $16, R3, R3 // increment pointer
|
ADD R21, R18, R18
|
||||||
SUB $16, R16, R16 // remaining bytes
|
ANDCC $31, R4, R4
|
||||||
CMP R16, $16 // at least 16 remaining?
|
// Skip the tail processing if no bytes remaining.
|
||||||
BGE cmploop
|
BEQ tail_0
|
||||||
VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
|
|
||||||
VSLDOI $8, V5, V5, V6 // get the double word values from vector
|
|
||||||
MFVSRD V5, R9
|
|
||||||
MFVSRD V6, R10
|
|
||||||
ADD R9, R10, R9
|
|
||||||
ADD R9, R18, R18
|
|
||||||
|
|
||||||
tail:
|
#ifdef GOPPC64_power10
|
||||||
CMP R16, $8 // 8 bytes left?
|
SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
|
||||||
BLT small
|
|
||||||
|
|
||||||
MOVD (R3), R12 // load 8 bytes
|
tail: // Count the last 0 - 31 bytes.
|
||||||
CMPB R12, R6, R17 // compare bytes
|
CMP R4, $16
|
||||||
POPCNTD R17, R15 // bit count
|
BLE small_tail_p10
|
||||||
SRD $3, R15, R15 // byte count
|
LXV 0(R3), V0
|
||||||
ADD R15, R18, R18 // add to byte count
|
VCMPEQUB V0, V1, V0
|
||||||
|
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
|
||||||
|
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
|
||||||
|
ADD R14, R18, R18
|
||||||
|
ADD $16, R3, R3
|
||||||
|
ANDCC $15, R4, R4
|
||||||
|
|
||||||
next1:
|
small_tail_p10:
|
||||||
ADD $8, R3, R3
|
SLD $56, R4, R6
|
||||||
SUB $8, R16, R16 // remaining bytes
|
LXVLL R3, R6, V0
|
||||||
BR tail
|
VCMPEQUB V0, V1, V0
|
||||||
|
VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
|
||||||
small:
|
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
|
||||||
CMP $0, R16 // any remaining
|
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
|
||||||
BEQ done
|
ADD R14, R18, R3
|
||||||
MOVBZ (R3), R12 // check each remaining byte
|
RET
|
||||||
CMP R12, R5
|
|
||||||
BNE next2
|
#else
|
||||||
ADD $1, R18
|
tail: // Count the last 0 - 31 bytes.
|
||||||
|
CMP R4, $16
|
||||||
next2:
|
BLT tail_8
|
||||||
SUB $1, R16
|
MOVD (R3), R12
|
||||||
ADD $1, R3 // inc address
|
MOVD 8(R3), R14
|
||||||
BR small
|
CMPB R12, R5, R12
|
||||||
|
CMPB R14, R5, R14
|
||||||
done:
|
POPCNTD R12, R12
|
||||||
MOVD R18, R3 // return count
|
POPCNTD R14, R14
|
||||||
|
ADD R12, R18, R18
|
||||||
|
ADD R14, R18, R18
|
||||||
|
ADD $16, R3, R3
|
||||||
|
ADD $-16, R4, R4
|
||||||
|
|
||||||
|
tail_8: // Count the remaining 0 - 15 bytes.
|
||||||
|
CMP R4, $8
|
||||||
|
BLT tail_4
|
||||||
|
MOVD (R3), R12
|
||||||
|
CMPB R12, R5, R12
|
||||||
|
POPCNTD R12, R12
|
||||||
|
ADD R12, R18, R18
|
||||||
|
ADD $8, R3, R3
|
||||||
|
ADD $-8, R4, R4
|
||||||
|
|
||||||
|
tail_4: // Count the remaining 0 - 7 bytes.
|
||||||
|
CMP R4, $4
|
||||||
|
BLT tail_2
|
||||||
|
MOVWZ (R3), R12
|
||||||
|
CMPB R12, R5, R12
|
||||||
|
SLD $32, R12, R12 // Remove non-participating matches.
|
||||||
|
POPCNTD R12, R12
|
||||||
|
ADD R12, R18, R18
|
||||||
|
ADD $4, R3, R3
|
||||||
|
ADD $-4, R4, R4
|
||||||
|
|
||||||
|
tail_2: // Count the remaining 0 - 3 bytes.
|
||||||
|
CMP R4, $2
|
||||||
|
BLT tail_1
|
||||||
|
MOVHZ (R3), R12
|
||||||
|
CMPB R12, R5, R12
|
||||||
|
SLD $48, R12, R12 // Remove non-participating matches.
|
||||||
|
POPCNTD R12, R12
|
||||||
|
ADD R12, R18, R18
|
||||||
|
ADD $2, R3, R3
|
||||||
|
ADD $-2, R4, R4
|
||||||
|
|
||||||
|
tail_1: // Count the remaining 0 - 1 bytes.
|
||||||
|
CMP R4, $1
|
||||||
|
BLT tail_0
|
||||||
|
MOVBZ (R3), R12
|
||||||
|
CMPB R12, R5, R12
|
||||||
|
ANDCC $0x8, R12, R12
|
||||||
|
ADD R12, R18, R18
|
||||||
|
#endif
|
||||||
|
|
||||||
|
tail_0: // No remaining tail to count.
|
||||||
|
SRD $3, R18, R3 // Fixup count, it is off by 8x.
|
||||||
RET
|
RET
|
||||||
|
Loading…
Reference in New Issue
Block a user