1
0
mirror of https://github.com/golang/go synced 2024-09-30 15:38:33 -06:00

internal/bytealg: process two AVX2 lanes per Count loop

The branch taken by the bytealg.Count algorithm used to process a single
32 bytes block per loop iteration. Throughput of the algorithm can be
improved by unrolling two iterations per loop: the lack of data
dependencies between each iteration allows for better utilization of the
CPU pipeline. The improvement is most significant on medium size payloads
that fit in the L1 cache; beyond the L1 cache size, memory bandwidth is
likely the bottleneck and the change does not show any measurable
improvements.

goos: linux
goarch: amd64
pkg: bytes
cpu: Intel(R) Xeon(R) CPU @ 2.60GHz
                │   old.txt   │               new.txt               │
                │   sec/op    │   sec/op     vs base                │
CountSingle/10    4.800n ± 0%   4.811n ± 0%   +0.23% (p=0.000 n=10)
CountSingle/32    5.445n ± 0%   5.430n ± 0%        ~ (p=0.085 n=10)
CountSingle/4K    81.38n ± 1%   63.12n ± 0%  -22.43% (p=0.000 n=10)
CountSingle/4M    133.0µ ± 7%   130.1µ ± 4%        ~ (p=0.280 n=10)
CountSingle/64M   4.079m ± 1%   4.070m ± 3%        ~ (p=0.796 n=10)
geomean           1.029µ        973.3n        -5.41%

                │   old.txt    │               new.txt                │
                │     B/s      │     B/s       vs base                │
CountSingle/10    1.940Gi ± 0%   1.936Gi ± 0%   -0.22% (p=0.000 n=10)
CountSingle/32    5.474Gi ± 0%   5.488Gi ± 0%        ~ (p=0.075 n=10)
CountSingle/4K    46.88Gi ± 1%   60.43Gi ± 0%  +28.92% (p=0.000 n=10)
CountSingle/4M    29.39Gi ± 7%   30.02Gi ± 4%        ~ (p=0.280 n=10)
CountSingle/64M   15.32Gi ± 1%   15.36Gi ± 3%        ~ (p=0.796 n=10)
geomean           11.75Gi        12.42Gi        +5.71%

Change-Id: I1098228c726a2ee814806dcb438b7e92febf4370
Reviewed-on: https://go-review.googlesource.com/c/go/+/532457
Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Achille Roussel 2023-10-04 04:58:03 +00:00 committed by Gopher Robot
parent ad76a98d5e
commit 8b6e0e6e8e

View File

@ -51,8 +51,8 @@ TEXT countbody<>(SB),NOSPLIT,$0
MOVQ SI, DI
CMPQ BX, $32
JA avx2
CMPQ BX, $64
JAE avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
@ -162,44 +162,63 @@ avx2:
JNE sse
#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
LEAQ -64(SI)(BX*1), R11
LEAQ (SI)(BX*1), R13
VPBROADCASTB X0, Y1
PCALIGN $32
avx2_loop:
VMOVDQU (DI), Y2
VMOVDQU 32(DI), Y4
VPCMPEQB Y1, Y2, Y3
VPCMPEQB Y1, Y4, Y5
VPMOVMSKB Y3, DX
VPMOVMSKB Y5, CX
POPCNTL DX, DX
POPCNTL CX, CX
ADDQ DX, R12
ADDQ $32, DI
ADDQ CX, R12
ADDQ $64, DI
CMPQ DI, R11
JLE avx2_loop
// If last block is already processed,
// skip to the end.
CMPQ DI, R11
//
// This check is NOT an optimization; if the input length is a
// multiple of 64, we must not go through the last leg of the
// function because the bit shift count passed to SALQ below would
// be 64, which is outside of the 0-63 range supported by those
// instructions.
//
// Tests in the bytes and strings packages with input lengths that
// are multiples of 64 will break if this condition were removed.
CMPQ DI, R13
JEQ endavx
// Load address of the last 32 bytes.
// Load address of the last 64 bytes.
// There is an overlap with the previous block.
MOVQ R11, DI
VMOVDQU (DI), Y2
VMOVDQU 32(DI), Y4
VPCMPEQB Y1, Y2, Y3
VPCMPEQB Y1, Y4, Y5
VPMOVMSKB Y3, DX
VPMOVMSKB Y5, CX
// Exit AVX mode.
VZEROUPPER
SALQ $32, CX
ORQ CX, DX
// Create mask to ignore overlap between previous 32 byte block
// Create mask to ignore overlap between previous 64 byte block
// and the next.
ANDQ $31, BX
MOVQ $32,CX
ANDQ $63, BX
MOVQ $64, CX
SUBQ BX, CX
MOVQ $0xFFFFFFFF, R10
SARQ CL, R10
MOVQ $0xFFFFFFFFFFFFFFFF, R10
SALQ CL, R10
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
POPCNTQ DX, DX
ADDQ DX, R12
MOVQ R12, (R8)
RET