mirror of
https://github.com/golang/go
synced 2024-11-18 13:54:59 -07:00
internal/bytealg: process two AVX2 lanes per Count loop
The branch taken by the bytealg.Count algorithm used to process a single 32 bytes block per loop iteration. Throughput of the algorithm can be improved by unrolling two iterations per loop: the lack of data dependencies between each iteration allows for better utilization of the CPU pipeline. The improvement is most significant on medium size payloads that fit in the L1 cache; beyond the L1 cache size, memory bandwidth is likely the bottleneck and the change does not show any measurable improvements. goos: linux goarch: amd64 pkg: bytes cpu: Intel(R) Xeon(R) CPU @ 2.60GHz │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ CountSingle/10 4.800n ± 0% 4.811n ± 0% +0.23% (p=0.000 n=10) CountSingle/32 5.445n ± 0% 5.430n ± 0% ~ (p=0.085 n=10) CountSingle/4K 81.38n ± 1% 63.12n ± 0% -22.43% (p=0.000 n=10) CountSingle/4M 133.0µ ± 7% 130.1µ ± 4% ~ (p=0.280 n=10) CountSingle/64M 4.079m ± 1% 4.070m ± 3% ~ (p=0.796 n=10) geomean 1.029µ 973.3n -5.41% │ old.txt │ new.txt │ │ B/s │ B/s vs base │ CountSingle/10 1.940Gi ± 0% 1.936Gi ± 0% -0.22% (p=0.000 n=10) CountSingle/32 5.474Gi ± 0% 5.488Gi ± 0% ~ (p=0.075 n=10) CountSingle/4K 46.88Gi ± 1% 60.43Gi ± 0% +28.92% (p=0.000 n=10) CountSingle/4M 29.39Gi ± 7% 30.02Gi ± 4% ~ (p=0.280 n=10) CountSingle/64M 15.32Gi ± 1% 15.36Gi ± 3% ~ (p=0.796 n=10) geomean 11.75Gi 12.42Gi +5.71% Change-Id: I1098228c726a2ee814806dcb438b7e92febf4370 Reviewed-on: https://go-review.googlesource.com/c/go/+/532457 Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com> Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
ad76a98d5e
commit
8b6e0e6e8e
@ -51,8 +51,8 @@ TEXT countbody<>(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
MOVQ SI, DI
|
MOVQ SI, DI
|
||||||
|
|
||||||
CMPQ BX, $32
|
CMPQ BX, $64
|
||||||
JA avx2
|
JAE avx2
|
||||||
sse:
|
sse:
|
||||||
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||||||
JMP sseloopentry
|
JMP sseloopentry
|
||||||
@ -162,44 +162,63 @@ avx2:
|
|||||||
JNE sse
|
JNE sse
|
||||||
#endif
|
#endif
|
||||||
MOVD AX, X0
|
MOVD AX, X0
|
||||||
LEAQ -32(SI)(BX*1), R11
|
LEAQ -64(SI)(BX*1), R11
|
||||||
|
LEAQ (SI)(BX*1), R13
|
||||||
VPBROADCASTB X0, Y1
|
VPBROADCASTB X0, Y1
|
||||||
PCALIGN $32
|
PCALIGN $32
|
||||||
avx2_loop:
|
avx2_loop:
|
||||||
VMOVDQU (DI), Y2
|
VMOVDQU (DI), Y2
|
||||||
|
VMOVDQU 32(DI), Y4
|
||||||
VPCMPEQB Y1, Y2, Y3
|
VPCMPEQB Y1, Y2, Y3
|
||||||
|
VPCMPEQB Y1, Y4, Y5
|
||||||
VPMOVMSKB Y3, DX
|
VPMOVMSKB Y3, DX
|
||||||
|
VPMOVMSKB Y5, CX
|
||||||
POPCNTL DX, DX
|
POPCNTL DX, DX
|
||||||
|
POPCNTL CX, CX
|
||||||
ADDQ DX, R12
|
ADDQ DX, R12
|
||||||
ADDQ $32, DI
|
ADDQ CX, R12
|
||||||
|
ADDQ $64, DI
|
||||||
CMPQ DI, R11
|
CMPQ DI, R11
|
||||||
JLE avx2_loop
|
JLE avx2_loop
|
||||||
|
|
||||||
// If last block is already processed,
|
// If last block is already processed,
|
||||||
// skip to the end.
|
// skip to the end.
|
||||||
CMPQ DI, R11
|
//
|
||||||
|
// This check is NOT an optimization; if the input length is a
|
||||||
|
// multiple of 64, we must not go through the last leg of the
|
||||||
|
// function because the bit shift count passed to SALQ below would
|
||||||
|
// be 64, which is outside of the 0-63 range supported by those
|
||||||
|
// instructions.
|
||||||
|
//
|
||||||
|
// Tests in the bytes and strings packages with input lengths that
|
||||||
|
// are multiples of 64 will break if this condition were removed.
|
||||||
|
CMPQ DI, R13
|
||||||
JEQ endavx
|
JEQ endavx
|
||||||
|
|
||||||
// Load address of the last 32 bytes.
|
// Load address of the last 64 bytes.
|
||||||
// There is an overlap with the previous block.
|
// There is an overlap with the previous block.
|
||||||
MOVQ R11, DI
|
MOVQ R11, DI
|
||||||
VMOVDQU (DI), Y2
|
VMOVDQU (DI), Y2
|
||||||
|
VMOVDQU 32(DI), Y4
|
||||||
VPCMPEQB Y1, Y2, Y3
|
VPCMPEQB Y1, Y2, Y3
|
||||||
|
VPCMPEQB Y1, Y4, Y5
|
||||||
VPMOVMSKB Y3, DX
|
VPMOVMSKB Y3, DX
|
||||||
|
VPMOVMSKB Y5, CX
|
||||||
// Exit AVX mode.
|
// Exit AVX mode.
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
|
SALQ $32, CX
|
||||||
|
ORQ CX, DX
|
||||||
|
|
||||||
// Create mask to ignore overlap between previous 32 byte block
|
// Create mask to ignore overlap between previous 64 byte block
|
||||||
// and the next.
|
// and the next.
|
||||||
ANDQ $31, BX
|
ANDQ $63, BX
|
||||||
MOVQ $32,CX
|
MOVQ $64, CX
|
||||||
SUBQ BX, CX
|
SUBQ BX, CX
|
||||||
MOVQ $0xFFFFFFFF, R10
|
MOVQ $0xFFFFFFFFFFFFFFFF, R10
|
||||||
SARQ CL, R10
|
|
||||||
SALQ CL, R10
|
SALQ CL, R10
|
||||||
// Apply mask
|
// Apply mask
|
||||||
ANDQ R10, DX
|
ANDQ R10, DX
|
||||||
POPCNTL DX, DX
|
POPCNTQ DX, DX
|
||||||
ADDQ DX, R12
|
ADDQ DX, R12
|
||||||
MOVQ R12, (R8)
|
MOVQ R12, (R8)
|
||||||
RET
|
RET
|
||||||
|
Loading…
Reference in New Issue
Block a user