From 7e95b8bfb035b53175f5a1b7d8750113933a7e17 Mon Sep 17 00:00:00 2001 From: qiulaidongfeng <2645477756@qq.com> Date: Tue, 31 Oct 2023 21:53:44 +0800 Subject: [PATCH] internal/bytealg: optimize indexbyte in amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goos: windows goarch: amd64 pkg: bytes cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ IndexByte/10-16 2.613n ± 1% 2.558n ± 1% -2.09% (p=0.014 n=10) IndexByte/32-16 3.034n ± 1% 3.010n ± 2% ~ (p=0.305 n=10) IndexByte/4K-16 57.20n ± 2% 39.58n ± 2% -30.81% (p=0.000 n=10) IndexByte/4M-16 34.48µ ± 1% 33.83µ ± 2% -1.87% (p=0.023 n=10) IndexByte/64M-16 1.493m ± 2% 1.450m ± 2% -2.89% (p=0.000 n=10) IndexBytePortable/10-16 3.172n ± 4% 3.163n ± 2% ~ (p=0.684 n=10) IndexBytePortable/32-16 8.465n ± 2% 8.375n ± 3% ~ (p=0.631 n=10) IndexBytePortable/4K-16 852.0n ± 1% 846.6n ± 3% ~ (p=0.971 n=10) IndexBytePortable/4M-16 868.2µ ± 2% 856.6µ ± 2% ~ (p=0.393 n=10) IndexBytePortable/64M-16 13.81m ± 2% 13.88m ± 3% ~ (p=0.684 n=10) geomean 1.204µ 1.148µ -4.63% │ old.txt │ new.txt │ │ B/s │ B/s vs base │ IndexByte/10-16 3.565Gi ± 1% 3.641Gi ± 1% +2.15% (p=0.015 n=10) IndexByte/32-16 9.821Gi ± 1% 9.899Gi ± 2% ~ (p=0.315 n=10) IndexByte/4K-16 66.70Gi ± 2% 96.39Gi ± 2% +44.52% (p=0.000 n=10) IndexByte/4M-16 113.3Gi ± 1% 115.5Gi ± 2% +1.91% (p=0.023 n=10) IndexByte/64M-16 41.85Gi ± 2% 43.10Gi ± 2% +2.98% (p=0.000 n=10) IndexBytePortable/10-16 2.936Gi ± 4% 2.945Gi ± 2% ~ (p=0.684 n=10) IndexBytePortable/32-16 3.521Gi ± 2% 3.559Gi ± 3% ~ (p=0.631 n=10) IndexBytePortable/4K-16 4.477Gi ± 1% 4.506Gi ± 3% ~ (p=0.971 n=10) IndexBytePortable/4M-16 4.499Gi ± 2% 4.560Gi ± 2% ~ (p=0.393 n=10) IndexBytePortable/64M-16 4.525Gi ± 2% 4.504Gi ± 3% ~ (p=0.684 n=10) geomean 10.04Gi 10.53Gi +4.86% --- src/internal/bytealg/indexbyte_amd64.s | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s index 0f07121a8c..c097dc6b07 100644 --- a/src/internal/bytealg/indexbyte_amd64.s +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -45,6 +45,7 @@ sse: LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes JMP sseloopentry + PCALIGN $16 sseloop: // Move the next 16-byte chunk of the data into X1. MOVOU (DI), X1 @@ -124,6 +125,8 @@ avx2: MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 + + PCALIGN $32 avx2_loop: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3