1
0
mirror of https://github.com/golang/go synced 2024-11-23 18:20:04 -07:00

runtime: use ERMS in memclr_amd64

This patch adds support for REP STOSB in memclr(). The current
implementation uses REP STOSB when 1) ERMS is supported
2) size is bigger than 2kb and less than 32mb.

The threshold of 2kb is chosen based on benchmark results and is
close to what Intel mentioned in their comparison of ERMSB and AVX
(Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit
AVX in the Intel Optimization Guide).

While REP STOS uses a no-RFO write protocol, ERMS could show the
same or slower performance comparing to Non-Temporal Stores when the
size is bigger than LLC depending on hardware.

Benchmarks (including MemclrRange from CL373362)
goos: darwin
goarch: amd64
pkg: runtime
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
name                           old time/op    new time/op    delta
Memclr/5-12                      1.90ns ± 2%    2.13ns ± 2%  +11.72%  (p=0.001 n=7+7)
Memclr/16-12                     2.33ns ± 4%    2.36ns ± 4%     ~     (p=0.259 n=7+7)
Memclr/64-12                     2.58ns ± 2%    2.61ns ± 3%     ~     (p=0.091 n=7+7)
Memclr/256-12                    4.89ns ± 4%    4.94ns ± 3%     ~     (p=0.620 n=7+7)
Memclr/4096-12                   38.4ns ± 2%    39.5ns ± 5%     ~     (p=0.078 n=7+7)
Memclr/65536-12                   929ns ± 2%    1040ns ±19%     ~     (p=0.268 n=5+7)
Memclr/1M-12                     24.2µs ± 5%    19.0µs ± 9%  -21.62%  (p=0.001 n=7+7)
Memclr/4M-12                     93.3µs ± 3%    73.2µs ± 4%  -21.50%  (p=0.001 n=7+7)
Memclr/8M-12                      209µs ± 6%     164µs ± 3%  -21.55%  (p=0.001 n=7+7)
Memclr/16M-12                     731µs ± 4%     507µs ± 6%  -30.71%  (p=0.001 n=7+7)
Memclr/64M-12                    1.79ms ± 1%    1.83ms ± 3%   +2.47%  (p=0.041 n=6+6)
MemclrRange/1_2_47K-12            873ns ± 3%     899ns ± 5%     ~     (p=0.053 n=7+7)
MemclrRange/2_8_166K-12          2.98µs ± 4%    2.90µs ± 5%     ~     (p=0.165 n=7+7)
MemclrRange/4_16_315K-12         6.81µs ± 4%    5.31µs ± 9%  -22.01%  (p=0.001 n=7+7)
MemclrRange/128_256_1623K-12     37.5µs ± 4%    28.1µs ± 4%  -25.19%  (p=0.001 n=7+6)
[Geo mean]                       1.56µs         1.43µs        -8.43%

name                           old speed      new speed      delta
Memclr/5-12                    2.63GB/s ± 2%  2.35GB/s ± 2%  -10.50%  (p=0.001 n=7+7)
Memclr/16-12                   6.86GB/s ± 4%  6.79GB/s ± 4%     ~     (p=0.259 n=7+7)
Memclr/64-12                   24.8GB/s ± 2%  24.5GB/s ± 3%     ~     (p=0.097 n=7+7)
Memclr/256-12                  52.4GB/s ± 4%  51.9GB/s ± 3%     ~     (p=0.620 n=7+7)
Memclr/4096-12                  107GB/s ± 2%   104GB/s ± 5%     ~     (p=0.073 n=7+7)
Memclr/65536-12                70.6GB/s ± 2%  64.2GB/s ±21%     ~     (p=0.268 n=5+7)
Memclr/1M-12                   43.4GB/s ± 5%  55.5GB/s ±10%  +28.04%  (p=0.001 n=7+7)
Memclr/4M-12                   45.0GB/s ± 4%  57.3GB/s ± 4%  +27.38%  (p=0.001 n=7+7)
Memclr/8M-12                   40.1GB/s ± 5%  51.1GB/s ± 3%  +27.37%  (p=0.001 n=7+7)
Memclr/16M-12                  23.0GB/s ± 4%  33.1GB/s ± 6%  +44.39%  (p=0.001 n=7+7)
Memclr/64M-12                  37.6GB/s ± 1%  36.7GB/s ± 3%   -2.38%  (p=0.041 n=6+6)
MemclrRange/1_2_47K-12         55.9GB/s ± 3%  54.3GB/s ± 5%     ~     (p=0.053 n=7+7)
MemclrRange/2_8_166K-12        57.4GB/s ± 5%  58.9GB/s ± 5%     ~     (p=0.165 n=7+7)
MemclrRange/4_16_315K-12       47.4GB/s ± 4%  60.9GB/s ± 9%  +28.40%  (p=0.001 n=7+7)
MemclrRange/128_256_1623K-12   44.3GB/s ± 4%  58.4GB/s ± 9%  +31.73%  (p=0.001 n=7+7)
[Geo mean]                     33.6GB/s       36.8GB/s        +9.27%

goos: linux
goarch: amd64
pkg: runtime
cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz
name                     old time/op    new time/op     delta
Memclr/5-2                 2.53ns ± 0%     2.52ns ± 0%   -0.25%  (p=0.001 n=7+7)
Memclr/16-2                2.77ns ± 0%     2.55ns ± 0%   -7.97%  (p=0.000 n=5+7)
Memclr/64-2                3.16ns ± 0%     3.16ns ± 0%     ~     (p=0.432 n=7+7)
Memclr/256-2               7.26ns ± 0%     7.26ns ± 0%     ~     (p=0.220 n=7+7)
Memclr/4096-2              49.3ns ± 0%     43.5ns ± 0%  -11.80%  (p=0.001 n=7+7)
Memclr/65536-2             1.32µs ± 1%     1.24µs ± 0%   -6.31%  (p=0.001 n=7+7)
Memclr/1M-2                27.3µs ± 0%     26.6µs ± 5%     ~     (p=0.195 n=7+7)
Memclr/4M-2                 195µs ± 0%      148µs ± 4%  -24.22%  (p=0.001 n=7+7)
Memclr/8M-2                 391µs ± 0%      308µs ± 0%  -21.09%  (p=0.001 n=7+6)
Memclr/16M-2                782µs ± 0%      639µs ± 1%  -18.31%  (p=0.001 n=7+7)
Memclr/64M-2               2.83ms ± 1%     2.84ms ± 1%     ~     (p=0.620 n=7+7)
MemclrRange/1K_2K-2        1.24µs ± 0%     1.24µs ± 0%     ~     (p=1.000 n=7+6)
MemclrRange/2K_8K-2        3.89µs ± 0%     3.11µs ± 0%  -20.00%  (p=0.001 n=6+7)
MemclrRange/4K_16K-2       3.63µs ± 0%     2.37µs ± 0%  -34.61%  (p=0.001 n=7+7)
MemclrRange/160K_228K-2    31.0µs ± 0%     30.6µs ± 1%   -1.50%  (p=0.001 n=7+7)
[Geo mean]                 1.97µs          1.76µs       -10.59%

name                     old speed      new speed       delta
Memclr/5-2               1.98GB/s ± 0%   1.98GB/s ± 0%   +0.27%  (p=0.001 n=7+7)
Memclr/16-2              5.78GB/s ± 0%   6.28GB/s ± 0%   +8.67%  (p=0.001 n=7+7)
Memclr/64-2              20.2GB/s ± 0%   20.3GB/s ± 0%     ~     (p=0.535 n=7+7)
Memclr/256-2             35.3GB/s ± 0%   35.2GB/s ± 0%     ~     (p=0.259 n=7+7)
Memclr/4096-2            83.1GB/s ± 0%   94.2GB/s ± 0%  +13.39%  (p=0.001 n=7+7)
Memclr/65536-2           49.7GB/s ± 1%   53.0GB/s ± 0%   +6.73%  (p=0.001 n=7+7)
Memclr/1M-2              38.4GB/s ± 0%   39.4GB/s ± 4%     ~     (p=0.209 n=7+7)
Memclr/4M-2              21.5GB/s ± 0%   28.4GB/s ± 4%  +32.02%  (p=0.001 n=7+7)
Memclr/8M-2              21.5GB/s ± 0%   27.2GB/s ± 0%  +26.73%  (p=0.001 n=7+6)
Memclr/16M-2             21.4GB/s ± 0%   26.2GB/s ± 1%  +22.42%  (p=0.001 n=7+7)
Memclr/64M-2             23.7GB/s ± 1%   23.7GB/s ± 1%     ~     (p=0.620 n=7+7)
MemclrRange/1K_2K-2      77.3GB/s ± 0%   77.3GB/s ± 0%     ~     (p=0.710 n=7+7)
MemclrRange/2K_8K-2      85.7GB/s ± 0%  107.1GB/s ± 0%  +25.00%  (p=0.001 n=6+7)
MemclrRange/4K_16K-2     89.0GB/s ± 0%  136.1GB/s ± 0%  +52.92%  (p=0.001 n=7+7)
MemclrRange/160K_228K-2  53.6GB/s ± 0%   54.4GB/s ± 1%   +1.52%  (p=0.001 n=7+7)
[Geo mean]               29.2GB/s        32.7GB/s       +11.86%

Change-Id: I8f3533f88ebd303ae1666a77391fec304bea9724
Reviewed-on: https://go-review.googlesource.com/c/go/+/374396
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
nimelehin 2021-12-24 15:29:18 +03:00 committed by Gopher Robot
parent 12eedc00b8
commit dcfe57b8c2

View File

@ -41,9 +41,20 @@ tail:
CMPQ BX, $256
JBE _129through256
CMPB internalcpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
JNE skip_erms
// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
// in the Intel Optimization Guide shows better performance for ERMSB starting
// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
CMPQ BX, $2048
JAE loop_preheader_erms
skip_erms:
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JE loop_preheader_avx2
JE loop_preheader_avx2
// TODO: for really big clears, use MOVNTDQ, even without AVX2.
loop:
@ -71,12 +82,13 @@ loop:
#endif
loop_preheader_avx2:
VPXOR Y0, Y0, Y0
VPXOR X0, X0, X0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
JAE loop_preheader_avx2_huge
loop_avx2:
VMOVDQU Y0, 0(DI)
VMOVDQU Y0, 32(DI)
@ -92,6 +104,25 @@ loop_avx2:
VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
loop_preheader_erms:
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JNE loop_erms
#endif
VPXOR X0, X0, X0
// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
// write protocol, ERMS could show the same or slower performance comparing to
// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_erms:
MOVQ BX, CX
REP; STOSB
RET
loop_preheader_avx2_huge:
// Align to 32 byte boundary
VMOVDQU Y0, 0(DI)