1
0
mirror of https://github.com/golang/go synced 2024-09-23 09:33:31 -06:00

cmd/compile/internal/ssa: optimize rules Zero and Move on loong64

goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
             │  old.bench   │              new.bench               │
             │    sec/op    │    sec/op     vs base                │
ClearFat7      3.6020n ± 0%   0.5087n ± 1%  -85.88% (p=0.000 n=20)
ClearFat8      0.5137n ± 0%   0.8004n ± 0%  +55.81% (p=0.000 n=20)
ClearFat11     5.2030n ± 0%   0.5082n ± 1%  -90.23% (p=0.000 n=20)
ClearFat12     0.8244n ± 0%   0.8004n ± 0%   -2.91% (p=0.000 n=20)
ClearFat13     6.0030n ± 0%   0.5077n ± 1%  -91.54% (p=0.000 n=20)
ClearFat14     6.4030n ± 0%   0.8004n ± 0%  -87.50% (p=0.000 n=20)
ClearFat15     6.8030n ± 0%   0.5065n ± 1%  -92.55% (p=0.000 n=20)
ClearFat16     2.4010n ± 0%   0.8004n ± 0%  -66.66% (p=0.000 n=20)
ClearFat24      3.202n ± 0%    1.601n ± 0%  -50.00% (p=0.000 n=20)
ClearFat32      4.002n ± 0%    2.001n ± 0%  -50.00% (p=0.000 n=20)
ClearFat40      4.802n ± 0%    1.601n ± 0%  -66.66% (p=0.000 n=20)
ClearFat48      5.603n ± 0%    2.001n ± 0%  -64.29% (p=0.000 n=20)
ClearFat56      6.403n ± 0%    2.001n ± 0%  -68.75% (p=0.000 n=20)
ClearFat64      7.204n ± 0%    2.401n ± 0%  -66.67% (p=0.000 n=20)
ClearFat72      8.004n ± 0%    2.001n ± 0%  -75.00% (p=0.000 n=20)
ClearFat128    14.010n ± 0%    3.218n ± 0%  -77.03% (p=0.000 n=20)
ClearFat256    26.810n ± 0%    6.727n ± 0%  -74.91% (p=0.000 n=20)
ClearFat512     52.43n ± 0%    16.40n ± 0%  -68.72% (p=0.000 n=20)
ClearFat1024   103.65n ± 0%    37.49n ± 0%  -63.83% (p=0.000 n=20)
ClearFat1032   104.50n ± 0%    52.83n ± 0%  -49.44% (p=0.000 n=20)
ClearFat1040   105.30n ± 0%    53.23n ± 0%  -49.45% (p=0.000 n=20)
CopyFat7       6.0030n ± 0%   0.6048n ± 0%  -89.93% (p=0.000 n=20)
CopyFat8       0.8004n ± 0%   0.5974n ± 0%  -25.37% (p=0.000 n=20)
CopyFat11      9.2050n ± 0%   0.6057n ± 0%  -93.42% (p=0.000 n=20)
CopyFat12      0.8103n ± 0%   0.6064n ± 0%  -25.16% (p=0.000 n=20)
CopyFat13      6.4030n ± 0%   0.6052n ± 0%  -90.55% (p=0.000 n=20)
CopyFat14      6.8040n ± 0%   0.6064n ± 0%  -91.09% (p=0.000 n=20)
CopyFat15      7.2040n ± 0%   0.6071n ± 0%  -91.57% (p=0.000 n=20)
CopyFat16      2.8010n ± 0%   0.6064n ± 0%  -78.35% (p=0.000 n=20)
CopyFat24       3.602n ± 0%    2.001n ± 0%  -44.45% (p=0.000 n=20)
CopyFat32       4.402n ± 0%    2.001n ± 0%  -54.54% (p=0.000 n=20)
CopyFat64       7.604n ± 0%    2.802n ± 0%  -63.15% (p=0.000 n=20)
CopyFat72       8.405n ± 0%    3.202n ± 0%  -61.90% (p=0.000 n=20)
CopyFat128     14.410n ± 0%    5.480n ± 0%  -61.97% (p=0.000 n=20)
CopyFat256      28.57n ± 0%    12.16n ± 0%  -57.44% (p=0.000 n=20)
CopyFat512      63.63n ± 0%    24.88n ± 0%  -60.90% (p=0.000 n=20)
CopyFat520      67.23n ± 0%    24.11n ± 0%  -64.14% (p=0.000 n=20)
CopyFat1024    125.00n ± 0%    50.60n ± 0%  -59.52% (p=0.000 n=20)
CopyFat1032    121.30n ± 0%    64.32n ± 0%  -46.97% (p=0.000 n=20)
CopyFat1040    124.50n ± 0%    67.23n ± 0%  -46.00% (p=0.000 n=20)
geomean         9.539n         2.779n       -70.87%

Change-Id: Ic04e5f849f20ec3ec748d6763d4c9f8a1f21ee49
Reviewed-on: https://go-review.googlesource.com/c/go/+/592115
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
This commit is contained in:
Xiaolin Zhao 2024-06-12 14:27:34 +08:00 committed by abner chenc
parent a9bd84e037
commit 9b88f58099
2 changed files with 601 additions and 666 deletions

View File

@ -260,136 +260,137 @@
// zeroing
(Zero [0] _ mem) => mem
(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem)
(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
(MOVHstore ptr (MOVVconst [0]) mem)
(Zero [2] ptr mem) =>
(MOVBstore [1] ptr (MOVVconst [0])
(MOVBstore [0] ptr (MOVVconst [0]) mem))
(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
(MOVWstore ptr (MOVVconst [0]) mem)
(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
(MOVHstore [2] ptr (MOVVconst [0])
(MOVHstore [0] ptr (MOVVconst [0]) mem))
(Zero [4] ptr mem) =>
(MOVBstore [3] ptr (MOVVconst [0])
(MOVBstore [2] ptr (MOVVconst [0])
(MOVBstore [1] ptr (MOVVconst [0])
(MOVBstore [0] ptr (MOVVconst [0]) mem))))
(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVVstore ptr (MOVVconst [0]) mem)
(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
(MOVWstore [4] ptr (MOVVconst [0])
(MOVWstore [0] ptr (MOVVconst [0]) mem))
(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
(MOVHstore [6] ptr (MOVVconst [0])
(MOVHstore [4] ptr (MOVVconst [0])
(MOVHstore [2] ptr (MOVVconst [0])
(MOVHstore [0] ptr (MOVVconst [0]) mem))))
(Zero [2] ptr mem) => (MOVHstore ptr (MOVVconst [0]) mem)
(Zero [3] ptr mem) =>
(MOVBstore [2] ptr (MOVVconst [0])
(MOVBstore [1] ptr (MOVVconst [0])
(MOVBstore [0] ptr (MOVVconst [0]) mem)))
(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
(MOVHstore [4] ptr (MOVVconst [0])
(MOVHstore [2] ptr (MOVVconst [0])
(MOVHstore [0] ptr (MOVVconst [0]) mem)))
(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
(MOVWstore [8] ptr (MOVVconst [0])
(MOVWstore [4] ptr (MOVVconst [0])
(MOVWstore [0] ptr (MOVVconst [0]) mem)))
(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVVstore [8] ptr (MOVVconst [0])
(MOVVstore [0] ptr (MOVVconst [0]) mem))
(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVVstore [16] ptr (MOVVconst [0])
(MOVVstore [8] ptr (MOVVconst [0])
(MOVVstore [0] ptr (MOVVconst [0]) mem)))
(MOVBstore [2] ptr (MOVVconst [0])
(MOVHstore ptr (MOVVconst [0]) mem))
(Zero [4] {t} ptr mem) => (MOVWstore ptr (MOVVconst [0]) mem)
(Zero [5] ptr mem) =>
(MOVBstore [4] ptr (MOVVconst [0])
(MOVWstore ptr (MOVVconst [0]) mem))
(Zero [6] ptr mem) =>
(MOVHstore [4] ptr (MOVVconst [0])
(MOVWstore ptr (MOVVconst [0]) mem))
(Zero [7] ptr mem) =>
(MOVWstore [3] ptr (MOVVconst [0])
(MOVWstore ptr (MOVVconst [0]) mem))
(Zero [8] {t} ptr mem) => (MOVVstore ptr (MOVVconst [0]) mem)
(Zero [9] ptr mem) =>
(MOVBstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [10] ptr mem) =>
(MOVHstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [11] ptr mem) =>
(MOVWstore [7] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [12] ptr mem) =>
(MOVWstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [13] ptr mem) =>
(MOVVstore [5] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [14] ptr mem) =>
(MOVVstore [6] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [15] ptr mem) =>
(MOVVstore [7] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
(Zero [16] ptr mem) =>
(MOVVstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
// strip off fractional word zeroing
(Zero [s] ptr mem) && s%8 != 0 && s > 16 =>
(Zero [s%8]
(OffPtr <ptr.Type> ptr [s-s%8])
(Zero [s-s%8] ptr mem))
// medium zeroing uses a duff device
// 8, and 128 are magic constants, see runtime/mkduff.go
(Zero [s] {t} ptr mem)
&& s%8 == 0 && s > 24 && s <= 8*128
&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
(DUFFZERO [8 * (128 - s/8)] ptr mem)
(Zero [s] ptr mem)
&& s%8 == 0 && s > 16 && s <= 8*128
&& !config.noDuffDevice =>
(DUFFZERO [8 * (128 - s/8)] ptr mem)
// large or unaligned zeroing uses a loop
(Zero [s] {t} ptr mem)
&& (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 =>
(LoweredZero [t.Alignment()]
ptr
(ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)])
mem)
// large zeroing uses a loop
(Zero [s] ptr mem)
&& s%8 == 0 && s > 8*128 =>
(LoweredZero
ptr
(ADDVconst <ptr.Type> ptr [s-8])
mem)
// moves
(Move [0] _ _ mem) => mem
(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
(MOVHstore dst (MOVHload src mem) mem)
(Move [2] dst src mem) =>
(MOVBstore [1] dst (MOVBload [1] src mem)
(MOVBstore dst (MOVBload src mem) mem))
(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
(MOVWstore dst (MOVWload src mem) mem)
(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
(MOVHstore [2] dst (MOVHload [2] src mem)
(MOVHstore dst (MOVHload src mem) mem))
(Move [4] dst src mem) =>
(MOVBstore [3] dst (MOVBload [3] src mem)
(MOVBstore [2] dst (MOVBload [2] src mem)
(MOVBstore [1] dst (MOVBload [1] src mem)
(MOVBstore dst (MOVBload src mem) mem))))
(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
(MOVVstore dst (MOVVload src mem) mem)
(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
(MOVWstore [4] dst (MOVWload [4] src mem)
(MOVWstore dst (MOVWload src mem) mem))
(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
(MOVHstore [6] dst (MOVHload [6] src mem)
(MOVHstore [4] dst (MOVHload [4] src mem)
(MOVHstore [2] dst (MOVHload [2] src mem)
(MOVHstore dst (MOVHload src mem) mem))))
(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
(Move [2] dst src mem) => (MOVHstore dst (MOVHUload src mem) mem)
(Move [3] dst src mem) =>
(MOVBstore [2] dst (MOVBload [2] src mem)
(MOVBstore [1] dst (MOVBload [1] src mem)
(MOVBstore dst (MOVBload src mem) mem)))
(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
(MOVHstore [4] dst (MOVHload [4] src mem)
(MOVHstore [2] dst (MOVHload [2] src mem)
(MOVHstore dst (MOVHload src mem) mem)))
(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
(MOVWstore [8] dst (MOVWload [8] src mem)
(MOVWstore [4] dst (MOVWload [4] src mem)
(MOVWstore dst (MOVWload src mem) mem)))
(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
(MOVVstore [8] dst (MOVVload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
(MOVVstore [16] dst (MOVVload [16] src mem)
(MOVVstore [8] dst (MOVVload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem)))
(MOVBstore [2] dst (MOVBUload [2] src mem)
(MOVHstore dst (MOVHUload src mem) mem))
(Move [4] dst src mem) => (MOVWstore dst (MOVWUload src mem) mem)
(Move [5] dst src mem) =>
(MOVBstore [4] dst (MOVBUload [4] src mem)
(MOVWstore dst (MOVWUload src mem) mem))
(Move [6] dst src mem) =>
(MOVHstore [4] dst (MOVHUload [4] src mem)
(MOVWstore dst (MOVWUload src mem) mem))
(Move [7] dst src mem) =>
(MOVWstore [3] dst (MOVWUload [3] src mem)
(MOVWstore dst (MOVWUload src mem) mem))
(Move [8] dst src mem) => (MOVVstore dst (MOVVload src mem) mem)
(Move [9] dst src mem) =>
(MOVBstore [8] dst (MOVBUload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [10] dst src mem) =>
(MOVHstore [8] dst (MOVHUload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [11] dst src mem) =>
(MOVWstore [7] dst (MOVWload [7] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [12] dst src mem) =>
(MOVWstore [8] dst (MOVWUload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [13] dst src mem) =>
(MOVVstore [5] dst (MOVVload [5] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [14] dst src mem) =>
(MOVVstore [6] dst (MOVVload [6] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [15] dst src mem) =>
(MOVVstore [7] dst (MOVVload [7] src mem)
(MOVVstore dst (MOVVload src mem) mem))
(Move [16] dst src mem) =>
(MOVVstore [8] dst (MOVVload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
// strip off fractional word move
(Move [s] dst src mem) && s%8 != 0 && s > 16 =>
(Move [s%8]
(OffPtr <dst.Type> dst [s-s%8])
(OffPtr <src.Type> src [s-s%8])
(Move [s-s%8] dst src mem))
// medium move uses a duff device
(Move [s] {t} dst src mem)
&& s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0
&& !config.noDuffDevice && logLargeCopy(v, s) =>
(DUFFCOPY [16 * (128 - s/8)] dst src mem)
(Move [s] dst src mem)
&& s%8 == 0 && s > 16 && s <= 8*128
&& !config.noDuffDevice && logLargeCopy(v, s) =>
(DUFFCOPY [16 * (128 - s/8)] dst src mem)
// 16 and 128 are magic constants. 16 is the number of bytes to encode:
// MOVV (R1), R23
// ADDV $8, R1
// MOVV R23, (R2)
// ADDV $8, R2
// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy.
// MOVV (R20), R30
// ADDV $8, R20
// MOVV R30, (R21)
// ADDV $8, R21
// and 128 is the number of such blocks. See runtime/duff_loong64.s:duffcopy.
// large move uses a loop
(Move [s] dst src mem)
&& s%8 == 0 && s > 1024 && logLargeCopy(v, s) =>
(LoweredMove
dst
src
(ADDVconst <src.Type> src [s-8])
mem)
// large or unaligned move uses a loop
(Move [s] {t} dst src mem)
&& s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 =>
(LoweredMove [t.Alignment()]
dst
src
(ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)])
mem)
// calls
(StaticCall ...) => (CALLstatic ...)

File diff suppressed because it is too large Load Diff