1
0
mirror of https://github.com/golang/go synced 2024-11-25 00:07:56 -07:00

cmd/compile: add loong64-specific inlining for runtime.memmove

goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                                 |   bench.old   |               bench.new                |
                                 |    sec/op     |    sec/op     vs base                  |
Memmove/0                          0.8004n ±  0%   0.4002n ± 0%  -50.00% (p=0.000 n=20)
Memmove/1                           2.494n ±  0%    2.136n ± 0%  -14.35% (p=0.000 n=20)
Memmove/2                           2.802n ±  0%    2.512n ± 0%  -10.35% (p=0.000 n=20)
Memmove/3                           2.802n ±  0%    2.497n ± 0%  -10.92% (p=0.000 n=20)
Memmove/4                           3.202n ±  0%    2.808n ± 0%  -12.30% (p=0.000 n=20)
Memmove/5                           2.821n ±  0%    2.658n ± 0%   -5.76% (p=0.000 n=20)
Memmove/6                           2.819n ±  0%    2.657n ± 0%   -5.73% (p=0.000 n=20)
Memmove/7                           2.820n ±  0%    2.654n ± 0%   -5.87% (p=0.000 n=20)
Memmove/8                           3.202n ±  0%    2.814n ± 0%  -12.12% (p=0.000 n=20)
Memmove/9                           3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/10                          3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/11                          3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/12                          3.202n ±  0%    3.010n ± 0%   -6.01% (p=0.000 n=20)
Memmove/13                          3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/14                          3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/15                          3.202n ±  0%    3.010n ± 0%   -6.01% (p=0.000 n=20)
Memmove/16                          3.202n ±  0%    3.009n ± 0%   -6.03% (p=0.000 n=20)
Memmove/32                          3.602n ±  0%    3.603n ± 0%   +0.03% (p=0.000 n=20)
Memmove/64                          4.202n ±  0%    4.204n ± 0%   +0.05% (p=0.000 n=20)
Memmove/128                         8.005n ±  0%    8.007n ± 0%   +0.02% (p=0.000 n=20)
Memmove/256                         11.21n ±  0%    10.81n ± 0%   -3.57% (p=0.000 n=20)
Memmove/512                         17.65n ±  0%    17.96n ± 0%   +1.73% (p=0.000 n=20)
Memmove/1024                        30.48n ±  0%    30.46n ± 0%   -0.07% (p=0.000 n=20)
Memmove/2048                        56.43n ±  0%    56.30n ± 0%   -0.24% (p=0.000 n=20)
Memmove/4096                        107.7n ±  0%    107.6n ± 0%   -0.09% (p=0.000 n=20)
MemmoveOverlap/32                   4.002n ±  0%    4.003n ± 0%   +0.02% (p=0.002 n=20)
MemmoveOverlap/64                   4.603n ±  0%    4.603n ± 0%        ~ (p=0.286 n=20)
MemmoveOverlap/128                  8.704n ±  0%    8.699n ± 0%        ~ (p=0.180 n=20)
MemmoveOverlap/256                  12.01n ±  0%    11.76n ± 0%   -2.08% (p=0.000 n=20)
MemmoveOverlap/512                  18.42n ±  0%    18.36n ± 0%   -0.33% (p=0.000 n=20)
MemmoveOverlap/1024                 31.23n ±  0%    31.16n ± 0%   -0.21% (p=0.000 n=20)
MemmoveOverlap/2048                 57.42n ±  0%    56.82n ± 0%   -1.04% (p=0.000 n=20)
MemmoveOverlap/4096                 108.5n ±  0%    108.0n ± 0%   -0.46% (p=0.000 n=20)
MemmoveUnalignedDst/0               2.804n ±  0%    2.447n ± 0%  -12.70% (p=0.000 n=20)
MemmoveUnalignedDst/1               2.802n ±  0%    2.491n ± 0%  -11.12% (p=0.000 n=20)
MemmoveUnalignedDst/2               3.202n ±  0%    2.808n ± 0%  -12.29% (p=0.000 n=20)
MemmoveUnalignedDst/3               3.202n ±  0%    2.814n ± 0%  -12.12% (p=0.000 n=20)
MemmoveUnalignedDst/4               3.602n ±  0%    3.202n ± 0%  -11.10% (p=0.000 n=20)
MemmoveUnalignedDst/5               3.202n ±  0%    3.203n ± 0%   +0.03% (p=0.014 n=20)
MemmoveUnalignedDst/6               3.202n ±  0%    3.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/7               3.202n ±  0%    3.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/8               3.602n ±  0%    3.202n ± 0%  -11.10% (p=0.000 n=20)
MemmoveUnalignedDst/9               3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/10              3.602n ±  0%    3.602n ± 0%        ~ (p=0.091 n=20)
MemmoveUnalignedDst/11              3.602n ±  0%    3.602n ± 0%        ~ (p=0.613 n=20)
MemmoveUnalignedDst/12              3.602n ±  0%    3.602n ± 0%        ~ (p=0.165 n=20)
MemmoveUnalignedDst/13              3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/14              3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/15              3.602n ±  0%    3.602n ± 0%    0.00% (p=0.027 n=20)
MemmoveUnalignedDst/16              3.602n ±  0%    3.602n ± 0%        ~ (p=0.661 n=20)
MemmoveUnalignedDst/32              4.002n ±  0%    4.002n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedDst/64              6.804n ±  0%    6.804n ± 0%        ~ (p=0.204 n=20)
MemmoveUnalignedDst/128             12.61n ±  0%    12.61n ± 0%        ~ (p=1.000 n=20) ¹
MemmoveUnalignedDst/256             16.33n ±  2%    16.32n ± 2%        ~ (p=0.839 n=20)
MemmoveUnalignedDst/512             25.61n ±  0%    24.71n ± 0%   -3.51% (p=0.000 n=20)
MemmoveUnalignedDst/1024            42.81n ±  0%    42.82n ± 0%        ~ (p=0.973 n=20)
MemmoveUnalignedDst/2048            74.86n ±  0%    76.03n ± 0%   +1.56% (p=0.000 n=20)
MemmoveUnalignedDst/4096            152.0n ± 11%    152.0n ± 0%    0.00% (p=0.013 n=20)
MemmoveUnalignedDstOverlap/32       5.319n ±  0%    5.558n ± 1%   +4.50% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/64       8.006n ±  0%    8.025n ± 0%   +0.24% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/128      9.631n ±  0%    9.601n ± 0%   -0.31% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/256      13.79n ±  2%    13.58n ± 1%        ~ (p=0.234 n=20)
MemmoveUnalignedDstOverlap/512      21.38n ±  0%    21.30n ± 0%   -0.37% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/1024     41.71n ±  0%    41.70n ± 0%        ~ (p=0.887 n=20)
MemmoveUnalignedDstOverlap/2048     81.63n ±  0%    81.61n ± 0%        ~ (p=0.481 n=20)
MemmoveUnalignedDstOverlap/4096     162.6n ±  0%    162.6n ± 0%        ~ (p=0.171 n=20)
MemmoveUnalignedSrc/0               2.808n ±  0%    2.482n ± 0%  -11.61% (p=0.000 n=20)
MemmoveUnalignedSrc/1               2.804n ±  0%    2.577n ± 0%   -8.08% (p=0.000 n=20)
MemmoveUnalignedSrc/2               3.202n ±  0%    2.806n ± 0%  -12.37% (p=0.000 n=20)
MemmoveUnalignedSrc/3               3.202n ±  0%    2.808n ± 0%  -12.30% (p=0.000 n=20)
MemmoveUnalignedSrc/4               3.602n ±  0%    3.202n ± 0%  -11.10% (p=0.000 n=20)
MemmoveUnalignedSrc/5               3.202n ±  0%    3.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/6               3.202n ±  0%    3.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/7               3.202n ±  0%    3.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/8               3.602n ±  0%    3.202n ± 0%  -11.10% (p=0.000 n=20)
MemmoveUnalignedSrc/9               3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/10              3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/11              3.602n ±  0%    3.602n ± 0%        ~ (p=0.746 n=20)
MemmoveUnalignedSrc/12              3.602n ±  0%    3.602n ± 0%        ~ (p=0.407 n=20)
MemmoveUnalignedSrc/13              3.603n ±  0%    3.602n ± 0%   -0.03% (p=0.001 n=20)
MemmoveUnalignedSrc/14              3.603n ±  0%    3.602n ± 0%   -0.01% (p=0.013 n=20)
MemmoveUnalignedSrc/15              3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/16              3.602n ±  0%    3.602n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/32              4.002n ±  0%    4.002n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrc/64              4.803n ±  0%    4.803n ± 0%    0.00% (p=0.008 n=20)
MemmoveUnalignedSrc/128             8.405n ±  0%    8.405n ± 0%    0.00% (p=0.003 n=20)
MemmoveUnalignedSrc/256             12.04n ±  3%    12.20n ± 2%        ~ (p=0.151 n=20)
MemmoveUnalignedSrc/512             19.11n ±  0%    19.10n ± 3%        ~ (p=0.621 n=20)
MemmoveUnalignedSrc/1024            35.62n ±  0%    35.62n ± 0%        ~ (p=0.407 n=20)
MemmoveUnalignedSrc/2048            68.04n ±  0%    68.35n ± 0%   +0.46% (p=0.000 n=20)
MemmoveUnalignedSrc/4096            133.2n ±  1%    133.3n ± 0%        ~ (p=0.131 n=20)
MemmoveUnalignedSrcDst/f_16_0       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_16_0       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/f_16_1       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_16_1       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/f_16_4       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_16_4       4.202n ±  0%    4.202n ± 0%        ~ (p=0.661 n=20)
MemmoveUnalignedSrcDst/f_16_7       4.202n ±  0%    4.202n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_16_7       4.203n ±  0%    4.202n ± 0%   -0.02% (p=0.008 n=20)
MemmoveUnalignedSrcDst/f_64_0       6.103n ±  0%    6.100n ± 0%        ~ (p=0.595 n=20)
MemmoveUnalignedSrcDst/b_64_0       6.103n ±  0%    6.102n ± 0%        ~ (p=0.973 n=20)
MemmoveUnalignedSrcDst/f_64_1       7.419n ±  0%    7.226n ± 0%   -2.59% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_1       6.745n ±  0%    6.941n ± 0%   +2.89% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_4       7.420n ±  0%    7.223n ± 0%   -2.65% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_4       6.753n ±  0%    6.941n ± 0%   +2.79% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_7       7.423n ±  0%    7.204n ± 0%   -2.96% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_7       6.750n ±  0%    6.941n ± 0%   +2.83% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_0      12.96n ±  0%    12.99n ± 0%   +0.27% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_0      12.91n ±  0%    12.94n ± 0%   +0.23% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_1      17.21n ±  0%    17.21n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_256_1      17.61n ±  0%    17.61n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/f_256_4      16.21n ±  0%    16.21n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_256_4      16.41n ±  0%    16.41n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/f_256_7      14.12n ±  0%    14.10n ± 0%        ~ (p=0.307 n=20)
MemmoveUnalignedSrcDst/b_256_7      14.81n ±  0%    14.81n ± 0%        ~ (p=1.000 n=20) ¹
MemmoveUnalignedSrcDst/f_4096_0     109.3n ±  0%    109.4n ± 0%   +0.09% (p=0.004 n=20)
MemmoveUnalignedSrcDst/b_4096_0     109.6n ±  0%    109.6n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/f_4096_1     113.5n ±  0%    113.5n ± 0%        ~ (p=1.000 n=20)
MemmoveUnalignedSrcDst/b_4096_1     113.7n ±  0%    113.7n ± 0%        ~ (p=1.000 n=20) ¹
MemmoveUnalignedSrcDst/f_4096_4     112.3n ±  0%    112.3n ± 0%        ~ (p=0.763 n=20)
MemmoveUnalignedSrcDst/b_4096_4     112.6n ±  0%    112.9n ± 1%   +0.31% (p=0.032 n=20)
MemmoveUnalignedSrcDst/f_4096_7     110.6n ±  0%    110.6n ± 0%        ~ (p=1.000 n=20) ¹
MemmoveUnalignedSrcDst/b_4096_7     111.1n ±  0%    111.1n ± 0%        ~ (p=1.000 n=20) ¹
MemmoveUnalignedSrcDst/f_65536_0    4.801µ ±  0%    4.818µ ± 0%   +0.34% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_0    5.027µ ±  0%    5.036µ ± 0%   +0.19% (p=0.007 n=20)
MemmoveUnalignedSrcDst/f_65536_1    4.815µ ±  0%    4.729µ ± 0%   -1.78% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_1    4.659µ ±  0%    4.737µ ± 1%   +1.69% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_4    4.807µ ±  0%    4.721µ ± 0%   -1.78% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_4    4.659µ ±  0%    4.601µ ± 0%   -1.23% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_7    4.868µ ±  0%    4.759µ ± 0%   -2.23% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_7    4.665µ ±  0%    4.709µ ± 0%   +0.93% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/32       6.804n ±  0%    6.810n ± 0%   +0.09% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/64       10.41n ±  0%    10.42n ± 0%   +0.10% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/128      11.59n ±  0%    11.58n ± 0%        ~ (p=0.414 n=20)
MemmoveUnalignedSrcOverlap/256      14.22n ±  0%    14.29n ± 0%   +0.46% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/512      23.11n ±  0%    23.04n ± 0%   -0.28% (p=0.001 n=20)
MemmoveUnalignedSrcOverlap/1024     41.44n ±  0%    41.47n ± 0%        ~ (p=0.693 n=20)
MemmoveUnalignedSrcOverlap/2048     81.25n ±  0%    81.25n ± 0%        ~ (p=0.405 n=20)
MemmoveUnalignedSrcOverlap/4096     166.1n ±  0%    166.1n ± 0%        ~ (p=0.451 n=20)
geomean                             13.02n          12.69n        -2.51%
¹ all samples are equal

Change-Id: I712adc7670f6ae360714ec5a770d00d76c8700ed
Reviewed-on: https://go-review.googlesource.com/c/go/+/618815
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
This commit is contained in:
Xiaolin Zhao 2024-10-09 15:42:23 +08:00 committed by abner chenc
parent 47a48ebf34
commit 5f88755f43
3 changed files with 49 additions and 0 deletions

View File

@ -796,3 +796,13 @@
(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes)
// Arch-specific inlining for small or disjoint runtime.memmove
// Match post-lowering calls, register version.
(SelectN [0] call:(CALLstatic {sym} dst src (MOVVconst [sz]) mem))
&& sz >= 0
&& isSameCall(sym, "runtime.memmove")
&& call.Uses == 1
&& isInlinableMemmove(dst, src, sz, config)
&& clobber(call)
=> (Move [sz] dst src mem)

View File

@ -652,6 +652,8 @@ func rewriteValueLOONG64(v *Value) bool {
return rewriteValueLOONG64_OpSelect0(v)
case OpSelect1:
return rewriteValueLOONG64_OpSelect1(v)
case OpSelectN:
return rewriteValueLOONG64_OpSelectN(v)
case OpSignExt16to32:
v.Op = OpLOONG64MOVHreg
return true
@ -8933,6 +8935,40 @@ func rewriteValueLOONG64_OpSelect1(v *Value) bool {
}
return false
}
func rewriteValueLOONG64_OpSelectN(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
// match: (SelectN [0] call:(CALLstatic {sym} dst src (MOVVconst [sz]) mem))
// cond: sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)
// result: (Move [sz] dst src mem)
for {
if auxIntToInt64(v.AuxInt) != 0 {
break
}
call := v_0
if call.Op != OpLOONG64CALLstatic || len(call.Args) != 4 {
break
}
sym := auxToCall(call.Aux)
mem := call.Args[3]
dst := call.Args[0]
src := call.Args[1]
call_2 := call.Args[2]
if call_2.Op != OpLOONG64MOVVconst {
break
}
sz := auxIntToInt64(call_2.AuxInt)
if !(sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)) {
break
}
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(sz)
v.AddArg3(dst, src, mem)
return true
}
return false
}
func rewriteValueLOONG64_OpSlicemask(v *Value) bool {
v_0 := v.Args[0]
b := v.Block

View File

@ -95,6 +95,7 @@ func moveArchLowering1(b []byte, x *[1]byte) {
_ = b[1]
// amd64:-".*memmove"
// arm64:-".*memmove"
// loong64:-".*memmove"
// ppc64x:-".*memmove"
copy(b, x[:])
}
@ -103,6 +104,7 @@ func moveArchLowering2(b []byte, x *[2]byte) {
_ = b[2]
// amd64:-".*memmove"
// arm64:-".*memmove"
// loong64:-".*memmove"
// ppc64x:-".*memmove"
copy(b, x[:])
}
@ -111,6 +113,7 @@ func moveArchLowering4(b []byte, x *[4]byte) {
_ = b[4]
// amd64:-".*memmove"
// arm64:-".*memmove"
// loong64:-".*memmove"
// ppc64x:-".*memmove"
copy(b, x[:])
}