mirror of
https://github.com/golang/go
synced 2024-11-25 00:07:56 -07:00
cmd/compile: add loong64-specific inlining for runtime.memmove
goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Memmove/0 0.8004n ± 0% 0.4002n ± 0% -50.00% (p=0.000 n=20) Memmove/1 2.494n ± 0% 2.136n ± 0% -14.35% (p=0.000 n=20) Memmove/2 2.802n ± 0% 2.512n ± 0% -10.35% (p=0.000 n=20) Memmove/3 2.802n ± 0% 2.497n ± 0% -10.92% (p=0.000 n=20) Memmove/4 3.202n ± 0% 2.808n ± 0% -12.30% (p=0.000 n=20) Memmove/5 2.821n ± 0% 2.658n ± 0% -5.76% (p=0.000 n=20) Memmove/6 2.819n ± 0% 2.657n ± 0% -5.73% (p=0.000 n=20) Memmove/7 2.820n ± 0% 2.654n ± 0% -5.87% (p=0.000 n=20) Memmove/8 3.202n ± 0% 2.814n ± 0% -12.12% (p=0.000 n=20) Memmove/9 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/10 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/11 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/12 3.202n ± 0% 3.010n ± 0% -6.01% (p=0.000 n=20) Memmove/13 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/14 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/15 3.202n ± 0% 3.010n ± 0% -6.01% (p=0.000 n=20) Memmove/16 3.202n ± 0% 3.009n ± 0% -6.03% (p=0.000 n=20) Memmove/32 3.602n ± 0% 3.603n ± 0% +0.03% (p=0.000 n=20) Memmove/64 4.202n ± 0% 4.204n ± 0% +0.05% (p=0.000 n=20) Memmove/128 8.005n ± 0% 8.007n ± 0% +0.02% (p=0.000 n=20) Memmove/256 11.21n ± 0% 10.81n ± 0% -3.57% (p=0.000 n=20) Memmove/512 17.65n ± 0% 17.96n ± 0% +1.73% (p=0.000 n=20) Memmove/1024 30.48n ± 0% 30.46n ± 0% -0.07% (p=0.000 n=20) Memmove/2048 56.43n ± 0% 56.30n ± 0% -0.24% (p=0.000 n=20) Memmove/4096 107.7n ± 0% 107.6n ± 0% -0.09% (p=0.000 n=20) MemmoveOverlap/32 4.002n ± 0% 4.003n ± 0% +0.02% (p=0.002 n=20) MemmoveOverlap/64 4.603n ± 0% 4.603n ± 0% ~ (p=0.286 n=20) MemmoveOverlap/128 8.704n ± 0% 8.699n ± 0% ~ (p=0.180 n=20) MemmoveOverlap/256 12.01n ± 0% 11.76n ± 0% -2.08% (p=0.000 n=20) MemmoveOverlap/512 18.42n ± 0% 18.36n ± 0% -0.33% (p=0.000 n=20) MemmoveOverlap/1024 31.23n ± 0% 31.16n ± 0% -0.21% (p=0.000 n=20) MemmoveOverlap/2048 57.42n ± 0% 56.82n ± 0% -1.04% (p=0.000 n=20) MemmoveOverlap/4096 108.5n ± 0% 108.0n ± 0% -0.46% (p=0.000 n=20) MemmoveUnalignedDst/0 2.804n ± 0% 2.447n ± 0% -12.70% (p=0.000 n=20) MemmoveUnalignedDst/1 2.802n ± 0% 2.491n ± 0% -11.12% (p=0.000 n=20) MemmoveUnalignedDst/2 3.202n ± 0% 2.808n ± 0% -12.29% (p=0.000 n=20) MemmoveUnalignedDst/3 3.202n ± 0% 2.814n ± 0% -12.12% (p=0.000 n=20) MemmoveUnalignedDst/4 3.602n ± 0% 3.202n ± 0% -11.10% (p=0.000 n=20) MemmoveUnalignedDst/5 3.202n ± 0% 3.203n ± 0% +0.03% (p=0.014 n=20) MemmoveUnalignedDst/6 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/7 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/8 3.602n ± 0% 3.202n ± 0% -11.10% (p=0.000 n=20) MemmoveUnalignedDst/9 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/10 3.602n ± 0% 3.602n ± 0% ~ (p=0.091 n=20) MemmoveUnalignedDst/11 3.602n ± 0% 3.602n ± 0% ~ (p=0.613 n=20) MemmoveUnalignedDst/12 3.602n ± 0% 3.602n ± 0% ~ (p=0.165 n=20) MemmoveUnalignedDst/13 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/14 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/15 3.602n ± 0% 3.602n ± 0% 0.00% (p=0.027 n=20) MemmoveUnalignedDst/16 3.602n ± 0% 3.602n ± 0% ~ (p=0.661 n=20) MemmoveUnalignedDst/32 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedDst/64 6.804n ± 0% 6.804n ± 0% ~ (p=0.204 n=20) MemmoveUnalignedDst/128 12.61n ± 0% 12.61n ± 0% ~ (p=1.000 n=20) ¹ MemmoveUnalignedDst/256 16.33n ± 2% 16.32n ± 2% ~ (p=0.839 n=20) MemmoveUnalignedDst/512 25.61n ± 0% 24.71n ± 0% -3.51% (p=0.000 n=20) MemmoveUnalignedDst/1024 42.81n ± 0% 42.82n ± 0% ~ (p=0.973 n=20) MemmoveUnalignedDst/2048 74.86n ± 0% 76.03n ± 0% +1.56% (p=0.000 n=20) MemmoveUnalignedDst/4096 152.0n ± 11% 152.0n ± 0% 0.00% (p=0.013 n=20) MemmoveUnalignedDstOverlap/32 5.319n ± 0% 5.558n ± 1% +4.50% (p=0.000 n=20) MemmoveUnalignedDstOverlap/64 8.006n ± 0% 8.025n ± 0% +0.24% (p=0.000 n=20) MemmoveUnalignedDstOverlap/128 9.631n ± 0% 9.601n ± 0% -0.31% (p=0.000 n=20) MemmoveUnalignedDstOverlap/256 13.79n ± 2% 13.58n ± 1% ~ (p=0.234 n=20) MemmoveUnalignedDstOverlap/512 21.38n ± 0% 21.30n ± 0% -0.37% (p=0.000 n=20) MemmoveUnalignedDstOverlap/1024 41.71n ± 0% 41.70n ± 0% ~ (p=0.887 n=20) MemmoveUnalignedDstOverlap/2048 81.63n ± 0% 81.61n ± 0% ~ (p=0.481 n=20) MemmoveUnalignedDstOverlap/4096 162.6n ± 0% 162.6n ± 0% ~ (p=0.171 n=20) MemmoveUnalignedSrc/0 2.808n ± 0% 2.482n ± 0% -11.61% (p=0.000 n=20) MemmoveUnalignedSrc/1 2.804n ± 0% 2.577n ± 0% -8.08% (p=0.000 n=20) MemmoveUnalignedSrc/2 3.202n ± 0% 2.806n ± 0% -12.37% (p=0.000 n=20) MemmoveUnalignedSrc/3 3.202n ± 0% 2.808n ± 0% -12.30% (p=0.000 n=20) MemmoveUnalignedSrc/4 3.602n ± 0% 3.202n ± 0% -11.10% (p=0.000 n=20) MemmoveUnalignedSrc/5 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/6 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/7 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/8 3.602n ± 0% 3.202n ± 0% -11.10% (p=0.000 n=20) MemmoveUnalignedSrc/9 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/10 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/11 3.602n ± 0% 3.602n ± 0% ~ (p=0.746 n=20) MemmoveUnalignedSrc/12 3.602n ± 0% 3.602n ± 0% ~ (p=0.407 n=20) MemmoveUnalignedSrc/13 3.603n ± 0% 3.602n ± 0% -0.03% (p=0.001 n=20) MemmoveUnalignedSrc/14 3.603n ± 0% 3.602n ± 0% -0.01% (p=0.013 n=20) MemmoveUnalignedSrc/15 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/16 3.602n ± 0% 3.602n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/32 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrc/64 4.803n ± 0% 4.803n ± 0% 0.00% (p=0.008 n=20) MemmoveUnalignedSrc/128 8.405n ± 0% 8.405n ± 0% 0.00% (p=0.003 n=20) MemmoveUnalignedSrc/256 12.04n ± 3% 12.20n ± 2% ~ (p=0.151 n=20) MemmoveUnalignedSrc/512 19.11n ± 0% 19.10n ± 3% ~ (p=0.621 n=20) MemmoveUnalignedSrc/1024 35.62n ± 0% 35.62n ± 0% ~ (p=0.407 n=20) MemmoveUnalignedSrc/2048 68.04n ± 0% 68.35n ± 0% +0.46% (p=0.000 n=20) MemmoveUnalignedSrc/4096 133.2n ± 1% 133.3n ± 0% ~ (p=0.131 n=20) MemmoveUnalignedSrcDst/f_16_0 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_16_0 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/f_16_1 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_16_1 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/f_16_4 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_16_4 4.202n ± 0% 4.202n ± 0% ~ (p=0.661 n=20) MemmoveUnalignedSrcDst/f_16_7 4.202n ± 0% 4.202n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_16_7 4.203n ± 0% 4.202n ± 0% -0.02% (p=0.008 n=20) MemmoveUnalignedSrcDst/f_64_0 6.103n ± 0% 6.100n ± 0% ~ (p=0.595 n=20) MemmoveUnalignedSrcDst/b_64_0 6.103n ± 0% 6.102n ± 0% ~ (p=0.973 n=20) MemmoveUnalignedSrcDst/f_64_1 7.419n ± 0% 7.226n ± 0% -2.59% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_1 6.745n ± 0% 6.941n ± 0% +2.89% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_4 7.420n ± 0% 7.223n ± 0% -2.65% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_4 6.753n ± 0% 6.941n ± 0% +2.79% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_7 7.423n ± 0% 7.204n ± 0% -2.96% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_7 6.750n ± 0% 6.941n ± 0% +2.83% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_0 12.96n ± 0% 12.99n ± 0% +0.27% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_256_0 12.91n ± 0% 12.94n ± 0% +0.23% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_1 17.21n ± 0% 17.21n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_256_1 17.61n ± 0% 17.61n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/f_256_4 16.21n ± 0% 16.21n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_256_4 16.41n ± 0% 16.41n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/f_256_7 14.12n ± 0% 14.10n ± 0% ~ (p=0.307 n=20) MemmoveUnalignedSrcDst/b_256_7 14.81n ± 0% 14.81n ± 0% ~ (p=1.000 n=20) ¹ MemmoveUnalignedSrcDst/f_4096_0 109.3n ± 0% 109.4n ± 0% +0.09% (p=0.004 n=20) MemmoveUnalignedSrcDst/b_4096_0 109.6n ± 0% 109.6n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/f_4096_1 113.5n ± 0% 113.5n ± 0% ~ (p=1.000 n=20) MemmoveUnalignedSrcDst/b_4096_1 113.7n ± 0% 113.7n ± 0% ~ (p=1.000 n=20) ¹ MemmoveUnalignedSrcDst/f_4096_4 112.3n ± 0% 112.3n ± 0% ~ (p=0.763 n=20) MemmoveUnalignedSrcDst/b_4096_4 112.6n ± 0% 112.9n ± 1% +0.31% (p=0.032 n=20) MemmoveUnalignedSrcDst/f_4096_7 110.6n ± 0% 110.6n ± 0% ~ (p=1.000 n=20) ¹ MemmoveUnalignedSrcDst/b_4096_7 111.1n ± 0% 111.1n ± 0% ~ (p=1.000 n=20) ¹ MemmoveUnalignedSrcDst/f_65536_0 4.801µ ± 0% 4.818µ ± 0% +0.34% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_0 5.027µ ± 0% 5.036µ ± 0% +0.19% (p=0.007 n=20) MemmoveUnalignedSrcDst/f_65536_1 4.815µ ± 0% 4.729µ ± 0% -1.78% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_1 4.659µ ± 0% 4.737µ ± 1% +1.69% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_4 4.807µ ± 0% 4.721µ ± 0% -1.78% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_4 4.659µ ± 0% 4.601µ ± 0% -1.23% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_7 4.868µ ± 0% 4.759µ ± 0% -2.23% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_7 4.665µ ± 0% 4.709µ ± 0% +0.93% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/32 6.804n ± 0% 6.810n ± 0% +0.09% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/64 10.41n ± 0% 10.42n ± 0% +0.10% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/128 11.59n ± 0% 11.58n ± 0% ~ (p=0.414 n=20) MemmoveUnalignedSrcOverlap/256 14.22n ± 0% 14.29n ± 0% +0.46% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/512 23.11n ± 0% 23.04n ± 0% -0.28% (p=0.001 n=20) MemmoveUnalignedSrcOverlap/1024 41.44n ± 0% 41.47n ± 0% ~ (p=0.693 n=20) MemmoveUnalignedSrcOverlap/2048 81.25n ± 0% 81.25n ± 0% ~ (p=0.405 n=20) MemmoveUnalignedSrcOverlap/4096 166.1n ± 0% 166.1n ± 0% ~ (p=0.451 n=20) geomean 13.02n 12.69n -2.51% ¹ all samples are equal Change-Id: I712adc7670f6ae360714ec5a770d00d76c8700ed Reviewed-on: https://go-review.googlesource.com/c/go/+/618815 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
This commit is contained in:
parent
47a48ebf34
commit
5f88755f43
@ -796,3 +796,13 @@
|
||||
(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
|
||||
(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
|
||||
(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes)
|
||||
|
||||
// Arch-specific inlining for small or disjoint runtime.memmove
|
||||
// Match post-lowering calls, register version.
|
||||
(SelectN [0] call:(CALLstatic {sym} dst src (MOVVconst [sz]) mem))
|
||||
&& sz >= 0
|
||||
&& isSameCall(sym, "runtime.memmove")
|
||||
&& call.Uses == 1
|
||||
&& isInlinableMemmove(dst, src, sz, config)
|
||||
&& clobber(call)
|
||||
=> (Move [sz] dst src mem)
|
||||
|
@ -652,6 +652,8 @@ func rewriteValueLOONG64(v *Value) bool {
|
||||
return rewriteValueLOONG64_OpSelect0(v)
|
||||
case OpSelect1:
|
||||
return rewriteValueLOONG64_OpSelect1(v)
|
||||
case OpSelectN:
|
||||
return rewriteValueLOONG64_OpSelectN(v)
|
||||
case OpSignExt16to32:
|
||||
v.Op = OpLOONG64MOVHreg
|
||||
return true
|
||||
@ -8933,6 +8935,40 @@ func rewriteValueLOONG64_OpSelect1(v *Value) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueLOONG64_OpSelectN(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
b := v.Block
|
||||
config := b.Func.Config
|
||||
// match: (SelectN [0] call:(CALLstatic {sym} dst src (MOVVconst [sz]) mem))
|
||||
// cond: sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)
|
||||
// result: (Move [sz] dst src mem)
|
||||
for {
|
||||
if auxIntToInt64(v.AuxInt) != 0 {
|
||||
break
|
||||
}
|
||||
call := v_0
|
||||
if call.Op != OpLOONG64CALLstatic || len(call.Args) != 4 {
|
||||
break
|
||||
}
|
||||
sym := auxToCall(call.Aux)
|
||||
mem := call.Args[3]
|
||||
dst := call.Args[0]
|
||||
src := call.Args[1]
|
||||
call_2 := call.Args[2]
|
||||
if call_2.Op != OpLOONG64MOVVconst {
|
||||
break
|
||||
}
|
||||
sz := auxIntToInt64(call_2.AuxInt)
|
||||
if !(sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(sz)
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueLOONG64_OpSlicemask(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
b := v.Block
|
||||
|
@ -95,6 +95,7 @@ func moveArchLowering1(b []byte, x *[1]byte) {
|
||||
_ = b[1]
|
||||
// amd64:-".*memmove"
|
||||
// arm64:-".*memmove"
|
||||
// loong64:-".*memmove"
|
||||
// ppc64x:-".*memmove"
|
||||
copy(b, x[:])
|
||||
}
|
||||
@ -103,6 +104,7 @@ func moveArchLowering2(b []byte, x *[2]byte) {
|
||||
_ = b[2]
|
||||
// amd64:-".*memmove"
|
||||
// arm64:-".*memmove"
|
||||
// loong64:-".*memmove"
|
||||
// ppc64x:-".*memmove"
|
||||
copy(b, x[:])
|
||||
}
|
||||
@ -111,6 +113,7 @@ func moveArchLowering4(b []byte, x *[4]byte) {
|
||||
_ = b[4]
|
||||
// amd64:-".*memmove"
|
||||
// arm64:-".*memmove"
|
||||
// loong64:-".*memmove"
|
||||
// ppc64x:-".*memmove"
|
||||
copy(b, x[:])
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user