mirror of
https://github.com/golang/go
synced 2024-11-25 00:17:58 -07:00
cmd/internal/obj: optimize the function stacksplit on loong64
In the process of stack split checking, loong64 uses the following logic: if SP > stackguard then goto done, else morestack The possible problem here is that the probability of morestack execution is much lower than done, while static branch prediction is more inclined to obtain morestack, which will cause a certain probability of branch prediction error. Change the logic here to: if SP <= stackguard then goto morestack, else done benchmarks on 3A6000: goos: linux goarch: loong64 pkg: fmt cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ SprintfPadding 418.3n ± 1% 387.0n ± 0% -7.49% (p=0.000 n=20) SprintfEmpty 35.95n ± 0% 35.86n ± 0% -0.25% (p=0.000 n=20) SprintfString 75.02n ± 1% 72.24n ± 0% -3.71% (p=0.000 n=20) SprintfTruncateString 165.7n ± 3% 139.9n ± 1% -15.58% (p=0.000 n=20) SprintfTruncateBytes 171.0n ± 0% 147.3n ± 0% -13.83% (p=0.000 n=20) SprintfSlowParsingPath 90.56n ± 0% 80.85n ± 0% -10.72% (p=0.000 n=20) SprintfQuoteString 560.2n ± 0% 509.7n ± 0% -9.01% (p=0.000 n=20) SprintfInt 58.62n ± 0% 56.45n ± 0% -3.70% (p=0.000 n=20) SprintfIntInt 141.7n ± 0% 122.2n ± 0% -13.73% (p=0.000 n=20) SprintfPrefixedInt 210.6n ± 0% 208.8n ± 0% -0.88% (p=0.000 n=20) SprintfFloat 282.3n ± 0% 251.8n ± 1% -10.80% (p=0.000 n=20) SprintfComplex 854.1n ± 0% 813.8n ± 0% -4.71% (p=0.000 n=20) SprintfBoolean 76.32n ± 0% 71.14n ± 1% -6.79% (p=0.000 n=20) SprintfHexString 218.5n ± 0% 193.4n ± 0% -11.51% (p=0.000 n=20) SprintfHexBytes 321.3n ± 0% 275.0n ± 0% -14.42% (p=0.000 n=20) SprintfBytes 573.5n ± 0% 553.2n ± 1% -3.54% (p=0.000 n=20) SprintfStringer 501.1n ± 1% 446.6n ± 0% -10.86% (p=0.000 n=20) SprintfStructure 1.793µ ± 0% 1.683µ ± 0% -6.16% (p=0.000 n=20) ManyArgs 500.0n ± 0% 470.4n ± 0% -5.92% (p=0.000 n=20) FprintInt 67.51n ± 0% 65.71n ± 0% -2.66% (p=0.000 n=20) FprintfBytes 130.9n ± 0% 129.5n ± 1% -1.11% (p=0.000 n=20) FprintIntNoAlloc 67.55n ± 0% 65.80n ± 0% -2.58% (p=0.000 n=20) ScanInts 386.3µ ± 0% 346.5µ ± 0% -10.29% (p=0.000 n=20) ScanRecursiveInt 25.97m ± 0% 25.93m ± 0% -0.15% (p=0.038 n=20) ScanRecursiveIntReaderWrapper 26.07m ± 0% 25.93m ± 0% -0.53% (p=0.001 n=20) geomean 702.6n 653.7n -6.96% goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ BinaryTree17 7.688 ± 1% 7.724 ± 0% +0.47% (p=0.040 n=20) Fannkuch11 2.670 ± 0% 2.645 ± 0% -0.94% (p=0.000 n=20) FmtFprintfEmpty 35.93n ± 0% 37.50n ± 0% +4.37% (p=0.000 n=20) FmtFprintfString 56.32n ± 0% 59.74n ± 0% +6.08% (p=0.000 n=20) FmtFprintfInt 64.47n ± 0% 61.26n ± 0% -4.98% (p=0.000 n=20) FmtFprintfIntInt 100.30n ± 0% 99.67n ± 0% -0.63% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.3n ± 0% +2.23% (p=0.000 n=20) FmtFprintfFloat 234.1n ± 0% 203.4n ± 0% -13.11% (p=0.000 n=20) FmtManyArgs 503.0n ± 0% 467.9n ± 0% -6.96% (p=0.000 n=20) GobDecode 8.125m ± 0% 7.299m ± 0% -10.17% (p=0.000 n=20) GobEncode 8.930m ± 1% 8.581m ± 1% -3.91% (p=0.000 n=20) Gzip 280.0m ± 0% 279.8m ± 0% -0.10% (p=0.000 n=20) Gunzip 33.30m ± 0% 32.48m ± 0% -2.49% (p=0.000 n=20) HTTPClientServer 55.43µ ± 0% 54.10µ ± 1% -2.41% (p=0.000 n=20) JSONEncode 10.086m ± 0% 9.055m ± 0% -10.22% (p=0.000 n=20) JSONDecode 49.37m ± 1% 46.22m ± 1% -6.40% (p=0.000 n=20) Mandelbrot200 4.606m ± 0% 4.606m ± 0% ~ (p=0.280 n=20) GoParse 5.010m ± 0% 4.855m ± 0% -3.09% (p=0.000 n=20) RegexpMatchEasy0_32 59.09n ± 0% 59.32n ± 0% +0.39% (p=0.000 n=20) RegexpMatchEasy0_1K 455.2n ± 0% 453.8n ± 0% -0.31% (p=0.000 n=20) RegexpMatchEasy1_32 59.24n ± 0% 60.11n ± 0% +1.47% (p=0.000 n=20) RegexpMatchEasy1_1K 555.2n ± 0% 553.9n ± 0% -0.23% (p=0.000 n=20) RegexpMatchMedium_32 845.7n ± 0% 775.6n ± 0% -8.28% (p=0.000 n=20) RegexpMatchMedium_1K 26.68µ ± 0% 26.48µ ± 0% -0.78% (p=0.000 n=20) RegexpMatchHard_32 1.317µ ± 0% 1.326µ ± 0% +0.68% (p=0.000 n=20) RegexpMatchHard_1K 41.35µ ± 0% 40.95µ ± 0% -0.97% (p=0.000 n=20) Revcomp 463.0m ± 0% 473.0m ± 0% +2.15% (p=0.000 n=20) Template 83.80m ± 0% 76.26m ± 1% -9.00% (p=0.000 n=20) TimeParse 283.3n ± 0% 260.8n ± 0% -7.96% (p=0.000 n=20) TimeFormat 307.2n ± 0% 290.5n ± 0% -5.45% (p=0.000 n=20) geomean 53.16µ 51.67µ -2.79% Change-Id: Iaec2f50db18e9a2b405605f8b92af3683114ea34 Reviewed-on: https://go-review.googlesource.com/c/go/+/616035 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
356ba0f065
commit
5923a97f43
@ -726,7 +726,7 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
|
|||||||
var q *obj.Prog
|
var q *obj.Prog
|
||||||
if framesize <= abi.StackSmall {
|
if framesize <= abi.StackSmall {
|
||||||
// small stack: SP < stackguard
|
// small stack: SP < stackguard
|
||||||
// AGTU SP, stackguard, R20
|
// SGTU SP, stackguard, R20
|
||||||
p = obj.Appendp(p, c.newprog)
|
p = obj.Appendp(p, c.newprog)
|
||||||
|
|
||||||
p.As = ASGTU
|
p.As = ASGTU
|
||||||
@ -784,19 +784,41 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
|
|||||||
p.To.Reg = REG_R20
|
p.To.Reg = REG_R20
|
||||||
}
|
}
|
||||||
|
|
||||||
// q1: BNE R20, done
|
// q1: BEQ R20, morestack
|
||||||
p = obj.Appendp(p, c.newprog)
|
p = obj.Appendp(p, c.newprog)
|
||||||
q1 := p
|
q1 := p
|
||||||
|
|
||||||
p.As = ABNE
|
p.As = ABEQ
|
||||||
p.From.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_REG
|
||||||
p.From.Reg = REG_R20
|
p.From.Reg = REG_R20
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
p.To.Type = obj.TYPE_BRANCH
|
||||||
p.Mark |= BRANCH
|
p.Mark |= BRANCH
|
||||||
|
|
||||||
// MOV LINK, R31
|
end := c.ctxt.EndUnsafePoint(p, c.newprog, -1)
|
||||||
p = obj.Appendp(p, c.newprog)
|
|
||||||
|
|
||||||
|
var last *obj.Prog
|
||||||
|
for last = c.cursym.Func().Text; last.Link != nil; last = last.Link {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we are at the end of the function, but logically
|
||||||
|
// we are still in function prologue. We need to fix the
|
||||||
|
// SP data and PCDATA.
|
||||||
|
spfix := obj.Appendp(last, c.newprog)
|
||||||
|
spfix.As = obj.ANOP
|
||||||
|
spfix.Spadj = -framesize
|
||||||
|
|
||||||
|
pcdata := c.ctxt.EmitEntryStackMap(c.cursym, spfix, c.newprog)
|
||||||
|
pcdata = c.ctxt.StartUnsafePoint(pcdata, c.newprog)
|
||||||
|
|
||||||
|
if q != nil {
|
||||||
|
q.To.SetTarget(pcdata)
|
||||||
|
}
|
||||||
|
q1.To.SetTarget(pcdata)
|
||||||
|
|
||||||
|
p = c.cursym.Func().SpillRegisterArgs(pcdata, c.newprog)
|
||||||
|
|
||||||
|
// MOV LINK, R31
|
||||||
|
p = obj.Appendp(p, c.newprog)
|
||||||
p.As = mov
|
p.As = mov
|
||||||
p.From.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_REG
|
||||||
p.From.Reg = REGLINK
|
p.From.Reg = REGLINK
|
||||||
@ -807,45 +829,32 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
|
|||||||
p.Mark |= LABEL
|
p.Mark |= LABEL
|
||||||
}
|
}
|
||||||
|
|
||||||
p = c.ctxt.EmitEntryStackMap(c.cursym, p, c.newprog)
|
// JAL runtime.morestack(SB)
|
||||||
|
call := obj.Appendp(p, c.newprog)
|
||||||
|
call.As = AJAL
|
||||||
|
call.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
// Spill the register args that could be clobbered by the
|
|
||||||
// morestack code
|
|
||||||
p = c.cursym.Func().SpillRegisterArgs(p, c.newprog)
|
|
||||||
|
|
||||||
// JAL runtime.morestack(SB)
|
|
||||||
p = obj.Appendp(p, c.newprog)
|
|
||||||
|
|
||||||
p.As = AJAL
|
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
|
||||||
if c.cursym.CFunc() {
|
if c.cursym.CFunc() {
|
||||||
p.To.Sym = c.ctxt.Lookup("runtime.morestackc")
|
call.To.Sym = c.ctxt.Lookup("runtime.morestackc")
|
||||||
} else if !c.cursym.Func().Text.From.Sym.NeedCtxt() {
|
} else if !c.cursym.Func().Text.From.Sym.NeedCtxt() {
|
||||||
p.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt")
|
call.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt")
|
||||||
} else {
|
} else {
|
||||||
p.To.Sym = c.ctxt.Lookup("runtime.morestack")
|
call.To.Sym = c.ctxt.Lookup("runtime.morestack")
|
||||||
}
|
}
|
||||||
p.Mark |= BRANCH
|
call.Mark |= BRANCH
|
||||||
|
|
||||||
p = c.cursym.Func().UnspillRegisterArgs(p, c.newprog)
|
// The instructions which unspill regs should be preemptible.
|
||||||
p = c.ctxt.EndUnsafePoint(p, c.newprog, -1)
|
pcdata = c.ctxt.EndUnsafePoint(call, c.newprog, -1)
|
||||||
|
unspill := c.cursym.Func().UnspillRegisterArgs(pcdata, c.newprog)
|
||||||
|
|
||||||
// JMP start
|
// JMP start
|
||||||
p = obj.Appendp(p, c.newprog)
|
jmp := obj.Appendp(unspill, c.newprog)
|
||||||
|
jmp.As = AJMP
|
||||||
|
jmp.To.Type = obj.TYPE_BRANCH
|
||||||
|
jmp.To.SetTarget(startPred.Link)
|
||||||
|
jmp.Spadj = +framesize
|
||||||
|
|
||||||
p.As = AJMP
|
return end
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
|
||||||
p.To.SetTarget(startPred.Link)
|
|
||||||
startPred.Link.Mark |= LABEL
|
|
||||||
p.Mark |= BRANCH
|
|
||||||
|
|
||||||
// placeholder for q1's jump target
|
|
||||||
p = obj.Appendp(p, c.newprog)
|
|
||||||
|
|
||||||
p.As = obj.ANOP // zero-width place holder
|
|
||||||
q1.To.SetTarget(p)
|
|
||||||
|
|
||||||
return p
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ctxt0) addnop(p *obj.Prog) {
|
func (c *ctxt0) addnop(p *obj.Prog) {
|
||||||
|
Loading…
Reference in New Issue
Block a user