mirror of
https://github.com/golang/go
synced 2024-11-18 12:04:57 -07:00
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue
This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
This commit is contained in:
parent
a2cc8b20fd
commit
a143f5d646
@ -341,8 +341,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
q = p
|
q = p
|
||||||
}
|
}
|
||||||
|
|
||||||
var p1 *obj.Prog
|
|
||||||
var p2 *obj.Prog
|
|
||||||
var q2 *obj.Prog
|
var q2 *obj.Prog
|
||||||
for p := cursym.Text; p != nil; p = p.Link {
|
for p := cursym.Text; p != nil; p = p.Link {
|
||||||
o := p.As
|
o := p.As
|
||||||
@ -391,22 +389,24 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
// if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame
|
// if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame
|
||||||
//
|
//
|
||||||
// MOVW g_panic(g), R1
|
// MOVW g_panic(g), R1
|
||||||
// CMP $0, R1
|
// CMP $0, R1
|
||||||
// B.EQ end
|
// B.NE checkargp
|
||||||
// MOVW panic_argp(R1), R2
|
|
||||||
// ADD $(autosize+4), R13, R3
|
|
||||||
// CMP R2, R3
|
|
||||||
// B.NE end
|
|
||||||
// ADD $4, R13, R4
|
|
||||||
// MOVW R4, panic_argp(R1)
|
|
||||||
// end:
|
// end:
|
||||||
// NOP
|
// NOP
|
||||||
|
// ... function ...
|
||||||
|
// checkargp:
|
||||||
|
// MOVW panic_argp(R1), R2
|
||||||
|
// ADD $(autosize+4), R13, R3
|
||||||
|
// CMP R2, R3
|
||||||
|
// B.NE end
|
||||||
|
// ADD $4, R13, R4
|
||||||
|
// MOVW R4, panic_argp(R1)
|
||||||
|
// B end
|
||||||
//
|
//
|
||||||
// The NOP is needed to give the jumps somewhere to land.
|
// The NOP is needed to give the jumps somewhere to land.
|
||||||
// It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes.
|
// It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes.
|
||||||
|
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
|
|
||||||
p.As = AMOVW
|
p.As = AMOVW
|
||||||
p.From.Type = obj.TYPE_MEM
|
p.From.Type = obj.TYPE_MEM
|
||||||
p.From.Reg = REGG
|
p.From.Reg = REGG
|
||||||
@ -420,20 +420,34 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
p.From.Offset = 0
|
p.From.Offset = 0
|
||||||
p.Reg = REG_R1
|
p.Reg = REG_R1
|
||||||
|
|
||||||
p = obj.Appendp(ctxt, p)
|
// B.NE checkargp
|
||||||
p.As = ABEQ
|
bne := obj.Appendp(ctxt, p)
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
bne.As = ABNE
|
||||||
p1 = p
|
bne.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
p = obj.Appendp(ctxt, p)
|
// end: NOP
|
||||||
p.As = AMOVW
|
end := obj.Appendp(ctxt, bne)
|
||||||
p.From.Type = obj.TYPE_MEM
|
end.As = obj.ANOP
|
||||||
p.From.Reg = REG_R1
|
|
||||||
p.From.Offset = 0 // Panic.argp
|
|
||||||
p.To.Type = obj.TYPE_REG
|
|
||||||
p.To.Reg = REG_R2
|
|
||||||
|
|
||||||
p = obj.Appendp(ctxt, p)
|
// find end of function
|
||||||
|
var last *obj.Prog
|
||||||
|
for last = end; last.Link != nil; last = last.Link {
|
||||||
|
}
|
||||||
|
|
||||||
|
// MOVW panic_argp(R1), R2
|
||||||
|
mov := obj.Appendp(ctxt, last)
|
||||||
|
mov.As = AMOVW
|
||||||
|
mov.From.Type = obj.TYPE_MEM
|
||||||
|
mov.From.Reg = REG_R1
|
||||||
|
mov.From.Offset = 0 // Panic.argp
|
||||||
|
mov.To.Type = obj.TYPE_REG
|
||||||
|
mov.To.Reg = REG_R2
|
||||||
|
|
||||||
|
// B.NE branch target is MOVW above
|
||||||
|
bne.Pcond = mov
|
||||||
|
|
||||||
|
// ADD $(autosize+4), R13, R3
|
||||||
|
p = obj.Appendp(ctxt, mov)
|
||||||
p.As = AADD
|
p.As = AADD
|
||||||
p.From.Type = obj.TYPE_CONST
|
p.From.Type = obj.TYPE_CONST
|
||||||
p.From.Offset = int64(autosize) + 4
|
p.From.Offset = int64(autosize) + 4
|
||||||
@ -441,17 +455,20 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
p.To.Type = obj.TYPE_REG
|
p.To.Type = obj.TYPE_REG
|
||||||
p.To.Reg = REG_R3
|
p.To.Reg = REG_R3
|
||||||
|
|
||||||
|
// CMP R2, R3
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
p.As = ACMP
|
p.As = ACMP
|
||||||
p.From.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_REG
|
||||||
p.From.Reg = REG_R2
|
p.From.Reg = REG_R2
|
||||||
p.Reg = REG_R3
|
p.Reg = REG_R3
|
||||||
|
|
||||||
|
// B.NE end
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
p.As = ABNE
|
p.As = ABNE
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
p.To.Type = obj.TYPE_BRANCH
|
||||||
p2 = p
|
p.Pcond = end
|
||||||
|
|
||||||
|
// ADD $4, R13, R4
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
p.As = AADD
|
p.As = AADD
|
||||||
p.From.Type = obj.TYPE_CONST
|
p.From.Type = obj.TYPE_CONST
|
||||||
@ -460,6 +477,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
p.To.Type = obj.TYPE_REG
|
p.To.Type = obj.TYPE_REG
|
||||||
p.To.Reg = REG_R4
|
p.To.Reg = REG_R4
|
||||||
|
|
||||||
|
// MOVW R4, panic_argp(R1)
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
p.As = AMOVW
|
p.As = AMOVW
|
||||||
p.From.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_REG
|
||||||
@ -468,11 +486,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
p.To.Reg = REG_R1
|
p.To.Reg = REG_R1
|
||||||
p.To.Offset = 0 // Panic.argp
|
p.To.Offset = 0 // Panic.argp
|
||||||
|
|
||||||
|
// B end
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
|
p.As = AB
|
||||||
|
p.To.Type = obj.TYPE_BRANCH
|
||||||
|
p.Pcond = end
|
||||||
|
|
||||||
p.As = obj.ANOP
|
// reset for subsequent passes
|
||||||
p1.Pcond = p
|
p = end
|
||||||
p2.Pcond = p
|
|
||||||
}
|
}
|
||||||
|
|
||||||
case obj.ARET:
|
case obj.ARET:
|
||||||
@ -702,7 +723,7 @@ func softfloat(ctxt *obj.Link, cursym *obj.LSym) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
|
func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
|
||||||
// MOVW g_stackguard(g), R1
|
// MOVW g_stackguard(g), R1
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
|
|
||||||
p.As = AMOVW
|
p.As = AMOVW
|
||||||
@ -748,11 +769,11 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
|
|||||||
// SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall)
|
// SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall)
|
||||||
// The +StackGuard on both sides is required to keep the left side positive:
|
// The +StackGuard on both sides is required to keep the left side positive:
|
||||||
// SP is allowed to be slightly below stackguard. See stack.h.
|
// SP is allowed to be slightly below stackguard. See stack.h.
|
||||||
// CMP $StackPreempt, R1
|
// CMP $StackPreempt, R1
|
||||||
// MOVW.NE $StackGuard(SP), R2
|
// MOVW.NE $StackGuard(SP), R2
|
||||||
// SUB.NE R1, R2
|
// SUB.NE R1, R2
|
||||||
// MOVW.NE $(framesize+(StackGuard-StackSmall)), R3
|
// MOVW.NE $(framesize+(StackGuard-StackSmall)), R3
|
||||||
// CMP.NE R3, R2
|
// CMP.NE R3, R2
|
||||||
p = obj.Appendp(ctxt, p)
|
p = obj.Appendp(ctxt, p)
|
||||||
|
|
||||||
p.As = ACMP
|
p.As = ACMP
|
||||||
|
Loading…
Reference in New Issue
Block a user