mirror of
https://github.com/golang/go
synced 2024-11-18 06:14:46 -07:00
runtime, cmd/internal/obj/arm: improve arm function prologue
When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
This commit is contained in:
parent
f4b48de3ad
commit
5353cde080
@ -795,38 +795,45 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
|
||||
p.Scond = C_SCOND_NE
|
||||
}
|
||||
|
||||
// MOVW.LS R14, R3
|
||||
p = obj.Appendp(ctxt, p)
|
||||
// BLS call-to-morestack
|
||||
bls := obj.Appendp(ctxt, p)
|
||||
bls.As = ABLS
|
||||
bls.To.Type = obj.TYPE_BRANCH
|
||||
|
||||
p.As = AMOVW
|
||||
p.Scond = C_SCOND_LS
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = REGLINK
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = REG_R3
|
||||
|
||||
// BL.LS runtime.morestack(SB) // modifies LR, returns with LO still asserted
|
||||
p = obj.Appendp(ctxt, p)
|
||||
|
||||
p.As = ABL
|
||||
p.Scond = C_SCOND_LS
|
||||
p.To.Type = obj.TYPE_BRANCH
|
||||
if ctxt.Cursym.Cfunc != 0 {
|
||||
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestackc", 0)
|
||||
} else if ctxt.Cursym.Text.From3.Offset&obj.NEEDCTXT == 0 {
|
||||
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestack_noctxt", 0)
|
||||
} else {
|
||||
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestack", 0)
|
||||
var last *obj.Prog
|
||||
for last = ctxt.Cursym.Text; last.Link != nil; last = last.Link {
|
||||
}
|
||||
|
||||
// BLS start
|
||||
p = obj.Appendp(ctxt, p)
|
||||
// MOVW LR, R3
|
||||
movw := obj.Appendp(ctxt, last)
|
||||
movw.As = AMOVW
|
||||
movw.From.Type = obj.TYPE_REG
|
||||
movw.From.Reg = REGLINK
|
||||
movw.To.Type = obj.TYPE_REG
|
||||
movw.To.Reg = REG_R3
|
||||
|
||||
p.As = ABLS
|
||||
p.To.Type = obj.TYPE_BRANCH
|
||||
p.Pcond = ctxt.Cursym.Text.Link
|
||||
bls.Pcond = movw
|
||||
|
||||
return p
|
||||
// BL runtime.morestack
|
||||
call := obj.Appendp(ctxt, movw)
|
||||
call.As = obj.ACALL
|
||||
call.To.Type = obj.TYPE_BRANCH
|
||||
morestack := "runtime.morestack"
|
||||
switch {
|
||||
case ctxt.Cursym.Cfunc != 0:
|
||||
morestack = "runtime.morestackc"
|
||||
case ctxt.Cursym.Text.From3.Offset&obj.NEEDCTXT == 0:
|
||||
morestack = "runtime.morestack_noctxt"
|
||||
}
|
||||
call.To.Sym = obj.Linklookup(ctxt, morestack, 0)
|
||||
|
||||
// B start
|
||||
b := obj.Appendp(ctxt, call)
|
||||
b.As = obj.AJMP
|
||||
b.To.Type = obj.TYPE_BRANCH
|
||||
b.Pcond = ctxt.Cursym.Text.Link
|
||||
|
||||
return bls
|
||||
}
|
||||
|
||||
func initdiv(ctxt *obj.Link) {
|
||||
|
@ -259,7 +259,6 @@ noswitch:
|
||||
|
||||
// Called during function prolog when more stack is needed.
|
||||
// R1 frame size
|
||||
// R2 arg size
|
||||
// R3 prolog's LR
|
||||
// NB. we do not save R0 because we've forced 5c to pass all arguments
|
||||
// on the stack.
|
||||
|
@ -24,7 +24,7 @@ func rewindmorestack(buf *gobuf) {
|
||||
var inst uint32
|
||||
if buf.pc&3 == 0 && buf.pc != 0 {
|
||||
inst = *(*uint32)(unsafe.Pointer(buf.pc))
|
||||
if inst>>24 == 0x9a {
|
||||
if inst>>24 == 0x9a || inst>>24 == 0xea {
|
||||
buf.pc += uintptr(int32(inst<<8)>>6) + 8
|
||||
return
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user