mirror of
https://github.com/golang/go
synced 2024-09-29 04:24:36 -06:00
runtime/trace: enable frame pointer unwinding on amd64
Change tracer to use frame pointer unwinding by default on amd64. The expansion of inline frames is delayed until the stack table is dumped at the end of the trace. This requires storing the skip argument in the stack table, which now resides in pcBuf[0]. For stacks that are not produced by traceStackID (e.g. CPU samples), a logicalStackSentinel value in pcBuf[0] indicates that no inline expansion is needed. Add new GODEBUG=tracefpunwindoff=1 option to use the old unwinder if needed. Benchmarks show a considerable decrease in CPU overhead when using frame pointer unwinding for trace events: GODEBUG=tracefpunwindoff=1 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff1.txt GODEBUG=tracefpunwindoff=0 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff0.txt goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz │ tracefpunwindoff1.txt │ tracefpunwindoff0.txt │ │ sec/op │ sec/op vs base │ PingPongHog-32 3782.5n ± 0% 740.7n ± 2% -80.42% (p=0.000 n=20) For #16638 Change-Id: I2928a2fcd8779a31c45ce0f2fbcc0179641190bb Reviewed-on: https://go-review.googlesource.com/c/go/+/463835 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Felix Geisendörfer <felix.geisendoerfer@datadoghq.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
b52f6d3721
commit
ba71817390
@ -2087,3 +2087,7 @@ TEXT runtime·retpolineR12(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(12)
|
||||
TEXT runtime·retpolineR13(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(13)
|
||||
TEXT runtime·retpolineR14(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(14)
|
||||
TEXT runtime·retpolineR15(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(15)
|
||||
|
||||
TEXT ·getcallerfp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
|
||||
MOVQ BP, AX
|
||||
RET
|
||||
|
@ -1566,3 +1566,7 @@ TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
|
||||
MOVD R2, R0
|
||||
MOVD R3, R1
|
||||
JMP runtime·goPanicSliceConvert<ABIInternal>(SB)
|
||||
|
||||
TEXT ·getcallerfp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
|
||||
MOVD R29, R0
|
||||
RET
|
||||
|
@ -179,6 +179,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
|
||||
IDs will refer to the ID of the goroutine at the time of creation; it's possible for this
|
||||
ID to be reused for another goroutine. Setting N to 0 will report no ancestry information.
|
||||
|
||||
tracefpunwindoff: setting tracefpunwindoff=1 forces the execution tracer to
|
||||
use the runtime's default stack unwinder instead of frame pointer unwinding.
|
||||
This increases tracer overhead, but could be helpful as a workaround or for
|
||||
debugging unexpected regressions caused by frame pointer unwinding.
|
||||
|
||||
asyncpreemptoff: asyncpreemptoff=1 disables signal-based
|
||||
asynchronous goroutine preemption. This makes some loops
|
||||
non-preemptible for long periods, which may delay GC and
|
||||
|
@ -169,3 +169,6 @@ const preemptMSupported = false
|
||||
func preemptM(mp *m) {
|
||||
// No threads, so nothing to do.
|
||||
}
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -325,6 +325,7 @@ var debug struct {
|
||||
asyncpreemptoff int32
|
||||
harddecommit int32
|
||||
adaptivestackstart int32
|
||||
tracefpunwindoff int32
|
||||
|
||||
// debug.malloc is used as a combined debug check
|
||||
// in the malloc function and should be set
|
||||
@ -359,6 +360,7 @@ var dbgvars = []*dbgVar{
|
||||
{name: "inittrace", value: &debug.inittrace},
|
||||
{name: "harddecommit", value: &debug.harddecommit},
|
||||
{name: "adaptivestackstart", value: &debug.adaptivestackstart},
|
||||
{name: "tracefpunwindoff", value: &debug.tracefpunwindoff},
|
||||
{name: "panicnil", atomic: &debug.panicnil},
|
||||
}
|
||||
|
||||
|
@ -18,3 +18,6 @@ func emptyfunc()
|
||||
|
||||
//go:noescape
|
||||
func asmcgocall_no_g(fn, arg unsafe.Pointer)
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -47,3 +47,7 @@ func asmcgocall_no_g(fn, arg unsafe.Pointer)
|
||||
// respectively. Does not follow the Go ABI.
|
||||
func spillArgs()
|
||||
func unspillArgs()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
// TODO: Make this a compiler intrinsic
|
||||
func getcallerfp() uintptr
|
||||
|
@ -23,3 +23,6 @@ func read_tls_fallback()
|
||||
|
||||
//go:noescape
|
||||
func asmcgocall_no_g(fn, arg unsafe.Pointer)
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -21,3 +21,7 @@ func emptyfunc()
|
||||
// respectively. Does not follow the Go ABI.
|
||||
func spillArgs()
|
||||
func unspillArgs()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
// TODO: Make this a compiler intrinsic
|
||||
func getcallerfp() uintptr
|
||||
|
@ -9,3 +9,6 @@ package runtime
|
||||
// Called from assembly only; declared for go vet.
|
||||
func load_g()
|
||||
func save_g()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -14,3 +14,6 @@ func save_g()
|
||||
|
||||
//go:noescape
|
||||
func asmcgocall_no_g(fn, arg unsafe.Pointer)
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -9,3 +9,6 @@ package runtime
|
||||
// Called from assembly only; declared for go vet.
|
||||
func load_g()
|
||||
func save_g()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -15,3 +15,6 @@ func reginit()
|
||||
// respectively. Does not follow the Go ABI.
|
||||
func spillArgs()
|
||||
func unspillArgs()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -14,3 +14,6 @@ func save_g()
|
||||
// respectively. Does not follow the Go ABI.
|
||||
func spillArgs()
|
||||
func unspillArgs()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -7,3 +7,6 @@ package runtime
|
||||
// Called from assembly only; declared for go vet.
|
||||
func load_g()
|
||||
func save_g()
|
||||
|
||||
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
|
||||
func getcallerfp() uintptr { return 0 }
|
||||
|
@ -260,7 +260,7 @@ func StartTrace() error {
|
||||
gp.traceseq = 0
|
||||
gp.tracelastp = getg().m.p
|
||||
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
|
||||
id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum})
|
||||
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(gp.startpc) + sys.PCQuantum})
|
||||
traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
|
||||
}
|
||||
if status == _Gwaiting {
|
||||
@ -278,7 +278,7 @@ func StartTrace() error {
|
||||
gp.traceseq = 0
|
||||
gp.tracelastp = getg().m.p
|
||||
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
|
||||
id := trace.stackTab.put([]uintptr{startPCforTrace(0) + sys.PCQuantum}) // no start pc
|
||||
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(0) + sys.PCQuantum}) // no start pc
|
||||
traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
|
||||
gp.traceseq++
|
||||
traceEvent(traceEvGoInSyscall, -1, gp.goid)
|
||||
@ -862,27 +862,55 @@ func traceReadCPU() {
|
||||
})
|
||||
buf = bufp.ptr()
|
||||
}
|
||||
for i := range stk {
|
||||
if i >= len(buf.stk) {
|
||||
break
|
||||
}
|
||||
buf.stk[i] = uintptr(stk[i])
|
||||
nstk := 1
|
||||
buf.stk[0] = logicalStackSentinel
|
||||
for ; nstk < len(buf.stk) && nstk-1 < len(stk); nstk++ {
|
||||
buf.stk[nstk] = uintptr(stk[nstk-1])
|
||||
}
|
||||
stackID := trace.stackTab.put(buf.stk[:len(stk)])
|
||||
stackID := trace.stackTab.put(buf.stk[:nstk])
|
||||
|
||||
traceEventLocked(0, nil, 0, bufp, traceEvCPUSample, stackID, 1, timestamp/traceTickDiv, ppid, goid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
|
||||
// logicalStackSentinel is a sentinel value at pcBuf[0] signifying that
|
||||
// pcBuf[1:] holds a logical stack requiring no further processing. Any other
|
||||
// value at pcBuf[0] represents a skip value to apply to the physical stack in
|
||||
// pcBuf[1:] after inline expansion.
|
||||
const logicalStackSentinel = ^uintptr(0)
|
||||
|
||||
// traceStackID captures a stack trace into pcBuf, registers it in the trace
|
||||
// stack table, and returns its unique ID. pcBuf should have a length equal to
|
||||
// traceStackSize. skip controls the number of leaf frames to omit in order to
|
||||
// hide tracer internals from stack traces, see CL 5523.
|
||||
func traceStackID(mp *m, pcBuf []uintptr, skip int) uint64 {
|
||||
gp := getg()
|
||||
curgp := mp.curg
|
||||
var nstk int
|
||||
if curgp == gp {
|
||||
nstk = callers(skip+1, buf)
|
||||
} else if curgp != nil {
|
||||
nstk = gcallers(curgp, skip, buf)
|
||||
nstk := 1
|
||||
if tracefpunwindoff() {
|
||||
// Slow path: Unwind using default unwinder. Used when frame pointer
|
||||
// unwinding is unavailable or disabled.
|
||||
pcBuf[0] = logicalStackSentinel
|
||||
if curgp == gp {
|
||||
nstk += callers(skip+1, pcBuf[1:])
|
||||
} else if curgp != nil {
|
||||
nstk += gcallers(curgp, skip, pcBuf[1:])
|
||||
}
|
||||
} else {
|
||||
// Fast path: Unwind using frame pointers.
|
||||
pcBuf[0] = uintptr(skip)
|
||||
if curgp == gp {
|
||||
nstk += fpTracebackPCs(unsafe.Pointer(getcallerfp()), skip, pcBuf[1:])
|
||||
} else if curgp != nil {
|
||||
// We're called on the g0 stack through mcall(fn) or systemstack(fn). To
|
||||
// behave like gcallers above, we start unwinding from sched.bp, which
|
||||
// points to the caller frame of the leaf frame on g's stack. The return
|
||||
// address of the leaf frame is stored in sched.pc, which we manually
|
||||
// capture here.
|
||||
pcBuf[1] = curgp.sched.pc
|
||||
nstk += 1 + fpTracebackPCs(unsafe.Pointer(curgp.sched.bp), skip, pcBuf[2:])
|
||||
}
|
||||
}
|
||||
if nstk > 0 {
|
||||
nstk-- // skip runtime.goexit
|
||||
@ -890,10 +918,32 @@ func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
|
||||
if nstk > 0 && curgp.goid == 1 {
|
||||
nstk-- // skip runtime.main
|
||||
}
|
||||
id := trace.stackTab.put(buf[:nstk])
|
||||
id := trace.stackTab.put(pcBuf[:nstk])
|
||||
return uint64(id)
|
||||
}
|
||||
|
||||
// tracefpunwindoff returns false if frame pointer unwinding for the tracer is
|
||||
// disabled via GODEBUG or not supported by the architecture.
|
||||
func tracefpunwindoff() bool {
|
||||
// compiler emits frame pointers for amd64 and arm64, but issue 58432 blocks
|
||||
// arm64 support for now.
|
||||
return debug.tracefpunwindoff != 0 || goarch.ArchFamily != goarch.AMD64
|
||||
}
|
||||
|
||||
// fpTracebackPCs populates pcBuf with the return addresses for each frame and
|
||||
// returns the number of PCs written to pcBuf. The returned PCs correspond to
|
||||
// "physical frames" rather than "logical frames"; that is if A is inlined into
|
||||
// B, this will return a PC for only B.
|
||||
func fpTracebackPCs(fp unsafe.Pointer, skip int, pcBuf []uintptr) (i int) {
|
||||
for i = 0; i < len(pcBuf) && fp != nil; i++ {
|
||||
// return addr sits one word above the frame pointer
|
||||
pcBuf[i] = *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize))
|
||||
// follow the frame pointer to the next one
|
||||
fp = unsafe.Pointer(*(*uintptr)(fp))
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
|
||||
func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
|
||||
// Any time we acquire a buffer, we may end up flushing it,
|
||||
@ -1178,7 +1228,7 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
|
||||
stk := tab.tab[i].ptr()
|
||||
for ; stk != nil; stk = stk.link.ptr() {
|
||||
var frames []traceFrame
|
||||
frames, bufp = traceFrames(bufp, stk.stack())
|
||||
frames, bufp = traceFrames(bufp, fpunwindExpand(stk.stack()))
|
||||
|
||||
// Estimate the size of this record. This
|
||||
// bound is pretty loose, but avoids counting
|
||||
@ -1218,6 +1268,62 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
|
||||
return bufp
|
||||
}
|
||||
|
||||
// fpunwindExpand checks if pcBuf contains logical frames (which include inlined
|
||||
// frames) or physical frames (produced by frame pointer unwinding) using a
|
||||
// sentinel value in pcBuf[0]. Logical frames are simply returned without the
|
||||
// sentinel. Physical frames are turned into logical frames via inline unwinding
|
||||
// and by applying the skip value that's stored in pcBuf[0].
|
||||
func fpunwindExpand(pcBuf []uintptr) []uintptr {
|
||||
if len(pcBuf) > 0 && pcBuf[0] == logicalStackSentinel {
|
||||
// pcBuf contains logical rather than inlined frames, skip has already been
|
||||
// applied, just return it without the sentinel value in pcBuf[0].
|
||||
return pcBuf[1:]
|
||||
}
|
||||
|
||||
var (
|
||||
cache pcvalueCache
|
||||
lastFuncID = funcID_normal
|
||||
newPCBuf = make([]uintptr, 0, traceStackSize)
|
||||
skip = pcBuf[0]
|
||||
// skipOrAdd skips or appends retPC to newPCBuf and returns true if more
|
||||
// pcs can be added.
|
||||
skipOrAdd = func(retPC uintptr) bool {
|
||||
if skip > 0 {
|
||||
skip--
|
||||
} else {
|
||||
newPCBuf = append(newPCBuf, retPC)
|
||||
}
|
||||
return len(newPCBuf) < cap(newPCBuf)
|
||||
}
|
||||
)
|
||||
|
||||
outer:
|
||||
for _, retPC := range pcBuf[1:] {
|
||||
callPC := retPC - 1
|
||||
fi := findfunc(callPC)
|
||||
if !fi.valid() {
|
||||
// There is no funcInfo if callPC belongs to a C function. In this case
|
||||
// we still keep the pc, but don't attempt to expand inlined frames.
|
||||
if more := skipOrAdd(retPC); !more {
|
||||
break outer
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
u, uf := newInlineUnwinder(fi, callPC, &cache)
|
||||
for ; uf.valid(); uf = u.next(uf) {
|
||||
sf := u.srcFunc(uf)
|
||||
if sf.funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
|
||||
// ignore wrappers
|
||||
} else if more := skipOrAdd(uf.pc + 1); !more {
|
||||
break outer
|
||||
}
|
||||
lastFuncID = sf.funcID
|
||||
}
|
||||
}
|
||||
return newPCBuf
|
||||
}
|
||||
|
||||
type traceFrame struct {
|
||||
PC uintptr
|
||||
funcID uint64
|
||||
@ -1390,7 +1496,7 @@ func traceGoCreate(newg *g, pc uintptr) {
|
||||
newg.traceseq = 0
|
||||
newg.tracelastp = getg().m.p
|
||||
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
|
||||
id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum})
|
||||
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(pc) + sys.PCQuantum})
|
||||
traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id))
|
||||
}
|
||||
|
||||
@ -1443,7 +1549,16 @@ func traceGoUnpark(gp *g, skip int) {
|
||||
}
|
||||
|
||||
func traceGoSysCall() {
|
||||
traceEvent(traceEvGoSysCall, 1)
|
||||
if tracefpunwindoff() {
|
||||
traceEvent(traceEvGoSysCall, 1)
|
||||
} else {
|
||||
// The default unwinder starts unwinding from gp.syscallsp
|
||||
// which is captured 3 frames above this frame. We could
|
||||
// capture gp.syscallbp to allow frame pointer unwinding to
|
||||
// behave the same, but skipping 3 more frames here is
|
||||
// simpler.
|
||||
traceEvent(traceEvGoSysCall, 4)
|
||||
}
|
||||
}
|
||||
|
||||
func traceGoSysExit(ts int64) {
|
||||
|
Loading…
Reference in New Issue
Block a user