1
0
mirror of https://github.com/golang/go synced 2024-09-29 04:24:36 -06:00

runtime/trace: enable frame pointer unwinding on amd64

Change tracer to use frame pointer unwinding by default on amd64. The
expansion of inline frames is delayed until the stack table is dumped at
the end of the trace. This requires storing the skip argument in the
stack table, which now resides in pcBuf[0]. For stacks that are not
produced by traceStackID (e.g. CPU samples), a logicalStackSentinel
value in pcBuf[0] indicates that no inline expansion is needed.

Add new GODEBUG=tracefpunwindoff=1 option to use the old unwinder if
needed.

Benchmarks show a considerable decrease in CPU overhead when using frame
pointer unwinding for trace events:

GODEBUG=tracefpunwindoff=1 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff1.txt
GODEBUG=tracefpunwindoff=0 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff0.txt

goos: linux
goarch: amd64
pkg: runtime
cpu: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
               │ tracefpunwindoff1.txt │        tracefpunwindoff0.txt        │
               │        sec/op         │   sec/op     vs base                │
PingPongHog-32            3782.5n ± 0%   740.7n ± 2%  -80.42% (p=0.000 n=20)

For #16638

Change-Id: I2928a2fcd8779a31c45ce0f2fbcc0179641190bb
Reviewed-on: https://go-review.googlesource.com/c/go/+/463835
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Felix Geisendörfer <felix.geisendoerfer@datadoghq.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
Felix Geisendörfer 2023-03-09 07:54:51 +00:00 committed by Michael Pratt
parent b52f6d3721
commit ba71817390
16 changed files with 183 additions and 18 deletions

View File

@ -2087,3 +2087,7 @@ TEXT runtime·retpolineR12(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(12)
TEXT runtime·retpolineR13(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(13)
TEXT runtime·retpolineR14(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(14)
TEXT runtime·retpolineR15(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(15)
TEXT ·getcallerfp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
MOVQ BP, AX
RET

View File

@ -1566,3 +1566,7 @@ TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
MOVD R2, R0
MOVD R3, R1
JMP runtime·goPanicSliceConvert<ABIInternal>(SB)
TEXT ·getcallerfp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
MOVD R29, R0
RET

View File

@ -179,6 +179,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
IDs will refer to the ID of the goroutine at the time of creation; it's possible for this
ID to be reused for another goroutine. Setting N to 0 will report no ancestry information.
tracefpunwindoff: setting tracefpunwindoff=1 forces the execution tracer to
use the runtime's default stack unwinder instead of frame pointer unwinding.
This increases tracer overhead, but could be helpful as a workaround or for
debugging unexpected regressions caused by frame pointer unwinding.
asyncpreemptoff: asyncpreemptoff=1 disables signal-based
asynchronous goroutine preemption. This makes some loops
non-preemptible for long periods, which may delay GC and

View File

@ -169,3 +169,6 @@ const preemptMSupported = false
func preemptM(mp *m) {
// No threads, so nothing to do.
}
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -325,6 +325,7 @@ var debug struct {
asyncpreemptoff int32
harddecommit int32
adaptivestackstart int32
tracefpunwindoff int32
// debug.malloc is used as a combined debug check
// in the malloc function and should be set
@ -359,6 +360,7 @@ var dbgvars = []*dbgVar{
{name: "inittrace", value: &debug.inittrace},
{name: "harddecommit", value: &debug.harddecommit},
{name: "adaptivestackstart", value: &debug.adaptivestackstart},
{name: "tracefpunwindoff", value: &debug.tracefpunwindoff},
{name: "panicnil", atomic: &debug.panicnil},
}

View File

@ -18,3 +18,6 @@ func emptyfunc()
//go:noescape
func asmcgocall_no_g(fn, arg unsafe.Pointer)
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -47,3 +47,7 @@ func asmcgocall_no_g(fn, arg unsafe.Pointer)
// respectively. Does not follow the Go ABI.
func spillArgs()
func unspillArgs()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
// TODO: Make this a compiler intrinsic
func getcallerfp() uintptr

View File

@ -23,3 +23,6 @@ func read_tls_fallback()
//go:noescape
func asmcgocall_no_g(fn, arg unsafe.Pointer)
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -21,3 +21,7 @@ func emptyfunc()
// respectively. Does not follow the Go ABI.
func spillArgs()
func unspillArgs()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
// TODO: Make this a compiler intrinsic
func getcallerfp() uintptr

View File

@ -9,3 +9,6 @@ package runtime
// Called from assembly only; declared for go vet.
func load_g()
func save_g()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -14,3 +14,6 @@ func save_g()
//go:noescape
func asmcgocall_no_g(fn, arg unsafe.Pointer)
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -9,3 +9,6 @@ package runtime
// Called from assembly only; declared for go vet.
func load_g()
func save_g()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -15,3 +15,6 @@ func reginit()
// respectively. Does not follow the Go ABI.
func spillArgs()
func unspillArgs()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -14,3 +14,6 @@ func save_g()
// respectively. Does not follow the Go ABI.
func spillArgs()
func unspillArgs()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -7,3 +7,6 @@ package runtime
// Called from assembly only; declared for go vet.
func load_g()
func save_g()
// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented.
func getcallerfp() uintptr { return 0 }

View File

@ -260,7 +260,7 @@ func StartTrace() error {
gp.traceseq = 0
gp.tracelastp = getg().m.p
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum})
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(gp.startpc) + sys.PCQuantum})
traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
}
if status == _Gwaiting {
@ -278,7 +278,7 @@ func StartTrace() error {
gp.traceseq = 0
gp.tracelastp = getg().m.p
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
id := trace.stackTab.put([]uintptr{startPCforTrace(0) + sys.PCQuantum}) // no start pc
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(0) + sys.PCQuantum}) // no start pc
traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
gp.traceseq++
traceEvent(traceEvGoInSyscall, -1, gp.goid)
@ -862,27 +862,55 @@ func traceReadCPU() {
})
buf = bufp.ptr()
}
for i := range stk {
if i >= len(buf.stk) {
break
}
buf.stk[i] = uintptr(stk[i])
nstk := 1
buf.stk[0] = logicalStackSentinel
for ; nstk < len(buf.stk) && nstk-1 < len(stk); nstk++ {
buf.stk[nstk] = uintptr(stk[nstk-1])
}
stackID := trace.stackTab.put(buf.stk[:len(stk)])
stackID := trace.stackTab.put(buf.stk[:nstk])
traceEventLocked(0, nil, 0, bufp, traceEvCPUSample, stackID, 1, timestamp/traceTickDiv, ppid, goid)
}
}
}
func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
// logicalStackSentinel is a sentinel value at pcBuf[0] signifying that
// pcBuf[1:] holds a logical stack requiring no further processing. Any other
// value at pcBuf[0] represents a skip value to apply to the physical stack in
// pcBuf[1:] after inline expansion.
const logicalStackSentinel = ^uintptr(0)
// traceStackID captures a stack trace into pcBuf, registers it in the trace
// stack table, and returns its unique ID. pcBuf should have a length equal to
// traceStackSize. skip controls the number of leaf frames to omit in order to
// hide tracer internals from stack traces, see CL 5523.
func traceStackID(mp *m, pcBuf []uintptr, skip int) uint64 {
gp := getg()
curgp := mp.curg
var nstk int
if curgp == gp {
nstk = callers(skip+1, buf)
} else if curgp != nil {
nstk = gcallers(curgp, skip, buf)
nstk := 1
if tracefpunwindoff() {
// Slow path: Unwind using default unwinder. Used when frame pointer
// unwinding is unavailable or disabled.
pcBuf[0] = logicalStackSentinel
if curgp == gp {
nstk += callers(skip+1, pcBuf[1:])
} else if curgp != nil {
nstk += gcallers(curgp, skip, pcBuf[1:])
}
} else {
// Fast path: Unwind using frame pointers.
pcBuf[0] = uintptr(skip)
if curgp == gp {
nstk += fpTracebackPCs(unsafe.Pointer(getcallerfp()), skip, pcBuf[1:])
} else if curgp != nil {
// We're called on the g0 stack through mcall(fn) or systemstack(fn). To
// behave like gcallers above, we start unwinding from sched.bp, which
// points to the caller frame of the leaf frame on g's stack. The return
// address of the leaf frame is stored in sched.pc, which we manually
// capture here.
pcBuf[1] = curgp.sched.pc
nstk += 1 + fpTracebackPCs(unsafe.Pointer(curgp.sched.bp), skip, pcBuf[2:])
}
}
if nstk > 0 {
nstk-- // skip runtime.goexit
@ -890,10 +918,32 @@ func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
if nstk > 0 && curgp.goid == 1 {
nstk-- // skip runtime.main
}
id := trace.stackTab.put(buf[:nstk])
id := trace.stackTab.put(pcBuf[:nstk])
return uint64(id)
}
// tracefpunwindoff returns false if frame pointer unwinding for the tracer is
// disabled via GODEBUG or not supported by the architecture.
func tracefpunwindoff() bool {
// compiler emits frame pointers for amd64 and arm64, but issue 58432 blocks
// arm64 support for now.
return debug.tracefpunwindoff != 0 || goarch.ArchFamily != goarch.AMD64
}
// fpTracebackPCs populates pcBuf with the return addresses for each frame and
// returns the number of PCs written to pcBuf. The returned PCs correspond to
// "physical frames" rather than "logical frames"; that is if A is inlined into
// B, this will return a PC for only B.
func fpTracebackPCs(fp unsafe.Pointer, skip int, pcBuf []uintptr) (i int) {
for i = 0; i < len(pcBuf) && fp != nil; i++ {
// return addr sits one word above the frame pointer
pcBuf[i] = *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize))
// follow the frame pointer to the next one
fp = unsafe.Pointer(*(*uintptr)(fp))
}
return i
}
// traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
// Any time we acquire a buffer, we may end up flushing it,
@ -1178,7 +1228,7 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
stk := tab.tab[i].ptr()
for ; stk != nil; stk = stk.link.ptr() {
var frames []traceFrame
frames, bufp = traceFrames(bufp, stk.stack())
frames, bufp = traceFrames(bufp, fpunwindExpand(stk.stack()))
// Estimate the size of this record. This
// bound is pretty loose, but avoids counting
@ -1218,6 +1268,62 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
return bufp
}
// fpunwindExpand checks if pcBuf contains logical frames (which include inlined
// frames) or physical frames (produced by frame pointer unwinding) using a
// sentinel value in pcBuf[0]. Logical frames are simply returned without the
// sentinel. Physical frames are turned into logical frames via inline unwinding
// and by applying the skip value that's stored in pcBuf[0].
func fpunwindExpand(pcBuf []uintptr) []uintptr {
if len(pcBuf) > 0 && pcBuf[0] == logicalStackSentinel {
// pcBuf contains logical rather than inlined frames, skip has already been
// applied, just return it without the sentinel value in pcBuf[0].
return pcBuf[1:]
}
var (
cache pcvalueCache
lastFuncID = funcID_normal
newPCBuf = make([]uintptr, 0, traceStackSize)
skip = pcBuf[0]
// skipOrAdd skips or appends retPC to newPCBuf and returns true if more
// pcs can be added.
skipOrAdd = func(retPC uintptr) bool {
if skip > 0 {
skip--
} else {
newPCBuf = append(newPCBuf, retPC)
}
return len(newPCBuf) < cap(newPCBuf)
}
)
outer:
for _, retPC := range pcBuf[1:] {
callPC := retPC - 1
fi := findfunc(callPC)
if !fi.valid() {
// There is no funcInfo if callPC belongs to a C function. In this case
// we still keep the pc, but don't attempt to expand inlined frames.
if more := skipOrAdd(retPC); !more {
break outer
}
continue
}
u, uf := newInlineUnwinder(fi, callPC, &cache)
for ; uf.valid(); uf = u.next(uf) {
sf := u.srcFunc(uf)
if sf.funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
// ignore wrappers
} else if more := skipOrAdd(uf.pc + 1); !more {
break outer
}
lastFuncID = sf.funcID
}
}
return newPCBuf
}
type traceFrame struct {
PC uintptr
funcID uint64
@ -1390,7 +1496,7 @@ func traceGoCreate(newg *g, pc uintptr) {
newg.traceseq = 0
newg.tracelastp = getg().m.p
// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum})
id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(pc) + sys.PCQuantum})
traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id))
}
@ -1443,7 +1549,16 @@ func traceGoUnpark(gp *g, skip int) {
}
func traceGoSysCall() {
traceEvent(traceEvGoSysCall, 1)
if tracefpunwindoff() {
traceEvent(traceEvGoSysCall, 1)
} else {
// The default unwinder starts unwinding from gp.syscallsp
// which is captured 3 frames above this frame. We could
// capture gp.syscallbp to allow frame pointer unwinding to
// behave the same, but skipping 3 more frames here is
// simpler.
traceEvent(traceEvGoSysCall, 4)
}
}
func traceGoSysExit(ts int64) {