From e038c7e4189a33ae25eb2c446f608b6e2f946823 Mon Sep 17 00:00:00 2001 From: "Hana (Hyang-Ah) Kim" Date: Sat, 2 Nov 2019 17:16:36 +0900 Subject: [PATCH] runtime/pprof: correctly encode inlined functions in CPU profile The pprof profile proto message expects inlined functions of a PC to be encoded in one Location entry using multiple Line entries. https://github.com/google/pprof/blob/5e96527/proto/profile.proto#L177-L184 runtime/pprof has encoded the symbolization information by creating a Location for each PC found in the stack trace and including info from all the frames expanded from the PC using runtime.CallersFrames. This assumes inlined functions are represented as a single PC in the stack trace. (https://go-review.googlesource.com/41256) In the recent years, behavior around inlining and the traceback changed significantly (e.g. https://golang.org/cl/152537, https://golang.org/issue/29582, and many changes). Now the PCs in the stack trace represent user frames even including inline marks. As a result, the profile proto started to allocate a Location entry for each user frame, lose the inline information (so pprof presented incorrect results when inlined functions are involved), and confuse the pprof tool with those PCs made up for inline marks. This CL attempts to detect inlined call frames from the stack traces of CPU profiles, and organize the Location information as intended. Currently, runtime does not provide a reliable and convenient way to detect inlined call frames and expand user frames from a given externally recognizable PCs. So we use heuristics to recover the groups - inlined call frames have nil Func field - inlined call frames will have the same Entry point - but must be careful with recursive functions that have the same Entry point by definition, and non-Go functions that may lack most of the fields of Frame. The followup CL will address the issue with other profile types. Change-Id: I0c9667ab016a3e898d648f31c3f82d84c15398db Reviewed-on: https://go-review.googlesource.com/c/go/+/204636 Reviewed-by: Keith Randall --- src/runtime/pprof/pprof_test.go | 97 +++++-- src/runtime/pprof/proto.go | 251 ++++++++++++++++-- src/runtime/pprof/proto_test.go | 11 + .../pprof/testdata/mappingtest/main.go | 13 +- 4 files changed, 318 insertions(+), 54 deletions(-) diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go index ed04fe06acf..b553baf3a95 100644 --- a/src/runtime/pprof/pprof_test.go +++ b/src/runtime/pprof/pprof_test.go @@ -49,8 +49,12 @@ var ( // Must not call other functions nor access heap/globals in the loop, // otherwise under race detector the samples will be in the race runtime. func cpuHog1(x int) int { + return cpuHog0(x, 1e5) +} + +func cpuHog0(x, n int) int { foo := x - for i := 0; i < 1e5; i++ { + for i := 0; i < n; i++ { if foo > 0 { foo *= foo } else { @@ -101,34 +105,69 @@ func TestCPUProfileMultithreaded(t *testing.T) { } func TestCPUProfileInlining(t *testing.T) { - testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) { + p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) { cpuHogger(inlinedCaller, &salt1, dur) }) + + // Check if inlined function locations are encoded correctly. The inlinedCalee and inlinedCaller should be in one location. + for _, loc := range p.Location { + hasInlinedCallerAfterInlinedCallee, hasInlinedCallee := false, false + for _, line := range loc.Line { + if line.Function.Name == "runtime/pprof.inlinedCallee" { + hasInlinedCallee = true + } + if hasInlinedCallee && line.Function.Name == "runtime/pprof.inlinedCaller" { + hasInlinedCallerAfterInlinedCallee = true + } + } + if hasInlinedCallee != hasInlinedCallerAfterInlinedCallee { + t.Fatalf("want inlinedCallee followed by inlinedCaller, got separate Location entries:\n%v", p) + } + } } func inlinedCaller(x int) int { - x = inlinedCallee(x) + x = inlinedCallee(x, 1e5) return x } -func inlinedCallee(x int) int { - // We could just use cpuHog1, but for loops prevent inlining - // right now. :( - foo := x - i := 0 -loop: - if foo > 0 { - foo *= foo - } else { - foo *= foo + 1 - } - if i++; i < 1e5 { - goto loop - } - return foo +func inlinedCallee(x, n int) int { + return cpuHog0(x, n) } -func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) { +func TestCPUProfileRecursion(t *testing.T) { + p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) { + cpuHogger(recursionCaller, &salt1, dur) + }) + + // check the Location encoding was not confused by recursive calls. + for i, loc := range p.Location { + recursionFunc := 0 + for _, line := range loc.Line { + if name := line.Function.Name; name == "runtime/pprof.recursionCaller" || name == "runtime/pprof.recursionCallee" { + recursionFunc++ + } + } + if recursionFunc > 1 { + t.Fatalf("want at most one recursionCaller or recursionCallee in one Location, got a violating Location (index: %d):\n%v", i, p) + } + } +} + +func recursionCaller(x int) int { + y := recursionCallee(3, x) + return y +} + +func recursionCallee(n, x int) int { + if n == 0 { + return 1 + } + y := inlinedCallee(x, 1e4) + return y * recursionCallee(n-1, x) +} + +func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile { p, err := profile.Parse(bytes.NewReader(valBytes)) if err != nil { t.Fatal(err) @@ -137,11 +176,12 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca count := uintptr(sample.Value[0]) f(count, sample.Location, sample.Label) } + return p } // testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need, -// as interpreted by matches. -func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) { +// as interpreted by matches, and returns the parsed profile. +func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile { switch runtime.GOOS { case "darwin": switch runtime.GOARCH { @@ -195,8 +235,8 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri f(duration) StopCPUProfile() - if profileOk(t, matches, need, avoid, prof, duration) { - return + if p, ok := profileOk(t, matches, need, avoid, prof, duration); ok { + return p } duration *= 2 @@ -217,6 +257,7 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri t.Skip("ignore the failure in QEMU; see golang.org/issue/9605") } t.FailNow() + return nil } func contains(slice []string, s string) bool { @@ -242,7 +283,7 @@ func stackContains(spec string, count uintptr, stk []*profile.Location, labels m type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool -func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) { +func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (_ *profile.Profile, ok bool) { ok = true // Check that profile is well formed, contains 'need', and does not contain @@ -251,7 +292,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p avoidSamples := make([]uintptr, len(avoid)) var samples uintptr var buf bytes.Buffer - parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) { + p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) { fmt.Fprintf(&buf, "%d:", count) fprintStack(&buf, stk) samples += count @@ -278,7 +319,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p // not enough samples due to coarse timer // resolution. Let it go. t.Log("too few samples on Windows (golang.org/issue/10842)") - return false + return p, false } // Check that we got a reasonable number of samples. @@ -300,7 +341,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p } if len(need) == 0 { - return ok + return p, ok } var total uintptr @@ -323,7 +364,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p ok = false } } - return ok + return p, ok } // Fork can hang if preempted with signals frequently enough (see issue 5517). diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go index 7864dd79ad0..688df4b942f 100644 --- a/src/runtime/pprof/proto.go +++ b/src/runtime/pprof/proto.go @@ -41,8 +41,8 @@ type profileBuilder struct { pb protobuf strings []string stringMap map[string]int - locs map[uintptr]int - funcs map[string]int // Package path-qualified function name to Function.ID + locs map[uintptr]locInfo // list of locInfo starting with the given PC. + funcs map[string]int // Package path-qualified function name to Function.ID mem []memMap } @@ -207,13 +207,43 @@ func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file b.pb.endMessage(tag, start) } +func allFrames(addr uintptr) ([]runtime.Frame, symbolizeFlag) { + // Expand this one address using CallersFrames so we can cache + // each expansion. In general, CallersFrames takes a whole + // stack, but in this case we know there will be no skips in + // the stack and we have return PCs anyway. + frames := runtime.CallersFrames([]uintptr{addr}) + frame, more := frames.Next() + if frame.Function == "runtime.goexit" { + // Short-circuit if we see runtime.goexit so the loop + // below doesn't allocate a useless empty location. + return nil, 0 + } + + symbolizeResult := lookupTried + if frame.PC == 0 || frame.Function == "" || frame.File == "" || frame.Line == 0 { + symbolizeResult |= lookupFailed + } + + if frame.PC == 0 { + // If we failed to resolve the frame, at least make up + // a reasonable call PC. This mostly happens in tests. + frame.PC = addr - 1 + } + ret := []runtime.Frame{frame} + for frame.Function != "runtime.goexit" && more == true { + frame, more = frames.Next() + ret = append(ret, frame) + } + return ret, symbolizeResult +} + // locForPC returns the location ID for addr. // addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call. // It may emit to b.pb, so there must be no message encoding in progress. func (b *profileBuilder) locForPC(addr uintptr) uint64 { - id := uint64(b.locs[addr]) - if id != 0 { - return id + if loc, ok := b.locs[addr]; ok { + return loc.id } // Expand this one address using CallersFrames so we can cache @@ -248,8 +278,8 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 { } newFuncs := make([]newFunc, 0, 8) - id = uint64(len(b.locs)) + 1 - b.locs[addr] = int(id) + id := uint64(len(b.locs)) + 1 + b.locs[addr] = locInfo{id: id, pcs: []uintptr{addr}} start := b.pb.startMessage() b.pb.uint64Opt(tagLocation_ID, id) b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC)) @@ -293,6 +323,16 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 { return id } +type locInfo struct { + // location id assigned by the profileBuilder + id uint64 + + // sequence of PCs, including the fake PCs returned by the traceback + // to represent inlined functions + // https://github.com/golang/go/blob/d6f2f833c93a41ec1c68e49804b8387a06b131c5/src/runtime/traceback.go#L347-L368 + pcs []uintptr +} + // newProfileBuilder returns a new profileBuilder. // CPU profiling data obtained from the runtime can be added // by calling b.addCPUData, and then the eventual profile @@ -305,7 +345,7 @@ func newProfileBuilder(w io.Writer) *profileBuilder { start: time.Now(), strings: []string{""}, stringMap: map[string]int{"": 0}, - locs: map[uintptr]int{}, + locs: map[uintptr]locInfo{}, funcs: map[string]int{}, } b.readMapping() @@ -388,7 +428,10 @@ func (b *profileBuilder) build() { } values := []int64{0, 0} + + var deck = &pcDeck{} var locs []uint64 + for e := b.m.all; e != nil; e = e.nextAll { values[0] = e.count values[1] = e.count * b.period @@ -402,23 +445,62 @@ func (b *profileBuilder) build() { } } + deck.reset() locs = locs[:0] - for i, addr := range e.stk { - // Addresses from stack traces point to the - // next instruction after each call, except - // for the leaf, which points to where the - // signal occurred. locForPC expects return - // PCs, so increment the leaf address to look - // like a return PC. - if i == 0 { - addr++ - } - l := b.locForPC(addr) - if l == 0 { // runtime.goexit + + // Addresses from stack traces point to the next instruction after each call, + // except for the leaf, which points to where the signal occurred. + // deck.add+emitLocation expects return PCs so increment the leaf address to + // look like a return PC. + e.stk[0] += 1 + for stk := e.stk; len(stk) > 0; { + addr := stk[0] + if l, ok := b.locs[addr]; ok { + // first record the location if there is any pending accumulated info. + if id := b.emitLocation(deck); id > 0 { + locs = append(locs, id) + } + + // then, record the cached location. + locs = append(locs, l.id) + stk = stk[len(l.pcs):] // skip the matching pcs. continue } - locs = append(locs, l) + + frames, symbolizeResult := allFrames(addr) + if len(frames) == 0 { // runtime.goexit. + if id := b.emitLocation(deck); id > 0 { + locs = append(locs, id) + } + stk = stk[1:] + continue + } + + if added := deck.tryAdd(addr, frames, symbolizeResult); added { + stk = stk[1:] + continue + } + // add failed because this addr is not inlined with + // the existing PCs in the deck. Flush the deck and retry to + // handle this pc. + if id := b.emitLocation(deck); id > 0 { + locs = append(locs, id) + } + + // check cache again - previous emitLocation added a new entry + if l, ok := b.locs[addr]; ok { + locs = append(locs, l.id) + stk = stk[len(l.pcs):] // skip the matching pcs. + } else { + deck.tryAdd(addr, frames, symbolizeResult) // must succeed. + stk = stk[1:] + } } + if id := b.emitLocation(deck); id > 0 { // emit remaining location. + locs = append(locs, id) + } + e.stk[0] -= 1 // undo the adjustment on the leaf done before the loop. + b.pbSample(values, locs, labels) } @@ -435,6 +517,133 @@ func (b *profileBuilder) build() { b.zw.Close() } +// pcDeck is a helper to detect a sequence of inlined functions from +// a stack trace returned by the runtime. +// +// The stack traces returned by runtime's trackback functions are fully +// expanded (at least for Go functions) and include the fake pcs representing +// inlined functions. The profile proto expects the inlined functions to be +// encoded in one Location message. +// https://github.com/google/pprof/blob/5e965273ee43930341d897407202dd5e10e952cb/proto/profile.proto#L177-L184 +// +// Runtime does not directly expose whether a frame is for an inlined function +// and looking up debug info is not ideal, so we use a heuristic to filter +// the fake pcs and restore the inlined and entry functions. Inlined functions +// have the following properties: +// Frame's Func is nil (note: also true for non-Go functions), and +// Frame's Entry matches its entry function frame's Entry. (note: could also be true for recursive calls and non-Go functions), +// Frame's Name does not match its entry function frame's name. +// +// As reading and processing the pcs in a stack trace one by one (from leaf to the root), +// we use pcDeck to temporarily hold the observed pcs and their expanded frames +// until we observe the entry function frame. +type pcDeck struct { + pcs []uintptr + frames []runtime.Frame + symbolizeResult symbolizeFlag +} + +func (d *pcDeck) reset() { + d.pcs = d.pcs[:0] + d.frames = d.frames[:0] + d.symbolizeResult = 0 +} + +// tryAdd tries to add the pc and Frames expanded from it (most likely one, +// since the stack trace is already fully expanded) and the symbolizeResult +// to the deck. If it fails the caller needs to flush the deck and retry. +func (d *pcDeck) tryAdd(pc uintptr, frames []runtime.Frame, symbolizeResult symbolizeFlag) (success bool) { + if existing := len(d.pcs); existing > 0 { + // 'frames' are all expanded from one 'pc' and represent all inlined functions + // so we check only the first one. + newFrame := frames[0] + last := d.frames[existing-1] + if last.Func != nil && newFrame.Func != nil { // Can't be an inlined frame. + return false + } + + if last.Entry == 0 || newFrame.Entry == 0 { // Possibly not a Go function. Don't try to merge. + return false + } + + if last.Entry != newFrame.Entry { // newFrame is for a different function. + return false + } + if last.Function == newFrame.Function { // maybe recursion. + return false + } + } + d.pcs = append(d.pcs, pc) + d.frames = append(d.frames, frames...) + d.symbolizeResult |= symbolizeResult + return true +} + +// emitLocation emits the new location and function information recorded in the deck +// and returns the location ID encoded in the profile protobuf. +// It emits to b.pb, so there must be no message encoding in progress. +// It resets the deck. +func (b *profileBuilder) emitLocation(deck *pcDeck) uint64 { + defer deck.reset() + + if len(deck.pcs) == 0 { + return 0 + } + + addr := deck.pcs[0] + firstFrame := deck.frames[0] + + // We can't write out functions while in the middle of the + // Location message, so record new functions we encounter and + // write them out after the Location. + type newFunc struct { + id uint64 + name, file string + } + newFuncs := make([]newFunc, 0, 8) + + id := uint64(len(b.locs)) + 1 + b.locs[addr] = locInfo{id: id, pcs: append([]uintptr{}, deck.pcs...)} + + start := b.pb.startMessage() + b.pb.uint64Opt(tagLocation_ID, id) + b.pb.uint64Opt(tagLocation_Address, uint64(firstFrame.PC)) + for _, frame := range deck.frames { + // Write out each line in frame expansion. + funcID := uint64(b.funcs[frame.Function]) + if funcID == 0 { + funcID = uint64(len(b.funcs)) + 1 + b.funcs[frame.Function] = int(funcID) + newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File}) + } + b.pbLine(tagLocation_Line, funcID, int64(frame.Line)) + } + for i := range b.mem { + if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake { + b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1)) + + m := b.mem[i] + m.funcs |= deck.symbolizeResult + b.mem[i] = m + break + } + } + b.pb.endMessage(tagProfile_Location, start) + + // Write out functions we found during frame expansion. + for _, fn := range newFuncs { + start := b.pb.startMessage() + b.pb.uint64Opt(tagFunction_ID, fn.id) + b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name)) + b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name)) + b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file)) + b.pb.endMessage(tagProfile_Function, start) + } + + b.flush() + return id +} + // readMapping reads /proc/self/maps and writes mappings to b.pb. // It saves the address ranges of the mappings in b.mem for use // when emitting locations. diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go index bcb4d3386d5..eda2b003ad4 100644 --- a/src/runtime/pprof/proto_test.go +++ b/src/runtime/pprof/proto_test.go @@ -358,6 +358,17 @@ func TestMapping(t *testing.T) { continue } } + + if traceback == "Go+C" { + // The test code was arranged to have PCs from C and + // they are not symbolized. + // Check no Location containing those unsymbolized PCs contains multiple lines. + for i, loc := range prof.Location { + if !symbolized(loc) && len(loc.Line) > 1 { + t.Errorf("Location[%d] contains unsymbolized PCs and multiple lines: %v", i, loc) + } + } + } }) } } diff --git a/src/runtime/pprof/testdata/mappingtest/main.go b/src/runtime/pprof/testdata/mappingtest/main.go index 476b9e88a32..484b7f9d06c 100644 --- a/src/runtime/pprof/testdata/mappingtest/main.go +++ b/src/runtime/pprof/testdata/mappingtest/main.go @@ -17,8 +17,7 @@ package main int cpuHogCSalt1 = 0; int cpuHogCSalt2 = 0; -void CPUHogCFunction() { - int foo = cpuHogCSalt1; +void CPUHogCFunction0(int foo) { int i; for (i = 0; i < 100000; i++) { if (foo > 0) { @@ -30,6 +29,10 @@ void CPUHogCFunction() { } } +void CPUHogCFunction() { + CPUHogCFunction0(cpuHogCSalt1); +} + struct CgoTracebackArg { uintptr_t context; uintptr_t sigContext; @@ -39,8 +42,9 @@ struct CgoTracebackArg { void CollectCgoTraceback(void* parg) { struct CgoTracebackArg* arg = (struct CgoTracebackArg*)(parg); - arg->buf[0] = (uintptr_t)(CPUHogCFunction); - arg->buf[1] = 0; + arg->buf[0] = (uintptr_t)(CPUHogCFunction0); + arg->buf[1] = (uintptr_t)(CPUHogCFunction); + arg->buf[2] = 0; }; */ import "C" @@ -81,7 +85,6 @@ var salt1 int var salt2 int func cpuHogGoFunction() { - // Generates CPU profile samples including a Go call path. for { foo := salt1 for i := 0; i < 1e5; i++ {