diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index e5620aec11d..b4fe0e5549c 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -9,7 +9,6 @@ package runtime import ( "internal/abi" - "internal/goarch" "internal/profilerecord" "internal/runtime/atomic" "runtime/internal/sys" @@ -543,14 +542,16 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { gp := getg() mp := acquirem() // we must not be preempted while accessing profstack - var nstk int + nstk := 1 if tracefpunwindoff() || gp.m.hasCgoOnStack() { + mp.profStack[0] = logicalStackSentinel if gp.m.curg == nil || gp.m.curg == gp { - nstk = callers(skip, mp.profStack) + nstk = callers(skip, mp.profStack[1:]) } else { - nstk = gcallers(gp.m.curg, skip, mp.profStack) + nstk = gcallers(gp.m.curg, skip, mp.profStack[1:]) } } else { + mp.profStack[0] = uintptr(skip) if gp.m.curg == nil || gp.m.curg == gp { if skip > 0 { // We skip one fewer frame than the provided value for frame @@ -558,12 +559,12 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { // frame, whereas the saved frame pointer will give us the // caller's return address first (so, not including // saveblockevent) - skip -= 1 + mp.profStack[0] -= 1 } - nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack) + nstk += fpTracebackPCs(unsafe.Pointer(getfp()), mp.profStack[1:]) } else { - mp.profStack[0] = gp.m.curg.sched.pc - nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:]) + mp.profStack[1] = gp.m.curg.sched.pc + nstk += 1 + fpTracebackPCs(unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[2:]) } } @@ -571,52 +572,6 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { releasem(mp) } -// fpTracebackPartialExpand records a call stack obtained starting from fp. -// This function will skip the given number of frames, properly accounting for -// inlining, and save remaining frames as "physical" return addresses. The -// consumer should later use CallersFrames or similar to expand inline frames. -func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int { - var n int - lastFuncID := abi.FuncIDNormal - skipOrAdd := func(retPC uintptr) bool { - if skip > 0 { - skip-- - } else if n < len(pcBuf) { - pcBuf[n] = retPC - n++ - } - return n < len(pcBuf) - } - for n < len(pcBuf) && fp != nil { - // return addr sits one word above the frame pointer - pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize)) - - if skip > 0 { - callPC := pc - 1 - fi := findfunc(callPC) - u, uf := newInlineUnwinder(fi, callPC) - for ; uf.valid(); uf = u.next(uf) { - sf := u.srcFunc(uf) - if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) { - // ignore wrappers - } else if more := skipOrAdd(uf.pc + 1); !more { - return n - } - lastFuncID = sf.funcID - } - } else { - // We've skipped the desired number of frames, so no need - // to perform further inline expansion now. - pcBuf[n] = pc - n++ - } - - // follow the frame pointer to the next one - fp = unsafe.Pointer(*(*uintptr)(fp)) - } - return n -} - // lockTimer assists with profiling contention on runtime-internal locks. // // There are several steps between the time that an M experiences contention and @@ -1120,34 +1075,10 @@ type BlockProfileRecord struct { // the [testing] package's -test.blockprofile flag instead // of calling BlockProfile directly. func BlockProfile(p []BlockProfileRecord) (n int, ok bool) { - n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { copyBlockProfileRecord(&p[0], r) p = p[1:] }) - if !ok { - return - } - expandFrames(p[:n]) - return -} - -func expandFrames(p []BlockProfileRecord) { - expandedStack := makeProfStack() - for i := range p { - cf := CallersFrames(p[i].Stack()) - j := 0 - for ; j < len(expandedStack); j++ { - f, more := cf.Next() - // f.PC is a "call PC", but later consumers will expect - // "return PCs" - expandedStack[i] = f.PC + 1 - if !more { - break - } - } - k := copy(p[i].Stack0[:], expandedStack[:j]) - clear(p[i].Stack0[k:]) - } } // blockProfileInternal returns the number of records n in the profile. If there @@ -1180,9 +1111,6 @@ func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord return } -// copyBlockProfileRecord copies the sample values and call stack from src to dst. -// The call stack is copied as-is. The caller is responsible for handling inline -// expansion, needed when the call stack was collected with frame pointer unwinding. func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) { dst.Count = src.Count dst.Cycles = src.Cycles @@ -1195,11 +1123,7 @@ func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProf if asanenabled { asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0)) } - // We just copy the stack here without inline expansion - // (needed if frame pointer unwinding is used) - // since this function is called under the profile lock, - // and doing something that might allocate can violate lock ordering. - i := copy(dst.Stack0[:], src.Stack) + i := fpunwindExpand(dst.Stack0[:], src.Stack) clear(dst.Stack0[i:]) } @@ -1218,15 +1142,10 @@ func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok // Most clients should use the [runtime/pprof] package // instead of calling MutexProfile directly. func MutexProfile(p []BlockProfileRecord) (n int, ok bool) { - n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { copyBlockProfileRecord(&p[0], r) p = p[1:] }) - if !ok { - return - } - expandFrames(p[:n]) - return } // mutexProfileInternal returns the number of records n in the profile. If there diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go index 4b7a9f63c65..d3af5bba914 100644 --- a/src/runtime/pprof/pprof.go +++ b/src/runtime/pprof/pprof.go @@ -404,25 +404,6 @@ type countProfile interface { Label(i int) *labelMap } -// expandInlinedFrames copies the call stack from pcs into dst, expanding any -// PCs corresponding to inlined calls into the corresponding PCs for the inlined -// functions. Returns the number of frames copied to dst. -func expandInlinedFrames(dst, pcs []uintptr) int { - cf := runtime.CallersFrames(pcs) - var n int - for n < len(dst) { - f, more := cf.Next() - // f.PC is a "call PC", but later consumers will expect - // "return PCs" - dst[n] = f.PC + 1 - n++ - if !more { - break - } - } - return n -} - // printCountCycleProfile outputs block profile records (for block or mutex profiles) // as the pprof-proto format output. Translations from cycle count to time duration // are done because The proto expects count and time (nanoseconds) instead of count @@ -445,7 +426,7 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records [] values[1] = int64(float64(r.Cycles) / cpuGHz) // For count profiles, all stack addresses are // return PCs, which is what appendLocsForStack expects. - n := expandInlinedFrames(expandedStack, r.Stack) + n := pprof_fpunwindExpand(expandedStack[:], r.Stack) locs = b.appendLocsForStack(locs[:0], expandedStack[:n]) b.pbSample(values, locs, nil) } @@ -954,7 +935,7 @@ func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile fu for i := range p { r := &p[i] fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count) - n := expandInlinedFrames(expandedStack, r.Stack) + n := pprof_fpunwindExpand(expandedStack, r.Stack) stack := expandedStack[:n] for _, pc := range stack { fmt.Fprintf(w, " %#x", pc) diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go index bd11914544f..09abbb31ae6 100644 --- a/src/runtime/pprof/pprof_test.go +++ b/src/runtime/pprof/pprof_test.go @@ -2578,81 +2578,3 @@ func produceProfileEvents(t *testing.T, depth int) { runtime.GC() goroutineDeep(t, depth-4) // -4 for produceProfileEvents, **, chanrecv1, chanrev, gopark } - -func TestMutexBlockFullAggregation(t *testing.T) { - // This regression test is adapted from - // https://github.com/grafana/pyroscope-go/issues/103, - // authored by Tolya Korniltsev - - var m sync.Mutex - - prev := runtime.SetMutexProfileFraction(-1) - defer runtime.SetMutexProfileFraction(prev) - - const fraction = 1 - const iters = 100 - const workers = 2 - - runtime.SetMutexProfileFraction(fraction) - runtime.SetBlockProfileRate(1) - defer runtime.SetBlockProfileRate(0) - - wg := sync.WaitGroup{} - wg.Add(workers) - for j := 0; j < workers; j++ { - go func() { - for i := 0; i < iters; i++ { - m.Lock() - // Wait at least 1 millisecond to pass the - // starvation threshold for the mutex - time.Sleep(time.Millisecond) - m.Unlock() - } - wg.Done() - }() - } - wg.Wait() - - assertNoDuplicates := func(name string, collect func([]runtime.BlockProfileRecord) (int, bool)) { - var p []runtime.BlockProfileRecord - n, ok := collect(nil) - for { - p = make([]runtime.BlockProfileRecord, n+50) - n, ok = collect(p) - if ok { - p = p[:n] - break - } - } - seen := make(map[string]struct{}) - for _, r := range p { - cf := runtime.CallersFrames(r.Stack()) - var stack strings.Builder - for { - f, more := cf.Next() - stack.WriteString(f.Func.Name()) - if !more { - break - } - stack.WriteByte('\n') - } - s := stack.String() - if !strings.Contains(s, "TestMutexBlockFullAggregation") { - continue - } - if _, ok := seen[s]; ok { - t.Errorf("saw duplicate entry in %s profile with stack:\n%s", name, s) - } - seen[s] = struct{}{} - } - if len(seen) == 0 { - t.Errorf("did not see any samples in %s profile for this test", name) - } - } - t.Run("mutex", func(t *testing.T) { - assertNoDuplicates("mutex", runtime.MutexProfile) - }) - t.Run("block", func(t *testing.T) { - assertNoDuplicates("block", runtime.BlockProfile) - }) -}