diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index b4fe0e5549c..e5620aec11d 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -9,6 +9,7 @@ package runtime
 
 import (
 	"internal/abi"
+	"internal/goarch"
 	"internal/profilerecord"
 	"internal/runtime/atomic"
 	"runtime/internal/sys"
@@ -542,16 +543,14 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	gp := getg()
 	mp := acquirem() // we must not be preempted while accessing profstack
 
-	nstk := 1
+	var nstk int
 	if tracefpunwindoff() || gp.m.hasCgoOnStack() {
-		mp.profStack[0] = logicalStackSentinel
 		if gp.m.curg == nil || gp.m.curg == gp {
-			nstk = callers(skip, mp.profStack[1:])
+			nstk = callers(skip, mp.profStack)
 		} else {
-			nstk = gcallers(gp.m.curg, skip, mp.profStack[1:])
+			nstk = gcallers(gp.m.curg, skip, mp.profStack)
 		}
 	} else {
-		mp.profStack[0] = uintptr(skip)
 		if gp.m.curg == nil || gp.m.curg == gp {
 			if skip > 0 {
 				// We skip one fewer frame than the provided value for frame
@@ -559,12 +558,12 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 				// frame, whereas the saved frame pointer will give us the
 				// caller's return address first (so, not including
 				// saveblockevent)
-				mp.profStack[0] -= 1
+				skip -= 1
 			}
-			nstk += fpTracebackPCs(unsafe.Pointer(getfp()), mp.profStack[1:])
+			nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack)
 		} else {
-			mp.profStack[1] = gp.m.curg.sched.pc
-			nstk += 1 + fpTracebackPCs(unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[2:])
+			mp.profStack[0] = gp.m.curg.sched.pc
+			nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:])
 		}
 	}
 
@@ -572,6 +571,52 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	releasem(mp)
 }
 
+// fpTracebackPartialExpand records a call stack obtained starting from fp.
+// This function will skip the given number of frames, properly accounting for
+// inlining, and save remaining frames as "physical" return addresses. The
+// consumer should later use CallersFrames or similar to expand inline frames.
+func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int {
+	var n int
+	lastFuncID := abi.FuncIDNormal
+	skipOrAdd := func(retPC uintptr) bool {
+		if skip > 0 {
+			skip--
+		} else if n < len(pcBuf) {
+			pcBuf[n] = retPC
+			n++
+		}
+		return n < len(pcBuf)
+	}
+	for n < len(pcBuf) && fp != nil {
+		// return addr sits one word above the frame pointer
+		pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize))
+
+		if skip > 0 {
+			callPC := pc - 1
+			fi := findfunc(callPC)
+			u, uf := newInlineUnwinder(fi, callPC)
+			for ; uf.valid(); uf = u.next(uf) {
+				sf := u.srcFunc(uf)
+				if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) {
+					// ignore wrappers
+				} else if more := skipOrAdd(uf.pc + 1); !more {
+					return n
+				}
+				lastFuncID = sf.funcID
+			}
+		} else {
+			// We've skipped the desired number of frames, so no need
+			// to perform further inline expansion now.
+			pcBuf[n] = pc
+			n++
+		}
+
+		// follow the frame pointer to the next one
+		fp = unsafe.Pointer(*(*uintptr)(fp))
+	}
+	return n
+}
+
 // lockTimer assists with profiling contention on runtime-internal locks.
 //
 // There are several steps between the time that an M experiences contention and
@@ -1075,10 +1120,34 @@ type BlockProfileRecord struct {
 // the [testing] package's -test.blockprofile flag instead
 // of calling BlockProfile directly.
 func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
-	return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
+	n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
 		copyBlockProfileRecord(&p[0], r)
 		p = p[1:]
 	})
+	if !ok {
+		return
+	}
+	expandFrames(p[:n])
+	return
+}
+
+func expandFrames(p []BlockProfileRecord) {
+	expandedStack := makeProfStack()
+	for i := range p {
+		cf := CallersFrames(p[i].Stack())
+		j := 0
+		for ; j < len(expandedStack); j++ {
+			f, more := cf.Next()
+			// f.PC is a "call PC", but later consumers will expect
+			// "return PCs"
+			expandedStack[i] = f.PC + 1
+			if !more {
+				break
+			}
+		}
+		k := copy(p[i].Stack0[:], expandedStack[:j])
+		clear(p[i].Stack0[k:])
+	}
 }
 
 // blockProfileInternal returns the number of records n in the profile. If there
@@ -1111,6 +1180,9 @@ func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord
 	return
 }
 
+// copyBlockProfileRecord copies the sample values and call stack from src to dst.
+// The call stack is copied as-is. The caller is responsible for handling inline
+// expansion, needed when the call stack was collected with frame pointer unwinding.
 func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) {
 	dst.Count = src.Count
 	dst.Cycles = src.Cycles
@@ -1123,7 +1195,11 @@ func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProf
 	if asanenabled {
 		asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
 	}
-	i := fpunwindExpand(dst.Stack0[:], src.Stack)
+	// We just copy the stack here without inline expansion
+	// (needed if frame pointer unwinding is used)
+	// since this function is called under the profile lock,
+	// and doing something that might allocate can violate lock ordering.
+	i := copy(dst.Stack0[:], src.Stack)
 	clear(dst.Stack0[i:])
 }
 
@@ -1142,10 +1218,15 @@ func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok
 // Most clients should use the [runtime/pprof] package
 // instead of calling MutexProfile directly.
 func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
-	return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
+	n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
 		copyBlockProfileRecord(&p[0], r)
 		p = p[1:]
 	})
+	if !ok {
+		return
+	}
+	expandFrames(p[:n])
+	return
 }
 
 // mutexProfileInternal returns the number of records n in the profile. If there
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index d3af5bba914..4b7a9f63c65 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -404,6 +404,25 @@ type countProfile interface {
 	Label(i int) *labelMap
 }
 
+// expandInlinedFrames copies the call stack from pcs into dst, expanding any
+// PCs corresponding to inlined calls into the corresponding PCs for the inlined
+// functions. Returns the number of frames copied to dst.
+func expandInlinedFrames(dst, pcs []uintptr) int {
+	cf := runtime.CallersFrames(pcs)
+	var n int
+	for n < len(dst) {
+		f, more := cf.Next()
+		// f.PC is a "call PC", but later consumers will expect
+		// "return PCs"
+		dst[n] = f.PC + 1
+		n++
+		if !more {
+			break
+		}
+	}
+	return n
+}
+
 // printCountCycleProfile outputs block profile records (for block or mutex profiles)
 // as the pprof-proto format output. Translations from cycle count to time duration
 // are done because The proto expects count and time (nanoseconds) instead of count
@@ -426,7 +445,7 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records []
 		values[1] = int64(float64(r.Cycles) / cpuGHz)
 		// For count profiles, all stack addresses are
 		// return PCs, which is what appendLocsForStack expects.
-		n := pprof_fpunwindExpand(expandedStack[:], r.Stack)
+		n := expandInlinedFrames(expandedStack, r.Stack)
 		locs = b.appendLocsForStack(locs[:0], expandedStack[:n])
 		b.pbSample(values, locs, nil)
 	}
@@ -935,7 +954,7 @@ func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile fu
 	for i := range p {
 		r := &p[i]
 		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
-		n := pprof_fpunwindExpand(expandedStack, r.Stack)
+		n := expandInlinedFrames(expandedStack, r.Stack)
 		stack := expandedStack[:n]
 		for _, pc := range stack {
 			fmt.Fprintf(w, " %#x", pc)
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 09abbb31ae6..bd11914544f 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -2578,3 +2578,81 @@ func produceProfileEvents(t *testing.T, depth int) {
 	runtime.GC()
 	goroutineDeep(t, depth-4) // -4 for produceProfileEvents, **, chanrecv1, chanrev, gopark
 }
+
+func TestMutexBlockFullAggregation(t *testing.T) {
+	// This regression test is adapted from
+	// https://github.com/grafana/pyroscope-go/issues/103,
+	// authored by Tolya Korniltsev
+
+	var m sync.Mutex
+
+	prev := runtime.SetMutexProfileFraction(-1)
+	defer runtime.SetMutexProfileFraction(prev)
+
+	const fraction = 1
+	const iters = 100
+	const workers = 2
+
+	runtime.SetMutexProfileFraction(fraction)
+	runtime.SetBlockProfileRate(1)
+	defer runtime.SetBlockProfileRate(0)
+
+	wg := sync.WaitGroup{}
+	wg.Add(workers)
+	for j := 0; j < workers; j++ {
+		go func() {
+			for i := 0; i < iters; i++ {
+				m.Lock()
+				// Wait at least 1 millisecond to pass the
+				// starvation threshold for the mutex
+				time.Sleep(time.Millisecond)
+				m.Unlock()
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	assertNoDuplicates := func(name string, collect func([]runtime.BlockProfileRecord) (int, bool)) {
+		var p []runtime.BlockProfileRecord
+		n, ok := collect(nil)
+		for {
+			p = make([]runtime.BlockProfileRecord, n+50)
+			n, ok = collect(p)
+			if ok {
+				p = p[:n]
+				break
+			}
+		}
+		seen := make(map[string]struct{})
+		for _, r := range p {
+			cf := runtime.CallersFrames(r.Stack())
+			var stack strings.Builder
+			for {
+				f, more := cf.Next()
+				stack.WriteString(f.Func.Name())
+				if !more {
+					break
+				}
+				stack.WriteByte('\n')
+			}
+			s := stack.String()
+			if !strings.Contains(s, "TestMutexBlockFullAggregation") {
+				continue
+			}
+			if _, ok := seen[s]; ok {
+				t.Errorf("saw duplicate entry in %s profile with stack:\n%s", name, s)
+			}
+			seen[s] = struct{}{}
+		}
+		if len(seen) == 0 {
+			t.Errorf("did not see any samples in %s profile for this test", name)
+		}
+	}
+	t.Run("mutex", func(t *testing.T) {
+		assertNoDuplicates("mutex", runtime.MutexProfile)
+	})
+	t.Run("block", func(t *testing.T) {
+		assertNoDuplicates("block", runtime.BlockProfile)
+	})
+}