Revert "runtime: profile contended lock calls"

This reverts commit go.dev/cl/528657. Reason for revert: broke a lot of builders. Change-Id: I70c33062020e997c4df67b3eaa2e886cf0da961e Reviewed-on: https://go-review.googlesource.com/c/go/+/543660 Reviewed-by: Than McIntosh <thanm@google.com> Auto-Submit: Matthew Dempsky <mdempsky@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-11-17 17:54:48 -07:00 · 2023-11-20 06:27:20 +00:00 · 2023-11-20 06:27:20 +00:00 · 468bc94188
commit 468bc94188
parent 237715cf7c
11 changed files with 23 additions and 627 deletions
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@ -52,17 +52,6 @@ func NumCgoCall() int64 {
 	return n
 }

-func totalMutexWaitTimeNanos() int64 {
-	total := sched.totalMutexWaitTime.Load()
-
-	total += sched.totalRuntimeLockWaitTime.Load()
-	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
-		total += mp.mLockProfile.waitTime.Load()
-	}
-
-	return total
-}
-
 // NumGoroutine returns the number of goroutines that currently exist.
 func NumGoroutine() int {
 	return int(gcount())
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -1340,27 +1340,6 @@ func PageCachePagesLeaked() (leaked uintptr) {
 	return
 }

-type Mutex = mutex
-
-var Lock = lock
-var Unlock = unlock
-
-func MutexContended(l *mutex) bool {
-	switch atomic.Loaduintptr(&l.key) {
-	case 0: // unlocked
-		return false
-	case 1: // locked
-		return false
-	default: // an M is sleeping
-		return true
-	}
-}
-
-func SemRootLock(addr *uint32) *mutex {
-	root := semtable.rootFor(addr)
-	return &root.lock
-}
-
 var Semacquire = semacquire
 var Semrelease1 = semrelease1

--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@ -145,14 +145,6 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	risk in that scenario. Currently not supported on Windows, plan9 or js/wasm. Setting this
 	option for some applications can produce large traces, so use with care.

-	profileruntimelocks: setting profileruntimelocks=1 includes call stacks related to
-	contention on runtime-internal locks in the "mutex" profile, subject to the
-	MutexProfileFraction setting. The call stacks will correspond to the unlock call that
-	released the lock. But instead of the value corresponding to the amount of contention that
-	call stack caused, it corresponds to the amount of time the caller of unlock had to wait
-	in its original call to lock. A future release is expected to align those and remove this
-	setting.
-
 	invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
 	copier to crash the program if an invalid pointer value (for example, 1)
 	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@ -71,8 +71,6 @@ func lock2(l *mutex) {
 	// its wakeup call.
 	wait := v

-	timer := &lockTimer{lock: l}
-	timer.begin()
 	// On uniprocessors, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@ -84,7 +82,6 @@ func lock2(l *mutex) {
 		for i := 0; i < spin; i++ {
 			for l.key == mutex_unlocked {
 				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
-					timer.end()
 					return
 				}
 			}
@ -95,7 +92,6 @@ func lock2(l *mutex) {
 		for i := 0; i < passive_spin; i++ {
 			for l.key == mutex_unlocked {
 				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
-					timer.end()
 					return
 				}
 			}
@ -105,7 +101,6 @@ func lock2(l *mutex) {
 		// Sleep.
 		v = atomic.Xchg(key32(&l.key), mutex_sleeping)
 		if v == mutex_unlocked {
-			timer.end()
 			return
 		}
 		wait = mutex_sleeping
@ -127,7 +122,6 @@ func unlock2(l *mutex) {
 	}

 	gp := getg()
-	gp.m.mLockProfile.recordUnlock(l)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtime·unlock: lock count")
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@ -48,8 +48,6 @@ func lock2(l *mutex) {
 	}
 	semacreate(gp.m)

-	timer := &lockTimer{lock: l}
-	timer.begin()
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@ -62,7 +60,6 @@ Loop:
 		if v&locked == 0 {
 			// Unlocked. Try to lock.
 			if atomic.Casuintptr(&l.key, v, v|locked) {
-				timer.end()
 				return
 			}
 			i = 0
@ -122,7 +119,6 @@ func unlock2(l *mutex) {
 			}
 		}
 	}
-	gp.m.mLockProfile.recordUnlock(l)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtime·unlock: lock count")
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@ -470,7 +470,7 @@ func initMetrics() {
 		"/sync/mutex/wait/total:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				out.kind = metricKindFloat64
-				out.scalar = float64bits(nsToSec(totalMutexWaitTimeNanos()))
+				out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
 			},
 		},
 	}
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@ -6,21 +6,15 @@ package runtime_test

 import (
 	"bytes"
-	"fmt"
-	"internal/goexperiment"
-	"internal/profile"
 	"os"
 	"reflect"
 	"runtime"
 	"runtime/debug"
 	"runtime/metrics"
-	"runtime/pprof"
 	"runtime/trace"
-	"slices"
 	"sort"
 	"strings"
 	"sync"
-	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@ -945,305 +939,3 @@ func TestSchedPauseMetrics(t *testing.T) {
 		})
 	}
 }
-
-func TestRuntimeLockMetricsAndProfile(t *testing.T) {
-	old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
-	defer runtime.SetMutexProfileFraction(old)
-	if old != 0 {
-		t.Fatalf("need MutexProfileRate 0, got %d", old)
-	}
-
-	{
-		before := os.Getenv("GODEBUG")
-		for _, s := range strings.Split(before, ",") {
-			if strings.HasPrefix(s, "profileruntimelocks=") {
-				t.Logf("GODEBUG includes explicit setting %q", s)
-			}
-		}
-		defer func() { os.Setenv("GODEBUG", before) }()
-		os.Setenv("GODEBUG", fmt.Sprintf("%s,profileruntimelocks=1", before))
-	}
-
-	loadProfile := func(t *testing.T) *profile.Profile {
-		var w bytes.Buffer
-		pprof.Lookup("mutex").WriteTo(&w, 0)
-		p, err := profile.Parse(&w)
-		if err != nil {
-			t.Fatalf("failed to parse profile: %v", err)
-		}
-		if err := p.CheckValid(); err != nil {
-			t.Fatalf("invalid profile: %v", err)
-		}
-		return p
-	}
-
-	measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
-		beforeProfile := loadProfile(t)
-		beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
-		metrics.Read(beforeMetrics)
-
-		fn()
-
-		afterProfile := loadProfile(t)
-		afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
-		metrics.Read(afterMetrics)
-
-		sumSamples := func(p *profile.Profile, i int) int64 {
-			var sum int64
-			for _, s := range p.Sample {
-				sum += s.Value[i]
-			}
-			return sum
-		}
-
-		metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
-		profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
-
-		// The internal/profile package does not support compaction; this delta
-		// profile will include separate positive and negative entries.
-		p = afterProfile.Copy()
-		if len(beforeProfile.Sample) > 0 {
-			err := p.Merge(beforeProfile, -1)
-			if err != nil {
-				t.Fatalf("Merge profiles: %v", err)
-			}
-		}
-
-		return metricGrowth, profileGrowth, p
-	}
-
-	testcase := func(stk []string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
-		return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
-			metricGrowth, profileGrowth, p := measureDelta(t, func() {
-				var started, stopped sync.WaitGroup
-				started.Add(workers)
-				stopped.Add(workers)
-				for i := 0; i < workers; i++ {
-					w := &contentionWorker{
-						before: func() {
-							started.Done()
-							started.Wait()
-						},
-						after: func() {
-							stopped.Done()
-						},
-						fn: fn,
-					}
-					go w.run()
-				}
-				stopped.Wait()
-			})
-
-			if profileGrowth == 0 {
-				t.Errorf("no increase in mutex profile")
-			}
-			if metricGrowth == 0 {
-				t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
-			}
-			// This comparison is possible because the time measurements in support of
-			// runtime/pprof and runtime/metrics for runtime-internal locks are so close
-			// together. It doesn't work as well for user-space contention, where the
-			// involved goroutines are not _Grunnable the whole time and so need to pass
-			// through the scheduler.
-			t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
-			t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
-
-			if goexperiment.StaticLockRanking {
-				if !slices.ContainsFunc(stk, func(s string) bool {
-					return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
-				}) {
-					// stk is a call stack that is still on the user stack when
-					// it calls runtime.unlock. Add the extra function that
-					// we'll see, when the static lock ranking implementation of
-					// runtime.unlockWithRank switches to the system stack.
-					stk = append([]string{"runtime.unlockWithRank"}, stk...)
-				}
-			}
-
-			var stks [][]string
-			for _, s := range p.Sample {
-				var have []string
-				for _, loc := range s.Location {
-					for _, line := range loc.Line {
-						have = append(have, line.Function.Name)
-					}
-				}
-				stks = append(stks, have)
-				if slices.Equal(have, stk) {
-					n += s.Value[0]
-					value += s.Value[1]
-				}
-			}
-			t.Logf("stack %v has samples totaling n=%d value=%d", stk, n, value)
-			if n == 0 && value == 0 {
-				t.Logf("profile:\n%s", p)
-				for _, have := range stks {
-					t.Logf("have stack %v", have)
-				}
-				t.Errorf("want stack %v", stk)
-			}
-
-			return metricGrowth, profileGrowth, n, value
-		}
-	}
-
-	name := t.Name()
-
-	t.Run("runtime.lock", func(t *testing.T) {
-		mus := make([]runtime.Mutex, 100)
-		var needContention atomic.Int64
-		delay := 10 * time.Microsecond
-		delayMicros := delay.Microseconds()
-
-		// The goroutine that acquires the lock will only proceed when it
-		// detects that its partner is contended for the lock. That will lead to
-		// live-lock if anything (such as a STW) prevents the partner goroutine
-		// from running. Allowing the contention workers to pause and restart
-		// (to allow a STW to proceed) makes it harder to confirm that we're
-		// counting the correct number of contention events, since some locks
-		// will end up contended twice. Instead, disable the GC.
-		defer debug.SetGCPercent(debug.SetGCPercent(-1))
-
-		const workers = 2
-		if runtime.GOMAXPROCS(0) < workers {
-			t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
-		}
-
-		fn := func() bool {
-			n := int(needContention.Load())
-			if n < 0 {
-				return false
-			}
-			mu := &mus[n]
-
-			runtime.Lock(mu)
-			for int(needContention.Load()) == n {
-				if runtime.MutexContended(mu) {
-					// make them wait a little while
-					for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
-						runtime.Usleep(uint32(delayMicros))
-					}
-					break
-				}
-			}
-			runtime.Unlock(mu)
-			needContention.Store(int64(n - 1))
-
-			return true
-		}
-
-		stk := []string{
-			"runtime.unlock",
-			"runtime_test." + name + ".func5.1",
-			"runtime_test.(*contentionWorker).run",
-		}
-
-		t.Run("sample-1", func(t *testing.T) {
-			old := runtime.SetMutexProfileFraction(1)
-			defer runtime.SetMutexProfileFraction(old)
-
-			needContention.Store(int64(len(mus) - 1))
-			metricGrowth, profileGrowth, n, _ := testcase(stk, workers, fn)(t)
-
-			if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
-				// The test imposes a delay with usleep, verified with calls to
-				// nanotime. Compare against the runtime/metrics package's view
-				// (based on nanotime) rather than runtime/pprof's view (based
-				// on cputicks).
-				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
-			}
-			if have, want := n, int64(len(mus)); have != want {
-				t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
-			}
-
-			const slop = 1.5 // account for nanotime vs cputicks
-			if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
-				t.Errorf("views differ by more than %fx", slop)
-			}
-		})
-
-		t.Run("sample-2", func(t *testing.T) {
-			old := runtime.SetMutexProfileFraction(2)
-			defer runtime.SetMutexProfileFraction(old)
-
-			needContention.Store(int64(len(mus) - 1))
-			metricGrowth, profileGrowth, n, _ := testcase(stk, workers, fn)(t)
-
-			// With 100 trials and profile fraction of 2, we expect to capture
-			// 50 samples. Allow the test to pass if we get at least 20 samples;
-			// the CDF of the binomial distribution says there's less than a
-			// 1e-9 chance of that, which is an acceptably low flakiness rate.
-			const samplingSlop = 2.5
-
-			if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
-				// The test imposes a delay with usleep, verified with calls to
-				// nanotime. Compare against the runtime/metrics package's view
-				// (based on nanotime) rather than runtime/pprof's view (based
-				// on cputicks).
-				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
-			}
-			if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
-				t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
-			}
-
-			const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
-			if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
-				t.Errorf("views differ by more than %fx", timerSlop)
-			}
-		})
-	})
-
-	t.Run("runtime.semrelease", func(t *testing.T) {
-		old := runtime.SetMutexProfileFraction(1)
-		defer runtime.SetMutexProfileFraction(old)
-
-		const workers = 3
-		if runtime.GOMAXPROCS(0) < workers {
-			t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
-		}
-
-		var sem uint32 = 1
-		var sawContention atomic.Int32
-		var need int32 = 1000 // counteract low timer resolution by requiring more samples
-		fn := func() bool {
-			if sawContention.Load() >= need {
-				return false
-			}
-			runtime.Semacquire(&sem)
-			runtime.Semrelease1(&sem, false, 0)
-			if runtime.MutexContended(runtime.SemRootLock(&sem)) {
-				sawContention.Add(1)
-			}
-			return true
-		}
-
-		stk := []string{
-			"runtime.unlock",
-			"runtime.semrelease1",
-			"runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
-			"runtime_test.(*contentionWorker).run",
-		}
-
-		// Verify that we get call stack we expect, with anything more than zero
-		// nanoseconds / zero samples. The duration of each contention event is
-		// too small relative to the expected overhead for us to verify its
-		// value more directly. Leave that to the explicit lock/unlock test.
-
-		testcase(stk, workers, fn)(t)
-	})
-}
-
-// contentionWorker provides cleaner call stacks for lock contention profile tests
-type contentionWorker struct {
-	before func()
-	fn     func() bool
-	after  func()
-}
-
-func (w *contentionWorker) run() {
-	defer w.after()
-	w.before()
-
-	for w.fn() {
-	}
-}
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -234,10 +234,6 @@ func newBucket(typ bucketType, nstk int) *bucket {
 // stk returns the slice in b holding the stack.
 func (b *bucket) stk() []uintptr {
 	stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
-	if b.nstk > maxStack {
-		// prove that slicing works; otherwise a failure requires a P
-		throw("bad profile stack count")
-	}
 	return stk[:b.nstk:b.nstk]
 }

@ -513,237 +509,7 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	} else {
 		nstk = gcallers(gp.m.curg, skip, stk[:])
 	}
-
-	saveBlockEventStack(cycles, rate, stk[:nstk], which)
-}
-
-// lockTimer assists with profiling contention on runtime-internal locks.
-//
-// There are several steps between the time that an M experiences contention and
-// when that contention may be added to the profile. This comes from our
-// constraints: We need to keep the critical section of each lock small,
-// especially when those locks are contended. The reporting code cannot acquire
-// new locks until the M has released all other locks, which means no memory
-// allocations and encourages use of (temporary) M-local storage.
-//
-// The M will have space for storing one call stack that caused contention, and
-// for the magnitude of that contention. It will also have space to store the
-// magnitude of additional contention the M caused, since it only has space to
-// remember one call stack and might encounter several contention events before
-// it releases all of its locks and is thus able to transfer the local buffer
-// into the profile.
-//
-// The M will collect the call stack when it unlocks the contended lock. That
-// minimizes the impact on the critical section of the contended lock, and
-// matches the mutex profile's behavior for contention in sync.Mutex: measured
-// at the Unlock method.
-//
-// The profile for contention on sync.Mutex blames the caller of Unlock for the
-// amount of contention experienced by the callers of Lock which had to wait.
-// When there are several critical sections, this allows identifying which of
-// them is responsible.
-//
-// Matching that behavior for runtime-internal locks will require identifying
-// which Ms are blocked on the mutex. The semaphore-based implementation is
-// ready to allow that, but the futex-based implementation will require a bit
-// more work. Until then, we report contention on runtime-internal locks with a
-// call stack taken from the unlock call (like the rest of the user-space
-// "mutex" profile), but assign it a duration value based on how long the
-// previous lock call took (like the user-space "block" profile).
-//
-// Thus, reporting the call stacks of runtime-internal lock contention is
-// guarded by GODEBUG for now. Set GODEBUG=profileruntimelocks=1 to enable.
-//
-// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
-//
-// The M will track this by storing a pointer to the lock; lock/unlock pairs for
-// runtime-internal locks are always on the same M.
-//
-// Together, that demands several steps for recording contention. First, when
-// finally acquiring a contended lock, the M decides whether it should plan to
-// profile that event by storing a pointer to the lock in its "to be profiled
-// upon unlock" field. If that field is already set, it uses the relative
-// magnitudes to weight a random choice between itself and the other lock, with
-// the loser's time being added to the "additional contention" field. Otherwise
-// if the M's call stack buffer is occupied, it does the comparison against that
-// sample's magnitude.
-//
-// Second, having unlocked a mutex the M checks to see if it should capture the
-// call stack into its local buffer. Finally, when the M unlocks its last mutex,
-// it transfers the local buffer into the profile. As part of that step, it also
-// transfers any "additional contention" time to the profile. Any lock
-// contention that it experiences while adding samples to the profile will be
-// recorded later as "additional contention" and not include a call stack, to
-// avoid an echo.
-type lockTimer struct {
-	lock      *mutex
-	timeRate  int64
-	timeStart int64
-	tickStart int64
-}
-
-func (lt *lockTimer) begin() {
-	rate := int64(atomic.Load64(&mutexprofilerate))
-
-	lt.timeRate = gTrackingPeriod
-	if rate != 0 && rate < lt.timeRate {
-		lt.timeRate = rate
-	}
-	if int64(fastrand())%lt.timeRate == 0 {
-		lt.timeStart = nanotime()
-	}
-
-	if rate > 0 && int64(fastrand())%rate == 0 {
-		lt.tickStart = cputicks()
-	}
-}
-
-func (lt *lockTimer) end() {
-	gp := getg()
-
-	if lt.timeStart != 0 {
-		nowTime := nanotime()
-		gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
-	}
-
-	if lt.tickStart != 0 {
-		nowTick := cputicks()
-		gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
-	}
-}
-
-type mLockProfile struct {
-	waitTime   atomic.Int64      // total nanoseconds spent waiting in runtime.lockWithRank
-	stack      [maxStack]uintptr // stack that experienced contention in runtime.lockWithRank
-	pending    uintptr           // *mutex that experienced contention (to be traceback-ed)
-	cycles     int64             // cycles attributable to "pending" (if set), otherwise to "stack"
-	cyclesLost int64             // contention for which we weren't able to record a call stack
-	disabled   bool              // attribute all time to "lost"
-}
-
-func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
-	if cycles <= 0 {
-		return
-	}
-
-	if prof.disabled {
-		// We're experiencing contention while attempting to report contention.
-		// Make a note of its magnitude, but don't allow it to be the sole cause
-		// of another contention report.
-		prof.cyclesLost += cycles
-		return
-	}
-
-	if uintptr(unsafe.Pointer(l)) == prof.pending {
-		// Optimization: we'd already planned to profile this same lock (though
-		// possibly from a different unlock site).
-		prof.cycles += cycles
-		return
-	}
-
-	if prev := prof.cycles; prev > 0 {
-		// We can only store one call stack for runtime-internal lock contention
-		// on this M, and we've already got one. Decide which should stay, and
-		// add the other to the report for runtime._LostContendedLock.
-		prevScore := fastrand64() % uint64(prev)
-		thisScore := fastrand64() % uint64(cycles)
-		if prevScore > thisScore {
-			prof.cyclesLost += cycles
-			return
-		} else {
-			prof.cyclesLost += prev
-		}
-	}
-	// Saving the *mutex as a uintptr is safe because:
-	//  - lockrank_on.go does this too, which gives it regular exercise
-	//  - the lock would only move if it's stack allocated, which means it
-	//      cannot experience multi-M contention
-	prof.pending = uintptr(unsafe.Pointer(l))
-	prof.cycles = cycles
-}
-
-// From unlock2, we might not be holding a p in this code.
-//
-//go:nowritebarrierrec
-func (prof *mLockProfile) recordUnlock(l *mutex) {
-	if uintptr(unsafe.Pointer(l)) == prof.pending {
-		prof.captureStack()
-	}
-	if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
-		prof.store()
-	}
-}
-
-func (prof *mLockProfile) captureStack() {
-	skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
-	if staticLockRanking {
-		// When static lock ranking is enabled, we'll always be on the system
-		// stack at this point. There will be a runtime.unlockWithRank.func1
-		// frame, and if the call to runtime.unlock took place on a user stack
-		// then there'll also be a runtime.systemstack frame. To keep stack
-		// traces somewhat consistent whether or not static lock ranking is
-		// enabled, we'd like to skip those. But it's hard to tell how long
-		// we've been on the system stack so accept an extra frame in that case,
-		// with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
-		// "runtime.unlock".
-		skip += 1 // runtime.unlockWithRank.func1
-	}
-	prof.pending = 0
-
-	if debug.profileruntimelocks.Load() == 0 {
-		prof.stack[0] = abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum
-		prof.stack[1] = 0
-		return
-	}
-
-	var nstk int
-	gp := getg()
-	sp := getcallersp()
-	pc := getcallerpc()
-	systemstack(func() {
-		var u unwinder
-		u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
-		nstk = tracebackPCs(&u, skip, prof.stack[:])
-	})
-	if nstk < len(prof.stack) {
-		prof.stack[nstk] = 0
-	}
-}
-
-func (prof *mLockProfile) store() {
-	// Report any contention we experience within this function as "lost"; it's
-	// important that the act of reporting a contention event not lead to a
-	// reportable contention event. This also means we can use prof.stack
-	// without copying, since it won't change during this function.
-	mp := acquirem()
-	prof.disabled = true
-
-	nstk := maxStack
-	for i := 0; i < nstk; i++ {
-		if pc := prof.stack[i]; pc == 0 {
-			nstk = i
-			break
-		}
-	}
-
-	cycles, lost := prof.cycles, prof.cyclesLost
-	prof.cycles, prof.cyclesLost = 0, 0
-
-	rate := int64(atomic.Load64(&mutexprofilerate))
-	saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
-	if lost > 0 {
-		lostStk := [...]uintptr{
-			abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum,
-		}
-		saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
-	}
-
-	prof.disabled = false
-	releasem(mp)
-}
-
-func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
-	b := stkbucket(which, 0, stk, true)
+	b := stkbucket(which, 0, stk[:nstk], true)
 	bp := b.bp()

 	lock(&profBlockLock)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -919,7 +919,7 @@ func mcommoninit(mp *m, id int64) {
 	// when it is just in a register or thread-local storage.
 	mp.alllink = allm

-	// NumCgoCall() and others iterate over allm w/o schedlock,
+	// NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
 	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
 	unlock(&sched.lock)
@ -1852,7 +1852,6 @@ found:
 	unlock(&sched.lock)

 	atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
-	sched.totalRuntimeLockWaitTime.Add(mp.mLockProfile.waitTime.Load())

 	// Release the P.
 	handoffp(releasep())
@ -5271,7 +5270,6 @@ func _ExternalCode()              { _ExternalCode() }
 func _LostExternalCode()          { _LostExternalCode() }
 func _GC()                        { _GC() }
 func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
-func _LostContendedLock()         { _LostContendedLock() }
 func _VDSO()                      { _VDSO() }

 // Called if we receive a SIGPROF signal.
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@ -307,27 +307,26 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	cgocheck            int32
-	clobberfree         int32
-	dontfreezetheworld  int32
-	efence              int32
-	gccheckmark         int32
-	gcpacertrace        int32
-	gcshrinkstackoff    int32
-	gcstoptheworld      int32
-	gctrace             int32
-	invalidptr          int32
-	madvdontneed        int32 // for Linux; issue 28466
-	profileruntimelocks atomic.Int32
-	scavtrace           int32
-	scheddetail         int32
-	schedtrace          int32
-	tracebackancestors  int32
-	asyncpreemptoff     int32
-	harddecommit        int32
-	adaptivestackstart  int32
-	tracefpunwindoff    int32
-	traceadvanceperiod  int32
+	cgocheck           int32
+	clobberfree        int32
+	dontfreezetheworld int32
+	efence             int32
+	gccheckmark        int32
+	gcpacertrace       int32
+	gcshrinkstackoff   int32
+	gcstoptheworld     int32
+	gctrace            int32
+	invalidptr         int32
+	madvdontneed       int32 // for Linux; issue 28466
+	scavtrace          int32
+	scheddetail        int32
+	schedtrace         int32
+	tracebackancestors int32
+	asyncpreemptoff    int32
+	harddecommit       int32
+	adaptivestackstart int32
+	tracefpunwindoff   int32
+	traceadvanceperiod int32

 	// debug.malloc is used as a combined debug check
 	// in the malloc function and should be set
@ -353,7 +352,6 @@ var dbgvars = []*dbgVar{
 	{name: "gctrace", value: &debug.gctrace},
 	{name: "invalidptr", value: &debug.invalidptr},
 	{name: "madvdontneed", value: &debug.madvdontneed},
-	{name: "profileruntimelocks", atomic: &debug.profileruntimelocks},
 	{name: "sbrk", value: &debug.sbrk},
 	{name: "scavtrace", value: &debug.scavtrace},
 	{name: "scheddetail", value: &debug.scheddetail},
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -593,8 +593,6 @@ type m struct {
 	lockedInt     uint32      // tracking for internal lockOSThread
 	nextwaitm     muintptr    // next m waiting for lock

-	mLockProfile mLockProfile // fields relating to runtime.lock contention
-
 	// wait* are used to carry arguments from gopark into park_m, because
 	// there's no stack to put them on. That is their sole purpose.
 	waitunlockf          func(*g, unsafe.Pointer) bool
@ -902,12 +900,6 @@ type schedt struct {
 	// stwTotalTimeOther covers the others.
 	stwTotalTimeGC    timeHistogram
 	stwTotalTimeOther timeHistogram
-
-	// totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
-	// allm) is the sum of time goroutines have spent in _Grunnable and with an
-	// M, but waiting for locks within the runtime. This field stores the value
-	// for Ms that have exited.
-	totalRuntimeLockWaitTime atomic.Int64
 }

 // Values for the flags field of a sigTabT.