From c8ccbcdde595bd04056398bec72532a8582f7442 Mon Sep 17 00:00:00 2001
From: Rhys Hiltner <rhys.hiltner@gmail.com>
Date: Fri, 2 Aug 2024 15:31:22 -0700
Subject: [PATCH] runtime: add direct benchmark of mutex contention

Measure throughput of a single mutex with all threads contending. Do not
attempt to measure fairness/starvation.

The ChanContended benchmark works somewhat well for this (interacting
with the mutex is a large contributor to its results), but it's better
to be clear about what we're attempting to measure.

For #68578

Change-Id: Ie397b4c363bfcd5afddf796a81cd6c34ebf8551b
Reviewed-on: https://go-review.googlesource.com/c/go/+/604375
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
---
 src/runtime/runtime_test.go | 48 ++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index 6004649ee92..c24f725c0e6 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@@ -564,6 +564,48 @@ func BenchmarkOSYield(b *testing.B) {
 	}
 }
 
+func BenchmarkMutexContention(b *testing.B) {
+	// Measure throughput of a single mutex with all threads contending
+	//
+	// Share a single counter across all threads. Progress from any thread is
+	// progress for the benchmark as a whole. We don't measure or give points
+	// for fairness here, arbitrary delay to any given thread's progress is
+	// invisible and allowed.
+	//
+	// The cache line that holds the count value will need to move between
+	// processors, but not as often as the cache line that holds the mutex. The
+	// mutex protects access to the count value, which limits contention on that
+	// cache line. This is a simple design, but it helps to make the behavior of
+	// the benchmark clear. Most real uses of mutex will protect some number of
+	// cache lines anyway.
+
+	var state struct {
+		_     cpu.CacheLinePad
+		lock  Mutex
+		_     cpu.CacheLinePad
+		count atomic.Int64
+		_     cpu.CacheLinePad
+	}
+
+	procs := GOMAXPROCS(0)
+	var wg sync.WaitGroup
+	for range procs {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for {
+				Lock(&state.lock)
+				ours := state.count.Add(1)
+				Unlock(&state.lock)
+				if ours >= int64(b.N) {
+					return
+				}
+			}
+		}()
+	}
+	wg.Wait()
+}
+
 func BenchmarkMutexHandoff(b *testing.B) {
 	testcase := func(delay func(l *Mutex)) func(b *testing.B) {
 		return func(b *testing.B) {
@@ -590,11 +632,11 @@ func BenchmarkMutexHandoff(b *testing.B) {
 			// each other in a non-blocking way via the "turn" state.
 
 			var state struct {
-				_    [cpu.CacheLinePadSize]byte
+				_    cpu.CacheLinePad
 				lock Mutex
-				_    [cpu.CacheLinePadSize]byte
+				_    cpu.CacheLinePad
 				turn atomic.Int64
-				_    [cpu.CacheLinePadSize]byte
+				_    cpu.CacheLinePad
 			}
 
 			var delta atomic.Int64