runtime: benchmark mutex handoffs

The speed of handing off a mutex to a waiting thread is sensitive to the configuration of the spinning section of lock2. Measure that latency directly, to complement our existing benchmarks of mutex throughput. For #68578 Change-Id: I7637684bcff62eb05cc008491f095f653d13af4b Reviewed-on: https://go-review.googlesource.com/c/go/+/602176 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-11-11 19:21:37 -07:00 · 2024-07-31 13:45:53 -07:00 · 2024-07-31 13:45:53 -07:00 · e8776e19b9
commit e8776e19b9
parent aac7106cb9
1 changed files with 110 additions and 0 deletions
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@ -7,6 +7,8 @@ package runtime_test
 import (
 	"flag"
 	"fmt"
 	"internal/cpu"
 	"internal/runtime/atomic"
 	"io"
 	. "runtime"
 	"runtime/debug"
@ -561,3 +563,111 @@ func BenchmarkOSYield(b *testing.B) {
 		OSYield()
 	}
 }
 func BenchmarkMutexHandoff(b *testing.B) {
 	testcase := func(delay func(l *Mutex)) func(b *testing.B) {
 		return func(b *testing.B) {
 			if workers := 2; GOMAXPROCS(0) < workers {
 				b.Skipf("requires GOMAXPROCS >= %d", workers)
 			}
 			// Measure latency of mutex handoff between threads.
 			//
 			// Hand off a runtime.mutex between two threads, one running a
 			// "coordinator" goroutine and the other running a "worker"
 			// goroutine. We don't override the runtime's typical
 			// goroutine/thread mapping behavior.
 			//
 			// Measure the latency, starting when the coordinator enters a call
 			// to runtime.unlock and ending when the worker's call to
 			// runtime.lock returns. The benchmark can specify a "delay"
 			// function to simulate the length of the mutex-holder's critical
 			// section, including to arrange for the worker's thread to be in
 			// either the "spinning" or "sleeping" portions of the runtime.lock2
 			// implementation. Measurement starts after any such "delay".
 			//
 			// The two threads' goroutines communicate their current position to
 			// each other in a non-blocking way via the "turn" state.
 			var state struct {
 				_    [cpu.CacheLinePadSize]byte
 				lock Mutex
 				_    [cpu.CacheLinePadSize]byte
 				turn atomic.Int64
 				_    [cpu.CacheLinePadSize]byte
 			}
 			var delta atomic.Int64
 			var wg sync.WaitGroup
 			// coordinator:
 			//  - acquire the mutex
 			//  - set the turn to 2 mod 4, instructing the worker to begin its Lock call
 			//  - wait until the mutex is contended
 			//  - wait a bit more so the worker can commit to its sleep
 			//  - release the mutex and wait for it to be our turn (0 mod 4) again
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
 				var t int64
 				for range b.N {
 					Lock(&state.lock)
 					state.turn.Add(2)
 					delay(&state.lock)
 					t -= Nanotime() // start the timer
 					Unlock(&state.lock)
 					for state.turn.Load()&0x2 != 0 {
 					}
 				}
 				state.turn.Add(1)
 				delta.Add(t)
 			}()
 			// worker:
 			//  - wait until its our turn (2 mod 4)
 			//  - acquire and release the mutex
 			//  - switch the turn counter back to the coordinator (0 mod 4)
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
 				var t int64
 				for {
 					switch state.turn.Load() & 0x3 {
 					case 0:
 					case 1, 3:
 						delta.Add(t)
 						return
 					case 2:
 						Lock(&state.lock)
 						t += Nanotime() // stop the timer
 						Unlock(&state.lock)
 						state.turn.Add(2)
 					}
 				}
 			}()
 			wg.Wait()
 			b.ReportMetric(float64(delta.Load())/float64(b.N), "ns/op")
 		}
 	}
 	b.Run("Solo", func(b *testing.B) {
 		var lock Mutex
 		for range b.N {
 			Lock(&lock)
 			Unlock(&lock)
 		}
 	})
 	b.Run("FastPingPong", testcase(func(l *Mutex) {}))
 	b.Run("SlowPingPong", testcase(func(l *Mutex) {
 		// Wait for the worker to stop spinning and prepare to sleep
 		for !MutexContended(l) {
 		}
 		// Wait a bit longer so the OS can finish committing the worker to its
 		// sleep. Balance consistency against getting enough iterations.
 		const extraNs = 10e3
 		for t0 := Nanotime(); Nanotime()-t0 < extraNs; {
 		}
 	}))
 }