sync: add active spinning to Mutex

Currently sync.Mutex is fully cooperative. That is, once contention is discovered, the goroutine calls into scheduler. This is suboptimal as the resource can become free soon after (especially if critical sections are short). Server software usually runs at ~~50% CPU utilization, that is, switching to other goroutines is not necessary profitable. This change adds limited active spinning to sync.Mutex if: 1. running on a multicore machine and 2. GOMAXPROCS>1 and 3. there is at least one other running P and 4. local runq is empty. As opposed to runtime mutex we don't do passive spinning, because there can be work on global runq on on other Ps. benchmark old ns/op new ns/op delta BenchmarkMutexNoSpin 1271 1272 +0.08% BenchmarkMutexNoSpin-2 702 683 -2.71% BenchmarkMutexNoSpin-4 377 372 -1.33% BenchmarkMutexNoSpin-8 197 190 -3.55% BenchmarkMutexNoSpin-16 131 122 -6.87% BenchmarkMutexNoSpin-32 170 164 -3.53% BenchmarkMutexSpin 4724 4728 +0.08% BenchmarkMutexSpin-2 2501 2491 -0.40% BenchmarkMutexSpin-4 1330 1325 -0.38% BenchmarkMutexSpin-8 684 684 +0.00% BenchmarkMutexSpin-16 414 372 -10.14% BenchmarkMutexSpin-32 559 469 -16.10% BenchmarkMutex 19.1 19.1 +0.00% BenchmarkMutex-2 81.6 54.3 -33.46% BenchmarkMutex-4 143 100 -30.07% BenchmarkMutex-8 154 156 +1.30% BenchmarkMutex-16 140 159 +13.57% BenchmarkMutex-32 141 163 +15.60% BenchmarkMutexSlack 33.3 31.2 -6.31% BenchmarkMutexSlack-2 122 97.7 -19.92% BenchmarkMutexSlack-4 168 158 -5.95% BenchmarkMutexSlack-8 152 158 +3.95% BenchmarkMutexSlack-16 140 159 +13.57% BenchmarkMutexSlack-32 146 162 +10.96% BenchmarkMutexWork 154 154 +0.00% BenchmarkMutexWork-2 89.2 89.9 +0.78% BenchmarkMutexWork-4 139 86.1 -38.06% BenchmarkMutexWork-8 177 162 -8.47% BenchmarkMutexWork-16 170 173 +1.76% BenchmarkMutexWork-32 176 176 +0.00% BenchmarkMutexWorkSlack 160 160 +0.00% BenchmarkMutexWorkSlack-2 103 99.1 -3.79% BenchmarkMutexWorkSlack-4 155 148 -4.52% BenchmarkMutexWorkSlack-8 176 170 -3.41% BenchmarkMutexWorkSlack-16 170 173 +1.76% BenchmarkMutexWorkSlack-32 175 176 +0.57% "No work" benchmarks are not very interesting (BenchmarkMutex and BenchmarkMutexSlack), as they are absolutely not realistic. Fixes #8889 Change-Id: I6f14f42af1fa48f73a776fdd11f0af6dd2bb428b Reviewed-on: https://go-review.googlesource.com/5430 Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
2024-11-18 11:55:01 -07:00 · 2015-02-20 11:50:56 +03:00 · 2015-02-20 11:50:56 +03:00 · edcad8639a
commit edcad8639a
parent 3dd029aa7e
4 changed files with 103 additions and 0 deletions
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@ -3332,3 +3332,27 @@ func sync_atomic_runtime_procPin() int {
 func sync_atomic_runtime_procUnpin() {
 	procUnpin()
 }
 // Active spinning for sync.Mutex.
 //go:linkname sync_runtime_canSpin sync.runtime_canSpin
 //go:nosplit
 func sync_runtime_canSpin(i int) bool {
 	// sync.Mutex is cooperative, so we are conservative with spinning.
 	// Spin only few times and only if running on a multicore machine and
 	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
 	// As opposed to runtime mutex we don't do passive spinning here,
 	// because there can be work on global runq on on other Ps.
 	if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
 		return false
 	}
 	if p := getg().m.p; p.runqhead != p.runqtail {
 		return false
 	}
 	return true
 }
 //go:linkname sync_runtime_doSpin sync.runtime_doSpin
 //go:nosplit
 func sync_runtime_doSpin() {
 	procyield(active_spin_cnt)
 }
--- a/src/sync/mutex.go
+++ b/src/sync/mutex.go
@ -48,15 +48,31 @@ func (m *Mutex) Lock() {
 	}
 	awoke := false
 	iter := 0
 	for {
 		old := m.state
 		new := old | mutexLocked
 		if old&mutexLocked != 0 {
 			if runtime_canSpin(iter) {
 				// Active spinning makes sense.
 				// Try to set mutexWoken flag to inform Unlock
 				// to not wake other blocked goroutines.
 				if !awoke && old&mutexWoken == 0 && old>>mutexWaiterShift != 0 &&
 					atomic.CompareAndSwapInt32(&m.state, old, old|mutexWoken) {
 					awoke = true
 				}
 				runtime_doSpin()
 				iter++
 				continue
 			}
 			new = old + 1<<mutexWaiterShift
 		}
 		if awoke {
 			// The goroutine has been woken from sleep,
 			// so we need to reset the flag in either case.
 			if new&mutexWoken == 0 {
 				panic("sync: inconsistent mutex state")
 			}
 			new &^= mutexWoken
 		}
 		if atomic.CompareAndSwapInt32(&m.state, old, new) {
@ -65,6 +81,7 @@ func (m *Mutex) Lock() {
 			}
 			runtime_Semacquire(&m.sema)
 			awoke = true
 			iter = 0
 		}
 	}
--- a/src/sync/mutex_test.go
+++ b/src/sync/mutex_test.go
@ -134,3 +134,58 @@ func BenchmarkMutexWork(b *testing.B) {
 func BenchmarkMutexWorkSlack(b *testing.B) {
 	benchmarkMutex(b, true, true)
 }
 func BenchmarkMutexNoSpin(b *testing.B) {
 	// This benchmark models a situation where spinning in the mutex should be
 	// non-profitable and allows to confirm that spinning does not do harm.
 	// To achieve this we create excess of goroutines most of which do local work.
 	// These goroutines yield during local work, so that switching from
 	// a blocked goroutine to other goroutines is profitable.
 	// As a matter of fact, this benchmark still triggers some spinning in the mutex.
 	var m Mutex
 	var acc0, acc1 uint64
 	b.SetParallelism(4)
 	b.RunParallel(func(pb *testing.PB) {
 		c := make(chan bool)
 		var data [4 << 10]uint64
 		for i := 0; pb.Next(); i++ {
 			if i%4 == 0 {
 				m.Lock()
 				acc0 -= 100
 				acc1 += 100
 				m.Unlock()
 			} else {
 				for i := 0; i < len(data); i += 4 {
 					data[i]++
 				}
 				// Elaborate way to say runtime.Gosched
 				// that does not put the goroutine onto global runq.
 				go func() {
 					c <- true
 				}()
 				<-c
 			}
 		}
 	})
 }
 func BenchmarkMutexSpin(b *testing.B) {
 	// This benchmark models a situation where spinning in the mutex should be
 	// profitable. To achieve this we create a goroutine per-proc.
 	// These goroutines access considerable amount of local data so that
 	// unnecessary rescheduling is penalized by cache misses.
 	var m Mutex
 	var acc0, acc1 uint64
 	b.RunParallel(func(pb *testing.PB) {
 		var data [16 << 10]uint64
 		for i := 0; pb.Next(); i++ {
 			m.Lock()
 			acc0 -= 100
 			acc1 += 100
 			m.Unlock()
 			for i := 0; i < len(data); i += 4 {
 				data[i]++
 			}
 		}
 	})
 }
--- a/src/sync/runtime.go
+++ b/src/sync/runtime.go
@ -38,3 +38,10 @@ func init() {
 	var s syncSema
 	runtime_Syncsemcheck(unsafe.Sizeof(s))
 }
 // Active spinning runtime support.
 // runtime_canSpin returns true is spinning makes sense at the moment.
 func runtime_canSpin(i int) bool
 // runtime_doSpin does active spinning.
 func runtime_doSpin()