mirror of
https://github.com/golang/go
synced 2024-11-19 12:34:47 -07:00
runtime: only sleep before stealing work from a running P
The sleep in question does not make sense if the stolen-from P cannot run the stolen G. The usleep(3) has been observed delaying execution of woken G's by ~60us; skipping it reduces the wakeup-to-execution latency to ~7us in these cases, improving CPU utilization. Benchmarks added by this change: name old time/op new time/op delta WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20) WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19) WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18) WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17) WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20) WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20) WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20) WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15) WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19) WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20) WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19) WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19) WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20) WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19) WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20) WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20) Benchmarks associated with the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20) ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19) Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19) ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17) Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18) ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19) Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19) Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18) Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19) Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18) Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16) Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18) Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f Reviewed-on: https://go-review.googlesource.com/78538 Reviewed-by: Austin Clements <austin@google.com>
This commit is contained in:
parent
2951f909ea
commit
868c8b374d
@ -4773,22 +4773,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
|
||||
if stealRunNextG {
|
||||
// Try to steal from _p_.runnext.
|
||||
if next := _p_.runnext; next != 0 {
|
||||
// Sleep to ensure that _p_ isn't about to run the g we
|
||||
// are about to steal.
|
||||
// The important use case here is when the g running on _p_
|
||||
// ready()s another g and then almost immediately blocks.
|
||||
// Instead of stealing runnext in this window, back off
|
||||
// to give _p_ a chance to schedule runnext. This will avoid
|
||||
// thrashing gs between different Ps.
|
||||
// A sync chan send/recv takes ~50ns as of time of writing,
|
||||
// so 3us gives ~50x overshoot.
|
||||
if GOOS != "windows" {
|
||||
usleep(3)
|
||||
} else {
|
||||
// On windows system timer granularity is 1-15ms,
|
||||
// which is way too much for this optimization.
|
||||
// So just yield.
|
||||
osyield()
|
||||
if _p_.status == _Prunning {
|
||||
// Sleep to ensure that _p_ isn't about to run the g
|
||||
// we are about to steal.
|
||||
// The important use case here is when the g running
|
||||
// on _p_ ready()s another g and then almost
|
||||
// immediately blocks. Instead of stealing runnext
|
||||
// in this window, back off to give _p_ a chance to
|
||||
// schedule runnext. This will avoid thrashing gs
|
||||
// between different Ps.
|
||||
// A sync chan send/recv takes ~50ns as of time of
|
||||
// writing, so 3us gives ~50x overshoot.
|
||||
if GOOS != "windows" {
|
||||
usleep(3)
|
||||
} else {
|
||||
// On windows system timer granularity is
|
||||
// 1-15ms, which is way too much for this
|
||||
// optimization. So just yield.
|
||||
osyield()
|
||||
}
|
||||
}
|
||||
if !_p_.runnext.cas(next, 0) {
|
||||
continue
|
||||
|
@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) {
|
||||
_ = sum
|
||||
}
|
||||
|
||||
func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
|
||||
if runtime.GOMAXPROCS(0) == 1 {
|
||||
b.Skip("skipping: GOMAXPROCS=1")
|
||||
}
|
||||
|
||||
wakeDelay := 5 * time.Microsecond
|
||||
for _, delay := range []time.Duration{
|
||||
0,
|
||||
1 * time.Microsecond,
|
||||
2 * time.Microsecond,
|
||||
5 * time.Microsecond,
|
||||
10 * time.Microsecond,
|
||||
20 * time.Microsecond,
|
||||
50 * time.Microsecond,
|
||||
100 * time.Microsecond,
|
||||
} {
|
||||
b.Run(delay.String(), func(b *testing.B) {
|
||||
if b.N == 0 {
|
||||
return
|
||||
}
|
||||
// Start two goroutines, which alternate between being
|
||||
// sender and receiver in the following protocol:
|
||||
//
|
||||
// - The receiver spins for `delay` and then does a
|
||||
// blocking receive on a channel.
|
||||
//
|
||||
// - The sender spins for `delay+wakeDelay` and then
|
||||
// sends to the same channel. (The addition of
|
||||
// `wakeDelay` improves the probability that the
|
||||
// receiver will be blocking when the send occurs when
|
||||
// the goroutines execute in parallel.)
|
||||
//
|
||||
// In each iteration of the benchmark, each goroutine
|
||||
// acts once as sender and once as receiver, so each
|
||||
// goroutine spins for delay twice.
|
||||
//
|
||||
// BenchmarkWakeupParallel is used to estimate how
|
||||
// efficiently the scheduler parallelizes goroutines in
|
||||
// the presence of blocking:
|
||||
//
|
||||
// - If both goroutines are executed on the same core,
|
||||
// an increase in delay by N will increase the time per
|
||||
// iteration by 4*N, because all 4 delays are
|
||||
// serialized.
|
||||
//
|
||||
// - Otherwise, an increase in delay by N will increase
|
||||
// the time per iteration by 2*N, and the time per
|
||||
// iteration is 2 * (runtime overhead + chan
|
||||
// send/receive pair + delay + wakeDelay). This allows
|
||||
// the runtime overhead, including the time it takes
|
||||
// for the unblocked goroutine to be scheduled, to be
|
||||
// estimated.
|
||||
ping, pong := make(chan struct{}), make(chan struct{})
|
||||
start := make(chan struct{})
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
<-start
|
||||
for i := 0; i < b.N; i++ {
|
||||
// sender
|
||||
spin(delay + wakeDelay)
|
||||
ping <- struct{}{}
|
||||
// receiver
|
||||
spin(delay)
|
||||
<-pong
|
||||
}
|
||||
done <- struct{}{}
|
||||
}()
|
||||
go func() {
|
||||
for i := 0; i < b.N; i++ {
|
||||
// receiver
|
||||
spin(delay)
|
||||
<-ping
|
||||
// sender
|
||||
spin(delay + wakeDelay)
|
||||
pong <- struct{}{}
|
||||
}
|
||||
done <- struct{}{}
|
||||
}()
|
||||
b.ResetTimer()
|
||||
start <- struct{}{}
|
||||
<-done
|
||||
<-done
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkWakeupParallelSpinning(b *testing.B) {
|
||||
benchmarkWakeupParallel(b, func(d time.Duration) {
|
||||
end := time.Now().Add(d)
|
||||
for time.Now().Before(end) {
|
||||
// do nothing
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkWakeupParallelSyscall(b *testing.B) {
|
||||
benchmarkWakeupParallel(b, func(d time.Duration) {
|
||||
// Invoke a blocking syscall directly; calling time.Sleep()
|
||||
// would deschedule the goroutine instead.
|
||||
ts := syscall.NsecToTimespec(d.Nanoseconds())
|
||||
for {
|
||||
if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
type Matrix [][]float64
|
||||
|
||||
func BenchmarkMatmult(b *testing.B) {
|
||||
|
Loading…
Reference in New Issue
Block a user