1
0
mirror of https://github.com/golang/go synced 2024-11-06 22:46:14 -07:00

runtime: don't hold scheduler lock when calling timeSleepUntil

Otherwise, we can get into a deadlock: sysmon takes the scheduler lock
and calls timeSleepUntil which takes each P's timer lock. Simultaneously,
some P calls runtimer (holding the P's own timer lock) which wakes up
the scavenger, calling goready, calling wakep, calling startm, getting
the scheduler lock. Now the sysmon thread is holding the scheduler lock
and trying to get a P's timer lock, while some other thread running on
that P is holding the P's timer lock and trying to get the scheduler lock.

So change sysmon to call timeSleepUntil without holding the scheduler
lock, and change timeSleepUntil to use allpLock, which is only held for
limited periods of time and should never compete with timer locks.

This hopefully

Fixes #35375

At least it should fix the linux-arm64-packet builder problems,
which occurred more reliably as that system has GOMAXPROCS == 96,
giving a lot more scope for this deadlock.

Change-Id: I7a7917daf7a4882e0b27ca416e4f6300cfaaa774
Reviewed-on: https://go-review.googlesource.com/c/go/+/205558
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
Ian Lance Taylor 2019-11-05 16:05:09 -08:00
parent cf3be9bbca
commit b50fcc88e9
2 changed files with 14 additions and 2 deletions

View File

@ -4452,10 +4452,10 @@ func sysmon() {
} }
usleep(delay) usleep(delay)
now := nanotime() now := nanotime()
next := timeSleepUntil()
if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) { if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
lock(&sched.lock) lock(&sched.lock)
if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) { if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
next := timeSleepUntil()
if next > now { if next > now {
atomic.Store(&sched.sysmonwait, 1) atomic.Store(&sched.sysmonwait, 1)
unlock(&sched.lock) unlock(&sched.lock)
@ -4474,6 +4474,7 @@ func sysmon() {
osRelax(false) osRelax(false)
} }
now = nanotime() now = nanotime()
next = timeSleepUntil()
lock(&sched.lock) lock(&sched.lock)
atomic.Store(&sched.sysmonwait, 0) atomic.Store(&sched.sysmonwait, 0)
noteclear(&sched.sysmonnote) noteclear(&sched.sysmonnote)
@ -4505,7 +4506,7 @@ func sysmon() {
incidlelocked(1) incidlelocked(1)
} }
} }
if timeSleepUntil() < now { if next < now {
// There are timers that should have already run, // There are timers that should have already run,
// perhaps because there is an unpreemptible P. // perhaps because there is an unpreemptible P.
// Try to start an M to run them. // Try to start an M to run them.

View File

@ -1234,6 +1234,8 @@ func timejumpLocked() *g {
return tb.gp return tb.gp
} }
// timeSleepUntil returns the time when the next timer should fire.
// This is only called by sysmon.
func timeSleepUntil() int64 { func timeSleepUntil() int64 {
if oldTimers { if oldTimers {
return timeSleepUntilOld() return timeSleepUntilOld()
@ -1241,7 +1243,15 @@ func timeSleepUntil() int64 {
next := int64(maxWhen) next := int64(maxWhen)
// Prevent allp slice changes. This is like retake.
lock(&allpLock)
for _, pp := range allp { for _, pp := range allp {
if pp == nil {
// This can happen if procresize has grown
// allp but not yet created new Ps.
continue
}
lock(&pp.timersLock) lock(&pp.timersLock)
c := atomic.Load(&pp.adjustTimers) c := atomic.Load(&pp.adjustTimers)
for _, t := range pp.timers { for _, t := range pp.timers {
@ -1276,6 +1286,7 @@ func timeSleepUntil() int64 {
} }
unlock(&pp.timersLock) unlock(&pp.timersLock)
} }
unlock(&allpLock)
return next return next
} }