diff --git a/src/runtime/chan.go b/src/runtime/chan.go index 0afe5d962b..859f36c914 100644 --- a/src/runtime/chan.go +++ b/src/runtime/chan.go @@ -250,6 +250,11 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool { gp.waiting = mysg gp.param = nil c.sendq.enqueue(mysg) + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2) // Ensure the value being sent is kept alive until the // receiver copies it out. The sudog has a pointer to the @@ -568,6 +573,11 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) mysg.c = c gp.param = nil c.recvq.enqueue(mysg) + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2) // someone woke us up @@ -646,7 +656,19 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) { func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool { // There are unlocked sudogs that point into gp's stack. Stack // copying must lock the channels of those sudogs. + // Set activeStackChans here instead of before we try parking + // because we could self-deadlock in stack growth on the + // channel lock. gp.activeStackChans = true + // Mark that it's safe for stack shrinking to occur now, + // because any thread acquiring this G's stack for shrinking + // is guaranteed to observe activeStackChans after this store. + atomic.Store8(&gp.parkingOnChan, 0) + // Make sure we unlock after setting activeStackChans and + // unsetting parkingOnChan. The moment we unlock chanLock + // we risk gp getting readied by a channel operation and + // so gp could continue running before everything before + // the unlock is visible (even to gp itself). unlock((*mutex)(chanLock)) return true } diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go index 039a086e9b..756bbbeccf 100644 --- a/src/runtime/chan_test.go +++ b/src/runtime/chan_test.go @@ -623,6 +623,62 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) { <-done } +func TestNoShrinkStackWhileParking(t *testing.T) { + // The goal of this test is to trigger a "racy sudog adjustment" + // throw. Basically, there's a window between when a goroutine + // becomes available for preemption for stack scanning (and thus, + // stack shrinking) but before the goroutine has fully parked on a + // channel. See issue 40641 for more details on the problem. + // + // The way we try to induce this failure is to set up two + // goroutines: a sender and a reciever that communicate across + // a channel. We try to set up a situation where the sender + // grows its stack temporarily then *fully* blocks on a channel + // often. Meanwhile a GC is triggered so that we try to get a + // mark worker to shrink the sender's stack and race with the + // sender parking. + // + // Unfortunately the race window here is so small that we + // either need a ridiculous number of iterations, or we add + // "usleep(1000)" to park_m, just before the unlockf call. + const n = 10 + send := func(c chan<- int, done chan struct{}) { + for i := 0; i < n; i++ { + c <- i + // Use lots of stack briefly so that + // the GC is going to want to shrink us + // when it scans us. Make sure not to + // do any function calls otherwise + // in order to avoid us shrinking ourselves + // when we're preempted. + stackGrowthRecursive(20) + } + done <- struct{}{} + } + recv := func(c <-chan int, done chan struct{}) { + for i := 0; i < n; i++ { + // Sleep here so that the sender always + // fully blocks. + time.Sleep(10 * time.Microsecond) + <-c + } + done <- struct{}{} + } + for i := 0; i < n*20; i++ { + c := make(chan int) + done := make(chan struct{}) + go recv(c, done) + go send(c, done) + // Wait a little bit before triggering + // the GC to make sure the sender and + // reciever have gotten into their groove. + time.Sleep(50 * time.Microsecond) + runtime.GC() + <-done + <-done + } +} + func TestSelectDuplicateChannel(t *testing.T) { // This test makes sure we can queue a G on // the same channel multiple times. diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index de4dec36ce..767bde15b4 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -523,9 +523,17 @@ func BenchmarkPingPongHog(b *testing.B) { <-done } +var padData [128]uint64 + func stackGrowthRecursive(i int) { var pad [128]uint64 - if i != 0 && pad[0] == 0 { + pad = padData + for j := range pad { + if pad[j] != 0 { + return + } + } + if i != 0 { stackGrowthRecursive(i - 1) } } diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index a3157037e7..9652b6a5a4 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -453,6 +453,10 @@ type g struct { // copying needs to acquire channel locks to protect these // areas of the stack. activeStackChans bool + // parkingOnChan indicates that the goroutine is about to + // park on a chansend or chanrecv. Used to signal an unsafe point + // for stack shrinking. It's a boolean value, but is updated atomically. + parkingOnChan uint8 raceignore int8 // ignore race detection events sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine diff --git a/src/runtime/select.go b/src/runtime/select.go index a506747910..41e68a3746 100644 --- a/src/runtime/select.go +++ b/src/runtime/select.go @@ -7,6 +7,7 @@ package runtime // This file contains the implementation of Go select statements. import ( + "runtime/internal/atomic" "unsafe" ) @@ -61,7 +62,20 @@ func selunlock(scases []scase, lockorder []uint16) { func selparkcommit(gp *g, _ unsafe.Pointer) bool { // There are unlocked sudogs that point into gp's stack. Stack // copying must lock the channels of those sudogs. + // Set activeStackChans here instead of before we try parking + // because we could self-deadlock in stack growth on a + // channel lock. gp.activeStackChans = true + // Mark that it's safe for stack shrinking to occur now, + // because any thread acquiring this G's stack for shrinking + // is guaranteed to observe activeStackChans after this store. + atomic.Store8(&gp.parkingOnChan, 0) + // Make sure we unlock after setting activeStackChans and + // unsetting parkingOnChan. The moment we unlock any of the + // channel locks we risk gp getting readied by a channel operation + // and so gp could continue running before everything before the + // unlock is visible (even to gp itself). + // This must not access gp's stack (see gopark). In // particular, it must not access the *hselect. That's okay, // because by the time this is called, gp.waiting has all @@ -305,6 +319,11 @@ func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, blo // wait for someone to wake us up gp.param = nil + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1) gp.activeStackChans = false diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 613126c708..adcef071da 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -862,6 +862,13 @@ func copystack(gp *g, newsize uintptr) { // Adjust sudogs, synchronizing with channel ops if necessary. ncopy := used if !gp.activeStackChans { + if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 { + // It's not safe for someone to shrink this stack while we're actively + // parking on a channel, but it is safe to grow since we do that + // ourselves and explicitly don't want to synchronize with channels + // since we could self-deadlock. + throw("racy sudog adjustment due to parking on channel") + } adjustsudogs(gp, &adjinfo) } else { // sudogs may be pointing in to the stack and gp has @@ -1105,7 +1112,11 @@ func isShrinkStackSafe(gp *g) bool { // We also can't copy the stack if we're at an asynchronous // safe-point because we don't have precise pointer maps for // all frames. - return gp.syscallsp == 0 && !gp.asyncSafePoint + // + // We also can't *shrink* the stack in the window between the + // goroutine calling gopark to park on a channel and + // gp.activeStackChans being set. + return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0 } // Maybe shrink the stack being used by gp.