From bf79f91d3dc424b2e61d5a48fc6864b8aea65ed3 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Mon, 10 Aug 2020 20:02:22 +0000 Subject: [PATCH] [release-branch.go1.15] runtime: disable stack shrinking in activeStackChans race window Currently activeStackChans is set before a goroutine blocks on a channel operation in an unlockf passed to gopark. The trouble is that the unlockf is called *after* the G's status is changed, and the G's status is what is used by a concurrent mark worker (calling suspendG) to determine that a G has successfully been suspended. In this window between the status change and unlockf, the mark worker could try to shrink the G's stack, and in particular observe that activeStackChans is false. This observation will cause the mark worker to *not* synchronize with concurrent channel operations when it should, and so updating pointers in the sudog for the blocked goroutine (which may point to the goroutine's stack) races with channel operations which may also manipulate the pointer (read it, dereference it, update it, etc.). Fix the problem by adding a new atomically-updated flag to the g struct called parkingOnChan, which is non-zero in the race window above. Then, in isShrinkStackSafe, check if parkingOnChan is zero. The race is resolved like so: * Blocking G sets parkingOnChan, then changes status in gopark. * Mark worker successfully suspends blocking G. * If the mark worker observes parkingOnChan is non-zero when checking isShrinkStackSafe, then it's not safe to shrink (we're in the race window). * If the mark worker observes parkingOnChan as zero, then because the mark worker observed the G status change, it can be sure that gopark's unlockf completed, and gp.activeStackChans will be correct. The risk of this change is low, since although it reduces the number of places that stack shrinking is allowed, the window here is incredibly small. Essentially, every place that it might crash now is replaced with no shrink. This change adds a test, but the race window is so small that it's hard to trigger without a well-placed sleep in park_m. Also, this change fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte stack frame. It turns out the compiler was destructuring the "pad" field and only allocating one uint64 on the stack. For #40641. Fixes #40643. Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/247050 Run-TryBot: Michael Knyszek TryBot-Result: Go Bot Reviewed-by: Michael Pratt Trust: Michael Knyszek (cherry picked from commit eb3c6a93c3236bbde5dee6cc5bd4ca9f8ab1647a) Reviewed-on: https://go-review.googlesource.com/c/go/+/256300 Reviewed-by: Austin Clements --- src/runtime/chan.go | 22 ++++++++++++++++ src/runtime/chan_test.go | 56 ++++++++++++++++++++++++++++++++++++++++ src/runtime/proc_test.go | 10 ++++++- src/runtime/runtime2.go | 4 +++ src/runtime/select.go | 19 ++++++++++++++ src/runtime/stack.go | 13 +++++++++- 6 files changed, 122 insertions(+), 2 deletions(-) diff --git a/src/runtime/chan.go b/src/runtime/chan.go index f6f4ffd02e..d5daa4b13d 100644 --- a/src/runtime/chan.go +++ b/src/runtime/chan.go @@ -250,6 +250,11 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool { gp.waiting = mysg gp.param = nil c.sendq.enqueue(mysg) + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2) // Ensure the value being sent is kept alive until the // receiver copies it out. The sudog has a pointer to the @@ -564,6 +569,11 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) mysg.c = c gp.param = nil c.recvq.enqueue(mysg) + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2) // someone woke us up @@ -641,7 +651,19 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) { func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool { // There are unlocked sudogs that point into gp's stack. Stack // copying must lock the channels of those sudogs. + // Set activeStackChans here instead of before we try parking + // because we could self-deadlock in stack growth on the + // channel lock. gp.activeStackChans = true + // Mark that it's safe for stack shrinking to occur now, + // because any thread acquiring this G's stack for shrinking + // is guaranteed to observe activeStackChans after this store. + atomic.Store8(&gp.parkingOnChan, 0) + // Make sure we unlock after setting activeStackChans and + // unsetting parkingOnChan. The moment we unlock chanLock + // we risk gp getting readied by a channel operation and + // so gp could continue running before everything before + // the unlock is visible (even to gp itself). unlock((*mutex)(chanLock)) return true } diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go index 039a086e9b..756bbbeccf 100644 --- a/src/runtime/chan_test.go +++ b/src/runtime/chan_test.go @@ -623,6 +623,62 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) { <-done } +func TestNoShrinkStackWhileParking(t *testing.T) { + // The goal of this test is to trigger a "racy sudog adjustment" + // throw. Basically, there's a window between when a goroutine + // becomes available for preemption for stack scanning (and thus, + // stack shrinking) but before the goroutine has fully parked on a + // channel. See issue 40641 for more details on the problem. + // + // The way we try to induce this failure is to set up two + // goroutines: a sender and a reciever that communicate across + // a channel. We try to set up a situation where the sender + // grows its stack temporarily then *fully* blocks on a channel + // often. Meanwhile a GC is triggered so that we try to get a + // mark worker to shrink the sender's stack and race with the + // sender parking. + // + // Unfortunately the race window here is so small that we + // either need a ridiculous number of iterations, or we add + // "usleep(1000)" to park_m, just before the unlockf call. + const n = 10 + send := func(c chan<- int, done chan struct{}) { + for i := 0; i < n; i++ { + c <- i + // Use lots of stack briefly so that + // the GC is going to want to shrink us + // when it scans us. Make sure not to + // do any function calls otherwise + // in order to avoid us shrinking ourselves + // when we're preempted. + stackGrowthRecursive(20) + } + done <- struct{}{} + } + recv := func(c <-chan int, done chan struct{}) { + for i := 0; i < n; i++ { + // Sleep here so that the sender always + // fully blocks. + time.Sleep(10 * time.Microsecond) + <-c + } + done <- struct{}{} + } + for i := 0; i < n*20; i++ { + c := make(chan int) + done := make(chan struct{}) + go recv(c, done) + go send(c, done) + // Wait a little bit before triggering + // the GC to make sure the sender and + // reciever have gotten into their groove. + time.Sleep(50 * time.Microsecond) + runtime.GC() + <-done + <-done + } +} + func TestSelectDuplicateChannel(t *testing.T) { // This test makes sure we can queue a G on // the same channel multiple times. diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index de4dec36ce..767bde15b4 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -523,9 +523,17 @@ func BenchmarkPingPongHog(b *testing.B) { <-done } +var padData [128]uint64 + func stackGrowthRecursive(i int) { var pad [128]uint64 - if i != 0 && pad[0] == 0 { + pad = padData + for j := range pad { + if pad[j] != 0 { + return + } + } + if i != 0 { stackGrowthRecursive(i - 1) } } diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index cffdb0bf27..0d0cac240b 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -447,6 +447,10 @@ type g struct { // copying needs to acquire channel locks to protect these // areas of the stack. activeStackChans bool + // parkingOnChan indicates that the goroutine is about to + // park on a chansend or chanrecv. Used to signal an unsafe point + // for stack shrinking. It's a boolean value, but is updated atomically. + parkingOnChan uint8 raceignore int8 // ignore race detection events sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine diff --git a/src/runtime/select.go b/src/runtime/select.go index a069e3e050..69d255712a 100644 --- a/src/runtime/select.go +++ b/src/runtime/select.go @@ -7,6 +7,7 @@ package runtime // This file contains the implementation of Go select statements. import ( + "runtime/internal/atomic" "unsafe" ) @@ -77,7 +78,20 @@ func selunlock(scases []scase, lockorder []uint16) { func selparkcommit(gp *g, _ unsafe.Pointer) bool { // There are unlocked sudogs that point into gp's stack. Stack // copying must lock the channels of those sudogs. + // Set activeStackChans here instead of before we try parking + // because we could self-deadlock in stack growth on a + // channel lock. gp.activeStackChans = true + // Mark that it's safe for stack shrinking to occur now, + // because any thread acquiring this G's stack for shrinking + // is guaranteed to observe activeStackChans after this store. + atomic.Store8(&gp.parkingOnChan, 0) + // Make sure we unlock after setting activeStackChans and + // unsetting parkingOnChan. The moment we unlock any of the + // channel locks we risk gp getting readied by a channel operation + // and so gp could continue running before everything before the + // unlock is visible (even to gp itself). + // This must not access gp's stack (see gopark). In // particular, it must not access the *hselect. That's okay, // because by the time this is called, gp.waiting has all @@ -316,6 +330,11 @@ loop: // wait for someone to wake us up gp.param = nil + // Signal to anyone trying to shrink our stack that we're about + // to park on a channel. The window between when this G's status + // changes and when we set gp.activeStackChans is not safe for + // stack shrinking. + atomic.Store8(&gp.parkingOnChan, 1) gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1) gp.activeStackChans = false diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 0e930f60db..f38ba56c80 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -864,6 +864,13 @@ func copystack(gp *g, newsize uintptr) { // Adjust sudogs, synchronizing with channel ops if necessary. ncopy := used if !gp.activeStackChans { + if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 { + // It's not safe for someone to shrink this stack while we're actively + // parking on a channel, but it is safe to grow since we do that + // ourselves and explicitly don't want to synchronize with channels + // since we could self-deadlock. + throw("racy sudog adjustment due to parking on channel") + } adjustsudogs(gp, &adjinfo) } else { // sudogs may be pointing in to the stack and gp has @@ -1103,7 +1110,11 @@ func isShrinkStackSafe(gp *g) bool { // We also can't copy the stack if we're at an asynchronous // safe-point because we don't have precise pointer maps for // all frames. - return gp.syscallsp == 0 && !gp.asyncSafePoint + // + // We also can't *shrink* the stack in the window between the + // goroutine calling gopark to park on a channel and + // gp.activeStackChans being set. + return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0 } // Maybe shrink the stack being used by gp.