1
0
mirror of https://github.com/golang/go synced 2024-11-14 23:00:29 -07:00

[release-branch.go1.15] runtime: disable stack shrinking in activeStackChans race window

Currently activeStackChans is set before a goroutine blocks on a channel
operation in an unlockf passed to gopark. The trouble is that the
unlockf is called *after* the G's status is changed, and the G's status
is what is used by a concurrent mark worker (calling suspendG) to
determine that a G has successfully been suspended. In this window
between the status change and unlockf, the mark worker could try to
shrink the G's stack, and in particular observe that activeStackChans is
false. This observation will cause the mark worker to *not* synchronize
with concurrent channel operations when it should, and so updating
pointers in the sudog for the blocked goroutine (which may point to the
goroutine's stack) races with channel operations which may also
manipulate the pointer (read it, dereference it, update it, etc.).

Fix the problem by adding a new atomically-updated flag to the g struct
called parkingOnChan, which is non-zero in the race window above. Then,
in isShrinkStackSafe, check if parkingOnChan is zero. The race is
resolved like so:

* Blocking G sets parkingOnChan, then changes status in gopark.
* Mark worker successfully suspends blocking G.
* If the mark worker observes parkingOnChan is non-zero when checking
  isShrinkStackSafe, then it's not safe to shrink (we're in the race
  window).
* If the mark worker observes parkingOnChan as zero, then because
  the mark worker observed the G status change, it can be sure that
  gopark's unlockf completed, and gp.activeStackChans will be correct.

The risk of this change is low, since although it reduces the number of
places that stack shrinking is allowed, the window here is incredibly
small. Essentially, every place that it might crash now is replaced with
no shrink.

This change adds a test, but the race window is so small that it's hard
to trigger without a well-placed sleep in park_m. Also, this change
fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte
stack frame. It turns out the compiler was destructuring the "pad" field
and only allocating one uint64 on the stack.

For #40641.
Fixes #40643.

Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8
Reviewed-on: https://go-review.googlesource.com/c/go/+/247050
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
(cherry picked from commit eb3c6a93c3)
Reviewed-on: https://go-review.googlesource.com/c/go/+/256300
Reviewed-by: Austin Clements <austin@google.com>
This commit is contained in:
Michael Anthony Knyszek 2020-08-10 20:02:22 +00:00 committed by Dmitri Shuralyov
parent 400c68a04d
commit bf79f91d3d
6 changed files with 122 additions and 2 deletions

View File

@ -250,6 +250,11 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
gp.waiting = mysg
gp.param = nil
c.sendq.enqueue(mysg)
// Signal to anyone trying to shrink our stack that we're about
// to park on a channel. The window between when this G's status
// changes and when we set gp.activeStackChans is not safe for
// stack shrinking.
atomic.Store8(&gp.parkingOnChan, 1)
gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
// Ensure the value being sent is kept alive until the
// receiver copies it out. The sudog has a pointer to the
@ -564,6 +569,11 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
mysg.c = c
gp.param = nil
c.recvq.enqueue(mysg)
// Signal to anyone trying to shrink our stack that we're about
// to park on a channel. The window between when this G's status
// changes and when we set gp.activeStackChans is not safe for
// stack shrinking.
atomic.Store8(&gp.parkingOnChan, 1)
gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
// someone woke us up
@ -641,7 +651,19 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool {
// There are unlocked sudogs that point into gp's stack. Stack
// copying must lock the channels of those sudogs.
// Set activeStackChans here instead of before we try parking
// because we could self-deadlock in stack growth on the
// channel lock.
gp.activeStackChans = true
// Mark that it's safe for stack shrinking to occur now,
// because any thread acquiring this G's stack for shrinking
// is guaranteed to observe activeStackChans after this store.
atomic.Store8(&gp.parkingOnChan, 0)
// Make sure we unlock after setting activeStackChans and
// unsetting parkingOnChan. The moment we unlock chanLock
// we risk gp getting readied by a channel operation and
// so gp could continue running before everything before
// the unlock is visible (even to gp itself).
unlock((*mutex)(chanLock))
return true
}

View File

@ -623,6 +623,62 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) {
<-done
}
func TestNoShrinkStackWhileParking(t *testing.T) {
// The goal of this test is to trigger a "racy sudog adjustment"
// throw. Basically, there's a window between when a goroutine
// becomes available for preemption for stack scanning (and thus,
// stack shrinking) but before the goroutine has fully parked on a
// channel. See issue 40641 for more details on the problem.
//
// The way we try to induce this failure is to set up two
// goroutines: a sender and a reciever that communicate across
// a channel. We try to set up a situation where the sender
// grows its stack temporarily then *fully* blocks on a channel
// often. Meanwhile a GC is triggered so that we try to get a
// mark worker to shrink the sender's stack and race with the
// sender parking.
//
// Unfortunately the race window here is so small that we
// either need a ridiculous number of iterations, or we add
// "usleep(1000)" to park_m, just before the unlockf call.
const n = 10
send := func(c chan<- int, done chan struct{}) {
for i := 0; i < n; i++ {
c <- i
// Use lots of stack briefly so that
// the GC is going to want to shrink us
// when it scans us. Make sure not to
// do any function calls otherwise
// in order to avoid us shrinking ourselves
// when we're preempted.
stackGrowthRecursive(20)
}
done <- struct{}{}
}
recv := func(c <-chan int, done chan struct{}) {
for i := 0; i < n; i++ {
// Sleep here so that the sender always
// fully blocks.
time.Sleep(10 * time.Microsecond)
<-c
}
done <- struct{}{}
}
for i := 0; i < n*20; i++ {
c := make(chan int)
done := make(chan struct{})
go recv(c, done)
go send(c, done)
// Wait a little bit before triggering
// the GC to make sure the sender and
// reciever have gotten into their groove.
time.Sleep(50 * time.Microsecond)
runtime.GC()
<-done
<-done
}
}
func TestSelectDuplicateChannel(t *testing.T) {
// This test makes sure we can queue a G on
// the same channel multiple times.

View File

@ -523,9 +523,17 @@ func BenchmarkPingPongHog(b *testing.B) {
<-done
}
var padData [128]uint64
func stackGrowthRecursive(i int) {
var pad [128]uint64
if i != 0 && pad[0] == 0 {
pad = padData
for j := range pad {
if pad[j] != 0 {
return
}
}
if i != 0 {
stackGrowthRecursive(i - 1)
}
}

View File

@ -447,6 +447,10 @@ type g struct {
// copying needs to acquire channel locks to protect these
// areas of the stack.
activeStackChans bool
// parkingOnChan indicates that the goroutine is about to
// park on a chansend or chanrecv. Used to signal an unsafe point
// for stack shrinking. It's a boolean value, but is updated atomically.
parkingOnChan uint8
raceignore int8 // ignore race detection events
sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine

View File

@ -7,6 +7,7 @@ package runtime
// This file contains the implementation of Go select statements.
import (
"runtime/internal/atomic"
"unsafe"
)
@ -77,7 +78,20 @@ func selunlock(scases []scase, lockorder []uint16) {
func selparkcommit(gp *g, _ unsafe.Pointer) bool {
// There are unlocked sudogs that point into gp's stack. Stack
// copying must lock the channels of those sudogs.
// Set activeStackChans here instead of before we try parking
// because we could self-deadlock in stack growth on a
// channel lock.
gp.activeStackChans = true
// Mark that it's safe for stack shrinking to occur now,
// because any thread acquiring this G's stack for shrinking
// is guaranteed to observe activeStackChans after this store.
atomic.Store8(&gp.parkingOnChan, 0)
// Make sure we unlock after setting activeStackChans and
// unsetting parkingOnChan. The moment we unlock any of the
// channel locks we risk gp getting readied by a channel operation
// and so gp could continue running before everything before the
// unlock is visible (even to gp itself).
// This must not access gp's stack (see gopark). In
// particular, it must not access the *hselect. That's okay,
// because by the time this is called, gp.waiting has all
@ -316,6 +330,11 @@ loop:
// wait for someone to wake us up
gp.param = nil
// Signal to anyone trying to shrink our stack that we're about
// to park on a channel. The window between when this G's status
// changes and when we set gp.activeStackChans is not safe for
// stack shrinking.
atomic.Store8(&gp.parkingOnChan, 1)
gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
gp.activeStackChans = false

View File

@ -864,6 +864,13 @@ func copystack(gp *g, newsize uintptr) {
// Adjust sudogs, synchronizing with channel ops if necessary.
ncopy := used
if !gp.activeStackChans {
if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 {
// It's not safe for someone to shrink this stack while we're actively
// parking on a channel, but it is safe to grow since we do that
// ourselves and explicitly don't want to synchronize with channels
// since we could self-deadlock.
throw("racy sudog adjustment due to parking on channel")
}
adjustsudogs(gp, &adjinfo)
} else {
// sudogs may be pointing in to the stack and gp has
@ -1103,7 +1110,11 @@ func isShrinkStackSafe(gp *g) bool {
// We also can't copy the stack if we're at an asynchronous
// safe-point because we don't have precise pointer maps for
// all frames.
return gp.syscallsp == 0 && !gp.asyncSafePoint
//
// We also can't *shrink* the stack in the window between the
// goroutine calling gopark to park on a channel and
// gp.activeStackChans being set.
return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0
}
// Maybe shrink the stack being used by gp.