2014-08-19 01:49:59 -06:00
|
|
|
// Copyright 2014 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
import (
|
|
|
|
"runtime/internal/atomic"
|
2015-11-11 10:39:30 -07:00
|
|
|
"runtime/internal/sys"
|
2015-11-02 12:09:24 -07:00
|
|
|
"unsafe"
|
|
|
|
)
|
2014-08-21 10:41:09 -06:00
|
|
|
|
2015-12-06 15:35:12 -07:00
|
|
|
var buildVersion = sys.TheVersion
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// Goroutine scheduler
|
|
|
|
// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
|
|
|
|
//
|
|
|
|
// The main concepts are:
|
|
|
|
// G - goroutine.
|
|
|
|
// M - worker thread, or machine.
|
|
|
|
// P - processor, a resource that is required to execute Go code.
|
|
|
|
// M must have an associated P to execute Go code, however it can be
|
|
|
|
// blocked or in a syscall w/o an associated P.
|
|
|
|
//
|
|
|
|
// Design doc at https://golang.org/s/go11sched.
|
|
|
|
|
2015-12-08 07:11:27 -07:00
|
|
|
// Worker thread parking/unparking.
|
|
|
|
// We need to balance between keeping enough running worker threads to utilize
|
|
|
|
// available hardware parallelism and parking excessive running worker threads
|
|
|
|
// to conserve CPU resources and power. This is not simple for two reasons:
|
|
|
|
// (1) scheduler state is intentionally distributed (in particular, per-P work
|
|
|
|
// queues), so it is not possible to compute global predicates on fast paths;
|
|
|
|
// (2) for optimal thread management we would need to know the future (don't park
|
|
|
|
// a worker thread when a new goroutine will be readied in near future).
|
|
|
|
//
|
|
|
|
// Three rejected approaches that would work badly:
|
|
|
|
// 1. Centralize all scheduler state (would inhibit scalability).
|
|
|
|
// 2. Direct goroutine handoff. That is, when we ready a new goroutine and there
|
|
|
|
// is a spare P, unpark a thread and handoff it the thread and the goroutine.
|
|
|
|
// This would lead to thread state thrashing, as the thread that readied the
|
|
|
|
// goroutine can be out of work the very next moment, we will need to park it.
|
|
|
|
// Also, it would destroy locality of computation as we want to preserve
|
|
|
|
// dependent goroutines on the same thread; and introduce additional latency.
|
|
|
|
// 3. Unpark an additional thread whenever we ready a goroutine and there is an
|
|
|
|
// idle P, but don't do handoff. This would lead to excessive thread parking/
|
|
|
|
// unparking as the additional threads will instantly park without discovering
|
|
|
|
// any work to do.
|
|
|
|
//
|
|
|
|
// The current approach:
|
|
|
|
// We unpark an additional thread when we ready a goroutine if (1) there is an
|
|
|
|
// idle P and there are no "spinning" worker threads. A worker thread is considered
|
|
|
|
// spinning if it is out of local work and did not find work in global run queue/
|
|
|
|
// netpoller; the spinning state is denoted in m.spinning and in sched.nmspinning.
|
|
|
|
// Threads unparked this way are also considered spinning; we don't do goroutine
|
|
|
|
// handoff so such threads are out of work initially. Spinning threads do some
|
|
|
|
// spinning looking for work in per-P run queues before parking. If a spinning
|
|
|
|
// thread finds work it takes itself out of the spinning state and proceeds to
|
|
|
|
// execution. If it does not find work it takes itself out of the spinning state
|
|
|
|
// and then parks.
|
|
|
|
// If there is at least one spinning thread (sched.nmspinning>1), we don't unpark
|
|
|
|
// new threads when readying goroutines. To compensate for that, if the last spinning
|
|
|
|
// thread finds work and stops spinning, it must unpark a new spinning thread.
|
|
|
|
// This approach smooths out unjustified spikes of thread unparking,
|
|
|
|
// but at the same time guarantees eventual maximal CPU parallelism utilization.
|
|
|
|
//
|
|
|
|
// The main implementation complication is that we need to be very careful during
|
|
|
|
// spinning->non-spinning thread transition. This transition can race with submission
|
|
|
|
// of a new goroutine, and either one part or another needs to unpark another worker
|
|
|
|
// thread. If they both fail to do that, we can end up with semi-persistent CPU
|
|
|
|
// underutilization. The general pattern for goroutine readying is: submit a goroutine
|
|
|
|
// to local work queue, #StoreLoad-style memory barrier, check sched.nmspinning.
|
|
|
|
// The general pattern for spinning->non-spinning transition is: decrement nmspinning,
|
|
|
|
// #StoreLoad-style memory barrier, check all per-P work queues for new work.
|
|
|
|
// Note that all this complexity does not apply to global run queue as we are not
|
|
|
|
// sloppy about thread unparking when submitting to global queue. Also see comments
|
|
|
|
// for nmspinning manipulation.
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
var (
|
2016-02-26 13:57:16 -07:00
|
|
|
m0 m
|
|
|
|
g0 g
|
|
|
|
raceprocctx0 uintptr
|
2015-10-18 18:04:05 -06:00
|
|
|
)
|
|
|
|
|
2014-12-22 11:27:53 -07:00
|
|
|
//go:linkname runtime_init runtime.init
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
func runtime_init()
|
2014-12-22 11:27:53 -07:00
|
|
|
|
|
|
|
//go:linkname main_init main.init
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
func main_init()
|
2014-12-22 11:27:53 -07:00
|
|
|
|
2015-04-13 17:31:39 -06:00
|
|
|
// main_init_done is a signal used by cgocallbackg that initialization
|
|
|
|
// has been completed. It is made before _cgo_notify_runtime_init_done,
|
|
|
|
// so all cgo calls can rely on it existing. When main_init is complete,
|
|
|
|
// it is closed, meaning cgocallbackg can reliably receive from it.
|
|
|
|
var main_init_done chan bool
|
|
|
|
|
2014-12-22 11:27:53 -07:00
|
|
|
//go:linkname main_main main.main
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
func main_main()
|
|
|
|
|
2015-03-26 16:48:42 -06:00
|
|
|
// runtimeInitTime is the nanotime() at which the runtime started.
|
|
|
|
var runtimeInitTime int64
|
|
|
|
|
2015-12-19 11:17:10 -07:00
|
|
|
// Value to use for signal mask for newly created M's.
|
|
|
|
var initSigmask sigset
|
|
|
|
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
// The main goroutine.
|
|
|
|
func main() {
|
|
|
|
g := getg()
|
|
|
|
|
|
|
|
// Racectx of m0->g0 is used only as the parent of the main goroutine.
|
|
|
|
// It must not be used for anything else.
|
|
|
|
g.m.g0.racectx = 0
|
|
|
|
|
|
|
|
// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
|
|
|
|
// Using decimal instead of binary GB and MB because
|
|
|
|
// they look nicer in the stack overflow failure message.
|
2015-11-11 10:39:30 -07:00
|
|
|
if sys.PtrSize == 8 {
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
maxstacksize = 1000000000
|
|
|
|
} else {
|
|
|
|
maxstacksize = 250000000
|
|
|
|
}
|
|
|
|
|
2015-03-26 16:48:42 -06:00
|
|
|
// Record when the world started.
|
|
|
|
runtimeInitTime = nanotime()
|
|
|
|
|
2015-02-12 00:18:31 -07:00
|
|
|
systemstack(func() {
|
|
|
|
newm(sysmon, nil)
|
|
|
|
})
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
|
|
|
|
// Lock the main goroutine onto this, the main OS thread,
|
2016-03-01 16:21:55 -07:00
|
|
|
// during initialization. Most programs won't care, but a few
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
// do require certain calls to be made by the main thread.
|
|
|
|
// Those can arrange for main.main to run in the main thread
|
|
|
|
// by calling runtime.LockOSThread during initialization
|
|
|
|
// to preserve the lock.
|
|
|
|
lockOSThread()
|
|
|
|
|
runtime: use traceback to traverse defer structures
This makes the GC and the stack copying agree about how
to interpret the defer structures. Previously, only the stack
copying treated them precisely.
This removes an untyped memory allocation and fixes
at least three copystack bugs.
To make sure the GC can find the deferred argument
frame until it has been copied, keep a Defer on the defer list
during its execution.
In addition to making it possible to remove the untyped
memory allocation, keeping the Defer on the list fixes
two races between copystack and execution of defers
(in both gopanic and Goexit). The problem is that once
the defer has been taken off the list, a stack copy that
happens before the deferred arguments have been copied
back to the stack will not update the arguments correctly.
The new tests TestDeferPtrsPanic and TestDeferPtrsGoexit
(variations on the existing TestDeferPtrs) pass now but
failed before this CL.
In addition to those fixes, keeping the Defer on the list
helps correct a dangling pointer error during copystack.
The traceback routines walk the Defer chain to provide
information about where a panic may resume execution.
When the executing Defer was not on the Defer chain
but instead linked from the Panic chain, the traceback
had to walk the Panic chain too. But Panic structs are
on the stack and being updated by copystack.
Traceback's use of the Panic chain while copystack is
updating those structs means that it can follow an
updated pointer and find itself reading from the new stack.
The new stack is usually all zeros, so it sees an incorrect
early end to the chain. The new TestPanicUseStack makes
this happen at tip and dies when adjustdefers finds an
unexpected argp. The new StackCopyPoison mode
causes an earlier bad dereference instead.
By keeping the Defer on the list, traceback can avoid
walking the Panic chain at all, making it okay for copystack
to update the Panics.
We'd have the same problem for any Defers on the stack.
There was only one: gopanic's dabort. Since we are not
taking the executing Defer off the chain, we can use it
to do what dabort was doing, and then there are no
Defers on the stack ever, so it is okay for traceback to use
the Defer chain even while copystack is executing:
copystack cannot modify the Defer chain.
LGTM=khr
R=khr
CC=dvyukov, golang-codereviews, iant, rlh
https://golang.org/cl/141490043
2014-09-16 08:36:38 -06:00
|
|
|
if g.m != &m0 {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime.main not on m0")
|
runtime: use traceback to traverse defer structures
This makes the GC and the stack copying agree about how
to interpret the defer structures. Previously, only the stack
copying treated them precisely.
This removes an untyped memory allocation and fixes
at least three copystack bugs.
To make sure the GC can find the deferred argument
frame until it has been copied, keep a Defer on the defer list
during its execution.
In addition to making it possible to remove the untyped
memory allocation, keeping the Defer on the list fixes
two races between copystack and execution of defers
(in both gopanic and Goexit). The problem is that once
the defer has been taken off the list, a stack copy that
happens before the deferred arguments have been copied
back to the stack will not update the arguments correctly.
The new tests TestDeferPtrsPanic and TestDeferPtrsGoexit
(variations on the existing TestDeferPtrs) pass now but
failed before this CL.
In addition to those fixes, keeping the Defer on the list
helps correct a dangling pointer error during copystack.
The traceback routines walk the Defer chain to provide
information about where a panic may resume execution.
When the executing Defer was not on the Defer chain
but instead linked from the Panic chain, the traceback
had to walk the Panic chain too. But Panic structs are
on the stack and being updated by copystack.
Traceback's use of the Panic chain while copystack is
updating those structs means that it can follow an
updated pointer and find itself reading from the new stack.
The new stack is usually all zeros, so it sees an incorrect
early end to the chain. The new TestPanicUseStack makes
this happen at tip and dies when adjustdefers finds an
unexpected argp. The new StackCopyPoison mode
causes an earlier bad dereference instead.
By keeping the Defer on the list, traceback can avoid
walking the Panic chain at all, making it okay for copystack
to update the Panics.
We'd have the same problem for any Defers on the stack.
There was only one: gopanic's dabort. Since we are not
taking the executing Defer off the chain, we can use it
to do what dabort was doing, and then there are no
Defers on the stack ever, so it is okay for traceback to use
the Defer chain even while copystack is executing:
copystack cannot modify the Defer chain.
LGTM=khr
R=khr
CC=dvyukov, golang-codereviews, iant, rlh
https://golang.org/cl/141490043
2014-09-16 08:36:38 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
runtime_init() // must be before defer
|
|
|
|
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
// Defer unlock so that runtime.Goexit during init does the unlock too.
|
|
|
|
needUnlock := true
|
|
|
|
defer func() {
|
|
|
|
if needUnlock {
|
|
|
|
unlockOSThread()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2015-03-05 14:04:17 -07:00
|
|
|
gcenable()
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
|
2015-04-13 17:31:39 -06:00
|
|
|
main_init_done = make(chan bool)
|
2014-11-11 15:08:33 -07:00
|
|
|
if iscgo {
|
|
|
|
if _cgo_thread_start == nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("_cgo_thread_start missing")
|
2014-11-11 15:08:33 -07:00
|
|
|
}
|
2014-11-20 21:59:22 -07:00
|
|
|
if GOOS != "windows" {
|
|
|
|
if _cgo_setenv == nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("_cgo_setenv missing")
|
2014-11-20 21:59:22 -07:00
|
|
|
}
|
|
|
|
if _cgo_unsetenv == nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("_cgo_unsetenv missing")
|
2014-11-20 21:59:22 -07:00
|
|
|
}
|
2014-11-11 15:08:33 -07:00
|
|
|
}
|
2015-03-25 18:50:35 -06:00
|
|
|
if _cgo_notify_runtime_init_done == nil {
|
|
|
|
throw("_cgo_notify_runtime_init_done missing")
|
|
|
|
}
|
2015-04-27 01:32:23 -06:00
|
|
|
cgocall(_cgo_notify_runtime_init_done, nil)
|
2014-11-11 15:08:33 -07:00
|
|
|
}
|
|
|
|
|
2016-09-14 12:47:12 -06:00
|
|
|
fn := main_init // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
|
|
|
|
fn()
|
2015-04-13 17:31:39 -06:00
|
|
|
close(main_init_done)
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
|
|
|
|
needUnlock = false
|
|
|
|
unlockOSThread()
|
|
|
|
|
2015-04-16 14:05:52 -06:00
|
|
|
if isarchive || islibrary {
|
|
|
|
// A program compiled with -buildmode=c-archive or c-shared
|
|
|
|
// has a main, but it is not executed.
|
2015-04-09 13:09:52 -06:00
|
|
|
return
|
|
|
|
}
|
2016-09-14 12:47:12 -06:00
|
|
|
fn = main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
|
|
|
|
fn()
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
if raceenabled {
|
|
|
|
racefini()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make racy client program work: if panicking on
|
|
|
|
// another goroutine at the same time as main returns,
|
|
|
|
// let the other goroutine finish printing the panic trace.
|
|
|
|
// Once it does, it will exit. See issue 3934.
|
|
|
|
if panicking != 0 {
|
2015-02-21 11:01:40 -07:00
|
|
|
gopark(nil, nil, "panicwait", traceEvGoStop, 1)
|
liblink, runtime: diagnose and fix C code running on Go stack
This CL contains compiler+runtime changes that detect C code
running on Go (not g0, not gsignal) stacks, and it contains
corrections for what it detected.
The detection works by changing the C prologue to use a different
stack guard word in the G than Go prologue does. On the g0 and
gsignal stacks, that stack guard word is set to the usual
stack guard value. But on ordinary Go stacks, that stack
guard word is set to ^0, which will make any stack split
check fail. The C prologue then calls morestackc instead
of morestack, and morestackc aborts the program with
a message about running C code on a Go stack.
This check catches all C code running on the Go stack
except NOSPLIT code. The NOSPLIT code is allowed,
so the check is complete. Since it is a dynamic check,
the code must execute to be caught. But unlike the static
checks we've been using in cmd/ld, the dynamic check
works with function pointers and other indirect calls.
For example it caught sigpanic being pushed onto Go
stacks in the signal handlers.
Fixes #8667.
LGTM=khr, iant
R=golang-codereviews, khr, iant
CC=golang-codereviews, r
https://golang.org/cl/133700043
2014-09-08 12:05:23 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
exit(0)
|
|
|
|
for {
|
|
|
|
var x *int32
|
|
|
|
*x = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-10 07:26:26 -07:00
|
|
|
// os_beforeExit is called from os.Exit(0).
|
|
|
|
//go:linkname os_beforeExit os.runtime_beforeExit
|
|
|
|
func os_beforeExit() {
|
|
|
|
if raceenabled {
|
|
|
|
racefini()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-29 01:08:10 -06:00
|
|
|
// start forcegc helper goroutine
|
|
|
|
func init() {
|
2014-09-02 17:18:46 -06:00
|
|
|
go forcegchelper()
|
|
|
|
}
|
|
|
|
|
|
|
|
func forcegchelper() {
|
|
|
|
forcegc.g = getg()
|
|
|
|
for {
|
|
|
|
lock(&forcegc.lock)
|
|
|
|
if forcegc.idle != 0 {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("forcegc: phase error")
|
2014-09-02 17:18:46 -06:00
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&forcegc.idle, 1)
|
2015-02-21 11:01:40 -07:00
|
|
|
goparkunlock(&forcegc.lock, "force gc (idle)", traceEvGoBlock, 1)
|
2014-09-02 17:18:46 -06:00
|
|
|
// this goroutine is explicitly resumed by sysmon
|
|
|
|
if debug.gctrace > 0 {
|
|
|
|
println("GC forced")
|
2014-08-29 01:08:10 -06:00
|
|
|
}
|
2015-10-23 12:15:18 -06:00
|
|
|
gcStart(gcBackgroundMode, true)
|
2014-09-02 17:18:46 -06:00
|
|
|
}
|
2014-08-29 01:08:10 -06:00
|
|
|
}
|
|
|
|
|
2014-09-11 14:33:01 -06:00
|
|
|
//go:nosplit
|
|
|
|
|
2016-03-01 16:21:55 -07:00
|
|
|
// Gosched yields the processor, allowing other goroutines to run. It does not
|
2014-08-19 01:49:59 -06:00
|
|
|
// suspend the current goroutine, so execution resumes automatically.
|
|
|
|
func Gosched() {
|
2014-09-03 09:35:22 -06:00
|
|
|
mcall(gosched_m)
|
2014-08-19 01:49:59 -06:00
|
|
|
}
|
2014-08-21 10:41:09 -06:00
|
|
|
|
2016-11-10 14:03:47 -07:00
|
|
|
var alwaysFalse bool
|
|
|
|
|
|
|
|
// goschedguarded does nothing, but is written in a way that guarantees a preemption check in its prologue.
|
|
|
|
// Calls to this function are inserted by the compiler in otherwise uninterruptible loops (see insertLoopReschedChecks).
|
|
|
|
func goschedguarded() {
|
|
|
|
if alwaysFalse {
|
|
|
|
goschedguarded()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-21 10:41:09 -06:00
|
|
|
// Puts the current goroutine into a waiting state and calls unlockf.
|
|
|
|
// If unlockf returns false, the goroutine is resumed.
|
runtime: never pass stack pointers to gopark
gopark calls the unlock function after setting the G to _Gwaiting.
This means it's generally unsafe to access the G's stack from the
unlock function because the G may start running on another P. Once we
start shrinking stacks concurrently, a stack shrink could also move
the stack the moment after it enters _Gwaiting and before the unlock
function is called.
Document this restriction and fix the two places where we currently
violate it.
This is unlikely to be a problem in practice for these two places
right now, but they're already skating on thin ice. For example, the
following sequence could in principle cause corruption, deadlock, or a
panic in the select code:
On M1/P1:
1. G1 selects on channels A and B.
2. selectgoImpl calls gopark.
3. gopark puts G1 in _Gwaiting.
4. gopark calls selparkcommit.
5. selparkcommit releases the lock on channel A.
On M2/P2:
6. G2 sends to channel A.
7. The send puts G1 in _Grunnable and puts it on P2's run queue.
8. The scheduler runs, selects G1, puts it in _Grunning, and resumes G1.
9. On G1, the sellock immediately following the gopark gets called.
10. sellock grows and moves the stack.
On M1/P1:
11. selparkcommit continues to scan the lock order for the next
channel to unlock, but it's now reading from a freed (and possibly
reused) stack.
This shouldn't happen in practice because step 10 isn't the first call
to sellock, so the stack should already be big enough. However, once
we start shrinking stacks concurrently, this reasoning won't work any
more.
For #12967.
Change-Id: I3660c5be37e5be9f87433cb8141bdfdf37fadc4c
Reviewed-on: https://go-review.googlesource.com/20038
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-26 08:50:54 -07:00
|
|
|
// unlockf must not access this G's stack, as it may be moved between
|
|
|
|
// the call to gopark and the call to unlockf.
|
2015-02-21 11:01:40 -07:00
|
|
|
func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason string, traceEv byte, traceskip int) {
|
2014-08-21 10:41:09 -06:00
|
|
|
mp := acquirem()
|
|
|
|
gp := mp.curg
|
2014-09-04 12:19:50 -06:00
|
|
|
status := readgstatus(gp)
|
|
|
|
if status != _Grunning && status != _Gscanrunning {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("gopark: bad g status")
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
|
|
|
mp.waitlock = lock
|
2014-11-11 15:08:33 -07:00
|
|
|
mp.waitunlockf = *(*unsafe.Pointer)(unsafe.Pointer(&unlockf))
|
2014-08-21 10:41:09 -06:00
|
|
|
gp.waitreason = reason
|
2014-12-12 10:41:57 -07:00
|
|
|
mp.waittraceev = traceEv
|
2015-02-21 11:01:40 -07:00
|
|
|
mp.waittraceskip = traceskip
|
2014-08-21 10:41:09 -06:00
|
|
|
releasem(mp)
|
|
|
|
// can't do anything that might move the G between Ms here.
|
2014-09-03 09:35:22 -06:00
|
|
|
mcall(park_m)
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// Puts the current goroutine into a waiting state and unlocks the lock.
|
|
|
|
// The goroutine can be made runnable again by calling goready(gp).
|
2015-02-21 11:01:40 -07:00
|
|
|
func goparkunlock(lock *mutex, reason string, traceEv byte, traceskip int) {
|
|
|
|
gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceEv, traceskip)
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
|
|
|
|
2015-02-21 11:01:40 -07:00
|
|
|
func goready(gp *g, traceskip int) {
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 12:54:31 -07:00
|
|
|
systemstack(func() {
|
2016-05-17 16:21:54 -06:00
|
|
|
ready(gp, traceskip, true)
|
2014-11-11 15:08:33 -07:00
|
|
|
})
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func acquireSudog() *sudog {
|
2014-09-07 21:16:12 -06:00
|
|
|
// Delicate dance: the semaphore implementation calls
|
|
|
|
// acquireSudog, acquireSudog calls new(sudog),
|
|
|
|
// new calls malloc, malloc can call the garbage collector,
|
|
|
|
// and the garbage collector calls the semaphore implementation
|
2015-05-15 14:00:50 -06:00
|
|
|
// in stopTheWorld.
|
2014-09-07 21:16:12 -06:00
|
|
|
// Break the cycle by doing acquirem/releasem around new(sudog).
|
|
|
|
// The acquirem/releasem increments m.locks during new(sudog),
|
|
|
|
// which keeps the garbage collector from being invoked.
|
|
|
|
mp := acquirem()
|
2015-04-16 22:21:30 -06:00
|
|
|
pp := mp.p.ptr()
|
2015-02-02 14:33:02 -07:00
|
|
|
if len(pp.sudogcache) == 0 {
|
|
|
|
lock(&sched.sudoglock)
|
|
|
|
// First, try to grab a batch from central cache.
|
|
|
|
for len(pp.sudogcache) < cap(pp.sudogcache)/2 && sched.sudogcache != nil {
|
|
|
|
s := sched.sudogcache
|
|
|
|
sched.sudogcache = s.next
|
|
|
|
s.next = nil
|
|
|
|
pp.sudogcache = append(pp.sudogcache, s)
|
|
|
|
}
|
|
|
|
unlock(&sched.sudoglock)
|
|
|
|
// If the central cache is empty, allocate a new one.
|
|
|
|
if len(pp.sudogcache) == 0 {
|
|
|
|
pp.sudogcache = append(pp.sudogcache, new(sudog))
|
|
|
|
}
|
|
|
|
}
|
2015-03-05 07:52:41 -07:00
|
|
|
n := len(pp.sudogcache)
|
|
|
|
s := pp.sudogcache[n-1]
|
|
|
|
pp.sudogcache[n-1] = nil
|
|
|
|
pp.sudogcache = pp.sudogcache[:n-1]
|
2015-02-02 14:33:02 -07:00
|
|
|
if s.elem != nil {
|
|
|
|
throw("acquireSudog: found s.elem != nil in cache")
|
2014-10-02 14:49:11 -06:00
|
|
|
}
|
2014-09-07 21:16:12 -06:00
|
|
|
releasem(mp)
|
2015-02-02 14:33:02 -07:00
|
|
|
return s
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func releaseSudog(s *sudog) {
|
2014-10-02 14:49:11 -06:00
|
|
|
if s.elem != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: sudog with non-nil elem")
|
2014-10-02 14:49:11 -06:00
|
|
|
}
|
2014-10-03 13:33:29 -06:00
|
|
|
if s.selectdone != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: sudog with non-nil selectdone")
|
2014-10-03 13:33:29 -06:00
|
|
|
}
|
runtime: fix sudog leak
The SudoG used to sit on the stack, so it was cheap to allocated
and didn't need to be cleaned up when finished.
For the conversion to Go, we had to move sudog off the stack
for a few reasons, so we added a cache of recently used sudogs
to keep allocation cheap. But we didn't add any of the necessary
cleanup before adding a SudoG to the new cache, and so the cached
SudoGs had stale pointers inside them that have caused all sorts
of awful, hard to debug problems.
CL 155760043 made sure SudoG.elem is cleaned up.
CL 150520043 made sure SudoG.selectdone is cleaned up.
This CL makes sure SudoG.next, SudoG.prev, and SudoG.waitlink
are cleaned up. I should have done this when I did the other two
fields; instead I wasted a week tracking down a leak they caused.
A dangling SudoG.waitlink can point into a sudogcache list that
has been "forgotten" in order to let the GC collect it, but that
dangling .waitlink keeps the list from being collected.
And then the list holding the SudoG with the dangling waitlink
can find itself in the same situation, and so on. We end up
with lists of lists of unusable SudoGs that are still linked into
the object graph and never collected (given the right mix of
non-trivial selects and non-channel synchronization).
More details in golang.org/issue/9110.
Fixes #9110.
LGTM=r
R=r
CC=dvyukov, golang-codereviews, iant, khr
https://golang.org/cl/177870043
2014-11-16 14:44:45 -07:00
|
|
|
if s.next != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: sudog with non-nil next")
|
runtime: fix sudog leak
The SudoG used to sit on the stack, so it was cheap to allocated
and didn't need to be cleaned up when finished.
For the conversion to Go, we had to move sudog off the stack
for a few reasons, so we added a cache of recently used sudogs
to keep allocation cheap. But we didn't add any of the necessary
cleanup before adding a SudoG to the new cache, and so the cached
SudoGs had stale pointers inside them that have caused all sorts
of awful, hard to debug problems.
CL 155760043 made sure SudoG.elem is cleaned up.
CL 150520043 made sure SudoG.selectdone is cleaned up.
This CL makes sure SudoG.next, SudoG.prev, and SudoG.waitlink
are cleaned up. I should have done this when I did the other two
fields; instead I wasted a week tracking down a leak they caused.
A dangling SudoG.waitlink can point into a sudogcache list that
has been "forgotten" in order to let the GC collect it, but that
dangling .waitlink keeps the list from being collected.
And then the list holding the SudoG with the dangling waitlink
can find itself in the same situation, and so on. We end up
with lists of lists of unusable SudoGs that are still linked into
the object graph and never collected (given the right mix of
non-trivial selects and non-channel synchronization).
More details in golang.org/issue/9110.
Fixes #9110.
LGTM=r
R=r
CC=dvyukov, golang-codereviews, iant, khr
https://golang.org/cl/177870043
2014-11-16 14:44:45 -07:00
|
|
|
}
|
|
|
|
if s.prev != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: sudog with non-nil prev")
|
runtime: fix sudog leak
The SudoG used to sit on the stack, so it was cheap to allocated
and didn't need to be cleaned up when finished.
For the conversion to Go, we had to move sudog off the stack
for a few reasons, so we added a cache of recently used sudogs
to keep allocation cheap. But we didn't add any of the necessary
cleanup before adding a SudoG to the new cache, and so the cached
SudoGs had stale pointers inside them that have caused all sorts
of awful, hard to debug problems.
CL 155760043 made sure SudoG.elem is cleaned up.
CL 150520043 made sure SudoG.selectdone is cleaned up.
This CL makes sure SudoG.next, SudoG.prev, and SudoG.waitlink
are cleaned up. I should have done this when I did the other two
fields; instead I wasted a week tracking down a leak they caused.
A dangling SudoG.waitlink can point into a sudogcache list that
has been "forgotten" in order to let the GC collect it, but that
dangling .waitlink keeps the list from being collected.
And then the list holding the SudoG with the dangling waitlink
can find itself in the same situation, and so on. We end up
with lists of lists of unusable SudoGs that are still linked into
the object graph and never collected (given the right mix of
non-trivial selects and non-channel synchronization).
More details in golang.org/issue/9110.
Fixes #9110.
LGTM=r
R=r
CC=dvyukov, golang-codereviews, iant, khr
https://golang.org/cl/177870043
2014-11-16 14:44:45 -07:00
|
|
|
}
|
|
|
|
if s.waitlink != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: sudog with non-nil waitlink")
|
runtime: fix sudog leak
The SudoG used to sit on the stack, so it was cheap to allocated
and didn't need to be cleaned up when finished.
For the conversion to Go, we had to move sudog off the stack
for a few reasons, so we added a cache of recently used sudogs
to keep allocation cheap. But we didn't add any of the necessary
cleanup before adding a SudoG to the new cache, and so the cached
SudoGs had stale pointers inside them that have caused all sorts
of awful, hard to debug problems.
CL 155760043 made sure SudoG.elem is cleaned up.
CL 150520043 made sure SudoG.selectdone is cleaned up.
This CL makes sure SudoG.next, SudoG.prev, and SudoG.waitlink
are cleaned up. I should have done this when I did the other two
fields; instead I wasted a week tracking down a leak they caused.
A dangling SudoG.waitlink can point into a sudogcache list that
has been "forgotten" in order to let the GC collect it, but that
dangling .waitlink keeps the list from being collected.
And then the list holding the SudoG with the dangling waitlink
can find itself in the same situation, and so on. We end up
with lists of lists of unusable SudoGs that are still linked into
the object graph and never collected (given the right mix of
non-trivial selects and non-channel synchronization).
More details in golang.org/issue/9110.
Fixes #9110.
LGTM=r
R=r
CC=dvyukov, golang-codereviews, iant, khr
https://golang.org/cl/177870043
2014-11-16 14:44:45 -07:00
|
|
|
}
|
2016-02-15 15:37:04 -07:00
|
|
|
if s.c != nil {
|
|
|
|
throw("runtime: sudog with non-nil c")
|
|
|
|
}
|
2014-10-02 14:49:11 -06:00
|
|
|
gp := getg()
|
|
|
|
if gp.param != nil {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: releaseSudog with non-nil gp.param")
|
2014-10-02 14:49:11 -06:00
|
|
|
}
|
2015-02-02 14:33:02 -07:00
|
|
|
mp := acquirem() // avoid rescheduling to another P
|
2015-04-16 22:21:30 -06:00
|
|
|
pp := mp.p.ptr()
|
2015-02-02 14:33:02 -07:00
|
|
|
if len(pp.sudogcache) == cap(pp.sudogcache) {
|
|
|
|
// Transfer half of local cache to the central cache.
|
|
|
|
var first, last *sudog
|
|
|
|
for len(pp.sudogcache) > cap(pp.sudogcache)/2 {
|
2015-03-05 07:52:41 -07:00
|
|
|
n := len(pp.sudogcache)
|
|
|
|
p := pp.sudogcache[n-1]
|
|
|
|
pp.sudogcache[n-1] = nil
|
|
|
|
pp.sudogcache = pp.sudogcache[:n-1]
|
2015-02-02 14:33:02 -07:00
|
|
|
if first == nil {
|
|
|
|
first = p
|
|
|
|
} else {
|
|
|
|
last.next = p
|
|
|
|
}
|
|
|
|
last = p
|
|
|
|
}
|
|
|
|
lock(&sched.sudoglock)
|
|
|
|
last.next = sched.sudogcache
|
|
|
|
sched.sudogcache = first
|
|
|
|
unlock(&sched.sudoglock)
|
|
|
|
}
|
|
|
|
pp.sudogcache = append(pp.sudogcache, s)
|
|
|
|
releasem(mp)
|
2014-08-21 10:41:09 -06:00
|
|
|
}
|
2014-09-03 09:10:38 -06:00
|
|
|
|
|
|
|
// funcPC returns the entry PC of the function f.
|
|
|
|
// It assumes that f is a func value. Otherwise the behavior is undefined.
|
2014-09-04 11:51:12 -06:00
|
|
|
//go:nosplit
|
2014-09-03 09:10:38 -06:00
|
|
|
func funcPC(f interface{}) uintptr {
|
2015-11-11 10:39:30 -07:00
|
|
|
return **(**uintptr)(add(unsafe.Pointer(&f), sys.PtrSize))
|
2014-09-03 09:10:38 -06:00
|
|
|
}
|
2014-09-04 19:12:31 -06:00
|
|
|
|
|
|
|
// called from assembly
|
|
|
|
func badmcall(fn func(*g)) {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: mcall called on m->g0 stack")
|
2014-09-04 19:12:31 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func badmcall2(fn func(*g)) {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("runtime: mcall function returned")
|
2014-09-04 19:12:31 -06:00
|
|
|
}
|
2014-09-06 11:07:23 -06:00
|
|
|
|
2014-09-06 11:12:47 -06:00
|
|
|
func badreflectcall() {
|
2016-03-27 18:29:53 -06:00
|
|
|
panic(plainError("arg size to reflect.call more than 1GB"))
|
2014-09-06 11:12:47 -06:00
|
|
|
}
|
|
|
|
|
2016-10-13 08:44:57 -06:00
|
|
|
var badmorestackg0Msg = "fatal: morestack on g0\n"
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
//go:nowritebarrierrec
|
|
|
|
func badmorestackg0() {
|
|
|
|
sp := stringStructOf(&badmorestackg0Msg)
|
|
|
|
write(2, sp.str, int32(sp.len))
|
|
|
|
}
|
|
|
|
|
|
|
|
var badmorestackgsignalMsg = "fatal: morestack on gsignal\n"
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
//go:nowritebarrierrec
|
|
|
|
func badmorestackgsignal() {
|
|
|
|
sp := stringStructOf(&badmorestackgsignalMsg)
|
|
|
|
write(2, sp.str, int32(sp.len))
|
|
|
|
}
|
|
|
|
|
2016-10-19 14:16:40 -06:00
|
|
|
//go:nosplit
|
|
|
|
func badctxt() {
|
|
|
|
throw("ctxt != 0")
|
|
|
|
}
|
|
|
|
|
2014-09-06 11:07:23 -06:00
|
|
|
func lockedOSThread() bool {
|
|
|
|
gp := getg()
|
|
|
|
return gp.lockedm != nil && gp.m.lockedg != nil
|
|
|
|
}
|
2014-09-12 14:12:39 -06:00
|
|
|
|
2014-11-11 15:08:33 -07:00
|
|
|
var (
|
|
|
|
allgs []*g
|
|
|
|
allglock mutex
|
|
|
|
)
|
|
|
|
|
2014-09-12 14:12:39 -06:00
|
|
|
func allgadd(gp *g) {
|
|
|
|
if readgstatus(gp) == _Gidle {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("allgadd: bad status Gidle")
|
2014-09-12 14:12:39 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
lock(&allglock)
|
|
|
|
allgs = append(allgs, gp)
|
|
|
|
allglen = uintptr(len(allgs))
|
2016-03-04 09:58:26 -07:00
|
|
|
|
|
|
|
// Grow GC rescan list if necessary.
|
|
|
|
if len(allgs) > cap(work.rescan.list) {
|
|
|
|
lock(&work.rescan.lock)
|
|
|
|
l := work.rescan.list
|
|
|
|
// Let append do the heavy lifting, but keep the
|
|
|
|
// length the same.
|
|
|
|
work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
|
|
|
|
unlock(&work.rescan.lock)
|
|
|
|
}
|
2014-09-12 14:12:39 -06:00
|
|
|
unlock(&allglock)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
const (
|
|
|
|
// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
|
|
|
|
// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
|
|
|
|
_GoidCacheBatch = 16
|
|
|
|
)
|
|
|
|
|
|
|
|
// The bootstrap sequence is:
|
|
|
|
//
|
|
|
|
// call osinit
|
|
|
|
// call schedinit
|
|
|
|
// make & queue new G
|
|
|
|
// call runtime·mstart
|
|
|
|
//
|
|
|
|
// The new G calls runtime·main.
|
|
|
|
func schedinit() {
|
|
|
|
// raceinit must be the first call to race detector.
|
|
|
|
// In particular, it must be done before mallocinit below calls racemapshadow.
|
|
|
|
_g_ := getg()
|
|
|
|
if raceenabled {
|
2016-02-26 13:57:16 -07:00
|
|
|
_g_.racectx, raceprocctx0 = raceinit()
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
sched.maxmcount = 10000
|
|
|
|
|
|
|
|
tracebackinit()
|
|
|
|
moduledataverify()
|
|
|
|
stackinit()
|
|
|
|
mallocinit()
|
|
|
|
mcommoninit(_g_.m)
|
2016-08-04 11:09:29 -06:00
|
|
|
alginit() // maps must not be used before this call
|
2016-10-30 18:30:38 -06:00
|
|
|
modulesinit() // provides activeModules
|
|
|
|
typelinksinit() // uses maps, activeModules
|
|
|
|
itabsinit() // uses activeModules
|
2015-10-18 18:04:05 -06:00
|
|
|
|
2015-12-19 11:17:10 -07:00
|
|
|
msigsave(_g_.m)
|
|
|
|
initSigmask = _g_.m.sigmask
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
goargs()
|
|
|
|
goenvs()
|
|
|
|
parsedebugvars()
|
2016-11-02 07:10:29 -06:00
|
|
|
gcinit()
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
sched.lastpoll = uint64(nanotime())
|
2016-10-29 17:54:19 -06:00
|
|
|
procs := ncpu
|
|
|
|
if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
|
|
|
|
procs = n
|
|
|
|
}
|
2016-04-06 17:47:32 -06:00
|
|
|
if procs > _MaxGomaxprocs {
|
|
|
|
procs = _MaxGomaxprocs
|
|
|
|
}
|
2016-10-29 17:54:19 -06:00
|
|
|
if procresize(procs) != nil {
|
2015-10-18 18:04:05 -06:00
|
|
|
throw("unknown runnable goroutine during bootstrap")
|
|
|
|
}
|
|
|
|
|
2015-12-06 15:35:12 -07:00
|
|
|
if buildVersion == "" {
|
2016-03-01 16:21:55 -07:00
|
|
|
// Condition should never trigger. This code just serves
|
2015-10-18 18:04:05 -06:00
|
|
|
// to ensure runtime·buildVersion is kept in the resulting binary.
|
2015-12-06 15:35:12 -07:00
|
|
|
buildVersion = "unknown"
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func dumpgstatus(gp *g) {
|
|
|
|
_g_ := getg()
|
|
|
|
print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
|
|
|
|
print("runtime: g: g=", _g_, ", goid=", _g_.goid, ", g->atomicstatus=", readgstatus(_g_), "\n")
|
|
|
|
}
|
|
|
|
|
|
|
|
func checkmcount() {
|
|
|
|
// sched lock is held
|
|
|
|
if sched.mcount > sched.maxmcount {
|
|
|
|
print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
|
|
|
|
throw("thread exhaustion")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func mcommoninit(mp *m) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
// g0 stack won't make sense for user (and is not necessary unwindable).
|
|
|
|
if _g_ != _g_.m.g0 {
|
|
|
|
callers(1, mp.createstack[:])
|
|
|
|
}
|
|
|
|
|
|
|
|
mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
|
|
|
|
if mp.fastrand == 0 {
|
|
|
|
mp.fastrand = 0x49f6428a
|
|
|
|
}
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
mp.id = sched.mcount
|
|
|
|
sched.mcount++
|
|
|
|
checkmcount()
|
|
|
|
mpreinit(mp)
|
|
|
|
if mp.gsignal != nil {
|
|
|
|
mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add to allm so garbage collector doesn't free g->m
|
|
|
|
// when it is just in a register or thread-local storage.
|
|
|
|
mp.alllink = allm
|
|
|
|
|
|
|
|
// NumCgoCall() iterates over allm w/o schedlock,
|
|
|
|
// so we need to publish it safely.
|
|
|
|
atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
|
|
|
|
unlock(&sched.lock)
|
2016-04-01 16:06:25 -06:00
|
|
|
|
|
|
|
// Allocate memory to hold a cgo traceback if the cgo call crashes.
|
|
|
|
if iscgo || GOOS == "solaris" || GOOS == "windows" {
|
|
|
|
mp.cgoCallers = new(cgoCallers)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// Mark gp ready to run.
|
2016-05-17 16:21:54 -06:00
|
|
|
func ready(gp *g, traceskip int, next bool) {
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
|
|
|
traceGoUnpark(gp, traceskip)
|
|
|
|
}
|
|
|
|
|
|
|
|
status := readgstatus(gp)
|
|
|
|
|
|
|
|
// Mark runnable.
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.locks++ // disable preemption because it can be holding p in a local var
|
|
|
|
if status&^_Gscan != _Gwaiting {
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("bad g->status in ready")
|
|
|
|
}
|
|
|
|
|
|
|
|
// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
2016-05-17 16:21:54 -06:00
|
|
|
runqput(_g_.m.p.ptr(), gp, next)
|
2016-08-30 10:29:16 -06:00
|
|
|
if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
wakep()
|
|
|
|
}
|
|
|
|
_g_.m.locks--
|
2015-11-02 12:09:24 -07:00
|
|
|
if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in Case we've cleared it in newstack
|
2015-10-18 18:04:05 -06:00
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func gcprocs() int32 {
|
|
|
|
// Figure out how many CPUs to use during GC.
|
|
|
|
// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
|
|
|
|
lock(&sched.lock)
|
|
|
|
n := gomaxprocs
|
|
|
|
if n > ncpu {
|
|
|
|
n = ncpu
|
|
|
|
}
|
|
|
|
if n > _MaxGcproc {
|
|
|
|
n = _MaxGcproc
|
|
|
|
}
|
|
|
|
if n > sched.nmidle+1 { // one M is currently running
|
|
|
|
n = sched.nmidle + 1
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func needaddgcproc() bool {
|
|
|
|
lock(&sched.lock)
|
|
|
|
n := gomaxprocs
|
|
|
|
if n > ncpu {
|
|
|
|
n = ncpu
|
|
|
|
}
|
|
|
|
if n > _MaxGcproc {
|
|
|
|
n = _MaxGcproc
|
|
|
|
}
|
|
|
|
n -= sched.nmidle + 1 // one M is currently running
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return n > 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func helpgc(nproc int32) {
|
|
|
|
_g_ := getg()
|
|
|
|
lock(&sched.lock)
|
|
|
|
pos := 0
|
|
|
|
for n := int32(1); n < nproc; n++ { // one M is currently running
|
|
|
|
if allp[pos].mcache == _g_.m.mcache {
|
|
|
|
pos++
|
|
|
|
}
|
|
|
|
mp := mget()
|
|
|
|
if mp == nil {
|
|
|
|
throw("gcprocs inconsistency")
|
|
|
|
}
|
|
|
|
mp.helpgc = n
|
|
|
|
mp.p.set(allp[pos])
|
|
|
|
mp.mcache = allp[pos].mcache
|
|
|
|
pos++
|
|
|
|
notewakeup(&mp.park)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// freezeStopWait is a large value that freezetheworld sets
|
|
|
|
// sched.stopwait to in order to request that all Gs permanently stop.
|
|
|
|
const freezeStopWait = 0x7fffffff
|
|
|
|
|
2016-12-19 20:43:38 -07:00
|
|
|
// freezing is set to non-zero if the runtime is trying to freeze the
|
|
|
|
// world.
|
|
|
|
var freezing uint32
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// Similar to stopTheWorld but best-effort and can be called several times.
|
|
|
|
// There is no reverse operation, used during crashing.
|
|
|
|
// This function must not lock any mutexes.
|
|
|
|
func freezetheworld() {
|
2016-12-19 20:43:38 -07:00
|
|
|
atomic.Store(&freezing, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
// stopwait and preemption requests can be lost
|
|
|
|
// due to races with concurrently executing threads,
|
|
|
|
// so try several times
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
// this should tell the scheduler to not start any new goroutines
|
|
|
|
sched.stopwait = freezeStopWait
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&sched.gcwaiting, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
// this should stop running goroutines
|
|
|
|
if !preemptall() {
|
|
|
|
break // no running goroutines
|
|
|
|
}
|
|
|
|
usleep(1000)
|
|
|
|
}
|
|
|
|
// to be sure
|
|
|
|
usleep(1000)
|
|
|
|
preemptall()
|
|
|
|
usleep(1000)
|
|
|
|
}
|
|
|
|
|
|
|
|
func isscanstatus(status uint32) bool {
|
|
|
|
if status == _Gscan {
|
|
|
|
throw("isscanstatus: Bad status Gscan")
|
|
|
|
}
|
|
|
|
return status&_Gscan == _Gscan
|
|
|
|
}
|
|
|
|
|
|
|
|
// All reads and writes of g's status go through readgstatus, casgstatus
|
|
|
|
// castogscanstatus, casfrom_Gscanstatus.
|
|
|
|
//go:nosplit
|
|
|
|
func readgstatus(gp *g) uint32 {
|
2015-11-02 12:09:24 -07:00
|
|
|
return atomic.Load(&gp.atomicstatus)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
2016-04-18 16:28:36 -06:00
|
|
|
// Ownership of gcscanvalid:
|
2015-10-18 18:04:05 -06:00
|
|
|
//
|
|
|
|
// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
|
2016-04-18 16:28:36 -06:00
|
|
|
// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
|
2015-10-18 18:04:05 -06:00
|
|
|
//
|
|
|
|
// Otherwise, a second goroutine can lock the scan state by setting _Gscan
|
2016-04-18 16:28:36 -06:00
|
|
|
// in the status bit and then modify gcscanvalid, and then unlock the scan state.
|
2015-10-18 18:04:05 -06:00
|
|
|
//
|
|
|
|
// Note that the first condition implies an exception to the second:
|
|
|
|
// if a second goroutine changes gp's status to _Grunning|_Gscan,
|
2016-04-18 16:28:36 -06:00
|
|
|
// that second goroutine still does not have the right to modify gcscanvalid.
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
// The Gscanstatuses are acting like locks and this releases them.
|
|
|
|
// If it proves to be a performance hit we should be able to make these
|
|
|
|
// simple atomic stores but for now we are going to throw if
|
|
|
|
// we see an inconsistent state.
|
|
|
|
func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
|
|
|
|
success := false
|
|
|
|
|
|
|
|
// Check that transition is valid.
|
|
|
|
switch oldval {
|
|
|
|
default:
|
|
|
|
print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("casfrom_Gscanstatus:top gp->status is not in scan state")
|
|
|
|
case _Gscanrunnable,
|
|
|
|
_Gscanwaiting,
|
|
|
|
_Gscanrunning,
|
|
|
|
_Gscansyscall:
|
|
|
|
if newval == oldval&^_Gscan {
|
2015-11-02 12:09:24 -07:00
|
|
|
success = atomic.Cas(&gp.atomicstatus, oldval, newval)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if !success {
|
|
|
|
print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("casfrom_Gscanstatus: gp->status is not in scan state")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This will return false if the gp is not in the expected status and the cas fails.
|
|
|
|
// This acts like a lock acquire while the casfromgstatus acts like a lock release.
|
|
|
|
func castogscanstatus(gp *g, oldval, newval uint32) bool {
|
|
|
|
switch oldval {
|
|
|
|
case _Grunnable,
|
2016-02-18 07:38:49 -07:00
|
|
|
_Grunning,
|
2015-10-18 18:04:05 -06:00
|
|
|
_Gwaiting,
|
|
|
|
_Gsyscall:
|
|
|
|
if newval == oldval|_Gscan {
|
2015-11-02 12:09:24 -07:00
|
|
|
return atomic.Cas(&gp.atomicstatus, oldval, newval)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
|
|
|
|
throw("castogscanstatus")
|
|
|
|
panic("not reached")
|
|
|
|
}
|
|
|
|
|
|
|
|
// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
|
|
|
|
// and casfrom_Gscanstatus instead.
|
|
|
|
// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
|
|
|
|
// put it in the Gscan state is finished.
|
|
|
|
//go:nosplit
|
|
|
|
func casgstatus(gp *g, oldval, newval uint32) {
|
|
|
|
if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
|
|
|
|
systemstack(func() {
|
|
|
|
print("runtime: casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
|
|
|
|
throw("casgstatus: bad incoming values")
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
if oldval == _Grunning && gp.gcscanvalid {
|
|
|
|
// If oldvall == _Grunning, then the actual status must be
|
|
|
|
// _Grunning or _Grunning|_Gscan; either way,
|
|
|
|
// we own gp.gcscanvalid, so it's safe to read.
|
|
|
|
// gp.gcscanvalid must not be true when we are running.
|
|
|
|
print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
|
|
|
|
throw("casgstatus")
|
|
|
|
}
|
|
|
|
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
// See http://golang.org/cl/21503 for justification of the yield delay.
|
|
|
|
const yieldDelay = 5 * 1000
|
|
|
|
var nextYield int64
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// loop if gp->atomicstatus is in a scan state giving
|
|
|
|
// GC time to finish and change the state to oldval.
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ {
|
2015-10-18 18:04:05 -06:00
|
|
|
if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
|
|
|
|
systemstack(func() {
|
|
|
|
throw("casgstatus: waiting for Gwaiting but is Grunnable")
|
|
|
|
})
|
|
|
|
}
|
|
|
|
// Help GC if needed.
|
|
|
|
// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
|
|
|
|
// gp.preemptscan = false
|
|
|
|
// systemstack(func() {
|
|
|
|
// gcphasework(gp)
|
|
|
|
// })
|
|
|
|
// }
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
// But meanwhile just yield.
|
|
|
|
if i == 0 {
|
|
|
|
nextYield = nanotime() + yieldDelay
|
|
|
|
}
|
|
|
|
if nanotime() < nextYield {
|
|
|
|
for x := 0; x < 10 && gp.atomicstatus != oldval; x++ {
|
|
|
|
procyield(1)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
osyield()
|
|
|
|
nextYield = nanotime() + yieldDelay/2
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2016-03-04 09:58:26 -07:00
|
|
|
if newval == _Grunning && gp.gcscanvalid {
|
|
|
|
// Run queueRescan on the system stack so it has more space.
|
|
|
|
systemstack(func() { queueRescan(gp) })
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
|
|
|
|
// Returns old status. Cannot call casgstatus directly, because we are racing with an
|
|
|
|
// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
|
|
|
|
// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
|
|
|
|
// it would loop waiting for the status to go back to Gwaiting, which it never will.
|
|
|
|
//go:nosplit
|
|
|
|
func casgcopystack(gp *g) uint32 {
|
|
|
|
for {
|
|
|
|
oldstatus := readgstatus(gp) &^ _Gscan
|
|
|
|
if oldstatus != _Gwaiting && oldstatus != _Grunnable {
|
|
|
|
throw("copystack: bad status, not Gwaiting or Grunnable")
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
|
2015-10-18 18:04:05 -06:00
|
|
|
return oldstatus
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// scang blocks until gp's stack has been scanned.
|
|
|
|
// It might be scanned by scang or it might be scanned by the goroutine itself.
|
|
|
|
// Either way, the stack scan has completed when scang returns.
|
runtime: pass gcWork to scanstack
Currently scanstack obtains its own gcWork from the P for the duration
of the stack scan and then, if called during mark termination,
disposes the gcWork.
However, this means that the number of workbufs allocated will be at
least the number of stacks scanned during mark termination, which may
be very high (especially during a STW GC). This happens because, in
steady state, each scanstack will obtain a fresh workbuf (either from
the empty list or by allocating it), fill it with the scan results,
and then dispose it to the full list. Nothing is consuming from the
full list during this (and hence nothing is recycling them to the
empty list), so the length of the full list by the time mark
termination starts draining it is at least the number of stacks
scanned.
Fix this by pushing the gcWork acquisition up the stack to either the
gcDrain that calls markroot that calls scanstack (which batches across
many stack scans and is the path taken during STW GC) or to newstack
(which is still a single scanstack call, but this is roughly bounded
by the number of Ps).
This fix reduces the workbuf allocation for the test program from
issue #15319 from 213 MB (roughly 2KB * 1e5 goroutines) to 10 MB.
Fixes #15319.
Note that there's potentially a similar issue in write barriers during
mark 2. Fixing that will be more difficult since there's no broader
non-preemptible context, but it should also be less of a problem since
the full list is being drained during mark 2.
Some overall improvements in the go1 benchmarks, plus the usual noise.
No significant change in the garbage benchmark (time/op or GC memory).
name old time/op new time/op delta
BinaryTree17-12 2.54s ± 1% 2.51s ± 1% -1.09% (p=0.000 n=20+19)
Fannkuch11-12 2.12s ± 0% 2.17s ± 0% +2.18% (p=0.000 n=19+18)
FmtFprintfEmpty-12 45.1ns ± 1% 45.2ns ± 0% ~ (p=0.078 n=19+18)
FmtFprintfString-12 127ns ± 0% 128ns ± 0% +1.08% (p=0.000 n=19+16)
FmtFprintfInt-12 125ns ± 0% 122ns ± 1% -2.71% (p=0.000 n=14+18)
FmtFprintfIntInt-12 196ns ± 0% 190ns ± 1% -2.91% (p=0.000 n=12+20)
FmtFprintfPrefixedInt-12 196ns ± 0% 194ns ± 1% -0.94% (p=0.000 n=13+18)
FmtFprintfFloat-12 253ns ± 1% 251ns ± 1% -0.86% (p=0.000 n=19+20)
FmtManyArgs-12 807ns ± 1% 784ns ± 1% -2.85% (p=0.000 n=20+20)
GobDecode-12 7.13ms ± 1% 7.12ms ± 1% ~ (p=0.351 n=19+20)
GobEncode-12 5.89ms ± 0% 5.95ms ± 0% +0.94% (p=0.000 n=19+19)
Gzip-12 219ms ± 1% 221ms ± 1% +1.35% (p=0.000 n=18+20)
Gunzip-12 37.5ms ± 1% 37.4ms ± 0% ~ (p=0.057 n=20+19)
HTTPClientServer-12 81.4µs ± 4% 81.9µs ± 3% ~ (p=0.118 n=17+18)
JSONEncode-12 15.7ms ± 1% 15.8ms ± 1% +0.73% (p=0.000 n=17+18)
JSONDecode-12 57.9ms ± 1% 57.2ms ± 1% -1.34% (p=0.000 n=19+19)
Mandelbrot200-12 4.12ms ± 1% 4.10ms ± 0% -0.33% (p=0.000 n=19+17)
GoParse-12 3.22ms ± 2% 3.25ms ± 1% +0.72% (p=0.000 n=18+20)
RegexpMatchEasy0_32-12 70.6ns ± 1% 71.1ns ± 2% +0.63% (p=0.005 n=19+20)
RegexpMatchEasy0_1K-12 240ns ± 0% 239ns ± 1% -0.59% (p=0.000 n=19+20)
RegexpMatchEasy1_32-12 71.3ns ± 1% 71.3ns ± 1% ~ (p=0.844 n=17+17)
RegexpMatchEasy1_1K-12 384ns ± 2% 371ns ± 1% -3.45% (p=0.000 n=19+20)
RegexpMatchMedium_32-12 109ns ± 1% 108ns ± 2% -0.48% (p=0.029 n=19+19)
RegexpMatchMedium_1K-12 34.3µs ± 1% 34.5µs ± 2% ~ (p=0.160 n=18+20)
RegexpMatchHard_32-12 1.79µs ± 9% 1.72µs ± 2% -3.83% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 53.3µs ± 4% 51.8µs ± 1% -2.82% (p=0.000 n=19+20)
Revcomp-12 386ms ± 0% 388ms ± 0% +0.72% (p=0.000 n=17+20)
Template-12 62.9ms ± 1% 62.5ms ± 1% -0.57% (p=0.010 n=18+19)
TimeParse-12 325ns ± 0% 331ns ± 0% +1.84% (p=0.000 n=18+19)
TimeFormat-12 338ns ± 0% 343ns ± 0% +1.34% (p=0.000 n=18+20)
[Geo mean] 52.7µs 52.5µs -0.42%
Change-Id: Ib2d34736c4ae2ec329605b0fbc44636038d8d018
Reviewed-on: https://go-review.googlesource.com/23391
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-05-23 20:14:53 -06:00
|
|
|
func scang(gp *g, gcw *gcWork) {
|
2015-10-18 18:04:05 -06:00
|
|
|
// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
|
|
|
|
// Nothing is racing with us now, but gcscandone might be set to true left over
|
|
|
|
// from an earlier round of stack scanning (we scan twice per GC).
|
|
|
|
// We use gcscandone to record whether the scan has been done during this round.
|
|
|
|
// It is important that the scan happens exactly once: if called twice,
|
|
|
|
// the installation of stack barriers will detect the double scan and die.
|
|
|
|
|
|
|
|
gp.gcscandone = false
|
|
|
|
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
// See http://golang.org/cl/21503 for justification of the yield delay.
|
|
|
|
const yieldDelay = 10 * 1000
|
|
|
|
var nextYield int64
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// Endeavor to get gcscandone set to true,
|
|
|
|
// either by doing the stack scan ourselves or by coercing gp to scan itself.
|
|
|
|
// gp.gcscandone can transition from false to true when we're not looking
|
|
|
|
// (if we asked for preemption), so any time we lock the status using
|
|
|
|
// castogscanstatus we have to double-check that the scan is still not done.
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
loop:
|
|
|
|
for i := 0; !gp.gcscandone; i++ {
|
2015-10-18 18:04:05 -06:00
|
|
|
switch s := readgstatus(gp); s {
|
|
|
|
default:
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("stopg: invalid status")
|
|
|
|
|
|
|
|
case _Gdead:
|
|
|
|
// No stack.
|
|
|
|
gp.gcscandone = true
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
break loop
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
case _Gcopystack:
|
|
|
|
// Stack being switched. Go around again.
|
|
|
|
|
|
|
|
case _Grunnable, _Gsyscall, _Gwaiting:
|
|
|
|
// Claim goroutine by setting scan bit.
|
|
|
|
// Racing with execution or readying of gp.
|
|
|
|
// The scan bit keeps them from running
|
|
|
|
// the goroutine until we're done.
|
|
|
|
if castogscanstatus(gp, s, s|_Gscan) {
|
|
|
|
if !gp.gcscandone {
|
runtime: pass gcWork to scanstack
Currently scanstack obtains its own gcWork from the P for the duration
of the stack scan and then, if called during mark termination,
disposes the gcWork.
However, this means that the number of workbufs allocated will be at
least the number of stacks scanned during mark termination, which may
be very high (especially during a STW GC). This happens because, in
steady state, each scanstack will obtain a fresh workbuf (either from
the empty list or by allocating it), fill it with the scan results,
and then dispose it to the full list. Nothing is consuming from the
full list during this (and hence nothing is recycling them to the
empty list), so the length of the full list by the time mark
termination starts draining it is at least the number of stacks
scanned.
Fix this by pushing the gcWork acquisition up the stack to either the
gcDrain that calls markroot that calls scanstack (which batches across
many stack scans and is the path taken during STW GC) or to newstack
(which is still a single scanstack call, but this is roughly bounded
by the number of Ps).
This fix reduces the workbuf allocation for the test program from
issue #15319 from 213 MB (roughly 2KB * 1e5 goroutines) to 10 MB.
Fixes #15319.
Note that there's potentially a similar issue in write barriers during
mark 2. Fixing that will be more difficult since there's no broader
non-preemptible context, but it should also be less of a problem since
the full list is being drained during mark 2.
Some overall improvements in the go1 benchmarks, plus the usual noise.
No significant change in the garbage benchmark (time/op or GC memory).
name old time/op new time/op delta
BinaryTree17-12 2.54s ± 1% 2.51s ± 1% -1.09% (p=0.000 n=20+19)
Fannkuch11-12 2.12s ± 0% 2.17s ± 0% +2.18% (p=0.000 n=19+18)
FmtFprintfEmpty-12 45.1ns ± 1% 45.2ns ± 0% ~ (p=0.078 n=19+18)
FmtFprintfString-12 127ns ± 0% 128ns ± 0% +1.08% (p=0.000 n=19+16)
FmtFprintfInt-12 125ns ± 0% 122ns ± 1% -2.71% (p=0.000 n=14+18)
FmtFprintfIntInt-12 196ns ± 0% 190ns ± 1% -2.91% (p=0.000 n=12+20)
FmtFprintfPrefixedInt-12 196ns ± 0% 194ns ± 1% -0.94% (p=0.000 n=13+18)
FmtFprintfFloat-12 253ns ± 1% 251ns ± 1% -0.86% (p=0.000 n=19+20)
FmtManyArgs-12 807ns ± 1% 784ns ± 1% -2.85% (p=0.000 n=20+20)
GobDecode-12 7.13ms ± 1% 7.12ms ± 1% ~ (p=0.351 n=19+20)
GobEncode-12 5.89ms ± 0% 5.95ms ± 0% +0.94% (p=0.000 n=19+19)
Gzip-12 219ms ± 1% 221ms ± 1% +1.35% (p=0.000 n=18+20)
Gunzip-12 37.5ms ± 1% 37.4ms ± 0% ~ (p=0.057 n=20+19)
HTTPClientServer-12 81.4µs ± 4% 81.9µs ± 3% ~ (p=0.118 n=17+18)
JSONEncode-12 15.7ms ± 1% 15.8ms ± 1% +0.73% (p=0.000 n=17+18)
JSONDecode-12 57.9ms ± 1% 57.2ms ± 1% -1.34% (p=0.000 n=19+19)
Mandelbrot200-12 4.12ms ± 1% 4.10ms ± 0% -0.33% (p=0.000 n=19+17)
GoParse-12 3.22ms ± 2% 3.25ms ± 1% +0.72% (p=0.000 n=18+20)
RegexpMatchEasy0_32-12 70.6ns ± 1% 71.1ns ± 2% +0.63% (p=0.005 n=19+20)
RegexpMatchEasy0_1K-12 240ns ± 0% 239ns ± 1% -0.59% (p=0.000 n=19+20)
RegexpMatchEasy1_32-12 71.3ns ± 1% 71.3ns ± 1% ~ (p=0.844 n=17+17)
RegexpMatchEasy1_1K-12 384ns ± 2% 371ns ± 1% -3.45% (p=0.000 n=19+20)
RegexpMatchMedium_32-12 109ns ± 1% 108ns ± 2% -0.48% (p=0.029 n=19+19)
RegexpMatchMedium_1K-12 34.3µs ± 1% 34.5µs ± 2% ~ (p=0.160 n=18+20)
RegexpMatchHard_32-12 1.79µs ± 9% 1.72µs ± 2% -3.83% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 53.3µs ± 4% 51.8µs ± 1% -2.82% (p=0.000 n=19+20)
Revcomp-12 386ms ± 0% 388ms ± 0% +0.72% (p=0.000 n=17+20)
Template-12 62.9ms ± 1% 62.5ms ± 1% -0.57% (p=0.010 n=18+19)
TimeParse-12 325ns ± 0% 331ns ± 0% +1.84% (p=0.000 n=18+19)
TimeFormat-12 338ns ± 0% 343ns ± 0% +1.34% (p=0.000 n=18+20)
[Geo mean] 52.7µs 52.5µs -0.42%
Change-Id: Ib2d34736c4ae2ec329605b0fbc44636038d8d018
Reviewed-on: https://go-review.googlesource.com/23391
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-05-23 20:14:53 -06:00
|
|
|
scanstack(gp, gcw)
|
2015-10-18 18:04:05 -06:00
|
|
|
gp.gcscandone = true
|
|
|
|
}
|
|
|
|
restartg(gp)
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
break loop
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
case _Gscanwaiting:
|
|
|
|
// newstack is doing a scan for us right now. Wait.
|
|
|
|
|
|
|
|
case _Grunning:
|
|
|
|
// Goroutine running. Try to preempt execution so it can scan itself.
|
|
|
|
// The preemption handler (in newstack) does the actual scan.
|
|
|
|
|
|
|
|
// Optimization: if there is already a pending preemption request
|
|
|
|
// (from the previous loop iteration), don't bother with the atomics.
|
|
|
|
if gp.preemptscan && gp.preempt && gp.stackguard0 == stackPreempt {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ask for preemption and self scan.
|
|
|
|
if castogscanstatus(gp, _Grunning, _Gscanrunning) {
|
|
|
|
if !gp.gcscandone {
|
|
|
|
gp.preemptscan = true
|
|
|
|
gp.preempt = true
|
|
|
|
gp.stackguard0 = stackPreempt
|
|
|
|
}
|
|
|
|
casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
|
|
|
|
}
|
|
|
|
}
|
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2016-04-04 08:22:38 -06:00
|
|
|
|
|
|
|
if i == 0 {
|
|
|
|
nextYield = nanotime() + yieldDelay
|
|
|
|
}
|
|
|
|
if nanotime() < nextYield {
|
|
|
|
procyield(10)
|
|
|
|
} else {
|
|
|
|
osyield()
|
|
|
|
nextYield = nanotime() + yieldDelay/2
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
gp.preemptscan = false // cancel scan request if no longer needed
|
|
|
|
}
|
|
|
|
|
|
|
|
// The GC requests that this routine be moved from a scanmumble state to a mumble state.
|
|
|
|
func restartg(gp *g) {
|
|
|
|
s := readgstatus(gp)
|
|
|
|
switch s {
|
|
|
|
default:
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("restartg: unexpected status")
|
|
|
|
|
|
|
|
case _Gdead:
|
|
|
|
// ok
|
|
|
|
|
|
|
|
case _Gscanrunnable,
|
|
|
|
_Gscanwaiting,
|
|
|
|
_Gscansyscall:
|
|
|
|
casfrom_Gscanstatus(gp, s, s&^_Gscan)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// stopTheWorld stops all P's from executing goroutines, interrupting
|
|
|
|
// all goroutines at GC safe points and records reason as the reason
|
|
|
|
// for the stop. On return, only the current goroutine's P is running.
|
|
|
|
// stopTheWorld must not be called from a system stack and the caller
|
|
|
|
// must not hold worldsema. The caller must call startTheWorld when
|
|
|
|
// other P's should resume execution.
|
|
|
|
//
|
|
|
|
// stopTheWorld is safe for multiple goroutines to call at the
|
|
|
|
// same time. Each will execute its own stop, and the stops will
|
|
|
|
// be serialized.
|
|
|
|
//
|
|
|
|
// This is also used by routines that do stack dumps. If the system is
|
|
|
|
// in panic or being exited, this may not reliably stop all
|
|
|
|
// goroutines.
|
|
|
|
func stopTheWorld(reason string) {
|
2016-09-22 07:48:30 -06:00
|
|
|
semacquire(&worldsema, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
getg().m.preemptoff = reason
|
|
|
|
systemstack(stopTheWorldWithSema)
|
|
|
|
}
|
|
|
|
|
|
|
|
// startTheWorld undoes the effects of stopTheWorld.
|
|
|
|
func startTheWorld() {
|
|
|
|
systemstack(startTheWorldWithSema)
|
|
|
|
// worldsema must be held over startTheWorldWithSema to ensure
|
|
|
|
// gomaxprocs cannot change while worldsema is held.
|
|
|
|
semrelease(&worldsema)
|
|
|
|
getg().m.preemptoff = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
// Holding worldsema grants an M the right to try to stop the world
|
|
|
|
// and prevents gomaxprocs from changing concurrently.
|
|
|
|
var worldsema uint32 = 1
|
|
|
|
|
|
|
|
// stopTheWorldWithSema is the core implementation of stopTheWorld.
|
|
|
|
// The caller is responsible for acquiring worldsema and disabling
|
|
|
|
// preemption first and then should stopTheWorldWithSema on the system
|
|
|
|
// stack:
|
|
|
|
//
|
2016-09-22 07:48:30 -06:00
|
|
|
// semacquire(&worldsema, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
// m.preemptoff = "reason"
|
|
|
|
// systemstack(stopTheWorldWithSema)
|
|
|
|
//
|
|
|
|
// When finished, the caller must either call startTheWorld or undo
|
|
|
|
// these three operations separately:
|
|
|
|
//
|
|
|
|
// m.preemptoff = ""
|
|
|
|
// systemstack(startTheWorldWithSema)
|
|
|
|
// semrelease(&worldsema)
|
|
|
|
//
|
|
|
|
// It is allowed to acquire worldsema once and then execute multiple
|
|
|
|
// startTheWorldWithSema/stopTheWorldWithSema pairs.
|
|
|
|
// Other P's are able to execute between successive calls to
|
|
|
|
// startTheWorldWithSema and stopTheWorldWithSema.
|
|
|
|
// Holding worldsema causes any other goroutines invoking
|
|
|
|
// stopTheWorld to block.
|
|
|
|
func stopTheWorldWithSema() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
// If we hold a lock, then we won't be able to stop another M
|
|
|
|
// that is blocked trying to acquire the lock.
|
|
|
|
if _g_.m.locks > 0 {
|
|
|
|
throw("stopTheWorld: holding locks")
|
|
|
|
}
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
sched.stopwait = gomaxprocs
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&sched.gcwaiting, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
preemptall()
|
|
|
|
// stop current P
|
|
|
|
_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
|
|
|
|
sched.stopwait--
|
|
|
|
// try to retake all P's in Psyscall status
|
|
|
|
for i := 0; i < int(gomaxprocs); i++ {
|
|
|
|
p := allp[i]
|
|
|
|
s := p.status
|
2015-11-02 12:09:24 -07:00
|
|
|
if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
|
|
|
traceGoSysBlock(p)
|
|
|
|
traceProcStop(p)
|
|
|
|
}
|
|
|
|
p.syscalltick++
|
|
|
|
sched.stopwait--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// stop idle P's
|
|
|
|
for {
|
|
|
|
p := pidleget()
|
|
|
|
if p == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
p.status = _Pgcstop
|
|
|
|
sched.stopwait--
|
|
|
|
}
|
|
|
|
wait := sched.stopwait > 0
|
|
|
|
unlock(&sched.lock)
|
|
|
|
|
|
|
|
// wait for remaining P's to stop voluntarily
|
|
|
|
if wait {
|
|
|
|
for {
|
|
|
|
// wait for 100us, then try to re-preempt in case of any races
|
|
|
|
if notetsleep(&sched.stopnote, 100*1000) {
|
|
|
|
noteclear(&sched.stopnote)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
preemptall()
|
|
|
|
}
|
|
|
|
}
|
2016-12-19 20:43:38 -07:00
|
|
|
|
|
|
|
// sanity checks
|
|
|
|
bad := ""
|
2015-10-18 18:04:05 -06:00
|
|
|
if sched.stopwait != 0 {
|
2016-12-19 20:43:38 -07:00
|
|
|
bad = "stopTheWorld: not stopped (stopwait != 0)"
|
|
|
|
} else {
|
|
|
|
for i := 0; i < int(gomaxprocs); i++ {
|
|
|
|
p := allp[i]
|
|
|
|
if p.status != _Pgcstop {
|
|
|
|
bad = "stopTheWorld: not stopped (status != _Pgcstop)"
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
2016-12-19 20:43:38 -07:00
|
|
|
if atomic.Load(&freezing) != 0 {
|
|
|
|
// Some other thread is panicking. This can cause the
|
|
|
|
// sanity checks above to fail if the panic happens in
|
|
|
|
// the signal handler on a stopped thread. Either way,
|
|
|
|
// we should halt this thread.
|
|
|
|
lock(&deadlock)
|
|
|
|
lock(&deadlock)
|
|
|
|
}
|
|
|
|
if bad != "" {
|
|
|
|
throw(bad)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func mhelpgc() {
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.helpgc = -1
|
|
|
|
}
|
|
|
|
|
|
|
|
func startTheWorldWithSema() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
_g_.m.locks++ // disable preemption because it can be holding p in a local var
|
|
|
|
gp := netpoll(false) // non-blocking
|
|
|
|
injectglist(gp)
|
|
|
|
add := needaddgcproc()
|
|
|
|
lock(&sched.lock)
|
|
|
|
|
|
|
|
procs := gomaxprocs
|
|
|
|
if newprocs != 0 {
|
|
|
|
procs = newprocs
|
|
|
|
newprocs = 0
|
|
|
|
}
|
|
|
|
p1 := procresize(procs)
|
|
|
|
sched.gcwaiting = 0
|
|
|
|
if sched.sysmonwait != 0 {
|
|
|
|
sched.sysmonwait = 0
|
|
|
|
notewakeup(&sched.sysmonnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
|
|
|
|
for p1 != nil {
|
|
|
|
p := p1
|
|
|
|
p1 = p1.link.ptr()
|
|
|
|
if p.m != 0 {
|
|
|
|
mp := p.m.ptr()
|
|
|
|
p.m = 0
|
|
|
|
if mp.nextp != 0 {
|
|
|
|
throw("startTheWorld: inconsistent mp->nextp")
|
|
|
|
}
|
|
|
|
mp.nextp.set(p)
|
|
|
|
notewakeup(&mp.park)
|
|
|
|
} else {
|
|
|
|
// Start M to run P. Do not start another M below.
|
|
|
|
newm(nil, p)
|
|
|
|
add = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wakeup an additional proc in case we have excessive runnable goroutines
|
|
|
|
// in local queues or in the global queue. If we don't, the proc will park itself.
|
|
|
|
// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
wakep()
|
|
|
|
}
|
|
|
|
|
|
|
|
if add {
|
|
|
|
// If GC could have used another helper proc, start one now,
|
|
|
|
// in the hope that it will be available next time.
|
|
|
|
// It would have been even better to start it before the collection,
|
|
|
|
// but doing so requires allocating memory, so it's tricky to
|
2016-03-01 16:21:55 -07:00
|
|
|
// coordinate. This lazy approach works out in practice:
|
2015-10-18 18:04:05 -06:00
|
|
|
// we don't mind if the first couple gc rounds don't have quite
|
|
|
|
// the maximum number of procs.
|
|
|
|
newm(mhelpgc, nil)
|
|
|
|
}
|
|
|
|
_g_.m.locks--
|
|
|
|
if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Called to start an M.
|
|
|
|
//go:nosplit
|
|
|
|
func mstart() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.stack.lo == 0 {
|
|
|
|
// Initialize stack bounds from system stack.
|
|
|
|
// Cgo may have left stack size in stack.hi.
|
|
|
|
size := _g_.stack.hi
|
|
|
|
if size == 0 {
|
2015-11-11 10:39:30 -07:00
|
|
|
size = 8192 * sys.StackGuardMultiplier
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
|
|
|
|
_g_.stack.lo = _g_.stack.hi - size + 1024
|
|
|
|
}
|
|
|
|
// Initialize stack guards so that we can start calling
|
|
|
|
// both Go and C functions with stack growth prologues.
|
|
|
|
_g_.stackguard0 = _g_.stack.lo + _StackGuard
|
|
|
|
_g_.stackguard1 = _g_.stackguard0
|
|
|
|
mstart1()
|
|
|
|
}
|
|
|
|
|
|
|
|
func mstart1() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_ != _g_.m.g0 {
|
|
|
|
throw("bad runtime·mstart")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Record top of stack for use by mcall.
|
|
|
|
// Once we call schedule we're never coming back,
|
|
|
|
// so other calls can reuse this stack space.
|
|
|
|
gosave(&_g_.m.g0.sched)
|
|
|
|
_g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
|
|
|
|
asminit()
|
|
|
|
minit()
|
|
|
|
|
|
|
|
// Install signal handlers; after minit so that minit can
|
|
|
|
// prepare the thread to be able to handle the signals.
|
|
|
|
if _g_.m == &m0 {
|
|
|
|
// Create an extra M for callbacks on threads not created by Go.
|
|
|
|
if iscgo && !cgoHasExtraM {
|
|
|
|
cgoHasExtraM = true
|
|
|
|
newextram()
|
|
|
|
}
|
2015-12-26 10:51:59 -07:00
|
|
|
initsig(false)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
if fn := _g_.m.mstartfn; fn != nil {
|
|
|
|
fn()
|
|
|
|
}
|
|
|
|
|
|
|
|
if _g_.m.helpgc != 0 {
|
|
|
|
_g_.m.helpgc = 0
|
|
|
|
stopm()
|
|
|
|
} else if _g_.m != &m0 {
|
|
|
|
acquirep(_g_.m.nextp.ptr())
|
|
|
|
_g_.m.nextp = 0
|
|
|
|
}
|
|
|
|
schedule()
|
|
|
|
}
|
|
|
|
|
|
|
|
// forEachP calls fn(p) for every P p when p reaches a GC safe point.
|
|
|
|
// If a P is currently executing code, this will bring the P to a GC
|
|
|
|
// safe point and execute fn on that P. If the P is not executing code
|
|
|
|
// (it is idle or in a syscall), this will call fn(p) directly while
|
|
|
|
// preventing the P from exiting its state. This does not ensure that
|
|
|
|
// fn will run on every CPU executing Go code, but it acts as a global
|
|
|
|
// memory barrier. GC uses this as a "ragged barrier."
|
|
|
|
//
|
|
|
|
// The caller must hold worldsema.
|
2015-10-26 09:27:37 -06:00
|
|
|
//
|
|
|
|
//go:systemstack
|
2015-10-18 18:04:05 -06:00
|
|
|
func forEachP(fn func(*p)) {
|
|
|
|
mp := acquirem()
|
|
|
|
_p_ := getg().m.p.ptr()
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
if sched.safePointWait != 0 {
|
|
|
|
throw("forEachP: sched.safePointWait != 0")
|
|
|
|
}
|
|
|
|
sched.safePointWait = gomaxprocs - 1
|
|
|
|
sched.safePointFn = fn
|
|
|
|
|
|
|
|
// Ask all Ps to run the safe point function.
|
|
|
|
for _, p := range allp[:gomaxprocs] {
|
|
|
|
if p != _p_ {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&p.runSafePointFn, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
preemptall()
|
|
|
|
|
|
|
|
// Any P entering _Pidle or _Psyscall from now on will observe
|
|
|
|
// p.runSafePointFn == 1 and will call runSafePointFn when
|
|
|
|
// changing its status to _Pidle/_Psyscall.
|
|
|
|
|
|
|
|
// Run safe point function for all idle Ps. sched.pidle will
|
|
|
|
// not change because we hold sched.lock.
|
|
|
|
for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Cas(&p.runSafePointFn, 1, 0) {
|
2015-10-18 18:04:05 -06:00
|
|
|
fn(p)
|
|
|
|
sched.safePointWait--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
wait := sched.safePointWait > 0
|
|
|
|
unlock(&sched.lock)
|
|
|
|
|
|
|
|
// Run fn for the current P.
|
|
|
|
fn(_p_)
|
|
|
|
|
|
|
|
// Force Ps currently in _Psyscall into _Pidle and hand them
|
|
|
|
// off to induce safe point function execution.
|
|
|
|
for i := 0; i < int(gomaxprocs); i++ {
|
|
|
|
p := allp[i]
|
|
|
|
s := p.status
|
2015-11-02 12:09:24 -07:00
|
|
|
if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
|
|
|
traceGoSysBlock(p)
|
|
|
|
traceProcStop(p)
|
|
|
|
}
|
|
|
|
p.syscalltick++
|
|
|
|
handoffp(p)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for remaining Ps to run fn.
|
|
|
|
if wait {
|
|
|
|
for {
|
|
|
|
// Wait for 100us, then try to re-preempt in
|
|
|
|
// case of any races.
|
2015-10-26 09:27:37 -06:00
|
|
|
//
|
|
|
|
// Requires system stack.
|
2015-10-18 18:04:05 -06:00
|
|
|
if notetsleep(&sched.safePointNote, 100*1000) {
|
|
|
|
noteclear(&sched.safePointNote)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
preemptall()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if sched.safePointWait != 0 {
|
|
|
|
throw("forEachP: not done")
|
|
|
|
}
|
|
|
|
for i := 0; i < int(gomaxprocs); i++ {
|
|
|
|
p := allp[i]
|
|
|
|
if p.runSafePointFn != 0 {
|
|
|
|
throw("forEachP: P did not run fn")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
sched.safePointFn = nil
|
|
|
|
unlock(&sched.lock)
|
|
|
|
releasem(mp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// runSafePointFn runs the safe point function, if any, for this P.
|
|
|
|
// This should be called like
|
|
|
|
//
|
|
|
|
// if getg().m.p.runSafePointFn != 0 {
|
|
|
|
// runSafePointFn()
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// runSafePointFn must be checked on any transition in to _Pidle or
|
|
|
|
// _Psyscall to avoid a race where forEachP sees that the P is running
|
|
|
|
// just before the P goes into _Pidle/_Psyscall and neither forEachP
|
|
|
|
// nor the P run the safe-point function.
|
|
|
|
func runSafePointFn() {
|
|
|
|
p := getg().m.p.ptr()
|
|
|
|
// Resolve the race between forEachP running the safe-point
|
|
|
|
// function on this P's behalf and this P running the
|
|
|
|
// safe-point function directly.
|
2015-11-02 12:09:24 -07:00
|
|
|
if !atomic.Cas(&p.runSafePointFn, 1, 0) {
|
2015-10-18 18:04:05 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
sched.safePointFn(p)
|
|
|
|
lock(&sched.lock)
|
|
|
|
sched.safePointWait--
|
|
|
|
if sched.safePointWait == 0 {
|
|
|
|
notewakeup(&sched.safePointNote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// When running with cgo, we call _cgo_thread_start
|
|
|
|
// to start threads for us so that we can play nicely with
|
|
|
|
// foreign code.
|
|
|
|
var cgoThreadStart unsafe.Pointer
|
|
|
|
|
|
|
|
type cgothreadstart struct {
|
|
|
|
g guintptr
|
|
|
|
tls *uint64
|
|
|
|
fn unsafe.Pointer
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allocate a new m unassociated with any thread.
|
|
|
|
// Can use p for allocation context if needed.
|
|
|
|
// fn is recorded as the new m's m.mstartfn.
|
2015-11-17 15:28:35 -07:00
|
|
|
//
|
2016-10-10 14:46:28 -06:00
|
|
|
// This function is allowed to have write barriers even if the caller
|
|
|
|
// isn't because it borrows _p_.
|
|
|
|
//
|
|
|
|
//go:yeswritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func allocm(_p_ *p, fn func()) *m {
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.locks++ // disable GC because it can be called from sysmon
|
|
|
|
if _g_.m.p == 0 {
|
|
|
|
acquirep(_p_) // temporarily borrow p for mallocs in this function
|
|
|
|
}
|
|
|
|
mp := new(m)
|
|
|
|
mp.mstartfn = fn
|
|
|
|
mcommoninit(mp)
|
|
|
|
|
|
|
|
// In case of cgo or Solaris, pthread_create will make us a stack.
|
|
|
|
// Windows and Plan 9 will layout sched stack on OS stack.
|
|
|
|
if iscgo || GOOS == "solaris" || GOOS == "windows" || GOOS == "plan9" {
|
|
|
|
mp.g0 = malg(-1)
|
|
|
|
} else {
|
2015-11-11 10:39:30 -07:00
|
|
|
mp.g0 = malg(8192 * sys.StackGuardMultiplier)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
mp.g0.m = mp
|
|
|
|
|
|
|
|
if _p_ == _g_.m.p.ptr() {
|
|
|
|
releasep()
|
|
|
|
}
|
|
|
|
_g_.m.locks--
|
|
|
|
if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
}
|
|
|
|
|
|
|
|
return mp
|
|
|
|
}
|
|
|
|
|
|
|
|
// needm is called when a cgo callback happens on a
|
|
|
|
// thread without an m (a thread not created by Go).
|
|
|
|
// In this case, needm is expected to find an m to use
|
|
|
|
// and return with m, g initialized correctly.
|
|
|
|
// Since m and g are not set now (likely nil, but see below)
|
|
|
|
// needm is limited in what routines it can call. In particular
|
|
|
|
// it can only call nosplit functions (textflag 7) and cannot
|
|
|
|
// do any scheduling that requires an m.
|
|
|
|
//
|
|
|
|
// In order to avoid needing heavy lifting here, we adopt
|
|
|
|
// the following strategy: there is a stack of available m's
|
|
|
|
// that can be stolen. Using compare-and-swap
|
|
|
|
// to pop from the stack has ABA races, so we simulate
|
|
|
|
// a lock by doing an exchange (via casp) to steal the stack
|
|
|
|
// head and replace the top pointer with MLOCKED (1).
|
|
|
|
// This serves as a simple spin lock that we can use even
|
|
|
|
// without an m. The thread that locks the stack in this way
|
|
|
|
// unlocks the stack by storing a valid stack head pointer.
|
|
|
|
//
|
|
|
|
// In order to make sure that there is always an m structure
|
|
|
|
// available to be stolen, we maintain the invariant that there
|
|
|
|
// is always one more than needed. At the beginning of the
|
|
|
|
// program (if cgo is in use) the list is seeded with a single m.
|
|
|
|
// If needm finds that it has taken the last m off the list, its job
|
|
|
|
// is - once it has installed its own m so that it can do things like
|
|
|
|
// allocate memory - to create a spare m and put it on the list.
|
|
|
|
//
|
|
|
|
// Each of these extra m's also has a g0 and a curg that are
|
|
|
|
// pressed into service as the scheduling stack and current
|
|
|
|
// goroutine for the duration of the cgo callback.
|
|
|
|
//
|
|
|
|
// When the callback is done with the m, it calls dropm to
|
|
|
|
// put the m back on the list.
|
|
|
|
//go:nosplit
|
|
|
|
func needm(x byte) {
|
|
|
|
if iscgo && !cgoHasExtraM {
|
|
|
|
// Can happen if C/C++ code calls Go from a global ctor.
|
|
|
|
// Can not throw, because scheduler is not initialized yet.
|
|
|
|
write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
|
|
|
|
exit(1)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lock extra list, take head, unlock popped list.
|
|
|
|
// nilokay=false is safe here because of the invariant above,
|
|
|
|
// that the extra list always contains or will soon contain
|
|
|
|
// at least one m.
|
|
|
|
mp := lockextra(false)
|
|
|
|
|
|
|
|
// Set needextram when we've just emptied the list,
|
|
|
|
// so that the eventual call into cgocallbackg will
|
|
|
|
// allocate a new m for the extra list. We delay the
|
|
|
|
// allocation until then so that it can be done
|
|
|
|
// after exitsyscall makes sure it is okay to be
|
|
|
|
// running at all (that is, there's no garbage collection
|
|
|
|
// running right now).
|
|
|
|
mp.needextram = mp.schedlink == 0
|
|
|
|
unlockextra(mp.schedlink.ptr())
|
|
|
|
|
2015-11-13 14:21:01 -07:00
|
|
|
// Save and block signals before installing g.
|
|
|
|
// Once g is installed, any incoming signals will try to execute,
|
|
|
|
// but we won't have the sigaltstack settings and other data
|
|
|
|
// set up appropriately until the end of minit, which will
|
|
|
|
// unblock the signals. This is the same dance as when
|
|
|
|
// starting a new m to run Go code via newosproc.
|
|
|
|
msigsave(mp)
|
|
|
|
sigblock()
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// Install g (= m->g0) and set the stack bounds
|
|
|
|
// to match the current stack. We don't actually know
|
|
|
|
// how big the stack is, like we don't know how big any
|
|
|
|
// scheduling stack is, but we assume there's at least 32 kB,
|
|
|
|
// which is more than enough for us.
|
|
|
|
setg(mp.g0)
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
|
|
|
|
_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
|
|
|
|
_g_.stackguard0 = _g_.stack.lo + _StackGuard
|
|
|
|
|
|
|
|
// Initialize this thread to use the m.
|
|
|
|
asminit()
|
|
|
|
minit()
|
|
|
|
}
|
|
|
|
|
|
|
|
var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
|
|
|
|
|
2016-07-19 00:00:43 -06:00
|
|
|
// newextram allocates m's and puts them on the extra list.
|
2015-10-18 18:04:05 -06:00
|
|
|
// It is called with a working local m, so that it can do things
|
|
|
|
// like call schedlock and allocate.
|
|
|
|
func newextram() {
|
2016-07-19 00:00:43 -06:00
|
|
|
c := atomic.Xchg(&extraMWaiters, 0)
|
|
|
|
if c > 0 {
|
|
|
|
for i := uint32(0); i < c; i++ {
|
|
|
|
oneNewExtraM()
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Make sure there is at least one extra M.
|
|
|
|
mp := lockextra(true)
|
|
|
|
unlockextra(mp)
|
|
|
|
if mp == nil {
|
|
|
|
oneNewExtraM()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// oneNewExtraM allocates an m and puts it on the extra list.
|
|
|
|
func oneNewExtraM() {
|
2015-10-18 18:04:05 -06:00
|
|
|
// Create extra goroutine locked to extra m.
|
|
|
|
// The goroutine is the context in which the cgo callback will run.
|
|
|
|
// The sched.pc will never be returned to, but setting it to
|
|
|
|
// goexit makes clear to the traceback routines where
|
|
|
|
// the goroutine stack ends.
|
|
|
|
mp := allocm(nil, nil)
|
|
|
|
gp := malg(4096)
|
2015-11-11 10:39:30 -07:00
|
|
|
gp.sched.pc = funcPC(goexit) + sys.PCQuantum
|
2015-10-18 18:04:05 -06:00
|
|
|
gp.sched.sp = gp.stack.hi
|
2015-11-11 10:39:30 -07:00
|
|
|
gp.sched.sp -= 4 * sys.RegSize // extra space in case of reads slightly beyond frame
|
2015-10-18 18:04:05 -06:00
|
|
|
gp.sched.lr = 0
|
|
|
|
gp.sched.g = guintptr(unsafe.Pointer(gp))
|
|
|
|
gp.syscallpc = gp.sched.pc
|
|
|
|
gp.syscallsp = gp.sched.sp
|
|
|
|
gp.stktopsp = gp.sched.sp
|
2016-03-04 09:58:26 -07:00
|
|
|
gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
|
2016-10-16 19:22:02 -06:00
|
|
|
gp.gcscandone = true
|
2016-03-04 09:58:26 -07:00
|
|
|
gp.gcRescan = -1
|
2015-10-18 18:04:05 -06:00
|
|
|
// malg returns status as Gidle, change to Gsyscall before adding to allg
|
|
|
|
// where GC will see it.
|
|
|
|
casgstatus(gp, _Gidle, _Gsyscall)
|
|
|
|
gp.m = mp
|
|
|
|
mp.curg = gp
|
|
|
|
mp.locked = _LockInternal
|
|
|
|
mp.lockedg = gp
|
|
|
|
gp.lockedm = mp
|
2015-11-02 12:09:24 -07:00
|
|
|
gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
|
2015-10-18 18:04:05 -06:00
|
|
|
if raceenabled {
|
2016-09-24 08:59:23 -06:00
|
|
|
gp.racectx = racegostart(funcPC(newextram) + sys.PCQuantum)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
// put on allg for garbage collector
|
|
|
|
allgadd(gp)
|
|
|
|
|
|
|
|
// Add m to the extra list.
|
|
|
|
mnext := lockextra(true)
|
|
|
|
mp.schedlink.set(mnext)
|
|
|
|
unlockextra(mp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// dropm is called when a cgo callback has called needm but is now
|
|
|
|
// done with the callback and returning back into the non-Go thread.
|
|
|
|
// It puts the current m back onto the extra list.
|
|
|
|
//
|
|
|
|
// The main expense here is the call to signalstack to release the
|
|
|
|
// m's signal stack, and then the call to needm on the next callback
|
|
|
|
// from this thread. It is tempting to try to save the m for next time,
|
|
|
|
// which would eliminate both these costs, but there might not be
|
|
|
|
// a next time: the current thread (which Go does not control) might exit.
|
|
|
|
// If we saved the m for that thread, there would be an m leak each time
|
|
|
|
// such a thread exited. Instead, we acquire and release an m on each
|
|
|
|
// call. These should typically not be scheduling operations, just a few
|
|
|
|
// atomics, so the cost should be small.
|
|
|
|
//
|
|
|
|
// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
|
|
|
|
// variable using pthread_key_create. Unlike the pthread keys we already use
|
|
|
|
// on OS X, this dummy key would never be read by Go code. It would exist
|
|
|
|
// only so that we could register at thread-exit-time destructor.
|
|
|
|
// That destructor would put the m back onto the extra list.
|
|
|
|
// This is purely a performance optimization. The current version,
|
|
|
|
// in which dropm happens on each cgo call, is still correct too.
|
|
|
|
// We may have to keep the current version on systems with cgo
|
|
|
|
// but without pthreads, like Windows.
|
|
|
|
func dropm() {
|
|
|
|
// Clear m and g, and return m to the extra list.
|
|
|
|
// After the call to setg we can only call nosplit functions
|
|
|
|
// with no pointer manipulation.
|
|
|
|
mp := getg().m
|
|
|
|
|
2015-11-13 14:21:01 -07:00
|
|
|
// Block signals before unminit.
|
|
|
|
// Unminit unregisters the signal handling stack (but needs g on some systems).
|
|
|
|
// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
|
|
|
|
// It's important not to try to handle a signal between those two steps.
|
2016-01-12 16:34:03 -07:00
|
|
|
sigmask := mp.sigmask
|
2015-11-13 14:21:01 -07:00
|
|
|
sigblock()
|
|
|
|
unminit()
|
2016-01-12 16:34:03 -07:00
|
|
|
|
|
|
|
mnext := lockextra(true)
|
|
|
|
mp.schedlink.set(mnext)
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
setg(nil)
|
2015-11-13 14:21:01 -07:00
|
|
|
|
|
|
|
// Commit the release of mp.
|
2015-10-18 18:04:05 -06:00
|
|
|
unlockextra(mp)
|
2016-01-12 16:34:03 -07:00
|
|
|
|
|
|
|
msigrestore(sigmask)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
2016-01-08 17:56:02 -07:00
|
|
|
// A helper function for EnsureDropM.
|
|
|
|
func getm() uintptr {
|
|
|
|
return uintptr(unsafe.Pointer(getg().m))
|
|
|
|
}
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
var extram uintptr
|
2016-07-19 00:00:43 -06:00
|
|
|
var extraMWaiters uint32
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
// lockextra locks the extra list and returns the list head.
|
|
|
|
// The caller must unlock the list by storing a new list head
|
|
|
|
// to extram. If nilokay is true, then lockextra will
|
|
|
|
// return a nil list head if that's what it finds. If nilokay is false,
|
|
|
|
// lockextra will keep waiting until the list head is no longer nil.
|
|
|
|
//go:nosplit
|
|
|
|
func lockextra(nilokay bool) *m {
|
|
|
|
const locked = 1
|
|
|
|
|
2016-07-19 00:00:43 -06:00
|
|
|
incr := false
|
2015-10-18 18:04:05 -06:00
|
|
|
for {
|
2015-11-02 12:09:24 -07:00
|
|
|
old := atomic.Loaduintptr(&extram)
|
2015-10-18 18:04:05 -06:00
|
|
|
if old == locked {
|
|
|
|
yield := osyield
|
|
|
|
yield()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if old == 0 && !nilokay {
|
2016-07-19 00:00:43 -06:00
|
|
|
if !incr {
|
|
|
|
// Add 1 to the number of threads
|
|
|
|
// waiting for an M.
|
|
|
|
// This is cleared by newextram.
|
|
|
|
atomic.Xadd(&extraMWaiters, 1)
|
|
|
|
incr = true
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
usleep(1)
|
|
|
|
continue
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Casuintptr(&extram, old, locked) {
|
2015-10-18 18:04:05 -06:00
|
|
|
return (*m)(unsafe.Pointer(old))
|
|
|
|
}
|
|
|
|
yield := osyield
|
|
|
|
yield()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func unlockextra(mp *m) {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
2016-03-01 16:21:55 -07:00
|
|
|
// Create a new m. It will start off with a call to fn, or else the scheduler.
|
2015-10-18 18:04:05 -06:00
|
|
|
// fn needs to be static and not a heap allocated closure.
|
|
|
|
// May run with m.p==nil, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func newm(fn func(), _p_ *p) {
|
|
|
|
mp := allocm(_p_, fn)
|
|
|
|
mp.nextp.set(_p_)
|
2015-12-19 11:17:10 -07:00
|
|
|
mp.sigmask = initSigmask
|
2015-10-18 18:04:05 -06:00
|
|
|
if iscgo {
|
|
|
|
var ts cgothreadstart
|
|
|
|
if _cgo_thread_start == nil {
|
|
|
|
throw("_cgo_thread_start missing")
|
|
|
|
}
|
|
|
|
ts.g.set(mp.g0)
|
|
|
|
ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
|
|
|
|
ts.fn = unsafe.Pointer(funcPC(mstart))
|
2016-01-05 15:06:58 -07:00
|
|
|
if msanenabled {
|
|
|
|
msanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts))
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stops execution of the current m until new work is available.
|
|
|
|
// Returns with acquired P.
|
|
|
|
func stopm() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.m.locks != 0 {
|
|
|
|
throw("stopm holding locks")
|
|
|
|
}
|
|
|
|
if _g_.m.p != 0 {
|
|
|
|
throw("stopm holding p")
|
|
|
|
}
|
|
|
|
if _g_.m.spinning {
|
2015-12-08 07:11:27 -07:00
|
|
|
throw("stopm spinning")
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
retry:
|
|
|
|
lock(&sched.lock)
|
|
|
|
mput(_g_.m)
|
|
|
|
unlock(&sched.lock)
|
|
|
|
notesleep(&_g_.m.park)
|
|
|
|
noteclear(&_g_.m.park)
|
|
|
|
if _g_.m.helpgc != 0 {
|
|
|
|
gchelper()
|
|
|
|
_g_.m.helpgc = 0
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
_g_.m.p = 0
|
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
acquirep(_g_.m.nextp.ptr())
|
|
|
|
_g_.m.nextp = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func mspinning() {
|
2015-12-08 07:11:27 -07:00
|
|
|
// startm's caller incremented nmspinning. Set the new M's spinning.
|
|
|
|
getg().m.spinning = true
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// Schedules some M to run the p (creates an M if necessary).
|
|
|
|
// If p==nil, tries to get an idle P, if no idle P's does nothing.
|
|
|
|
// May run with m.p==nil, so write barriers are not allowed.
|
2015-12-08 07:11:27 -07:00
|
|
|
// If spinning is set, the caller has incremented nmspinning and startm will
|
|
|
|
// either decrement nmspinning or set m.spinning in the newly started M.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func startm(_p_ *p, spinning bool) {
|
|
|
|
lock(&sched.lock)
|
|
|
|
if _p_ == nil {
|
|
|
|
_p_ = pidleget()
|
|
|
|
if _p_ == nil {
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if spinning {
|
2015-12-08 07:11:27 -07:00
|
|
|
// The caller incremented nmspinning, but there are no idle Ps,
|
|
|
|
// so it's okay to just undo the increment and give up.
|
|
|
|
if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
|
|
|
|
throw("startm: negative nmspinning")
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mp := mget()
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if mp == nil {
|
|
|
|
var fn func()
|
|
|
|
if spinning {
|
2015-12-08 07:11:27 -07:00
|
|
|
// The caller incremented nmspinning, so set m.spinning in the new M.
|
2015-10-18 18:04:05 -06:00
|
|
|
fn = mspinning
|
|
|
|
}
|
|
|
|
newm(fn, _p_)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if mp.spinning {
|
|
|
|
throw("startm: m is spinning")
|
|
|
|
}
|
|
|
|
if mp.nextp != 0 {
|
|
|
|
throw("startm: m has p")
|
|
|
|
}
|
|
|
|
if spinning && !runqempty(_p_) {
|
|
|
|
throw("startm: p has runnable gs")
|
|
|
|
}
|
2015-12-08 07:11:27 -07:00
|
|
|
// The caller incremented nmspinning, so set m.spinning in the new M.
|
2015-10-18 18:04:05 -06:00
|
|
|
mp.spinning = spinning
|
|
|
|
mp.nextp.set(_p_)
|
|
|
|
notewakeup(&mp.park)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Hands off P from syscall or locked M.
|
|
|
|
// Always runs without a P, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func handoffp(_p_ *p) {
|
runtime: start an M when handing off a P when there's GC work
Currently it's possible for the scheduler to deadlock with the right
confluence of locked Gs, assists, and scheduling of background mark
workers. Broadly, this happens because handoffp is stricter than
findrunnable, and if the only work for a P is GC work, handoffp will
put the P into idle, rather than starting an M to execute that P. One
way this can happen is as follows:
0. There is only one user G, which we'll call G 1. There is more than
one P, but they're all idle except the one running G 1.
1. G 1 locks itself to an M using runtime.LockOSThread.
2. GC starts up and enters mark 1.
3. G 1 performs a GC assist, which completes mark 1 without being
fully satisfied. Completing mark 1 causes all background mark
workers to park. And since the assist isn't fully satisfied, it
parks as well, waiting for a background mark worker to satisfy its
remaining assist debt.
4. The assist park enters the scheduler. Since G 1 is locked to the M,
the scheduler releases the P and calls handoffp to hand the P to
another M.
5. handoffp checks the local and global run queues, which are empty,
and sees that there are idle Ps, so rather than start an M, it puts
the P into idle.
At this point, all of the Gs are waiting and all of the Ps are idle.
In particular, none of the GC workers are running, so no mark work
gets done and the assist on the main G is never satisfied, so the
whole process soft locks up.
Fix this by making handoffp start an M if there is GC work. This
reintroduces a key invariant: that in any situation where findrunnable
would return a G to run on a P, handoffp for that P will start an M to
run work on that P.
Fixes #13645.
Tested by running 2,689 iterations of `go tool dist test -no-rebuild
runtime:cpu124` across 10 linux-amd64-noopt VMs with no failures.
Without this change, the failure rate was somewhere around 1%.
Performance change is negligible.
name old time/op new time/op delta
XBenchGarbage-12 2.48ms ± 2% 2.48ms ± 1% -0.24% (p=0.000 n=92+93)
name old time/op new time/op delta
BinaryTree17-12 2.86s ± 2% 2.87s ± 2% ~ (p=0.667 n=19+20)
Fannkuch11-12 2.52s ± 1% 2.47s ± 1% -2.05% (p=0.000 n=18+20)
FmtFprintfEmpty-12 51.7ns ± 1% 51.5ns ± 3% ~ (p=0.931 n=16+20)
FmtFprintfString-12 170ns ± 1% 168ns ± 1% -0.65% (p=0.000 n=19+19)
FmtFprintfInt-12 160ns ± 0% 160ns ± 0% +0.18% (p=0.033 n=17+19)
FmtFprintfIntInt-12 265ns ± 1% 273ns ± 1% +2.98% (p=0.000 n=17+19)
FmtFprintfPrefixedInt-12 235ns ± 1% 239ns ± 1% +1.99% (p=0.000 n=16+19)
FmtFprintfFloat-12 315ns ± 0% 315ns ± 1% ~ (p=0.250 n=17+19)
FmtManyArgs-12 1.04µs ± 1% 1.05µs ± 0% +0.87% (p=0.000 n=17+19)
GobDecode-12 7.93ms ± 0% 7.85ms ± 1% -1.03% (p=0.000 n=16+18)
GobEncode-12 6.62ms ± 1% 6.58ms ± 1% -0.60% (p=0.000 n=18+19)
Gzip-12 322ms ± 1% 320ms ± 1% -0.46% (p=0.009 n=20+20)
Gunzip-12 42.5ms ± 1% 42.5ms ± 0% ~ (p=0.751 n=19+19)
HTTPClientServer-12 69.7µs ± 1% 70.0µs ± 2% ~ (p=0.056 n=19+19)
JSONEncode-12 16.9ms ± 1% 16.7ms ± 1% -1.13% (p=0.000 n=19+19)
JSONDecode-12 61.5ms ± 1% 61.3ms ± 1% -0.35% (p=0.001 n=20+17)
Mandelbrot200-12 3.94ms ± 0% 3.91ms ± 0% -0.67% (p=0.000 n=20+18)
GoParse-12 3.71ms ± 1% 3.70ms ± 1% ~ (p=0.244 n=17+19)
RegexpMatchEasy0_32-12 101ns ± 1% 102ns ± 2% +0.54% (p=0.037 n=19+20)
RegexpMatchEasy0_1K-12 349ns ± 0% 350ns ± 0% +0.33% (p=0.000 n=17+18)
RegexpMatchEasy1_32-12 84.5ns ± 2% 84.2ns ± 1% -0.43% (p=0.048 n=19+20)
RegexpMatchEasy1_1K-12 510ns ± 1% 513ns ± 2% +0.58% (p=0.002 n=18+20)
RegexpMatchMedium_32-12 132ns ± 1% 134ns ± 1% +0.95% (p=0.000 n=20+20)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.6µs ± 1% -1.39% (p=0.000 n=20+20)
RegexpMatchHard_32-12 2.08µs ± 0% 2.06µs ± 1% -0.95% (p=0.000 n=18+18)
RegexpMatchHard_1K-12 62.2µs ± 1% 61.9µs ± 1% -0.42% (p=0.001 n=19+20)
Revcomp-12 537ms ± 0% 536ms ± 0% ~ (p=0.076 n=20+20)
Template-12 71.3ms ± 1% 69.3ms ± 1% -2.75% (p=0.000 n=20+20)
TimeParse-12 361ns ± 0% 360ns ± 1% ~ (p=0.056 n=19+19)
TimeFormat-12 353ns ± 0% 352ns ± 0% -0.23% (p=0.000 n=17+18)
[Geo mean] 62.6µs 62.5µs -0.17%
Change-Id: I0fbbbe4d7d99653ba5600ffb4394fa03558bc4e9
Reviewed-on: https://go-review.googlesource.com/19107
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-01 12:06:51 -07:00
|
|
|
// handoffp must start an M in any situation where
|
|
|
|
// findrunnable would return a G to run on _p_.
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// if it has local work, start it straight away
|
|
|
|
if !runqempty(_p_) || sched.runqsize != 0 {
|
|
|
|
startm(_p_, false)
|
|
|
|
return
|
|
|
|
}
|
runtime: start an M when handing off a P when there's GC work
Currently it's possible for the scheduler to deadlock with the right
confluence of locked Gs, assists, and scheduling of background mark
workers. Broadly, this happens because handoffp is stricter than
findrunnable, and if the only work for a P is GC work, handoffp will
put the P into idle, rather than starting an M to execute that P. One
way this can happen is as follows:
0. There is only one user G, which we'll call G 1. There is more than
one P, but they're all idle except the one running G 1.
1. G 1 locks itself to an M using runtime.LockOSThread.
2. GC starts up and enters mark 1.
3. G 1 performs a GC assist, which completes mark 1 without being
fully satisfied. Completing mark 1 causes all background mark
workers to park. And since the assist isn't fully satisfied, it
parks as well, waiting for a background mark worker to satisfy its
remaining assist debt.
4. The assist park enters the scheduler. Since G 1 is locked to the M,
the scheduler releases the P and calls handoffp to hand the P to
another M.
5. handoffp checks the local and global run queues, which are empty,
and sees that there are idle Ps, so rather than start an M, it puts
the P into idle.
At this point, all of the Gs are waiting and all of the Ps are idle.
In particular, none of the GC workers are running, so no mark work
gets done and the assist on the main G is never satisfied, so the
whole process soft locks up.
Fix this by making handoffp start an M if there is GC work. This
reintroduces a key invariant: that in any situation where findrunnable
would return a G to run on a P, handoffp for that P will start an M to
run work on that P.
Fixes #13645.
Tested by running 2,689 iterations of `go tool dist test -no-rebuild
runtime:cpu124` across 10 linux-amd64-noopt VMs with no failures.
Without this change, the failure rate was somewhere around 1%.
Performance change is negligible.
name old time/op new time/op delta
XBenchGarbage-12 2.48ms ± 2% 2.48ms ± 1% -0.24% (p=0.000 n=92+93)
name old time/op new time/op delta
BinaryTree17-12 2.86s ± 2% 2.87s ± 2% ~ (p=0.667 n=19+20)
Fannkuch11-12 2.52s ± 1% 2.47s ± 1% -2.05% (p=0.000 n=18+20)
FmtFprintfEmpty-12 51.7ns ± 1% 51.5ns ± 3% ~ (p=0.931 n=16+20)
FmtFprintfString-12 170ns ± 1% 168ns ± 1% -0.65% (p=0.000 n=19+19)
FmtFprintfInt-12 160ns ± 0% 160ns ± 0% +0.18% (p=0.033 n=17+19)
FmtFprintfIntInt-12 265ns ± 1% 273ns ± 1% +2.98% (p=0.000 n=17+19)
FmtFprintfPrefixedInt-12 235ns ± 1% 239ns ± 1% +1.99% (p=0.000 n=16+19)
FmtFprintfFloat-12 315ns ± 0% 315ns ± 1% ~ (p=0.250 n=17+19)
FmtManyArgs-12 1.04µs ± 1% 1.05µs ± 0% +0.87% (p=0.000 n=17+19)
GobDecode-12 7.93ms ± 0% 7.85ms ± 1% -1.03% (p=0.000 n=16+18)
GobEncode-12 6.62ms ± 1% 6.58ms ± 1% -0.60% (p=0.000 n=18+19)
Gzip-12 322ms ± 1% 320ms ± 1% -0.46% (p=0.009 n=20+20)
Gunzip-12 42.5ms ± 1% 42.5ms ± 0% ~ (p=0.751 n=19+19)
HTTPClientServer-12 69.7µs ± 1% 70.0µs ± 2% ~ (p=0.056 n=19+19)
JSONEncode-12 16.9ms ± 1% 16.7ms ± 1% -1.13% (p=0.000 n=19+19)
JSONDecode-12 61.5ms ± 1% 61.3ms ± 1% -0.35% (p=0.001 n=20+17)
Mandelbrot200-12 3.94ms ± 0% 3.91ms ± 0% -0.67% (p=0.000 n=20+18)
GoParse-12 3.71ms ± 1% 3.70ms ± 1% ~ (p=0.244 n=17+19)
RegexpMatchEasy0_32-12 101ns ± 1% 102ns ± 2% +0.54% (p=0.037 n=19+20)
RegexpMatchEasy0_1K-12 349ns ± 0% 350ns ± 0% +0.33% (p=0.000 n=17+18)
RegexpMatchEasy1_32-12 84.5ns ± 2% 84.2ns ± 1% -0.43% (p=0.048 n=19+20)
RegexpMatchEasy1_1K-12 510ns ± 1% 513ns ± 2% +0.58% (p=0.002 n=18+20)
RegexpMatchMedium_32-12 132ns ± 1% 134ns ± 1% +0.95% (p=0.000 n=20+20)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.6µs ± 1% -1.39% (p=0.000 n=20+20)
RegexpMatchHard_32-12 2.08µs ± 0% 2.06µs ± 1% -0.95% (p=0.000 n=18+18)
RegexpMatchHard_1K-12 62.2µs ± 1% 61.9µs ± 1% -0.42% (p=0.001 n=19+20)
Revcomp-12 537ms ± 0% 536ms ± 0% ~ (p=0.076 n=20+20)
Template-12 71.3ms ± 1% 69.3ms ± 1% -2.75% (p=0.000 n=20+20)
TimeParse-12 361ns ± 0% 360ns ± 1% ~ (p=0.056 n=19+19)
TimeFormat-12 353ns ± 0% 352ns ± 0% -0.23% (p=0.000 n=17+18)
[Geo mean] 62.6µs 62.5µs -0.17%
Change-Id: I0fbbbe4d7d99653ba5600ffb4394fa03558bc4e9
Reviewed-on: https://go-review.googlesource.com/19107
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-01 12:06:51 -07:00
|
|
|
// if it has GC work, start it straight away
|
|
|
|
if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
|
|
|
|
startm(_p_, false)
|
|
|
|
return
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
// no local work, check that there are no spinning/idle M's,
|
|
|
|
// otherwise our help is not required
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
|
2015-10-18 18:04:05 -06:00
|
|
|
startm(_p_, true)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
lock(&sched.lock)
|
|
|
|
if sched.gcwaiting != 0 {
|
|
|
|
_p_.status = _Pgcstop
|
|
|
|
sched.stopwait--
|
|
|
|
if sched.stopwait == 0 {
|
|
|
|
notewakeup(&sched.stopnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
|
2015-10-18 18:04:05 -06:00
|
|
|
sched.safePointFn(_p_)
|
|
|
|
sched.safePointWait--
|
|
|
|
if sched.safePointWait == 0 {
|
|
|
|
notewakeup(&sched.safePointNote)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if sched.runqsize != 0 {
|
|
|
|
unlock(&sched.lock)
|
|
|
|
startm(_p_, false)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// If this is the last running P and nobody is polling network,
|
|
|
|
// need to wakeup another M to poll network.
|
2015-11-02 12:09:24 -07:00
|
|
|
if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
unlock(&sched.lock)
|
|
|
|
startm(_p_, false)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
pidleput(_p_)
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tries to add one more P to execute G's.
|
|
|
|
// Called when a G is made runnable (newproc, ready).
|
|
|
|
func wakep() {
|
|
|
|
// be conservative about spinning threads
|
2015-11-02 12:09:24 -07:00
|
|
|
if !atomic.Cas(&sched.nmspinning, 0, 1) {
|
2015-10-18 18:04:05 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
startm(nil, true)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stops execution of the current m that is locked to a g until the g is runnable again.
|
|
|
|
// Returns with acquired P.
|
|
|
|
func stoplockedm() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
|
|
|
|
throw("stoplockedm: inconsistent locking")
|
|
|
|
}
|
|
|
|
if _g_.m.p != 0 {
|
|
|
|
// Schedule another M to run this p.
|
|
|
|
_p_ := releasep()
|
|
|
|
handoffp(_p_)
|
|
|
|
}
|
|
|
|
incidlelocked(1)
|
|
|
|
// Wait until another thread schedules lockedg again.
|
|
|
|
notesleep(&_g_.m.park)
|
|
|
|
noteclear(&_g_.m.park)
|
|
|
|
status := readgstatus(_g_.m.lockedg)
|
|
|
|
if status&^_Gscan != _Grunnable {
|
|
|
|
print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
|
|
|
|
dumpgstatus(_g_)
|
|
|
|
throw("stoplockedm: not runnable")
|
|
|
|
}
|
|
|
|
acquirep(_g_.m.nextp.ptr())
|
|
|
|
_g_.m.nextp = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Schedules the locked m to run the locked gp.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func startlockedm(gp *g) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
mp := gp.lockedm
|
|
|
|
if mp == _g_.m {
|
|
|
|
throw("startlockedm: locked to me")
|
|
|
|
}
|
|
|
|
if mp.nextp != 0 {
|
|
|
|
throw("startlockedm: m has p")
|
|
|
|
}
|
|
|
|
// directly handoff current P to the locked m
|
|
|
|
incidlelocked(-1)
|
|
|
|
_p_ := releasep()
|
|
|
|
mp.nextp.set(_p_)
|
|
|
|
notewakeup(&mp.park)
|
|
|
|
stopm()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stops the current m for stopTheWorld.
|
|
|
|
// Returns when the world is restarted.
|
|
|
|
func gcstopm() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if sched.gcwaiting == 0 {
|
|
|
|
throw("gcstopm: not waiting for gc")
|
|
|
|
}
|
|
|
|
if _g_.m.spinning {
|
|
|
|
_g_.m.spinning = false
|
2015-12-08 07:11:27 -07:00
|
|
|
// OK to just drop nmspinning here,
|
|
|
|
// startTheWorld will unpark threads as necessary.
|
|
|
|
if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
|
|
|
|
throw("gcstopm: negative nmspinning")
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
_p_ := releasep()
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_.status = _Pgcstop
|
|
|
|
sched.stopwait--
|
|
|
|
if sched.stopwait == 0 {
|
|
|
|
notewakeup(&sched.stopnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
stopm()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Schedules gp to run on the current M.
|
|
|
|
// If inheritTime is true, gp inherits the remaining time in the
|
|
|
|
// current time slice. Otherwise, it starts a new time slice.
|
|
|
|
// Never returns.
|
2016-10-10 15:14:14 -06:00
|
|
|
//
|
|
|
|
// Write barriers are allowed because this is called immediately after
|
|
|
|
// acquiring a P in several places.
|
|
|
|
//
|
|
|
|
//go:yeswritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func execute(gp *g, inheritTime bool) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
casgstatus(gp, _Grunnable, _Grunning)
|
|
|
|
gp.waitsince = 0
|
|
|
|
gp.preempt = false
|
|
|
|
gp.stackguard0 = gp.stack.lo + _StackGuard
|
|
|
|
if !inheritTime {
|
|
|
|
_g_.m.p.ptr().schedtick++
|
|
|
|
}
|
|
|
|
_g_.m.curg = gp
|
|
|
|
gp.m = _g_.m
|
|
|
|
|
|
|
|
// Check whether the profiler needs to be turned on or off.
|
|
|
|
hz := sched.profilehz
|
|
|
|
if _g_.m.profilehz != hz {
|
|
|
|
resetcpuprofiler(hz)
|
|
|
|
}
|
|
|
|
|
|
|
|
if trace.enabled {
|
|
|
|
// GoSysExit has to happen when we have a P, but before GoStart.
|
|
|
|
// So we emit it here.
|
|
|
|
if gp.syscallsp != 0 && gp.sysblocktraced {
|
2016-04-05 07:29:14 -06:00
|
|
|
traceGoSysExit(gp.sysexitticks)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
traceGoStart()
|
|
|
|
}
|
|
|
|
|
|
|
|
gogo(&gp.sched)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finds a runnable goroutine to execute.
|
|
|
|
// Tries to steal from other P's, get g from global queue, poll network.
|
|
|
|
func findrunnable() (gp *g, inheritTime bool) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
runtime: start an M when handing off a P when there's GC work
Currently it's possible for the scheduler to deadlock with the right
confluence of locked Gs, assists, and scheduling of background mark
workers. Broadly, this happens because handoffp is stricter than
findrunnable, and if the only work for a P is GC work, handoffp will
put the P into idle, rather than starting an M to execute that P. One
way this can happen is as follows:
0. There is only one user G, which we'll call G 1. There is more than
one P, but they're all idle except the one running G 1.
1. G 1 locks itself to an M using runtime.LockOSThread.
2. GC starts up and enters mark 1.
3. G 1 performs a GC assist, which completes mark 1 without being
fully satisfied. Completing mark 1 causes all background mark
workers to park. And since the assist isn't fully satisfied, it
parks as well, waiting for a background mark worker to satisfy its
remaining assist debt.
4. The assist park enters the scheduler. Since G 1 is locked to the M,
the scheduler releases the P and calls handoffp to hand the P to
another M.
5. handoffp checks the local and global run queues, which are empty,
and sees that there are idle Ps, so rather than start an M, it puts
the P into idle.
At this point, all of the Gs are waiting and all of the Ps are idle.
In particular, none of the GC workers are running, so no mark work
gets done and the assist on the main G is never satisfied, so the
whole process soft locks up.
Fix this by making handoffp start an M if there is GC work. This
reintroduces a key invariant: that in any situation where findrunnable
would return a G to run on a P, handoffp for that P will start an M to
run work on that P.
Fixes #13645.
Tested by running 2,689 iterations of `go tool dist test -no-rebuild
runtime:cpu124` across 10 linux-amd64-noopt VMs with no failures.
Without this change, the failure rate was somewhere around 1%.
Performance change is negligible.
name old time/op new time/op delta
XBenchGarbage-12 2.48ms ± 2% 2.48ms ± 1% -0.24% (p=0.000 n=92+93)
name old time/op new time/op delta
BinaryTree17-12 2.86s ± 2% 2.87s ± 2% ~ (p=0.667 n=19+20)
Fannkuch11-12 2.52s ± 1% 2.47s ± 1% -2.05% (p=0.000 n=18+20)
FmtFprintfEmpty-12 51.7ns ± 1% 51.5ns ± 3% ~ (p=0.931 n=16+20)
FmtFprintfString-12 170ns ± 1% 168ns ± 1% -0.65% (p=0.000 n=19+19)
FmtFprintfInt-12 160ns ± 0% 160ns ± 0% +0.18% (p=0.033 n=17+19)
FmtFprintfIntInt-12 265ns ± 1% 273ns ± 1% +2.98% (p=0.000 n=17+19)
FmtFprintfPrefixedInt-12 235ns ± 1% 239ns ± 1% +1.99% (p=0.000 n=16+19)
FmtFprintfFloat-12 315ns ± 0% 315ns ± 1% ~ (p=0.250 n=17+19)
FmtManyArgs-12 1.04µs ± 1% 1.05µs ± 0% +0.87% (p=0.000 n=17+19)
GobDecode-12 7.93ms ± 0% 7.85ms ± 1% -1.03% (p=0.000 n=16+18)
GobEncode-12 6.62ms ± 1% 6.58ms ± 1% -0.60% (p=0.000 n=18+19)
Gzip-12 322ms ± 1% 320ms ± 1% -0.46% (p=0.009 n=20+20)
Gunzip-12 42.5ms ± 1% 42.5ms ± 0% ~ (p=0.751 n=19+19)
HTTPClientServer-12 69.7µs ± 1% 70.0µs ± 2% ~ (p=0.056 n=19+19)
JSONEncode-12 16.9ms ± 1% 16.7ms ± 1% -1.13% (p=0.000 n=19+19)
JSONDecode-12 61.5ms ± 1% 61.3ms ± 1% -0.35% (p=0.001 n=20+17)
Mandelbrot200-12 3.94ms ± 0% 3.91ms ± 0% -0.67% (p=0.000 n=20+18)
GoParse-12 3.71ms ± 1% 3.70ms ± 1% ~ (p=0.244 n=17+19)
RegexpMatchEasy0_32-12 101ns ± 1% 102ns ± 2% +0.54% (p=0.037 n=19+20)
RegexpMatchEasy0_1K-12 349ns ± 0% 350ns ± 0% +0.33% (p=0.000 n=17+18)
RegexpMatchEasy1_32-12 84.5ns ± 2% 84.2ns ± 1% -0.43% (p=0.048 n=19+20)
RegexpMatchEasy1_1K-12 510ns ± 1% 513ns ± 2% +0.58% (p=0.002 n=18+20)
RegexpMatchMedium_32-12 132ns ± 1% 134ns ± 1% +0.95% (p=0.000 n=20+20)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.6µs ± 1% -1.39% (p=0.000 n=20+20)
RegexpMatchHard_32-12 2.08µs ± 0% 2.06µs ± 1% -0.95% (p=0.000 n=18+18)
RegexpMatchHard_1K-12 62.2µs ± 1% 61.9µs ± 1% -0.42% (p=0.001 n=19+20)
Revcomp-12 537ms ± 0% 536ms ± 0% ~ (p=0.076 n=20+20)
Template-12 71.3ms ± 1% 69.3ms ± 1% -2.75% (p=0.000 n=20+20)
TimeParse-12 361ns ± 0% 360ns ± 1% ~ (p=0.056 n=19+19)
TimeFormat-12 353ns ± 0% 352ns ± 0% -0.23% (p=0.000 n=17+18)
[Geo mean] 62.6µs 62.5µs -0.17%
Change-Id: I0fbbbe4d7d99653ba5600ffb4394fa03558bc4e9
Reviewed-on: https://go-review.googlesource.com/19107
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-01 12:06:51 -07:00
|
|
|
// The conditions here and in handoffp must agree: if
|
|
|
|
// findrunnable would return a G to run, handoffp must start
|
|
|
|
// an M.
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
top:
|
2016-03-18 05:52:52 -06:00
|
|
|
_p_ := _g_.m.p.ptr()
|
2015-10-18 18:04:05 -06:00
|
|
|
if sched.gcwaiting != 0 {
|
|
|
|
gcstopm()
|
|
|
|
goto top
|
|
|
|
}
|
2016-03-18 05:52:52 -06:00
|
|
|
if _p_.runSafePointFn != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
runSafePointFn()
|
|
|
|
}
|
|
|
|
if fingwait && fingwake {
|
|
|
|
if gp := wakefing(); gp != nil {
|
2016-05-17 16:21:54 -06:00
|
|
|
ready(gp, 0, true)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// local runq
|
2016-03-18 05:52:52 -06:00
|
|
|
if gp, inheritTime := runqget(_p_); gp != nil {
|
2015-10-18 18:04:05 -06:00
|
|
|
return gp, inheritTime
|
|
|
|
}
|
|
|
|
|
|
|
|
// global runq
|
|
|
|
if sched.runqsize != 0 {
|
|
|
|
lock(&sched.lock)
|
2016-03-18 05:52:52 -06:00
|
|
|
gp := globrunqget(_p_, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
unlock(&sched.lock)
|
|
|
|
if gp != nil {
|
|
|
|
return gp, false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Poll network.
|
|
|
|
// This netpoll is only an optimization before we resort to stealing.
|
|
|
|
// We can safely skip it if there a thread blocked in netpoll already.
|
|
|
|
// If there is any kind of logical race with that blocked thread
|
|
|
|
// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
|
|
|
|
// this thread will do blocking netpoll below anyway.
|
|
|
|
if netpollinited() && sched.lastpoll != 0 {
|
|
|
|
if gp := netpoll(false); gp != nil { // non-blocking
|
|
|
|
// netpoll returns list of goroutines linked by schedlink.
|
|
|
|
injectglist(gp.schedlink.ptr())
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoUnpark(gp, 0)
|
|
|
|
}
|
|
|
|
return gp, false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-18 05:52:52 -06:00
|
|
|
// Steal work from other P's.
|
|
|
|
procs := uint32(gomaxprocs)
|
|
|
|
if atomic.Load(&sched.npidle) == procs-1 {
|
|
|
|
// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
|
|
|
|
// New work can appear from returning syscall/cgocall, network or timers.
|
|
|
|
// Neither of that submits to local run queues, so no point in stealing.
|
|
|
|
goto stop
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
// If number of spinning M's >= number of busy P's, block.
|
|
|
|
// This is necessary to prevent excessive CPU consumption
|
|
|
|
// when GOMAXPROCS>>1 but the program parallelism is low.
|
2016-08-30 10:29:16 -06:00
|
|
|
if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) {
|
2015-10-18 18:04:05 -06:00
|
|
|
goto stop
|
|
|
|
}
|
|
|
|
if !_g_.m.spinning {
|
|
|
|
_g_.m.spinning = true
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xadd(&sched.nmspinning, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2016-03-18 05:52:52 -06:00
|
|
|
for i := 0; i < 4; i++ {
|
2016-06-28 10:22:46 -06:00
|
|
|
for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
|
2016-03-18 05:52:52 -06:00
|
|
|
if sched.gcwaiting != 0 {
|
|
|
|
goto top
|
|
|
|
}
|
|
|
|
stealRunNextG := i > 2 // first look for ready queues with more than 1 g
|
|
|
|
if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
|
|
|
|
return gp, false
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
stop:
|
|
|
|
|
2015-11-03 17:47:07 -07:00
|
|
|
// We have nothing to do. If we're in the GC mark phase, can
|
2015-10-26 09:27:37 -06:00
|
|
|
// safely scan and blacken objects, and have work to do, run
|
|
|
|
// idle-time marking rather than give up the P.
|
2016-03-18 05:52:52 -06:00
|
|
|
if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
|
2015-10-18 18:04:05 -06:00
|
|
|
_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
|
2016-01-26 12:44:58 -07:00
|
|
|
gp := _p_.gcBgMarkWorker.ptr()
|
2015-10-18 18:04:05 -06:00
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoUnpark(gp, 0)
|
|
|
|
}
|
|
|
|
return gp, false
|
|
|
|
}
|
|
|
|
|
|
|
|
// return P and block
|
|
|
|
lock(&sched.lock)
|
2016-03-18 05:52:52 -06:00
|
|
|
if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
unlock(&sched.lock)
|
|
|
|
goto top
|
|
|
|
}
|
|
|
|
if sched.runqsize != 0 {
|
2016-03-18 05:52:52 -06:00
|
|
|
gp := globrunqget(_p_, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
unlock(&sched.lock)
|
|
|
|
return gp, false
|
|
|
|
}
|
2016-03-18 05:52:52 -06:00
|
|
|
if releasep() != _p_ {
|
|
|
|
throw("findrunnable: wrong p")
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
pidleput(_p_)
|
|
|
|
unlock(&sched.lock)
|
2015-12-08 07:11:27 -07:00
|
|
|
|
|
|
|
// Delicate dance: thread transitions from spinning to non-spinning state,
|
|
|
|
// potentially concurrently with submission of new goroutines. We must
|
|
|
|
// drop nmspinning first and then check all per-P queues again (with
|
|
|
|
// #StoreLoad memory barrier in between). If we do it the other way around,
|
|
|
|
// another thread can submit a goroutine after we've checked all run queues
|
|
|
|
// but before we drop nmspinning; as the result nobody will unpark a thread
|
|
|
|
// to run the goroutine.
|
|
|
|
// If we discover new work below, we need to restore m.spinning as a signal
|
|
|
|
// for resetspinning to unpark a new worker thread (because there can be more
|
|
|
|
// than one starving goroutine). However, if after discovering new work
|
|
|
|
// we also observe no idle Ps, it is OK to just park the current thread:
|
|
|
|
// the system is fully loaded so no spinning threads are required.
|
|
|
|
// Also see "Worker thread parking/unparking" comment at the top of the file.
|
|
|
|
wasSpinning := _g_.m.spinning
|
2015-10-18 18:04:05 -06:00
|
|
|
if _g_.m.spinning {
|
|
|
|
_g_.m.spinning = false
|
2015-12-08 07:11:27 -07:00
|
|
|
if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
|
|
|
|
throw("findrunnable: negative nmspinning")
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// check all runqueues once again
|
|
|
|
for i := 0; i < int(gomaxprocs); i++ {
|
|
|
|
_p_ := allp[i]
|
|
|
|
if _p_ != nil && !runqempty(_p_) {
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_ = pidleget()
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if _p_ != nil {
|
|
|
|
acquirep(_p_)
|
2015-12-08 07:11:27 -07:00
|
|
|
if wasSpinning {
|
|
|
|
_g_.m.spinning = true
|
|
|
|
atomic.Xadd(&sched.nmspinning, 1)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
goto top
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
runtime: wake idle Ps when enqueuing GC work
If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.
Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.
To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.
There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.
Fixes #14179.
Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
2016-10-30 18:43:53 -06:00
|
|
|
// Check for idle-priority GC work again.
|
|
|
|
if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_ = pidleget()
|
|
|
|
if _p_ != nil && _p_.gcBgMarkWorker == 0 {
|
|
|
|
pidleput(_p_)
|
|
|
|
_p_ = nil
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if _p_ != nil {
|
|
|
|
acquirep(_p_)
|
|
|
|
if wasSpinning {
|
|
|
|
_g_.m.spinning = true
|
|
|
|
atomic.Xadd(&sched.nmspinning, 1)
|
|
|
|
}
|
|
|
|
// Go back to idle GC check.
|
|
|
|
goto stop
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// poll network
|
2015-11-02 12:09:24 -07:00
|
|
|
if netpollinited() && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
if _g_.m.p != 0 {
|
|
|
|
throw("findrunnable: netpoll with p")
|
|
|
|
}
|
|
|
|
if _g_.m.spinning {
|
|
|
|
throw("findrunnable: netpoll with spinning")
|
|
|
|
}
|
|
|
|
gp := netpoll(true) // block until new work is available
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store64(&sched.lastpoll, uint64(nanotime()))
|
2015-10-18 18:04:05 -06:00
|
|
|
if gp != nil {
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_ = pidleget()
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if _p_ != nil {
|
|
|
|
acquirep(_p_)
|
|
|
|
injectglist(gp.schedlink.ptr())
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoUnpark(gp, 0)
|
|
|
|
}
|
|
|
|
return gp, false
|
|
|
|
}
|
|
|
|
injectglist(gp)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
stopm()
|
|
|
|
goto top
|
|
|
|
}
|
|
|
|
|
2016-10-30 18:20:17 -06:00
|
|
|
// pollWork returns true if there is non-background work this P could
|
|
|
|
// be doing. This is a fairly lightweight check to be used for
|
|
|
|
// background work loops, like idle GC. It checks a subset of the
|
|
|
|
// conditions checked by the actual scheduler.
|
|
|
|
func pollWork() bool {
|
|
|
|
if sched.runqsize != 0 {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
p := getg().m.p.ptr()
|
|
|
|
if !runqempty(p) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
if netpollinited() && sched.lastpoll != 0 {
|
|
|
|
if gp := netpoll(false); gp != nil {
|
|
|
|
injectglist(gp)
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
func resetspinning() {
|
|
|
|
_g_ := getg()
|
2015-12-08 07:11:27 -07:00
|
|
|
if !_g_.m.spinning {
|
|
|
|
throw("resetspinning: not a spinning m")
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2015-12-08 07:11:27 -07:00
|
|
|
_g_.m.spinning = false
|
|
|
|
nmspinning := atomic.Xadd(&sched.nmspinning, -1)
|
|
|
|
if int32(nmspinning) < 0 {
|
|
|
|
throw("findrunnable: negative nmspinning")
|
|
|
|
}
|
|
|
|
// M wakeup policy is deliberately somewhat conservative, so check if we
|
|
|
|
// need to wakeup another P here. See "Worker thread parking/unparking"
|
|
|
|
// comment at the top of the file for details.
|
2015-11-02 12:09:24 -07:00
|
|
|
if nmspinning == 0 && atomic.Load(&sched.npidle) > 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
wakep()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Injects the list of runnable G's into the scheduler.
|
|
|
|
// Can run concurrently with GC.
|
|
|
|
func injectglist(glist *g) {
|
|
|
|
if glist == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
for gp := glist; gp != nil; gp = gp.schedlink.ptr() {
|
|
|
|
traceGoUnpark(gp, 0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
lock(&sched.lock)
|
|
|
|
var n int
|
|
|
|
for n = 0; glist != nil; n++ {
|
|
|
|
gp := glist
|
|
|
|
glist = gp.schedlink.ptr()
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
globrunqput(gp)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
for ; n != 0 && sched.npidle != 0; n-- {
|
|
|
|
startm(nil, false)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// One round of scheduler: find a runnable goroutine and execute it.
|
|
|
|
// Never returns.
|
|
|
|
func schedule() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.m.locks != 0 {
|
|
|
|
throw("schedule: holding locks")
|
|
|
|
}
|
|
|
|
|
|
|
|
if _g_.m.lockedg != nil {
|
|
|
|
stoplockedm()
|
|
|
|
execute(_g_.m.lockedg, false) // Never returns.
|
|
|
|
}
|
|
|
|
|
|
|
|
top:
|
|
|
|
if sched.gcwaiting != 0 {
|
|
|
|
gcstopm()
|
|
|
|
goto top
|
|
|
|
}
|
|
|
|
if _g_.m.p.ptr().runSafePointFn != 0 {
|
|
|
|
runSafePointFn()
|
|
|
|
}
|
|
|
|
|
|
|
|
var gp *g
|
|
|
|
var inheritTime bool
|
|
|
|
if trace.enabled || trace.shutdown {
|
|
|
|
gp = traceReader()
|
|
|
|
if gp != nil {
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
traceGoUnpark(gp, 0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if gp == nil && gcBlackenEnabled != 0 {
|
|
|
|
gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
|
|
|
|
}
|
|
|
|
if gp == nil {
|
|
|
|
// Check the global runnable queue once in a while to ensure fairness.
|
|
|
|
// Otherwise two goroutines can completely occupy the local runqueue
|
|
|
|
// by constantly respawning each other.
|
|
|
|
if _g_.m.p.ptr().schedtick%61 == 0 && sched.runqsize > 0 {
|
|
|
|
lock(&sched.lock)
|
|
|
|
gp = globrunqget(_g_.m.p.ptr(), 1)
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if gp == nil {
|
|
|
|
gp, inheritTime = runqget(_g_.m.p.ptr())
|
|
|
|
if gp != nil && _g_.m.spinning {
|
|
|
|
throw("schedule: spinning with local work")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if gp == nil {
|
|
|
|
gp, inheritTime = findrunnable() // blocks until work is available
|
2015-12-08 07:11:27 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// This thread is going to run a goroutine and is not spinning anymore,
|
|
|
|
// so if it was marked as spinning we need to reset it now and potentially
|
|
|
|
// start a new spinning M.
|
|
|
|
if _g_.m.spinning {
|
2015-10-18 18:04:05 -06:00
|
|
|
resetspinning()
|
|
|
|
}
|
|
|
|
|
|
|
|
if gp.lockedm != nil {
|
|
|
|
// Hands off own p to the locked m,
|
|
|
|
// then blocks waiting for a new p.
|
|
|
|
startlockedm(gp)
|
|
|
|
goto top
|
|
|
|
}
|
|
|
|
|
|
|
|
execute(gp, inheritTime)
|
|
|
|
}
|
|
|
|
|
|
|
|
// dropg removes the association between m and the current goroutine m->curg (gp for short).
|
|
|
|
// Typically a caller sets gp's status away from Grunning and then
|
|
|
|
// immediately calls dropg to finish the job. The caller is also responsible
|
|
|
|
// for arranging that gp will be restarted using ready at an
|
|
|
|
// appropriate time. After calling dropg and arranging for gp to be
|
|
|
|
// readied later, the caller can do other work but eventually should
|
|
|
|
// call schedule to restart the scheduling of goroutines on this m.
|
|
|
|
func dropg() {
|
|
|
|
_g_ := getg()
|
|
|
|
|
2016-10-19 14:00:07 -06:00
|
|
|
setMNoWB(&_g_.m.curg.m, nil)
|
|
|
|
setGNoWB(&_g_.m.curg, nil)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
|
|
|
|
unlock((*mutex)(lock))
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// park continuation on g0.
|
|
|
|
func park_m(gp *g) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
|
|
|
|
}
|
|
|
|
|
|
|
|
casgstatus(gp, _Grunning, _Gwaiting)
|
|
|
|
dropg()
|
|
|
|
|
|
|
|
if _g_.m.waitunlockf != nil {
|
|
|
|
fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
|
|
|
|
ok := fn(gp, _g_.m.waitlock)
|
|
|
|
_g_.m.waitunlockf = nil
|
|
|
|
_g_.m.waitlock = nil
|
|
|
|
if !ok {
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoUnpark(gp, 2)
|
|
|
|
}
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
execute(gp, true) // Schedule it back, never returns.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
schedule()
|
|
|
|
}
|
|
|
|
|
|
|
|
func goschedImpl(gp *g) {
|
|
|
|
status := readgstatus(gp)
|
|
|
|
if status&^_Gscan != _Grunning {
|
|
|
|
dumpgstatus(gp)
|
|
|
|
throw("bad g status")
|
|
|
|
}
|
|
|
|
casgstatus(gp, _Grunning, _Grunnable)
|
|
|
|
dropg()
|
|
|
|
lock(&sched.lock)
|
|
|
|
globrunqput(gp)
|
|
|
|
unlock(&sched.lock)
|
|
|
|
|
|
|
|
schedule()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Gosched continuation on g0.
|
|
|
|
func gosched_m(gp *g) {
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoSched()
|
|
|
|
}
|
|
|
|
goschedImpl(gp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func gopreempt_m(gp *g) {
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoPreempt()
|
|
|
|
}
|
|
|
|
goschedImpl(gp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finishes execution of the current goroutine.
|
|
|
|
func goexit1() {
|
|
|
|
if raceenabled {
|
|
|
|
racegoend()
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoEnd()
|
|
|
|
}
|
|
|
|
mcall(goexit0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// goexit continuation on g0.
|
|
|
|
func goexit0(gp *g) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
casgstatus(gp, _Grunning, _Gdead)
|
2016-01-06 19:16:01 -07:00
|
|
|
if isSystemGoroutine(gp) {
|
|
|
|
atomic.Xadd(&sched.ngsys, -1)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
gp.m = nil
|
|
|
|
gp.lockedm = nil
|
|
|
|
_g_.m.lockedg = nil
|
|
|
|
gp.paniconfault = false
|
|
|
|
gp._defer = nil // should be true already but just in case.
|
|
|
|
gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
|
|
|
|
gp.writebuf = nil
|
|
|
|
gp.waitreason = ""
|
|
|
|
gp.param = nil
|
|
|
|
|
2016-03-04 09:58:26 -07:00
|
|
|
// Note that gp's stack scan is now "valid" because it has no
|
|
|
|
// stack. We could dequeueRescan, but that takes a lock and
|
|
|
|
// isn't really necessary.
|
|
|
|
gp.gcscanvalid = true
|
2015-10-18 18:04:05 -06:00
|
|
|
dropg()
|
|
|
|
|
|
|
|
if _g_.m.locked&^_LockExternal != 0 {
|
|
|
|
print("invalid m->locked = ", _g_.m.locked, "\n")
|
|
|
|
throw("internal lockOSThread error")
|
|
|
|
}
|
|
|
|
_g_.m.locked = 0
|
|
|
|
gfput(_g_.m.p.ptr(), gp)
|
|
|
|
schedule()
|
|
|
|
}
|
|
|
|
|
2016-10-19 14:16:40 -06:00
|
|
|
// save updates getg().sched to refer to pc and sp so that a following
|
|
|
|
// gogo will restore pc and sp.
|
|
|
|
//
|
|
|
|
// save must not have write barriers because invoking a write barrier
|
|
|
|
// can clobber getg().sched.
|
|
|
|
//
|
2015-10-18 18:04:05 -06:00
|
|
|
//go:nosplit
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func save(pc, sp uintptr) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
_g_.sched.pc = pc
|
|
|
|
_g_.sched.sp = sp
|
|
|
|
_g_.sched.lr = 0
|
|
|
|
_g_.sched.ret = 0
|
|
|
|
_g_.sched.g = guintptr(unsafe.Pointer(_g_))
|
2016-10-19 14:16:40 -06:00
|
|
|
// We need to ensure ctxt is zero, but can't have a write
|
|
|
|
// barrier here. However, it should always already be zero.
|
|
|
|
// Assert that.
|
|
|
|
if _g_.sched.ctxt != nil {
|
|
|
|
badctxt()
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// The goroutine g is about to enter a system call.
|
|
|
|
// Record that it's not using the cpu anymore.
|
|
|
|
// This is called only from the go syscall library and cgocall,
|
|
|
|
// not from the low-level system calls used by the runtime.
|
|
|
|
//
|
|
|
|
// Entersyscall cannot split the stack: the gosave must
|
|
|
|
// make g->sched refer to the caller's stack segment, because
|
|
|
|
// entersyscall is going to return immediately after.
|
|
|
|
//
|
|
|
|
// Nothing entersyscall calls can split the stack either.
|
|
|
|
// We cannot safely move the stack during an active call to syscall,
|
|
|
|
// because we do not know which of the uintptr arguments are
|
|
|
|
// really pointers (back into the stack).
|
|
|
|
// In practice, this means that we make the fast path run through
|
|
|
|
// entersyscall doing no-split things, and the slow path has to use systemstack
|
|
|
|
// to run bigger things on the system stack.
|
|
|
|
//
|
|
|
|
// reentersyscall is the entry point used by cgo callbacks, where explicitly
|
|
|
|
// saved SP and PC are restored. This is needed when exitsyscall will be called
|
|
|
|
// from a function further up in the call stack than the parent, as g->syscallsp
|
|
|
|
// must always point to a valid stack frame. entersyscall below is the normal
|
|
|
|
// entry point for syscalls, which obtains the SP and PC from the caller.
|
|
|
|
//
|
|
|
|
// Syscall tracing:
|
|
|
|
// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
|
|
|
|
// If the syscall does not block, that is it, we do not emit any other events.
|
|
|
|
// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
|
|
|
|
// when syscall returns we emit traceGoSysExit and when the goroutine starts running
|
|
|
|
// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
|
|
|
|
// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
|
|
|
|
// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
|
|
|
|
// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
|
|
|
|
// and we wait for the increment before emitting traceGoSysExit.
|
|
|
|
// Note that the increment is done even if tracing is not enabled,
|
|
|
|
// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
|
|
|
|
//
|
|
|
|
//go:nosplit
|
|
|
|
func reentersyscall(pc, sp uintptr) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
// Disable preemption because during this function g is in Gsyscall status,
|
|
|
|
// but can have inconsistent g->sched, do not let GC observe it.
|
|
|
|
_g_.m.locks++
|
|
|
|
|
|
|
|
// Entersyscall must not call any function that might split/grow the stack.
|
|
|
|
// (See details in comment above.)
|
|
|
|
// Catch calls that might, by replacing the stack guard with something that
|
|
|
|
// will trip any stack check and leaving a flag to tell newstack to die.
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
_g_.throwsplit = true
|
|
|
|
|
|
|
|
// Leave SP around for GC and traceback.
|
|
|
|
save(pc, sp)
|
|
|
|
_g_.syscallsp = sp
|
|
|
|
_g_.syscallpc = pc
|
|
|
|
casgstatus(_g_, _Grunning, _Gsyscall)
|
|
|
|
if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
|
|
|
|
systemstack(func() {
|
|
|
|
print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
|
|
|
|
throw("entersyscall")
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
if trace.enabled {
|
|
|
|
systemstack(traceGoSysCall)
|
|
|
|
// systemstack itself clobbers g.sched.{pc,sp} and we might
|
|
|
|
// need them later when the G is genuinely blocked in a
|
|
|
|
// syscall
|
|
|
|
save(pc, sp)
|
|
|
|
}
|
|
|
|
|
2016-08-30 10:29:16 -06:00
|
|
|
if atomic.Load(&sched.sysmonwait) != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
systemstack(entersyscall_sysmon)
|
|
|
|
save(pc, sp)
|
|
|
|
}
|
|
|
|
|
|
|
|
if _g_.m.p.ptr().runSafePointFn != 0 {
|
|
|
|
// runSafePointFn may stack split if run on this stack
|
|
|
|
systemstack(runSafePointFn)
|
|
|
|
save(pc, sp)
|
|
|
|
}
|
|
|
|
|
|
|
|
_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
|
|
|
|
_g_.sysblocktraced = true
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
_g_.m.p.ptr().m = 0
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&_g_.m.p.ptr().status, _Psyscall)
|
2015-10-18 18:04:05 -06:00
|
|
|
if sched.gcwaiting != 0 {
|
|
|
|
systemstack(entersyscall_gcwait)
|
|
|
|
save(pc, sp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
|
|
|
|
// We set _StackGuard to StackPreempt so that first split stack check calls morestack.
|
|
|
|
// Morestack detects this case and throws.
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
_g_.m.locks--
|
|
|
|
}
|
|
|
|
|
|
|
|
// Standard syscall entry used by the go syscall library and normal cgo calls.
|
|
|
|
//go:nosplit
|
|
|
|
func entersyscall(dummy int32) {
|
|
|
|
reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
|
|
|
|
}
|
|
|
|
|
|
|
|
func entersyscall_sysmon() {
|
|
|
|
lock(&sched.lock)
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&sched.sysmonwait) != 0 {
|
|
|
|
atomic.Store(&sched.sysmonwait, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
notewakeup(&sched.sysmonnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
func entersyscall_gcwait() {
|
|
|
|
_g_ := getg()
|
|
|
|
_p_ := _g_.m.p.ptr()
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
2015-11-02 12:09:24 -07:00
|
|
|
if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
|
|
|
traceGoSysBlock(_p_)
|
|
|
|
traceProcStop(_p_)
|
|
|
|
}
|
|
|
|
_p_.syscalltick++
|
|
|
|
if sched.stopwait--; sched.stopwait == 0 {
|
|
|
|
notewakeup(&sched.stopnote)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// The same as entersyscall(), but with a hint that the syscall is blocking.
|
|
|
|
//go:nosplit
|
|
|
|
func entersyscallblock(dummy int32) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
_g_.m.locks++ // see comment in entersyscall
|
|
|
|
_g_.throwsplit = true
|
|
|
|
_g_.stackguard0 = stackPreempt // see comment in entersyscall
|
|
|
|
_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
|
|
|
|
_g_.sysblocktraced = true
|
|
|
|
_g_.m.p.ptr().syscalltick++
|
|
|
|
|
|
|
|
// Leave SP around for GC and traceback.
|
|
|
|
pc := getcallerpc(unsafe.Pointer(&dummy))
|
|
|
|
sp := getcallersp(unsafe.Pointer(&dummy))
|
|
|
|
save(pc, sp)
|
|
|
|
_g_.syscallsp = _g_.sched.sp
|
|
|
|
_g_.syscallpc = _g_.sched.pc
|
|
|
|
if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
|
|
|
|
sp1 := sp
|
|
|
|
sp2 := _g_.sched.sp
|
|
|
|
sp3 := _g_.syscallsp
|
|
|
|
systemstack(func() {
|
|
|
|
print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
|
|
|
|
throw("entersyscallblock")
|
|
|
|
})
|
|
|
|
}
|
|
|
|
casgstatus(_g_, _Grunning, _Gsyscall)
|
|
|
|
if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
|
|
|
|
systemstack(func() {
|
|
|
|
print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
|
|
|
|
throw("entersyscallblock")
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
systemstack(entersyscallblock_handoff)
|
|
|
|
|
|
|
|
// Resave for traceback during blocked call.
|
|
|
|
save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
|
|
|
|
|
|
|
|
_g_.m.locks--
|
|
|
|
}
|
|
|
|
|
|
|
|
func entersyscallblock_handoff() {
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoSysCall()
|
|
|
|
traceGoSysBlock(getg().m.p.ptr())
|
|
|
|
}
|
|
|
|
handoffp(releasep())
|
|
|
|
}
|
|
|
|
|
|
|
|
// The goroutine g exited its system call.
|
|
|
|
// Arrange for it to run on a cpu again.
|
|
|
|
// This is called only from the go syscall library, not
|
2016-02-20 21:24:27 -07:00
|
|
|
// from the low-level system calls used by the runtime.
|
2016-10-10 14:46:28 -06:00
|
|
|
//
|
|
|
|
// Write barriers are not allowed because our P may have been stolen.
|
|
|
|
//
|
2015-10-18 18:04:05 -06:00
|
|
|
//go:nosplit
|
2016-10-10 14:46:28 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func exitsyscall(dummy int32) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
_g_.m.locks++ // see comment in entersyscall
|
|
|
|
if getcallersp(unsafe.Pointer(&dummy)) > _g_.syscallsp {
|
2016-05-16 16:39:43 -06:00
|
|
|
// throw calls print which may try to grow the stack,
|
|
|
|
// but throwsplit == true so the stack can not be grown;
|
|
|
|
// use systemstack to avoid that possible problem.
|
|
|
|
systemstack(func() {
|
|
|
|
throw("exitsyscall: syscall frame is no longer valid")
|
|
|
|
})
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
_g_.waitsince = 0
|
|
|
|
oldp := _g_.m.p.ptr()
|
|
|
|
if exitsyscallfast() {
|
|
|
|
if _g_.m.mcache == nil {
|
|
|
|
throw("lost mcache")
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
|
|
|
|
systemstack(traceGoStart)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// There's a cpu for us, so we can run.
|
|
|
|
_g_.m.p.ptr().syscalltick++
|
|
|
|
// We need to cas the status and scan before resuming...
|
|
|
|
casgstatus(_g_, _Gsyscall, _Grunning)
|
|
|
|
|
|
|
|
// Garbage collector isn't running (since we are),
|
|
|
|
// so okay to clear syscallsp.
|
|
|
|
_g_.syscallsp = 0
|
|
|
|
_g_.m.locks--
|
|
|
|
if _g_.preempt {
|
|
|
|
// restore the preemption request in case we've cleared it in newstack
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
} else {
|
|
|
|
// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
|
|
|
|
_g_.stackguard0 = _g_.stack.lo + _StackGuard
|
|
|
|
}
|
|
|
|
_g_.throwsplit = false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
_g_.sysexitticks = 0
|
|
|
|
if trace.enabled {
|
|
|
|
// Wait till traceGoSysBlock event is emitted.
|
|
|
|
// This ensures consistency of the trace (the goroutine is started after it is blocked).
|
|
|
|
for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
|
|
|
|
osyield()
|
|
|
|
}
|
|
|
|
// We can't trace syscall exit right now because we don't have a P.
|
|
|
|
// Tracing code can invoke write barriers that cannot run without a P.
|
|
|
|
// So instead we remember the syscall exit time and emit the event
|
|
|
|
// in execute when we have a P.
|
2016-04-05 07:29:14 -06:00
|
|
|
_g_.sysexitticks = cputicks()
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
_g_.m.locks--
|
|
|
|
|
|
|
|
// Call the scheduler.
|
|
|
|
mcall(exitsyscall0)
|
|
|
|
|
|
|
|
if _g_.m.mcache == nil {
|
|
|
|
throw("lost mcache")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scheduler returned, so we're allowed to run now.
|
|
|
|
// Delete the syscallsp information that we left for
|
|
|
|
// the garbage collector during the system call.
|
|
|
|
// Must wait until now because until gosched returns
|
|
|
|
// we don't know for sure that the garbage collector
|
|
|
|
// is not running.
|
|
|
|
_g_.syscallsp = 0
|
|
|
|
_g_.m.p.ptr().syscalltick++
|
|
|
|
_g_.throwsplit = false
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func exitsyscallfast() bool {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
// Freezetheworld sets stopwait but does not retake P's.
|
|
|
|
if sched.stopwait == freezeStopWait {
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
_g_.m.p = 0
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to re-acquire the last P.
|
2015-11-02 12:09:24 -07:00
|
|
|
if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && atomic.Cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
|
2015-10-18 18:04:05 -06:00
|
|
|
// There's a cpu for us, so we can run.
|
2016-10-10 14:46:28 -06:00
|
|
|
exitsyscallfast_reacquired()
|
2015-10-18 18:04:05 -06:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to get any other idle P.
|
|
|
|
oldp := _g_.m.p.ptr()
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
_g_.m.p = 0
|
|
|
|
if sched.pidle != 0 {
|
|
|
|
var ok bool
|
|
|
|
systemstack(func() {
|
|
|
|
ok = exitsyscallfast_pidle()
|
|
|
|
if ok && trace.enabled {
|
|
|
|
if oldp != nil {
|
|
|
|
// Wait till traceGoSysBlock event is emitted.
|
|
|
|
// This ensures consistency of the trace (the goroutine is started after it is blocked).
|
|
|
|
for oldp.syscalltick == _g_.m.syscalltick {
|
|
|
|
osyield()
|
|
|
|
}
|
|
|
|
}
|
2016-04-05 07:29:14 -06:00
|
|
|
traceGoSysExit(0)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
})
|
|
|
|
if ok {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2016-10-10 14:46:28 -06:00
|
|
|
// exitsyscallfast_reacquired is the exitsyscall path on which this G
|
|
|
|
// has successfully reacquired the P it was running on before the
|
|
|
|
// syscall.
|
|
|
|
//
|
|
|
|
// This function is allowed to have write barriers because exitsyscall
|
|
|
|
// has acquired a P at this point.
|
|
|
|
//
|
|
|
|
//go:yeswritebarrierrec
|
|
|
|
//go:nosplit
|
|
|
|
func exitsyscallfast_reacquired() {
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.mcache = _g_.m.p.ptr().mcache
|
|
|
|
_g_.m.p.ptr().m.set(_g_.m)
|
|
|
|
if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
|
|
|
|
if trace.enabled {
|
|
|
|
// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
|
|
|
|
// traceGoSysBlock for this syscall was already emitted,
|
|
|
|
// but here we effectively retake the p from the new syscall running on the same p.
|
|
|
|
systemstack(func() {
|
|
|
|
// Denote blocking of the new syscall.
|
|
|
|
traceGoSysBlock(_g_.m.p.ptr())
|
|
|
|
// Denote completion of the current syscall.
|
|
|
|
traceGoSysExit(0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
_g_.m.p.ptr().syscalltick++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
func exitsyscallfast_pidle() bool {
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_ := pidleget()
|
2015-11-02 12:09:24 -07:00
|
|
|
if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
|
|
|
|
atomic.Store(&sched.sysmonwait, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
notewakeup(&sched.sysmonnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if _p_ != nil {
|
|
|
|
acquirep(_p_)
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// exitsyscall slow path on g0.
|
|
|
|
// Failed to acquire P, enqueue gp as runnable.
|
2016-10-10 15:14:14 -06:00
|
|
|
//
|
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func exitsyscall0(gp *g) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
casgstatus(gp, _Gsyscall, _Grunnable)
|
|
|
|
dropg()
|
|
|
|
lock(&sched.lock)
|
|
|
|
_p_ := pidleget()
|
|
|
|
if _p_ == nil {
|
|
|
|
globrunqput(gp)
|
2015-11-02 12:09:24 -07:00
|
|
|
} else if atomic.Load(&sched.sysmonwait) != 0 {
|
|
|
|
atomic.Store(&sched.sysmonwait, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
notewakeup(&sched.sysmonnote)
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
if _p_ != nil {
|
|
|
|
acquirep(_p_)
|
|
|
|
execute(gp, false) // Never returns.
|
|
|
|
}
|
|
|
|
if _g_.m.lockedg != nil {
|
|
|
|
// Wait until another thread schedules gp and so m again.
|
|
|
|
stoplockedm()
|
|
|
|
execute(gp, false) // Never returns.
|
|
|
|
}
|
|
|
|
stopm()
|
|
|
|
schedule() // Never returns.
|
|
|
|
}
|
|
|
|
|
|
|
|
func beforefork() {
|
|
|
|
gp := getg().m.curg
|
|
|
|
|
|
|
|
// Fork can hang if preempted with signals frequently enough (see issue 5517).
|
|
|
|
// Ensure that we stay on the same M where we disable profiling.
|
|
|
|
gp.m.locks++
|
|
|
|
if gp.m.profilehz != 0 {
|
|
|
|
resetcpuprofiler(0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// This function is called before fork in syscall package.
|
|
|
|
// Code between fork and exec must not allocate memory nor even try to grow stack.
|
|
|
|
// Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
|
|
|
|
// runtime_AfterFork will undo this in parent process, but not in child.
|
|
|
|
gp.stackguard0 = stackFork
|
|
|
|
}
|
|
|
|
|
|
|
|
// Called from syscall package before fork.
|
|
|
|
//go:linkname syscall_runtime_BeforeFork syscall.runtime_BeforeFork
|
|
|
|
//go:nosplit
|
|
|
|
func syscall_runtime_BeforeFork() {
|
|
|
|
systemstack(beforefork)
|
|
|
|
}
|
|
|
|
|
|
|
|
func afterfork() {
|
|
|
|
gp := getg().m.curg
|
|
|
|
|
|
|
|
// See the comment in beforefork.
|
|
|
|
gp.stackguard0 = gp.stack.lo + _StackGuard
|
|
|
|
|
|
|
|
hz := sched.profilehz
|
|
|
|
if hz != 0 {
|
|
|
|
resetcpuprofiler(hz)
|
|
|
|
}
|
|
|
|
gp.m.locks--
|
|
|
|
}
|
|
|
|
|
|
|
|
// Called from syscall package after fork in parent.
|
|
|
|
//go:linkname syscall_runtime_AfterFork syscall.runtime_AfterFork
|
|
|
|
//go:nosplit
|
|
|
|
func syscall_runtime_AfterFork() {
|
|
|
|
systemstack(afterfork)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allocate a new g, with a stack big enough for stacksize bytes.
|
|
|
|
func malg(stacksize int32) *g {
|
|
|
|
newg := new(g)
|
|
|
|
if stacksize >= 0 {
|
|
|
|
stacksize = round2(_StackSystem + stacksize)
|
|
|
|
systemstack(func() {
|
|
|
|
newg.stack, newg.stkbar = stackalloc(uint32(stacksize))
|
|
|
|
})
|
|
|
|
newg.stackguard0 = newg.stack.lo + _StackGuard
|
|
|
|
newg.stackguard1 = ^uintptr(0)
|
|
|
|
newg.stackAlloc = uintptr(stacksize)
|
|
|
|
}
|
|
|
|
return newg
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new g running fn with siz bytes of arguments.
|
|
|
|
// Put it on the queue of g's waiting to run.
|
|
|
|
// The compiler turns a go statement into a call to this.
|
|
|
|
// Cannot split the stack because it assumes that the arguments
|
|
|
|
// are available sequentially after &fn; they would not be
|
|
|
|
// copied if a stack split occurred.
|
|
|
|
//go:nosplit
|
|
|
|
func newproc(siz int32, fn *funcval) {
|
2015-11-11 10:39:30 -07:00
|
|
|
argp := add(unsafe.Pointer(&fn), sys.PtrSize)
|
2015-10-18 18:04:05 -06:00
|
|
|
pc := getcallerpc(unsafe.Pointer(&siz))
|
|
|
|
systemstack(func() {
|
|
|
|
newproc1(fn, (*uint8)(argp), siz, 0, pc)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new g running fn with narg bytes of arguments starting
|
|
|
|
// at argp and returning nret bytes of results. callerpc is the
|
2016-03-01 16:21:55 -07:00
|
|
|
// address of the go statement that created this. The new g is put
|
2015-10-18 18:04:05 -06:00
|
|
|
// on the queue of g's waiting to run.
|
|
|
|
func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if fn == nil {
|
|
|
|
_g_.m.throwing = -1 // do not dump full stacks
|
|
|
|
throw("go of nil func value")
|
|
|
|
}
|
|
|
|
_g_.m.locks++ // disable preemption because it can be holding p in a local var
|
|
|
|
siz := narg + nret
|
|
|
|
siz = (siz + 7) &^ 7
|
|
|
|
|
|
|
|
// We could allocate a larger initial stack if necessary.
|
|
|
|
// Not worth it: this is almost always an error.
|
|
|
|
// 4*sizeof(uintreg): extra space added below
|
|
|
|
// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
|
2015-11-11 10:39:30 -07:00
|
|
|
if siz >= _StackMin-4*sys.RegSize-sys.RegSize {
|
2015-10-18 18:04:05 -06:00
|
|
|
throw("newproc: function arguments too large for new goroutine")
|
|
|
|
}
|
|
|
|
|
|
|
|
_p_ := _g_.m.p.ptr()
|
|
|
|
newg := gfget(_p_)
|
|
|
|
if newg == nil {
|
|
|
|
newg = malg(_StackMin)
|
|
|
|
casgstatus(newg, _Gidle, _Gdead)
|
2016-03-04 09:58:26 -07:00
|
|
|
newg.gcRescan = -1
|
2015-10-18 18:04:05 -06:00
|
|
|
allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
|
|
|
|
}
|
|
|
|
if newg.stack.hi == 0 {
|
|
|
|
throw("newproc1: newg missing stack")
|
|
|
|
}
|
|
|
|
|
|
|
|
if readgstatus(newg) != _Gdead {
|
|
|
|
throw("newproc1: new g is not Gdead")
|
|
|
|
}
|
|
|
|
|
2015-11-11 10:39:30 -07:00
|
|
|
totalSize := 4*sys.RegSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
|
|
|
|
totalSize += -totalSize & (sys.SpAlign - 1) // align to spAlign
|
2015-10-18 18:04:05 -06:00
|
|
|
sp := newg.stack.hi - totalSize
|
|
|
|
spArg := sp
|
|
|
|
if usesLR {
|
|
|
|
// caller's LR
|
2016-10-26 12:34:20 -06:00
|
|
|
*(*uintptr)(unsafe.Pointer(sp)) = 0
|
cmd/compile, cmd/link, runtime: on ppc64x, maintain the TOC pointer in R2 when compiling PIC
The PowerPC ISA does not have a PC-relative load instruction, which poses
obvious challenges when generating position-independent code. The way the ELFv2
ABI addresses this is to specify that r2 points to a per "module" (shared
library or executable) TOC pointer. Maintaining this pointer requires
cooperation between codegen and the system linker:
* Non-leaf functions leave space on the stack at r1+24 to save the TOC pointer.
* A call to a function that *might* have to go via a PLT stub must be followed
by a nop instruction that the system linker can replace with "ld r1, 24(r1)"
to restore the TOC pointer (only when dynamically linking Go code).
* When calling a function via a function pointer, the address of the function
must be in r12, and the first couple of instructions (the "global entry
point") of the called function use this to derive the address of the TOC
for the module it is in.
* When calling a function that is implemented in the same module, the system
linker adjusts the call to skip over the instructions mentioned above (the
"local entry point"), assuming that r2 is already correctly set.
So this changeset adds the global entry point instructions, sets the metadata so
the system linker knows where the local entry point is, inserts code to save the
TOC pointer at 24(r1), adds a nop after any call not known to be local and copes
with the odd non-local code transfer in the runtime (e.g. the stuff around
jmpdefer). It does not actually compile PIC yet.
Change-Id: I7522e22bdfd2f891745a900c60254fe9e372c854
Reviewed-on: https://go-review.googlesource.com/15967
Reviewed-by: Russ Cox <rsc@golang.org>
2015-10-15 20:42:09 -06:00
|
|
|
prepGoExitFrame(sp)
|
2015-11-11 10:39:30 -07:00
|
|
|
spArg += sys.MinFrameSize
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2016-10-13 13:34:56 -06:00
|
|
|
if narg > 0 {
|
|
|
|
memmove(unsafe.Pointer(spArg), unsafe.Pointer(argp), uintptr(narg))
|
|
|
|
// This is a stack-to-stack copy. If write barriers
|
|
|
|
// are enabled and the source stack is grey (the
|
|
|
|
// destination is always black), then perform a
|
runtime: perform write barrier before pointer write
Currently, we perform write barriers after performing pointer writes.
At the moment, it simply doesn't matter what order this happens in, as
long as they appear atomic to GC. But both the hybrid barrier and ROC
are going to require a pre-write write barrier.
For the hybrid barrier, this is important because the barrier needs to
observe both the current value of the slot and the value that will be
written to it. (Alternatively, the caller could do the write and pass
in the old value, but it seems easier and more useful to just swap the
order of the barrier and the write.)
For ROC, this is necessary because, if the pointer write is going to
make the pointer reachable to some goroutine that it currently is not
visible to, the garbage collector must take some special action before
that pointer becomes more broadly visible.
This commits swaps pointer writes around so the write barrier occurs
before the pointer write.
The main subtlety here is bulk memory writes. Currently, these copy to
the destination first and then use the pointer bitmap of the
destination to find the copied pointers and invoke the write barrier.
This is necessary because the source may not have a pointer bitmap. To
handle these, we pass both the source and the destination to the bulk
memory barrier, which uses the pointer bitmap of the destination, but
reads the pointer values from the source.
Updates #17503.
Change-Id: I78ecc0c5c94ee81c29019c305b3d232069294a55
Reviewed-on: https://go-review.googlesource.com/31763
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-08-22 14:02:54 -06:00
|
|
|
// barrier copy. We do this *after* the memmove
|
|
|
|
// because the destination stack may have garbage on
|
|
|
|
// it.
|
2016-10-13 13:34:56 -06:00
|
|
|
if writeBarrier.needed && !_g_.m.curg.gcscandone {
|
|
|
|
f := findfunc(fn.fn)
|
|
|
|
stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
|
|
|
|
// We're in the prologue, so it's always stack map index 0.
|
|
|
|
bv := stackmapdata(stkmap, 0)
|
runtime: perform write barrier before pointer write
Currently, we perform write barriers after performing pointer writes.
At the moment, it simply doesn't matter what order this happens in, as
long as they appear atomic to GC. But both the hybrid barrier and ROC
are going to require a pre-write write barrier.
For the hybrid barrier, this is important because the barrier needs to
observe both the current value of the slot and the value that will be
written to it. (Alternatively, the caller could do the write and pass
in the old value, but it seems easier and more useful to just swap the
order of the barrier and the write.)
For ROC, this is necessary because, if the pointer write is going to
make the pointer reachable to some goroutine that it currently is not
visible to, the garbage collector must take some special action before
that pointer becomes more broadly visible.
This commits swaps pointer writes around so the write barrier occurs
before the pointer write.
The main subtlety here is bulk memory writes. Currently, these copy to
the destination first and then use the pointer bitmap of the
destination to find the copied pointers and invoke the write barrier.
This is necessary because the source may not have a pointer bitmap. To
handle these, we pass both the source and the destination to the bulk
memory barrier, which uses the pointer bitmap of the destination, but
reads the pointer values from the source.
Updates #17503.
Change-Id: I78ecc0c5c94ee81c29019c305b3d232069294a55
Reviewed-on: https://go-review.googlesource.com/31763
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-08-22 14:02:54 -06:00
|
|
|
bulkBarrierBitmap(spArg, spArg, uintptr(narg), 0, bv.bytedata)
|
2016-10-13 13:34:56 -06:00
|
|
|
}
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
|
2016-10-17 16:41:56 -06:00
|
|
|
memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
|
2015-10-18 18:04:05 -06:00
|
|
|
newg.sched.sp = sp
|
|
|
|
newg.stktopsp = sp
|
2015-11-11 10:39:30 -07:00
|
|
|
newg.sched.pc = funcPC(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
|
2015-10-18 18:04:05 -06:00
|
|
|
newg.sched.g = guintptr(unsafe.Pointer(newg))
|
|
|
|
gostartcallfn(&newg.sched, fn)
|
|
|
|
newg.gopc = callerpc
|
|
|
|
newg.startpc = fn.fn
|
2016-01-06 19:16:01 -07:00
|
|
|
if isSystemGoroutine(newg) {
|
|
|
|
atomic.Xadd(&sched.ngsys, +1)
|
|
|
|
}
|
2016-03-04 09:58:26 -07:00
|
|
|
// The stack is dirty from the argument frame, so queue it for
|
|
|
|
// scanning. Do this before setting it to runnable so we still
|
|
|
|
// own the G. If we're recycling a G, it may already be on the
|
|
|
|
// rescan list.
|
|
|
|
if newg.gcRescan == -1 {
|
|
|
|
queueRescan(newg)
|
|
|
|
} else {
|
|
|
|
// The recycled G is already on the rescan list. Just
|
|
|
|
// mark the stack dirty.
|
|
|
|
newg.gcscanvalid = false
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
casgstatus(newg, _Gdead, _Grunnable)
|
|
|
|
|
|
|
|
if _p_.goidcache == _p_.goidcacheend {
|
|
|
|
// Sched.goidgen is the last allocated id,
|
|
|
|
// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
|
|
|
|
// At startup sched.goidgen=0, so main goroutine receives goid=1.
|
2015-11-02 12:09:24 -07:00
|
|
|
_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
|
2015-10-18 18:04:05 -06:00
|
|
|
_p_.goidcache -= _GoidCacheBatch - 1
|
|
|
|
_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
|
|
|
|
}
|
|
|
|
newg.goid = int64(_p_.goidcache)
|
|
|
|
_p_.goidcache++
|
|
|
|
if raceenabled {
|
|
|
|
newg.racectx = racegostart(callerpc)
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoCreate(newg, newg.startpc)
|
|
|
|
}
|
|
|
|
runqput(_p_, newg, true)
|
|
|
|
|
2016-08-25 20:04:04 -06:00
|
|
|
if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && runtimeInitTime != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
wakep()
|
|
|
|
}
|
|
|
|
_g_.m.locks--
|
|
|
|
if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
|
|
|
|
_g_.stackguard0 = stackPreempt
|
|
|
|
}
|
|
|
|
return newg
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put on gfree list.
|
|
|
|
// If local list is too long, transfer a batch to the global list.
|
|
|
|
func gfput(_p_ *p, gp *g) {
|
|
|
|
if readgstatus(gp) != _Gdead {
|
|
|
|
throw("gfput: bad status (not Gdead)")
|
|
|
|
}
|
|
|
|
|
|
|
|
stksize := gp.stackAlloc
|
|
|
|
|
|
|
|
if stksize != _FixedStack {
|
|
|
|
// non-standard stack size - free it.
|
|
|
|
stackfree(gp.stack, gp.stackAlloc)
|
|
|
|
gp.stack.lo = 0
|
|
|
|
gp.stack.hi = 0
|
|
|
|
gp.stackguard0 = 0
|
|
|
|
gp.stkbar = nil
|
|
|
|
gp.stkbarPos = 0
|
|
|
|
} else {
|
|
|
|
// Reset stack barriers.
|
|
|
|
gp.stkbar = gp.stkbar[:0]
|
|
|
|
gp.stkbarPos = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
gp.schedlink.set(_p_.gfree)
|
|
|
|
_p_.gfree = gp
|
|
|
|
_p_.gfreecnt++
|
|
|
|
if _p_.gfreecnt >= 64 {
|
|
|
|
lock(&sched.gflock)
|
|
|
|
for _p_.gfreecnt >= 32 {
|
|
|
|
_p_.gfreecnt--
|
|
|
|
gp = _p_.gfree
|
|
|
|
_p_.gfree = gp.schedlink.ptr()
|
2016-03-11 14:27:51 -07:00
|
|
|
if gp.stack.lo == 0 {
|
|
|
|
gp.schedlink.set(sched.gfreeNoStack)
|
|
|
|
sched.gfreeNoStack = gp
|
|
|
|
} else {
|
|
|
|
gp.schedlink.set(sched.gfreeStack)
|
|
|
|
sched.gfreeStack = gp
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
sched.ngfree++
|
|
|
|
}
|
|
|
|
unlock(&sched.gflock)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get from gfree list.
|
|
|
|
// If local list is empty, grab a batch from global list.
|
|
|
|
func gfget(_p_ *p) *g {
|
|
|
|
retry:
|
|
|
|
gp := _p_.gfree
|
2016-03-11 14:27:51 -07:00
|
|
|
if gp == nil && (sched.gfreeStack != nil || sched.gfreeNoStack != nil) {
|
2015-10-18 18:04:05 -06:00
|
|
|
lock(&sched.gflock)
|
2016-03-11 14:27:51 -07:00
|
|
|
for _p_.gfreecnt < 32 {
|
|
|
|
if sched.gfreeStack != nil {
|
|
|
|
// Prefer Gs with stacks.
|
|
|
|
gp = sched.gfreeStack
|
|
|
|
sched.gfreeStack = gp.schedlink.ptr()
|
|
|
|
} else if sched.gfreeNoStack != nil {
|
|
|
|
gp = sched.gfreeNoStack
|
|
|
|
sched.gfreeNoStack = gp.schedlink.ptr()
|
|
|
|
} else {
|
|
|
|
break
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
_p_.gfreecnt++
|
|
|
|
sched.ngfree--
|
|
|
|
gp.schedlink.set(_p_.gfree)
|
|
|
|
_p_.gfree = gp
|
|
|
|
}
|
|
|
|
unlock(&sched.gflock)
|
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
if gp != nil {
|
|
|
|
_p_.gfree = gp.schedlink.ptr()
|
|
|
|
_p_.gfreecnt--
|
|
|
|
if gp.stack.lo == 0 {
|
2016-03-01 16:21:55 -07:00
|
|
|
// Stack was deallocated in gfput. Allocate a new one.
|
2015-10-18 18:04:05 -06:00
|
|
|
systemstack(func() {
|
|
|
|
gp.stack, gp.stkbar = stackalloc(_FixedStack)
|
|
|
|
})
|
|
|
|
gp.stackguard0 = gp.stack.lo + _StackGuard
|
|
|
|
gp.stackAlloc = _FixedStack
|
|
|
|
} else {
|
|
|
|
if raceenabled {
|
|
|
|
racemalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
|
|
|
|
}
|
2015-10-21 12:04:42 -06:00
|
|
|
if msanenabled {
|
|
|
|
msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return gp
|
|
|
|
}
|
|
|
|
|
|
|
|
// Purge all cached G's from gfree list to the global list.
|
|
|
|
func gfpurge(_p_ *p) {
|
|
|
|
lock(&sched.gflock)
|
|
|
|
for _p_.gfreecnt != 0 {
|
|
|
|
_p_.gfreecnt--
|
|
|
|
gp := _p_.gfree
|
|
|
|
_p_.gfree = gp.schedlink.ptr()
|
2016-03-11 14:27:51 -07:00
|
|
|
if gp.stack.lo == 0 {
|
|
|
|
gp.schedlink.set(sched.gfreeNoStack)
|
|
|
|
sched.gfreeNoStack = gp
|
|
|
|
} else {
|
|
|
|
gp.schedlink.set(sched.gfreeStack)
|
|
|
|
sched.gfreeStack = gp
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
sched.ngfree++
|
|
|
|
}
|
|
|
|
unlock(&sched.gflock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Breakpoint executes a breakpoint trap.
|
|
|
|
func Breakpoint() {
|
|
|
|
breakpoint()
|
|
|
|
}
|
|
|
|
|
|
|
|
// dolockOSThread is called by LockOSThread and lockOSThread below
|
|
|
|
// after they modify m.locked. Do not allow preemption during this call,
|
|
|
|
// or else the m might be different in this function than in the caller.
|
|
|
|
//go:nosplit
|
|
|
|
func dolockOSThread() {
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.lockedg = _g_
|
|
|
|
_g_.lockedm = _g_.m
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
|
|
|
|
// LockOSThread wires the calling goroutine to its current operating system thread.
|
|
|
|
// Until the calling goroutine exits or calls UnlockOSThread, it will always
|
|
|
|
// execute in that thread, and no other goroutine can.
|
|
|
|
func LockOSThread() {
|
|
|
|
getg().m.locked |= _LockExternal
|
|
|
|
dolockOSThread()
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func lockOSThread() {
|
|
|
|
getg().m.locked += _LockInternal
|
|
|
|
dolockOSThread()
|
|
|
|
}
|
|
|
|
|
|
|
|
// dounlockOSThread is called by UnlockOSThread and unlockOSThread below
|
|
|
|
// after they update m->locked. Do not allow preemption during this call,
|
|
|
|
// or else the m might be in different in this function than in the caller.
|
|
|
|
//go:nosplit
|
|
|
|
func dounlockOSThread() {
|
|
|
|
_g_ := getg()
|
|
|
|
if _g_.m.locked != 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
_g_.m.lockedg = nil
|
|
|
|
_g_.lockedm = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
|
|
|
|
// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
|
|
|
|
// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
|
|
|
|
func UnlockOSThread() {
|
|
|
|
getg().m.locked &^= _LockExternal
|
|
|
|
dounlockOSThread()
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func unlockOSThread() {
|
|
|
|
_g_ := getg()
|
|
|
|
if _g_.m.locked < _LockInternal {
|
|
|
|
systemstack(badunlockosthread)
|
|
|
|
}
|
|
|
|
_g_.m.locked -= _LockInternal
|
|
|
|
dounlockOSThread()
|
|
|
|
}
|
|
|
|
|
|
|
|
func badunlockosthread() {
|
|
|
|
throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
|
|
|
|
}
|
|
|
|
|
|
|
|
func gcount() int32 {
|
2016-01-06 19:16:01 -07:00
|
|
|
n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
|
2015-10-18 18:04:05 -06:00
|
|
|
for i := 0; ; i++ {
|
|
|
|
_p_ := allp[i]
|
|
|
|
if _p_ == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
n -= _p_.gfreecnt
|
|
|
|
}
|
|
|
|
|
|
|
|
// All these variables can be changed concurrently, so the result can be inconsistent.
|
|
|
|
// But at least the current goroutine is running.
|
|
|
|
if n < 1 {
|
|
|
|
n = 1
|
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func mcount() int32 {
|
|
|
|
return sched.mcount
|
|
|
|
}
|
|
|
|
|
|
|
|
var prof struct {
|
|
|
|
lock uint32
|
|
|
|
hz int32
|
|
|
|
}
|
|
|
|
|
|
|
|
func _System() { _System() }
|
|
|
|
func _ExternalCode() { _ExternalCode() }
|
|
|
|
func _GC() { _GC() }
|
|
|
|
|
|
|
|
// Called if we receive a SIGPROF signal.
|
2016-06-07 22:46:25 -06:00
|
|
|
// Called by the signal handler, may run during STW.
|
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
|
|
|
|
if prof.hz == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Profiling runs concurrently with GC, so it must not allocate.
|
runtime: fix Windows profiling crash
I don't have any way to test or reproduce this problem,
but the current code is clearly wrong for Windows.
Make it better.
As I said on #17165:
But the borrowing of M's and the profiling of M's by the CPU profiler
seem not synchronized enough. This code implements the CPU profiler
on Windows:
func profileloop1(param uintptr) uint32 {
stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
for {
stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
for mp := first; mp != nil; mp = mp.alllink {
thread := atomic.Loaduintptr(&mp.thread)
// Do not profile threads blocked on Notes,
// this includes idle worker threads,
// idle timer thread, idle heap scavenger, etc.
if thread == 0 || mp.profilehz == 0 || mp.blocked {
continue
}
stdcall1(_SuspendThread, thread)
if mp.profilehz != 0 && !mp.blocked {
profilem(mp)
}
stdcall1(_ResumeThread, thread)
}
}
}
func profilem(mp *m) {
var r *context
rbuf := make([]byte, unsafe.Sizeof(*r)+15)
tls := &mp.tls[0]
gp := *((**g)(unsafe.Pointer(tls)))
// align Context to 16 bytes
r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
r.contextflags = _CONTEXT_CONTROL
stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
sigprof(r.ip(), r.sp(), 0, gp, mp)
}
func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
if prof.hz == 0 {
return
}
// Profiling runs concurrently with GC, so it must not allocate.
mp.mallocing++
... lots of code ...
mp.mallocing--
}
A borrowed M may migrate between threads. Between the
atomic.Loaduintptr(&mp.thread) and the SuspendThread, mp may have
moved to a new thread, so that it's in active use. In particular
it might be calling malloc, as in the crash stack trace. If so, the
mp.mallocing++ in sigprof would provoke the crash.
Those lines are trying to guard against allocation during sigprof.
But on Windows, mp is the thread being traced, not the current
thread. Those lines should really be using getg().m.mallocing, which
is the same on Unix but not on Windows. With that change, it's
possible the race on the actual thread is not a problem: the traceback
would get confused and eventually return an error, but that's fine.
The code expects that possibility.
Fixes #17165.
Change-Id: If6619731910d65ca4b1a6e7de761fa2518ef339e
Reviewed-on: https://go-review.googlesource.com/33132
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-11-11 08:27:36 -07:00
|
|
|
// Set a trap in case the code does allocate.
|
|
|
|
// Note that on windows, one thread takes profiles of all the
|
|
|
|
// other threads, so mp is usually not getg().m.
|
|
|
|
// In fact mp may not even be stopped.
|
|
|
|
// See golang.org/issue/17165.
|
|
|
|
getg().m.mallocing++
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
// Define that a "user g" is a user-created goroutine, and a "system g"
|
|
|
|
// is one that is m->g0 or m->gsignal.
|
|
|
|
//
|
|
|
|
// We might be interrupted for profiling halfway through a
|
|
|
|
// goroutine switch. The switch involves updating three (or four) values:
|
|
|
|
// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
|
|
|
|
// because once it gets updated the new g is running.
|
|
|
|
//
|
|
|
|
// When switching from a user g to a system g, LR is not considered live,
|
|
|
|
// so the update only affects g, SP, and PC. Since PC must be last, there
|
|
|
|
// the possible partial transitions in ordinary execution are (1) g alone is updated,
|
|
|
|
// (2) both g and SP are updated, and (3) SP alone is updated.
|
|
|
|
// If SP or g alone is updated, we can detect the partial transition by checking
|
|
|
|
// whether the SP is within g's stack bounds. (We could also require that SP
|
|
|
|
// be changed only after g, but the stack bounds check is needed by other
|
|
|
|
// cases, so there is no need to impose an additional requirement.)
|
|
|
|
//
|
|
|
|
// There is one exceptional transition to a system g, not in ordinary execution.
|
|
|
|
// When a signal arrives, the operating system starts the signal handler running
|
|
|
|
// with an updated PC and SP. The g is updated last, at the beginning of the
|
|
|
|
// handler. There are two reasons this is okay. First, until g is updated the
|
|
|
|
// g and SP do not match, so the stack bounds check detects the partial transition.
|
|
|
|
// Second, signal handlers currently run with signals disabled, so a profiling
|
|
|
|
// signal cannot arrive during the handler.
|
|
|
|
//
|
|
|
|
// When switching from a system g to a user g, there are three possibilities.
|
|
|
|
//
|
|
|
|
// First, it may be that the g switch has no PC update, because the SP
|
|
|
|
// either corresponds to a user g throughout (as in asmcgocall)
|
|
|
|
// or because it has been arranged to look like a user g frame
|
|
|
|
// (as in cgocallback_gofunc). In this case, since the entire
|
|
|
|
// transition is a g+SP update, a partial transition updating just one of
|
|
|
|
// those will be detected by the stack bounds check.
|
|
|
|
//
|
|
|
|
// Second, when returning from a signal handler, the PC and SP updates
|
|
|
|
// are performed by the operating system in an atomic update, so the g
|
|
|
|
// update must be done before them. The stack bounds check detects
|
|
|
|
// the partial transition here, and (again) signal handlers run with signals
|
|
|
|
// disabled, so a profiling signal cannot arrive then anyway.
|
|
|
|
//
|
|
|
|
// Third, the common case: it may be that the switch updates g, SP, and PC
|
|
|
|
// separately. If the PC is within any of the functions that does this,
|
|
|
|
// we don't ask for a traceback. C.F. the function setsSP for more about this.
|
|
|
|
//
|
|
|
|
// There is another apparently viable approach, recorded here in case
|
|
|
|
// the "PC within setsSP function" check turns out not to be usable.
|
|
|
|
// It would be possible to delay the update of either g or SP until immediately
|
|
|
|
// before the PC update instruction. Then, because of the stack bounds check,
|
|
|
|
// the only problematic interrupt point is just before that PC update instruction,
|
|
|
|
// and the sigprof handler can detect that instruction and simulate stepping past
|
|
|
|
// it in order to reach a consistent state. On ARM, the update of g must be made
|
|
|
|
// in two places (in R10 and also in a TLS slot), so the delayed update would
|
|
|
|
// need to be the SP update. The sigprof handler must read the instruction at
|
|
|
|
// the current PC and if it was the known instruction (for example, JMP BX or
|
|
|
|
// MOV R2, PC), use that other register in place of the PC value.
|
|
|
|
// The biggest drawback to this solution is that it requires that we can tell
|
|
|
|
// whether it's safe to read from the memory pointed at by PC.
|
|
|
|
// In a correct program, we can test PC == nil and otherwise read,
|
|
|
|
// but if a profiling signal happens at the instant that a program executes
|
|
|
|
// a bad jump (before the program manages to handle the resulting fault)
|
|
|
|
// the profiling handler could fault trying to read nonexistent memory.
|
|
|
|
//
|
|
|
|
// To recap, there are no constraints on the assembly being used for the
|
|
|
|
// transition. We simply require that g and SP match and that the PC is not
|
|
|
|
// in gogo.
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
traceback := true
|
2015-10-18 18:04:05 -06:00
|
|
|
if gp == nil || sp < gp.stack.lo || gp.stack.hi < sp || setsSP(pc) {
|
|
|
|
traceback = false
|
|
|
|
}
|
|
|
|
var stk [maxCPUProfStack]uintptr
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
var haveStackLock *g
|
2015-10-18 18:04:05 -06:00
|
|
|
n := 0
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
|
2016-04-29 16:20:27 -06:00
|
|
|
cgoOff := 0
|
|
|
|
// Check cgoCallersUse to make sure that we are not
|
|
|
|
// interrupting other code that is fiddling with
|
|
|
|
// cgoCallers. We are running in a signal handler
|
|
|
|
// with all signals blocked, so we don't have to worry
|
|
|
|
// about any other code interrupting us.
|
|
|
|
if atomic.Load(&mp.cgoCallersUse) == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
|
|
|
|
for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 {
|
|
|
|
cgoOff++
|
|
|
|
}
|
|
|
|
copy(stk[:], mp.cgoCallers[:cgoOff])
|
|
|
|
mp.cgoCallers[0] = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect Go stack that leads to the cgo call.
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if gcTryLockStackBarriers(mp.curg) {
|
|
|
|
haveStackLock = mp.curg
|
2016-04-29 16:20:27 -06:00
|
|
|
n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
} else if traceback {
|
2015-11-23 13:13:16 -07:00
|
|
|
var flags uint = _TraceTrap
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if gp.m.curg != nil && gcTryLockStackBarriers(gp.m.curg) {
|
|
|
|
// It's safe to traceback the user stack.
|
|
|
|
haveStackLock = gp.m.curg
|
2015-11-23 13:13:16 -07:00
|
|
|
flags |= _TraceJumpStack
|
2015-11-11 13:34:54 -07:00
|
|
|
}
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
// Traceback is safe if we're on the system stack (if
|
|
|
|
// necessary, flags will stop it before switching to
|
|
|
|
// the user stack), or if we locked the user stack.
|
|
|
|
if gp != gp.m.curg || haveStackLock != nil {
|
|
|
|
n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, flags)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if haveStackLock != nil {
|
|
|
|
gcUnlockStackBarriers(haveStackLock)
|
|
|
|
}
|
|
|
|
|
2016-01-06 12:02:50 -07:00
|
|
|
if n <= 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
// Normal traceback is impossible or has failed.
|
|
|
|
// See if it falls into several common cases.
|
|
|
|
n = 0
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if GOOS == "windows" && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
// Libcall, i.e. runtime syscall on windows.
|
|
|
|
// Collect Go stack that leads to the call.
|
runtime: fix sigprof stack barrier locking
f90b48e intended to require the stack barrier lock in all cases of
sigprof that walked the user stack, but got it wrong. In particular,
if sp < gp.stack.lo || gp.stack.hi < sp, tracebackUser would be true,
but we wouldn't acquire the stack lock. If it then turned out that we
were in a cgo call, it would walk the stack without the lock.
In fact, the whole structure of stack locking is sigprof is somewhat
wrong because it assumes the G to lock is gp.m.curg, but all three
gentraceback calls start from potentially different Gs.
To fix this, we lower the gcTryLockStackBarriers calls much closer to
the gentraceback calls. There are now three separate trylock calls,
each clearly associated with a gentraceback and the locked G clearly
matches the G from which the gentraceback starts. This actually brings
the sigprof logic closer to what it originally was before stack
barrier locking.
This depends on "runtime: increase assumed stack size in
externalthreadhandler" because it very slightly increases the stack
used by sigprof; without this other commit, this is enough to blow the
profiler thread's assumed stack size.
Fixes #12528 (hopefully for real this time!).
For the 1.5 branch, though it will require some backporting. On the
1.5 branch, this will *not* require the "runtime: increase assumed
stack size in externalthreadhandler" commit: there's no pcvalue cache,
so the used stack is smaller.
Change-Id: Id2f6446ac276848f6fc158bee550cccd03186b83
Reviewed-on: https://go-review.googlesource.com/18328
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-01-05 13:21:27 -07:00
|
|
|
if gcTryLockStackBarriers(mp.libcallg.ptr()) {
|
|
|
|
n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
|
|
|
|
gcUnlockStackBarriers(mp.libcallg.ptr())
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
if n == 0 {
|
|
|
|
// If all of the above has failed, account it against abstract "System" or "GC".
|
|
|
|
n = 2
|
|
|
|
// "ExternalCode" is better than "etext".
|
|
|
|
if pc > firstmoduledata.etext {
|
2015-11-11 10:39:30 -07:00
|
|
|
pc = funcPC(_ExternalCode) + sys.PCQuantum
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
stk[0] = pc
|
|
|
|
if mp.preemptoff != "" || mp.helpgc != 0 {
|
2015-11-11 10:39:30 -07:00
|
|
|
stk[1] = funcPC(_GC) + sys.PCQuantum
|
2015-10-18 18:04:05 -06:00
|
|
|
} else {
|
2015-11-11 10:39:30 -07:00
|
|
|
stk[1] = funcPC(_System) + sys.PCQuantum
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if prof.hz != 0 {
|
|
|
|
// Simple cas-lock to coordinate with setcpuprofilerate.
|
2015-11-02 12:09:24 -07:00
|
|
|
for !atomic.Cas(&prof.lock, 0, 1) {
|
2015-10-18 18:04:05 -06:00
|
|
|
osyield()
|
|
|
|
}
|
|
|
|
if prof.hz != 0 {
|
|
|
|
cpuprof.add(stk[:n])
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&prof.lock, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
runtime: fix Windows profiling crash
I don't have any way to test or reproduce this problem,
but the current code is clearly wrong for Windows.
Make it better.
As I said on #17165:
But the borrowing of M's and the profiling of M's by the CPU profiler
seem not synchronized enough. This code implements the CPU profiler
on Windows:
func profileloop1(param uintptr) uint32 {
stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
for {
stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
for mp := first; mp != nil; mp = mp.alllink {
thread := atomic.Loaduintptr(&mp.thread)
// Do not profile threads blocked on Notes,
// this includes idle worker threads,
// idle timer thread, idle heap scavenger, etc.
if thread == 0 || mp.profilehz == 0 || mp.blocked {
continue
}
stdcall1(_SuspendThread, thread)
if mp.profilehz != 0 && !mp.blocked {
profilem(mp)
}
stdcall1(_ResumeThread, thread)
}
}
}
func profilem(mp *m) {
var r *context
rbuf := make([]byte, unsafe.Sizeof(*r)+15)
tls := &mp.tls[0]
gp := *((**g)(unsafe.Pointer(tls)))
// align Context to 16 bytes
r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
r.contextflags = _CONTEXT_CONTROL
stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
sigprof(r.ip(), r.sp(), 0, gp, mp)
}
func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
if prof.hz == 0 {
return
}
// Profiling runs concurrently with GC, so it must not allocate.
mp.mallocing++
... lots of code ...
mp.mallocing--
}
A borrowed M may migrate between threads. Between the
atomic.Loaduintptr(&mp.thread) and the SuspendThread, mp may have
moved to a new thread, so that it's in active use. In particular
it might be calling malloc, as in the crash stack trace. If so, the
mp.mallocing++ in sigprof would provoke the crash.
Those lines are trying to guard against allocation during sigprof.
But on Windows, mp is the thread being traced, not the current
thread. Those lines should really be using getg().m.mallocing, which
is the same on Unix but not on Windows. With that change, it's
possible the race on the actual thread is not a problem: the traceback
would get confused and eventually return an error, but that's fine.
The code expects that possibility.
Fixes #17165.
Change-Id: If6619731910d65ca4b1a6e7de761fa2518ef339e
Reviewed-on: https://go-review.googlesource.com/33132
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-11-11 08:27:36 -07:00
|
|
|
getg().m.mallocing--
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
2016-06-07 22:46:25 -06:00
|
|
|
// If the signal handler receives a SIGPROF signal on a non-Go thread,
|
|
|
|
// it tries to collect a traceback into sigprofCallers.
|
|
|
|
// sigprofCallersUse is set to non-zero while sigprofCallers holds a traceback.
|
|
|
|
var sigprofCallers cgoCallers
|
|
|
|
var sigprofCallersUse uint32
|
|
|
|
|
2016-10-04 08:11:55 -06:00
|
|
|
// sigprofNonGo is called if we receive a SIGPROF signal on a non-Go thread,
|
|
|
|
// and the signal handler collected a stack trace in sigprofCallers.
|
2016-06-07 22:46:25 -06:00
|
|
|
// When this is called, sigprofCallersUse will be non-zero.
|
|
|
|
// g is nil, and what we can do is very limited.
|
|
|
|
//go:nosplit
|
|
|
|
//go:nowritebarrierrec
|
|
|
|
func sigprofNonGo() {
|
|
|
|
if prof.hz != 0 {
|
|
|
|
n := 0
|
|
|
|
for n < len(sigprofCallers) && sigprofCallers[n] != 0 {
|
|
|
|
n++
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simple cas-lock to coordinate with setcpuprofilerate.
|
2016-10-04 08:11:55 -06:00
|
|
|
for !atomic.Cas(&prof.lock, 0, 1) {
|
|
|
|
osyield()
|
2016-06-07 22:46:25 -06:00
|
|
|
}
|
2016-10-04 08:11:55 -06:00
|
|
|
if prof.hz != 0 {
|
|
|
|
cpuprof.addNonGo(sigprofCallers[:n])
|
|
|
|
}
|
|
|
|
atomic.Store(&prof.lock, 0)
|
2016-06-07 22:46:25 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
atomic.Store(&sigprofCallersUse, 0)
|
|
|
|
}
|
|
|
|
|
2016-10-04 08:11:55 -06:00
|
|
|
// sigprofNonGoPC is called when a profiling signal arrived on a
|
|
|
|
// non-Go thread and we have a single PC value, not a stack trace.
|
|
|
|
// g is nil, and what we can do is very limited.
|
|
|
|
//go:nosplit
|
|
|
|
//go:nowritebarrierrec
|
|
|
|
func sigprofNonGoPC(pc uintptr) {
|
|
|
|
if prof.hz != 0 {
|
|
|
|
pc := []uintptr{
|
|
|
|
pc,
|
|
|
|
funcPC(_ExternalCode) + sys.PCQuantum,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simple cas-lock to coordinate with setcpuprofilerate.
|
|
|
|
for !atomic.Cas(&prof.lock, 0, 1) {
|
|
|
|
osyield()
|
|
|
|
}
|
|
|
|
if prof.hz != 0 {
|
|
|
|
cpuprof.addNonGo(pc)
|
|
|
|
}
|
|
|
|
atomic.Store(&prof.lock, 0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-18 18:04:05 -06:00
|
|
|
// Reports whether a function will set the SP
|
|
|
|
// to an absolute value. Important that
|
|
|
|
// we don't traceback when these are at the bottom
|
|
|
|
// of the stack since we can't be sure that we will
|
|
|
|
// find the caller.
|
|
|
|
//
|
|
|
|
// If the function is not on the bottom of the stack
|
|
|
|
// we assume that it will have set it up so that traceback will be consistent,
|
|
|
|
// either by being a traceback terminating function
|
|
|
|
// or putting one on the stack at the right offset.
|
|
|
|
func setsSP(pc uintptr) bool {
|
|
|
|
f := findfunc(pc)
|
|
|
|
if f == nil {
|
|
|
|
// couldn't find the function for this PC,
|
|
|
|
// so assume the worst and stop traceback
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
switch f.entry {
|
|
|
|
case gogoPC, systemstackPC, mcallPC, morestackPC:
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Arrange to call fn with a traceback hz times a second.
|
|
|
|
func setcpuprofilerate_m(hz int32) {
|
|
|
|
// Force sane arguments.
|
|
|
|
if hz < 0 {
|
|
|
|
hz = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Disable preemption, otherwise we can be rescheduled to another thread
|
|
|
|
// that has profiling enabled.
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.locks++
|
|
|
|
|
|
|
|
// Stop profiler on this thread so that it is safe to lock prof.
|
|
|
|
// if a profiling signal came in while we had prof locked,
|
|
|
|
// it would deadlock.
|
|
|
|
resetcpuprofiler(0)
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
for !atomic.Cas(&prof.lock, 0, 1) {
|
2015-10-18 18:04:05 -06:00
|
|
|
osyield()
|
|
|
|
}
|
|
|
|
prof.hz = hz
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&prof.lock, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
sched.profilehz = hz
|
|
|
|
unlock(&sched.lock)
|
|
|
|
|
|
|
|
if hz != 0 {
|
|
|
|
resetcpuprofiler(hz)
|
|
|
|
}
|
|
|
|
|
|
|
|
_g_.m.locks--
|
|
|
|
}
|
|
|
|
|
2016-03-01 16:21:55 -07:00
|
|
|
// Change number of processors. The world is stopped, sched is locked.
|
2015-10-18 18:04:05 -06:00
|
|
|
// gcworkbufs are not being modified by either the GC or
|
|
|
|
// the write barrier code.
|
|
|
|
// Returns list of Ps with local work, they need to be scheduled by the caller.
|
|
|
|
func procresize(nprocs int32) *p {
|
|
|
|
old := gomaxprocs
|
|
|
|
if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
|
|
|
|
throw("procresize: invalid arg")
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
traceGomaxprocs(nprocs)
|
|
|
|
}
|
|
|
|
|
|
|
|
// update statistics
|
|
|
|
now := nanotime()
|
|
|
|
if sched.procresizetime != 0 {
|
|
|
|
sched.totaltime += int64(old) * (now - sched.procresizetime)
|
|
|
|
}
|
|
|
|
sched.procresizetime = now
|
|
|
|
|
|
|
|
// initialize new P's
|
|
|
|
for i := int32(0); i < nprocs; i++ {
|
|
|
|
pp := allp[i]
|
|
|
|
if pp == nil {
|
|
|
|
pp = new(p)
|
|
|
|
pp.id = i
|
|
|
|
pp.status = _Pgcstop
|
|
|
|
pp.sudogcache = pp.sudogbuf[:0]
|
|
|
|
for i := range pp.deferpool {
|
|
|
|
pp.deferpool[i] = pp.deferpoolbuf[i][:0]
|
|
|
|
}
|
|
|
|
atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
|
|
|
|
}
|
|
|
|
if pp.mcache == nil {
|
|
|
|
if old == 0 && i == 0 {
|
|
|
|
if getg().m.mcache == nil {
|
|
|
|
throw("missing mcache?")
|
|
|
|
}
|
|
|
|
pp.mcache = getg().m.mcache // bootstrap
|
|
|
|
} else {
|
|
|
|
pp.mcache = allocmcache()
|
|
|
|
}
|
|
|
|
}
|
2016-02-26 13:57:16 -07:00
|
|
|
if raceenabled && pp.racectx == 0 {
|
|
|
|
if old == 0 && i == 0 {
|
|
|
|
pp.racectx = raceprocctx0
|
|
|
|
raceprocctx0 = 0 // bootstrap
|
|
|
|
} else {
|
|
|
|
pp.racectx = raceproccreate()
|
|
|
|
}
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// free unused P's
|
|
|
|
for i := nprocs; i < old; i++ {
|
|
|
|
p := allp[i]
|
|
|
|
if trace.enabled {
|
|
|
|
if p == getg().m.p.ptr() {
|
|
|
|
// moving to p[0], pretend that we were descheduled
|
|
|
|
// and then scheduled again to keep the trace sane.
|
|
|
|
traceGoSched()
|
|
|
|
traceProcStop(p)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// move all runnable goroutines to the global queue
|
|
|
|
for p.runqhead != p.runqtail {
|
|
|
|
// pop from tail of local queue
|
|
|
|
p.runqtail--
|
2015-11-02 14:59:39 -07:00
|
|
|
gp := p.runq[p.runqtail%uint32(len(p.runq))].ptr()
|
2015-10-18 18:04:05 -06:00
|
|
|
// push onto head of global queue
|
|
|
|
globrunqputhead(gp)
|
|
|
|
}
|
|
|
|
if p.runnext != 0 {
|
|
|
|
globrunqputhead(p.runnext.ptr())
|
|
|
|
p.runnext = 0
|
|
|
|
}
|
|
|
|
// if there's a background worker, make it runnable and put
|
|
|
|
// it on the global queue so it can clean itself up
|
2016-01-26 12:44:58 -07:00
|
|
|
if gp := p.gcBgMarkWorker.ptr(); gp != nil {
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
2016-01-26 12:44:58 -07:00
|
|
|
traceGoUnpark(gp, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2016-01-26 12:44:58 -07:00
|
|
|
globrunqput(gp)
|
2016-01-19 20:45:37 -07:00
|
|
|
// This assignment doesn't race because the
|
|
|
|
// world is stopped.
|
2016-01-26 12:44:58 -07:00
|
|
|
p.gcBgMarkWorker.set(nil)
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
for i := range p.sudogbuf {
|
|
|
|
p.sudogbuf[i] = nil
|
|
|
|
}
|
|
|
|
p.sudogcache = p.sudogbuf[:0]
|
|
|
|
for i := range p.deferpool {
|
|
|
|
for j := range p.deferpoolbuf[i] {
|
|
|
|
p.deferpoolbuf[i][j] = nil
|
|
|
|
}
|
|
|
|
p.deferpool[i] = p.deferpoolbuf[i][:0]
|
|
|
|
}
|
|
|
|
freemcache(p.mcache)
|
|
|
|
p.mcache = nil
|
|
|
|
gfpurge(p)
|
|
|
|
traceProcFree(p)
|
2016-02-26 13:57:16 -07:00
|
|
|
if raceenabled {
|
|
|
|
raceprocdestroy(p.racectx)
|
|
|
|
p.racectx = 0
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
p.status = _Pdead
|
|
|
|
// can't free P itself because it can be referenced by an M in syscall
|
|
|
|
}
|
|
|
|
|
|
|
|
_g_ := getg()
|
|
|
|
if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
|
|
|
|
// continue to use the current P
|
|
|
|
_g_.m.p.ptr().status = _Prunning
|
|
|
|
} else {
|
|
|
|
// release the current P and acquire allp[0]
|
|
|
|
if _g_.m.p != 0 {
|
|
|
|
_g_.m.p.ptr().m = 0
|
|
|
|
}
|
|
|
|
_g_.m.p = 0
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
p := allp[0]
|
|
|
|
p.m = 0
|
|
|
|
p.status = _Pidle
|
|
|
|
acquirep(p)
|
|
|
|
if trace.enabled {
|
|
|
|
traceGoStart()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
var runnablePs *p
|
|
|
|
for i := nprocs - 1; i >= 0; i-- {
|
|
|
|
p := allp[i]
|
|
|
|
if _g_.m.p.ptr() == p {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
p.status = _Pidle
|
|
|
|
if runqempty(p) {
|
|
|
|
pidleput(p)
|
|
|
|
} else {
|
|
|
|
p.m.set(mget())
|
|
|
|
p.link.set(runnablePs)
|
|
|
|
runnablePs = p
|
|
|
|
}
|
|
|
|
}
|
2016-03-18 05:52:52 -06:00
|
|
|
stealOrder.reset(uint32(nprocs))
|
2015-10-18 18:04:05 -06:00
|
|
|
var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
|
2015-10-18 18:04:05 -06:00
|
|
|
return runnablePs
|
|
|
|
}
|
|
|
|
|
|
|
|
// Associate p and the current m.
|
2016-10-10 15:14:14 -06:00
|
|
|
//
|
|
|
|
// This function is allowed to have write barriers even if the caller
|
|
|
|
// isn't because it immediately acquires _p_.
|
|
|
|
//
|
|
|
|
//go:yeswritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func acquirep(_p_ *p) {
|
2016-10-10 15:14:14 -06:00
|
|
|
// Do the part that isn't allowed to have write barriers.
|
2015-10-18 18:04:05 -06:00
|
|
|
acquirep1(_p_)
|
|
|
|
|
|
|
|
// have p; write barriers now allowed
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.mcache = _p_.mcache
|
|
|
|
|
|
|
|
if trace.enabled {
|
|
|
|
traceProcStart()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-10 15:14:14 -06:00
|
|
|
// acquirep1 is the first step of acquirep, which actually acquires
|
|
|
|
// _p_. This is broken out so we can disallow write barriers for this
|
|
|
|
// part, since we don't yet have a P.
|
|
|
|
//
|
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func acquirep1(_p_ *p) {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.m.p != 0 || _g_.m.mcache != nil {
|
|
|
|
throw("acquirep: already in go")
|
|
|
|
}
|
|
|
|
if _p_.m != 0 || _p_.status != _Pidle {
|
|
|
|
id := int32(0)
|
|
|
|
if _p_.m != 0 {
|
|
|
|
id = _p_.m.ptr().id
|
|
|
|
}
|
|
|
|
print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
|
|
|
|
throw("acquirep: invalid p state")
|
|
|
|
}
|
|
|
|
_g_.m.p.set(_p_)
|
|
|
|
_p_.m.set(_g_.m)
|
|
|
|
_p_.status = _Prunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Disassociate p and the current m.
|
|
|
|
func releasep() *p {
|
|
|
|
_g_ := getg()
|
|
|
|
|
|
|
|
if _g_.m.p == 0 || _g_.m.mcache == nil {
|
|
|
|
throw("releasep: invalid arg")
|
|
|
|
}
|
|
|
|
_p_ := _g_.m.p.ptr()
|
|
|
|
if _p_.m.ptr() != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
|
|
|
|
print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", _p_.m, " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
|
|
|
|
throw("releasep: invalid p state")
|
|
|
|
}
|
|
|
|
if trace.enabled {
|
|
|
|
traceProcStop(_g_.m.p.ptr())
|
|
|
|
}
|
|
|
|
_g_.m.p = 0
|
|
|
|
_g_.m.mcache = nil
|
|
|
|
_p_.m = 0
|
|
|
|
_p_.status = _Pidle
|
|
|
|
return _p_
|
|
|
|
}
|
|
|
|
|
|
|
|
func incidlelocked(v int32) {
|
|
|
|
lock(&sched.lock)
|
|
|
|
sched.nmidlelocked += v
|
|
|
|
if v > 0 {
|
|
|
|
checkdead()
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for deadlock situation.
|
|
|
|
// The check is based on number of running M's, if 0 -> deadlock.
|
|
|
|
func checkdead() {
|
|
|
|
// For -buildmode=c-shared or -buildmode=c-archive it's OK if
|
2016-03-01 16:21:55 -07:00
|
|
|
// there are no running goroutines. The calling program is
|
2015-10-18 18:04:05 -06:00
|
|
|
// assumed to be running.
|
|
|
|
if islibrary || isarchive {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we are dying because of a signal caught on an already idle thread,
|
|
|
|
// freezetheworld will cause all running threads to block.
|
|
|
|
// And runtime will essentially enter into deadlock state,
|
|
|
|
// except that there is a thread that will call exit soon.
|
|
|
|
if panicking > 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// -1 for sysmon
|
|
|
|
run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
|
|
|
|
if run > 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if run < 0 {
|
|
|
|
print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
|
|
|
|
throw("checkdead: inconsistent counts")
|
|
|
|
}
|
|
|
|
|
|
|
|
grunning := 0
|
|
|
|
lock(&allglock)
|
|
|
|
for i := 0; i < len(allgs); i++ {
|
|
|
|
gp := allgs[i]
|
|
|
|
if isSystemGoroutine(gp) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
s := readgstatus(gp)
|
|
|
|
switch s &^ _Gscan {
|
|
|
|
case _Gwaiting:
|
|
|
|
grunning++
|
|
|
|
case _Grunnable,
|
|
|
|
_Grunning,
|
|
|
|
_Gsyscall:
|
|
|
|
unlock(&allglock)
|
|
|
|
print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
|
|
|
|
throw("checkdead: runnable g")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
unlock(&allglock)
|
|
|
|
if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
|
|
|
|
throw("no goroutines (main called runtime.Goexit) - deadlock!")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Maybe jump time forward for playground.
|
|
|
|
gp := timejump()
|
|
|
|
if gp != nil {
|
|
|
|
casgstatus(gp, _Gwaiting, _Grunnable)
|
|
|
|
globrunqput(gp)
|
|
|
|
_p_ := pidleget()
|
|
|
|
if _p_ == nil {
|
|
|
|
throw("checkdead: no p for timer")
|
|
|
|
}
|
|
|
|
mp := mget()
|
|
|
|
if mp == nil {
|
2015-11-10 13:24:59 -07:00
|
|
|
// There should always be a free M since
|
|
|
|
// nothing is running.
|
|
|
|
throw("checkdead: no m for timer")
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2015-11-10 13:24:59 -07:00
|
|
|
mp.nextp.set(_p_)
|
|
|
|
notewakeup(&mp.park)
|
2015-10-18 18:04:05 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
getg().m.throwing = -1 // do not dump full stacks
|
|
|
|
throw("all goroutines are asleep - deadlock!")
|
|
|
|
}
|
|
|
|
|
|
|
|
// forcegcperiod is the maximum time in nanoseconds between garbage
|
|
|
|
// collections. If we go this long without a garbage collection, one
|
|
|
|
// is forced to run.
|
|
|
|
//
|
|
|
|
// This is a variable for testing purposes. It normally doesn't change.
|
|
|
|
var forcegcperiod int64 = 2 * 60 * 1e9
|
|
|
|
|
2015-11-17 15:31:04 -07:00
|
|
|
// Always runs without a P, so write barriers are not allowed.
|
|
|
|
//
|
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func sysmon() {
|
|
|
|
// If a heap span goes unused for 5 minutes after a garbage collection,
|
|
|
|
// we hand it back to the operating system.
|
|
|
|
scavengelimit := int64(5 * 60 * 1e9)
|
|
|
|
|
|
|
|
if debug.scavenge > 0 {
|
|
|
|
// Scavenge-a-lot for testing.
|
|
|
|
forcegcperiod = 10 * 1e6
|
|
|
|
scavengelimit = 20 * 1e6
|
|
|
|
}
|
|
|
|
|
|
|
|
lastscavenge := nanotime()
|
|
|
|
nscavenge := 0
|
|
|
|
|
|
|
|
lasttrace := int64(0)
|
|
|
|
idle := 0 // how many cycles in succession we had not wokeup somebody
|
|
|
|
delay := uint32(0)
|
|
|
|
for {
|
|
|
|
if idle == 0 { // start with 20us sleep...
|
|
|
|
delay = 20
|
|
|
|
} else if idle > 50 { // start doubling the sleep after 1ms...
|
|
|
|
delay *= 2
|
|
|
|
}
|
|
|
|
if delay > 10*1000 { // up to 10ms
|
|
|
|
delay = 10 * 1000
|
|
|
|
}
|
|
|
|
usleep(delay)
|
2016-08-30 10:29:16 -06:00
|
|
|
if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
|
2015-10-18 18:04:05 -06:00
|
|
|
lock(&sched.lock)
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
|
|
|
|
atomic.Store(&sched.sysmonwait, 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
unlock(&sched.lock)
|
|
|
|
// Make wake-up period small enough
|
|
|
|
// for the sampling to be correct.
|
|
|
|
maxsleep := forcegcperiod / 2
|
|
|
|
if scavengelimit < forcegcperiod {
|
|
|
|
maxsleep = scavengelimit / 2
|
|
|
|
}
|
|
|
|
notetsleep(&sched.sysmonnote, maxsleep)
|
|
|
|
lock(&sched.lock)
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&sched.sysmonwait, 0)
|
2015-10-18 18:04:05 -06:00
|
|
|
noteclear(&sched.sysmonnote)
|
|
|
|
idle = 0
|
|
|
|
delay = 20
|
|
|
|
}
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
// poll network if not polled for more than 10ms
|
2015-11-02 12:09:24 -07:00
|
|
|
lastpoll := int64(atomic.Load64(&sched.lastpoll))
|
2015-10-18 18:04:05 -06:00
|
|
|
now := nanotime()
|
|
|
|
unixnow := unixnanotime()
|
|
|
|
if lastpoll != 0 && lastpoll+10*1000*1000 < now {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
|
2015-10-18 18:04:05 -06:00
|
|
|
gp := netpoll(false) // non-blocking - returns list of goroutines
|
|
|
|
if gp != nil {
|
|
|
|
// Need to decrement number of idle locked M's
|
|
|
|
// (pretending that one more is running) before injectglist.
|
|
|
|
// Otherwise it can lead to the following situation:
|
|
|
|
// injectglist grabs all P's but before it starts M's to run the P's,
|
|
|
|
// another M returns from syscall, finishes running its G,
|
|
|
|
// observes that there is no work to do and no other running M's
|
|
|
|
// and reports deadlock.
|
|
|
|
incidlelocked(-1)
|
|
|
|
injectglist(gp)
|
|
|
|
incidlelocked(1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// retake P's blocked in syscalls
|
|
|
|
// and preempt long running G's
|
|
|
|
if retake(now) != 0 {
|
|
|
|
idle = 0
|
|
|
|
} else {
|
|
|
|
idle++
|
|
|
|
}
|
|
|
|
// check if we need to force a GC
|
2015-11-02 12:09:24 -07:00
|
|
|
lastgc := int64(atomic.Load64(&memstats.last_gc))
|
2015-12-14 15:25:28 -07:00
|
|
|
if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
lock(&forcegc.lock)
|
|
|
|
forcegc.idle = 0
|
|
|
|
forcegc.g.schedlink = 0
|
|
|
|
injectglist(forcegc.g)
|
|
|
|
unlock(&forcegc.lock)
|
|
|
|
}
|
|
|
|
// scavenge heap once in a while
|
|
|
|
if lastscavenge+scavengelimit/2 < now {
|
2015-11-11 17:13:51 -07:00
|
|
|
mheap_.scavenge(int32(nscavenge), uint64(now), uint64(scavengelimit))
|
2015-10-18 18:04:05 -06:00
|
|
|
lastscavenge = now
|
|
|
|
nscavenge++
|
|
|
|
}
|
2015-12-10 16:06:42 -07:00
|
|
|
if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
|
2015-10-18 18:04:05 -06:00
|
|
|
lasttrace = now
|
|
|
|
schedtrace(debug.scheddetail > 0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var pdesc [_MaxGomaxprocs]struct {
|
|
|
|
schedtick uint32
|
|
|
|
schedwhen int64
|
|
|
|
syscalltick uint32
|
|
|
|
syscallwhen int64
|
|
|
|
}
|
|
|
|
|
|
|
|
// forcePreemptNS is the time slice given to a G before it is
|
|
|
|
// preempted.
|
|
|
|
const forcePreemptNS = 10 * 1000 * 1000 // 10ms
|
|
|
|
|
|
|
|
func retake(now int64) uint32 {
|
|
|
|
n := 0
|
|
|
|
for i := int32(0); i < gomaxprocs; i++ {
|
|
|
|
_p_ := allp[i]
|
|
|
|
if _p_ == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
pd := &pdesc[i]
|
|
|
|
s := _p_.status
|
|
|
|
if s == _Psyscall {
|
|
|
|
// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
|
|
|
|
t := int64(_p_.syscalltick)
|
|
|
|
if int64(pd.syscalltick) != t {
|
|
|
|
pd.syscalltick = uint32(t)
|
|
|
|
pd.syscallwhen = now
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// On the one hand we don't want to retake Ps if there is no other work to do,
|
|
|
|
// but on the other hand we want to retake them eventually
|
|
|
|
// because they can prevent the sysmon thread from deep sleep.
|
2015-11-02 12:09:24 -07:00
|
|
|
if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
|
2015-10-18 18:04:05 -06:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Need to decrement number of idle locked M's
|
|
|
|
// (pretending that one more is running) before the CAS.
|
|
|
|
// Otherwise the M from which we retake can exit the syscall,
|
|
|
|
// increment nmidle and report deadlock.
|
|
|
|
incidlelocked(-1)
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Cas(&_p_.status, s, _Pidle) {
|
2015-10-18 18:04:05 -06:00
|
|
|
if trace.enabled {
|
|
|
|
traceGoSysBlock(_p_)
|
|
|
|
traceProcStop(_p_)
|
|
|
|
}
|
|
|
|
n++
|
|
|
|
_p_.syscalltick++
|
|
|
|
handoffp(_p_)
|
|
|
|
}
|
|
|
|
incidlelocked(1)
|
|
|
|
} else if s == _Prunning {
|
|
|
|
// Preempt G if it's running for too long.
|
|
|
|
t := int64(_p_.schedtick)
|
|
|
|
if int64(pd.schedtick) != t {
|
|
|
|
pd.schedtick = uint32(t)
|
|
|
|
pd.schedwhen = now
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if pd.schedwhen+forcePreemptNS > now {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
preemptone(_p_)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return uint32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tell all goroutines that they have been preempted and they should stop.
|
2016-03-01 16:21:55 -07:00
|
|
|
// This function is purely best-effort. It can fail to inform a goroutine if a
|
2015-10-18 18:04:05 -06:00
|
|
|
// processor just started running it.
|
|
|
|
// No locks need to be held.
|
|
|
|
// Returns true if preemption request was issued to at least one goroutine.
|
|
|
|
func preemptall() bool {
|
|
|
|
res := false
|
|
|
|
for i := int32(0); i < gomaxprocs; i++ {
|
|
|
|
_p_ := allp[i]
|
|
|
|
if _p_ == nil || _p_.status != _Prunning {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if preemptone(_p_) {
|
|
|
|
res = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tell the goroutine running on processor P to stop.
|
2016-03-01 16:21:55 -07:00
|
|
|
// This function is purely best-effort. It can incorrectly fail to inform the
|
|
|
|
// goroutine. It can send inform the wrong goroutine. Even if it informs the
|
2015-10-18 18:04:05 -06:00
|
|
|
// correct goroutine, that goroutine might ignore the request if it is
|
|
|
|
// simultaneously executing newstack.
|
|
|
|
// No lock needs to be held.
|
|
|
|
// Returns true if preemption request was issued.
|
|
|
|
// The actual preemption will happen at some point in the future
|
|
|
|
// and will be indicated by the gp->status no longer being
|
|
|
|
// Grunning
|
|
|
|
func preemptone(_p_ *p) bool {
|
|
|
|
mp := _p_.m.ptr()
|
|
|
|
if mp == nil || mp == getg().m {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
gp := mp.curg
|
|
|
|
if gp == nil || gp == mp.g0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
gp.preempt = true
|
|
|
|
|
|
|
|
// Every call in a go routine checks for stack overflow by
|
|
|
|
// comparing the current stack pointer to gp->stackguard0.
|
|
|
|
// Setting gp->stackguard0 to StackPreempt folds
|
|
|
|
// preemption into the normal stack overflow check.
|
|
|
|
gp.stackguard0 = stackPreempt
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
var starttime int64
|
|
|
|
|
|
|
|
func schedtrace(detailed bool) {
|
|
|
|
now := nanotime()
|
|
|
|
if starttime == 0 {
|
|
|
|
starttime = now
|
|
|
|
}
|
|
|
|
|
|
|
|
lock(&sched.lock)
|
|
|
|
print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
|
|
|
|
if detailed {
|
|
|
|
print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
|
|
|
|
}
|
|
|
|
// We must be careful while reading data from P's, M's and G's.
|
|
|
|
// Even if we hold schedlock, most data can be changed concurrently.
|
|
|
|
// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
|
|
|
|
for i := int32(0); i < gomaxprocs; i++ {
|
|
|
|
_p_ := allp[i]
|
|
|
|
if _p_ == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
mp := _p_.m.ptr()
|
2015-11-02 12:09:24 -07:00
|
|
|
h := atomic.Load(&_p_.runqhead)
|
|
|
|
t := atomic.Load(&_p_.runqtail)
|
2015-10-18 18:04:05 -06:00
|
|
|
if detailed {
|
|
|
|
id := int32(-1)
|
|
|
|
if mp != nil {
|
|
|
|
id = mp.id
|
|
|
|
}
|
|
|
|
print(" P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
|
|
|
|
} else {
|
|
|
|
// In non-detailed mode format lengths of per-P run queues as:
|
|
|
|
// [len1 len2 len3 len4]
|
|
|
|
print(" ")
|
|
|
|
if i == 0 {
|
|
|
|
print("[")
|
|
|
|
}
|
|
|
|
print(t - h)
|
|
|
|
if i == gomaxprocs-1 {
|
|
|
|
print("]\n")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !detailed {
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for mp := allm; mp != nil; mp = mp.alllink {
|
|
|
|
_p_ := mp.p.ptr()
|
|
|
|
gp := mp.curg
|
|
|
|
lockedg := mp.lockedg
|
|
|
|
id1 := int32(-1)
|
|
|
|
if _p_ != nil {
|
|
|
|
id1 = _p_.id
|
|
|
|
}
|
|
|
|
id2 := int64(-1)
|
|
|
|
if gp != nil {
|
|
|
|
id2 = gp.goid
|
|
|
|
}
|
|
|
|
id3 := int64(-1)
|
|
|
|
if lockedg != nil {
|
|
|
|
id3 = lockedg.goid
|
|
|
|
}
|
2016-06-28 18:23:01 -06:00
|
|
|
print(" M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
lock(&allglock)
|
|
|
|
for gi := 0; gi < len(allgs); gi++ {
|
|
|
|
gp := allgs[gi]
|
|
|
|
mp := gp.m
|
|
|
|
lockedm := gp.lockedm
|
|
|
|
id1 := int32(-1)
|
|
|
|
if mp != nil {
|
|
|
|
id1 = mp.id
|
|
|
|
}
|
|
|
|
id2 := int32(-1)
|
|
|
|
if lockedm != nil {
|
|
|
|
id2 = lockedm.id
|
|
|
|
}
|
|
|
|
print(" G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason, ") m=", id1, " lockedm=", id2, "\n")
|
|
|
|
}
|
|
|
|
unlock(&allglock)
|
|
|
|
unlock(&sched.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put mp on midle list.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func mput(mp *m) {
|
|
|
|
mp.schedlink = sched.midle
|
|
|
|
sched.midle.set(mp)
|
|
|
|
sched.nmidle++
|
|
|
|
checkdead()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to get an m from midle list.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func mget() *m {
|
|
|
|
mp := sched.midle.ptr()
|
|
|
|
if mp != nil {
|
|
|
|
sched.midle = mp.schedlink
|
|
|
|
sched.nmidle--
|
|
|
|
}
|
|
|
|
return mp
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put gp on the global runnable queue.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func globrunqput(gp *g) {
|
|
|
|
gp.schedlink = 0
|
|
|
|
if sched.runqtail != 0 {
|
|
|
|
sched.runqtail.ptr().schedlink.set(gp)
|
|
|
|
} else {
|
|
|
|
sched.runqhead.set(gp)
|
|
|
|
}
|
|
|
|
sched.runqtail.set(gp)
|
|
|
|
sched.runqsize++
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put gp at the head of the global runnable queue.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func globrunqputhead(gp *g) {
|
|
|
|
gp.schedlink = sched.runqhead
|
|
|
|
sched.runqhead.set(gp)
|
|
|
|
if sched.runqtail == 0 {
|
|
|
|
sched.runqtail.set(gp)
|
|
|
|
}
|
|
|
|
sched.runqsize++
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put a batch of runnable goroutines on the global runnable queue.
|
|
|
|
// Sched must be locked.
|
|
|
|
func globrunqputbatch(ghead *g, gtail *g, n int32) {
|
|
|
|
gtail.schedlink = 0
|
|
|
|
if sched.runqtail != 0 {
|
|
|
|
sched.runqtail.ptr().schedlink.set(ghead)
|
|
|
|
} else {
|
|
|
|
sched.runqhead.set(ghead)
|
|
|
|
}
|
|
|
|
sched.runqtail.set(gtail)
|
|
|
|
sched.runqsize += n
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try get a batch of G's from the global runnable queue.
|
|
|
|
// Sched must be locked.
|
|
|
|
func globrunqget(_p_ *p, max int32) *g {
|
|
|
|
if sched.runqsize == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
n := sched.runqsize/gomaxprocs + 1
|
|
|
|
if n > sched.runqsize {
|
|
|
|
n = sched.runqsize
|
|
|
|
}
|
|
|
|
if max > 0 && n > max {
|
|
|
|
n = max
|
|
|
|
}
|
|
|
|
if n > int32(len(_p_.runq))/2 {
|
|
|
|
n = int32(len(_p_.runq)) / 2
|
|
|
|
}
|
|
|
|
|
|
|
|
sched.runqsize -= n
|
|
|
|
if sched.runqsize == 0 {
|
|
|
|
sched.runqtail = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
gp := sched.runqhead.ptr()
|
|
|
|
sched.runqhead = gp.schedlink
|
|
|
|
n--
|
|
|
|
for ; n > 0; n-- {
|
|
|
|
gp1 := sched.runqhead.ptr()
|
|
|
|
sched.runqhead = gp1.schedlink
|
|
|
|
runqput(_p_, gp1, false)
|
|
|
|
}
|
|
|
|
return gp
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put p to on _Pidle list.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func pidleput(_p_ *p) {
|
|
|
|
if !runqempty(_p_) {
|
|
|
|
throw("pidleput: P has non-empty run queue")
|
|
|
|
}
|
|
|
|
_p_.link = sched.pidle
|
|
|
|
sched.pidle.set(_p_)
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// Try get a p from _Pidle list.
|
|
|
|
// Sched must be locked.
|
|
|
|
// May run during STW, so write barriers are not allowed.
|
2016-10-10 15:14:14 -06:00
|
|
|
//go:nowritebarrierrec
|
2015-10-18 18:04:05 -06:00
|
|
|
func pidleget() *p {
|
|
|
|
_p_ := sched.pidle.ptr()
|
|
|
|
if _p_ != nil {
|
|
|
|
sched.pidle = _p_.link
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
return _p_
|
|
|
|
}
|
|
|
|
|
|
|
|
// runqempty returns true if _p_ has no Gs on its local run queue.
|
2016-03-18 09:34:11 -06:00
|
|
|
// It never returns true spuriously.
|
2015-10-18 18:04:05 -06:00
|
|
|
func runqempty(_p_ *p) bool {
|
2016-03-18 09:34:11 -06:00
|
|
|
// Defend against a race where 1) _p_ has G1 in runqnext but runqhead == runqtail,
|
|
|
|
// 2) runqput on _p_ kicks G1 to the runq, 3) runqget on _p_ empties runqnext.
|
|
|
|
// Simply observing that runqhead == runqtail and then observing that runqnext == nil
|
|
|
|
// does not mean the queue is empty.
|
|
|
|
for {
|
|
|
|
head := atomic.Load(&_p_.runqhead)
|
|
|
|
tail := atomic.Load(&_p_.runqtail)
|
|
|
|
runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&_p_.runnext)))
|
|
|
|
if tail == atomic.Load(&_p_.runqtail) {
|
|
|
|
return head == tail && runnext == 0
|
|
|
|
}
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// To shake out latent assumptions about scheduling order,
|
|
|
|
// we introduce some randomness into scheduling decisions
|
|
|
|
// when running with the race detector.
|
|
|
|
// The need for this was made obvious by changing the
|
|
|
|
// (deterministic) scheduling order in Go 1.5 and breaking
|
|
|
|
// many poorly-written tests.
|
|
|
|
// With the randomness here, as long as the tests pass
|
|
|
|
// consistently with -race, they shouldn't have latent scheduling
|
|
|
|
// assumptions.
|
|
|
|
const randomizeScheduler = raceenabled
|
|
|
|
|
|
|
|
// runqput tries to put g on the local runnable queue.
|
|
|
|
// If next if false, runqput adds g to the tail of the runnable queue.
|
|
|
|
// If next is true, runqput puts g in the _p_.runnext slot.
|
|
|
|
// If the run queue is full, runnext puts g on the global queue.
|
|
|
|
// Executed only by the owner P.
|
|
|
|
func runqput(_p_ *p, gp *g, next bool) {
|
2016-06-28 10:22:46 -06:00
|
|
|
if randomizeScheduler && next && fastrand()%2 == 0 {
|
2015-10-18 18:04:05 -06:00
|
|
|
next = false
|
|
|
|
}
|
|
|
|
|
|
|
|
if next {
|
|
|
|
retryNext:
|
|
|
|
oldnext := _p_.runnext
|
|
|
|
if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
|
|
|
|
goto retryNext
|
|
|
|
}
|
|
|
|
if oldnext == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Kick the old runnext out to the regular run queue.
|
|
|
|
gp = oldnext.ptr()
|
|
|
|
}
|
|
|
|
|
|
|
|
retry:
|
2015-11-02 12:09:24 -07:00
|
|
|
h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
|
2015-10-18 18:04:05 -06:00
|
|
|
t := _p_.runqtail
|
|
|
|
if t-h < uint32(len(_p_.runq)) {
|
2015-11-02 14:59:39 -07:00
|
|
|
_p_.runq[t%uint32(len(_p_.runq))].set(gp)
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
|
2015-10-18 18:04:05 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
if runqputslow(_p_, gp, h, t) {
|
|
|
|
return
|
|
|
|
}
|
2016-02-24 03:55:20 -07:00
|
|
|
// the queue is not full, now the put above must succeed
|
2015-10-18 18:04:05 -06:00
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put g and a batch of work from local runnable queue on global queue.
|
|
|
|
// Executed only by the owner P.
|
|
|
|
func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
|
|
|
|
var batch [len(_p_.runq)/2 + 1]*g
|
|
|
|
|
|
|
|
// First, grab a batch from local queue.
|
|
|
|
n := t - h
|
|
|
|
n = n / 2
|
|
|
|
if n != uint32(len(_p_.runq)/2) {
|
|
|
|
throw("runqputslow: queue is not full")
|
|
|
|
}
|
|
|
|
for i := uint32(0); i < n; i++ {
|
2015-11-02 14:59:39 -07:00
|
|
|
batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
if !atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
|
2015-10-18 18:04:05 -06:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
batch[n] = gp
|
|
|
|
|
|
|
|
if randomizeScheduler {
|
|
|
|
for i := uint32(1); i <= n; i++ {
|
2016-06-28 10:22:46 -06:00
|
|
|
j := fastrand() % (i + 1)
|
2015-10-18 18:04:05 -06:00
|
|
|
batch[i], batch[j] = batch[j], batch[i]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Link the goroutines.
|
|
|
|
for i := uint32(0); i < n; i++ {
|
|
|
|
batch[i].schedlink.set(batch[i+1])
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now put the batch on global queue.
|
|
|
|
lock(&sched.lock)
|
|
|
|
globrunqputbatch(batch[0], batch[n], int32(n+1))
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get g from local runnable queue.
|
|
|
|
// If inheritTime is true, gp should inherit the remaining time in the
|
|
|
|
// current time slice. Otherwise, it should start a new time slice.
|
|
|
|
// Executed only by the owner P.
|
|
|
|
func runqget(_p_ *p) (gp *g, inheritTime bool) {
|
|
|
|
// If there's a runnext, it's the next G to run.
|
|
|
|
for {
|
|
|
|
next := _p_.runnext
|
|
|
|
if next == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if _p_.runnext.cas(next, 0) {
|
|
|
|
return next.ptr(), true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for {
|
2015-11-02 12:09:24 -07:00
|
|
|
h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
|
2015-10-18 18:04:05 -06:00
|
|
|
t := _p_.runqtail
|
|
|
|
if t == h {
|
|
|
|
return nil, false
|
|
|
|
}
|
2015-11-02 14:59:39 -07:00
|
|
|
gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
|
2015-10-18 18:04:05 -06:00
|
|
|
return gp, false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Grabs a batch of goroutines from _p_'s runnable queue into batch.
|
|
|
|
// Batch is a ring buffer starting at batchHead.
|
|
|
|
// Returns number of grabbed goroutines.
|
|
|
|
// Can be executed by any P.
|
2015-11-02 14:59:39 -07:00
|
|
|
func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
|
2015-10-18 18:04:05 -06:00
|
|
|
for {
|
2015-11-02 12:09:24 -07:00
|
|
|
h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
|
|
|
|
t := atomic.Load(&_p_.runqtail) // load-acquire, synchronize with the producer
|
2015-10-18 18:04:05 -06:00
|
|
|
n := t - h
|
|
|
|
n = n - n/2
|
|
|
|
if n == 0 {
|
|
|
|
if stealRunNextG {
|
|
|
|
// Try to steal from _p_.runnext.
|
|
|
|
if next := _p_.runnext; next != 0 {
|
|
|
|
// Sleep to ensure that _p_ isn't about to run the g we
|
|
|
|
// are about to steal.
|
|
|
|
// The important use case here is when the g running on _p_
|
|
|
|
// ready()s another g and then almost immediately blocks.
|
|
|
|
// Instead of stealing runnext in this window, back off
|
|
|
|
// to give _p_ a chance to schedule runnext. This will avoid
|
|
|
|
// thrashing gs between different Ps.
|
2016-03-18 04:00:03 -06:00
|
|
|
// A sync chan send/recv takes ~50ns as of time of writing,
|
|
|
|
// so 3us gives ~50x overshoot.
|
|
|
|
if GOOS != "windows" {
|
|
|
|
usleep(3)
|
|
|
|
} else {
|
|
|
|
// On windows system timer granularity is 1-15ms,
|
|
|
|
// which is way too much for this optimization.
|
|
|
|
// So just yield.
|
|
|
|
osyield()
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
if !_p_.runnext.cas(next, 0) {
|
|
|
|
continue
|
|
|
|
}
|
2015-11-02 14:59:39 -07:00
|
|
|
batch[batchHead%uint32(len(batch))] = next
|
2015-10-18 18:04:05 -06:00
|
|
|
return 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
for i := uint32(0); i < n; i++ {
|
|
|
|
g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
|
|
|
|
batch[(batchHead+i)%uint32(len(batch))] = g
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
|
2015-10-18 18:04:05 -06:00
|
|
|
return n
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Steal half of elements from local runnable queue of p2
|
|
|
|
// and put onto local runnable queue of p.
|
|
|
|
// Returns one of the stolen elements (or nil if failed).
|
|
|
|
func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
|
|
|
|
t := _p_.runqtail
|
|
|
|
n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
|
|
|
|
if n == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
n--
|
2015-11-02 14:59:39 -07:00
|
|
|
gp := _p_.runq[(t+n)%uint32(len(_p_.runq))].ptr()
|
2015-10-18 18:04:05 -06:00
|
|
|
if n == 0 {
|
|
|
|
return gp
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
|
2015-10-18 18:04:05 -06:00
|
|
|
if t-h+n >= uint32(len(_p_.runq)) {
|
|
|
|
throw("runqsteal: runq overflow")
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
|
2015-10-18 18:04:05 -06:00
|
|
|
return gp
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname setMaxThreads runtime/debug.setMaxThreads
|
|
|
|
func setMaxThreads(in int) (out int) {
|
|
|
|
lock(&sched.lock)
|
|
|
|
out = int(sched.maxmcount)
|
2016-10-20 03:24:51 -06:00
|
|
|
if in > 0x7fffffff { // MaxInt32
|
|
|
|
sched.maxmcount = 0x7fffffff
|
|
|
|
} else {
|
|
|
|
sched.maxmcount = int32(in)
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
checkmcount()
|
|
|
|
unlock(&sched.lock)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func haveexperiment(name string) bool {
|
2016-05-25 12:37:43 -06:00
|
|
|
if name == "framepointer" {
|
|
|
|
return framepointer_enabled // set by linker
|
|
|
|
}
|
2015-11-11 10:39:30 -07:00
|
|
|
x := sys.Goexperiment
|
2015-10-18 18:04:05 -06:00
|
|
|
for x != "" {
|
|
|
|
xname := ""
|
|
|
|
i := index(x, ",")
|
|
|
|
if i < 0 {
|
|
|
|
xname, x = x, ""
|
|
|
|
} else {
|
|
|
|
xname, x = x[:i], x[i+1:]
|
|
|
|
}
|
|
|
|
if xname == name {
|
|
|
|
return true
|
|
|
|
}
|
2016-05-25 12:37:43 -06:00
|
|
|
if len(xname) > 2 && xname[:2] == "no" && xname[2:] == name {
|
|
|
|
return false
|
|
|
|
}
|
2015-10-18 18:04:05 -06:00
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func procPin() int {
|
|
|
|
_g_ := getg()
|
|
|
|
mp := _g_.m
|
|
|
|
|
|
|
|
mp.locks++
|
|
|
|
return int(mp.p.ptr().id)
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func procUnpin() {
|
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.locks--
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname sync_runtime_procPin sync.runtime_procPin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_runtime_procPin() int {
|
|
|
|
return procPin()
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname sync_runtime_procUnpin sync.runtime_procUnpin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_runtime_procUnpin() {
|
|
|
|
procUnpin()
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname sync_atomic_runtime_procPin sync/atomic.runtime_procPin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_atomic_runtime_procPin() int {
|
|
|
|
return procPin()
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname sync_atomic_runtime_procUnpin sync/atomic.runtime_procUnpin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_atomic_runtime_procUnpin() {
|
|
|
|
procUnpin()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Active spinning for sync.Mutex.
|
|
|
|
//go:linkname sync_runtime_canSpin sync.runtime_canSpin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_runtime_canSpin(i int) bool {
|
|
|
|
// sync.Mutex is cooperative, so we are conservative with spinning.
|
|
|
|
// Spin only few times and only if running on a multicore machine and
|
|
|
|
// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
|
|
|
|
// As opposed to runtime mutex we don't do passive spinning here,
|
|
|
|
// because there can be work on global runq on on other Ps.
|
|
|
|
if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if p := getg().m.p.ptr(); !runqempty(p) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:linkname sync_runtime_doSpin sync.runtime_doSpin
|
|
|
|
//go:nosplit
|
|
|
|
func sync_runtime_doSpin() {
|
|
|
|
procyield(active_spin_cnt)
|
|
|
|
}
|
2016-03-18 05:52:52 -06:00
|
|
|
|
|
|
|
var stealOrder randomOrder
|
|
|
|
|
|
|
|
// randomOrder/randomEnum are helper types for randomized work stealing.
|
|
|
|
// They allow to enumerate all Ps in different pseudo-random orders without repetitions.
|
|
|
|
// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS
|
|
|
|
// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration.
|
|
|
|
type randomOrder struct {
|
|
|
|
count uint32
|
|
|
|
coprimes []uint32
|
|
|
|
}
|
|
|
|
|
|
|
|
type randomEnum struct {
|
|
|
|
i uint32
|
|
|
|
count uint32
|
|
|
|
pos uint32
|
|
|
|
inc uint32
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ord *randomOrder) reset(count uint32) {
|
|
|
|
ord.count = count
|
|
|
|
ord.coprimes = ord.coprimes[:0]
|
|
|
|
for i := uint32(1); i <= count; i++ {
|
|
|
|
if gcd(i, count) == 1 {
|
|
|
|
ord.coprimes = append(ord.coprimes, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ord *randomOrder) start(i uint32) randomEnum {
|
|
|
|
return randomEnum{
|
|
|
|
count: ord.count,
|
|
|
|
pos: i % ord.count,
|
|
|
|
inc: ord.coprimes[i%uint32(len(ord.coprimes))],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (enum *randomEnum) done() bool {
|
|
|
|
return enum.i == enum.count
|
|
|
|
}
|
|
|
|
|
|
|
|
func (enum *randomEnum) next() {
|
|
|
|
enum.i++
|
|
|
|
enum.pos = (enum.pos + enum.inc) % enum.count
|
|
|
|
}
|
|
|
|
|
|
|
|
func (enum *randomEnum) position() uint32 {
|
|
|
|
return enum.pos
|
|
|
|
}
|
|
|
|
|
|
|
|
func gcd(a, b uint32) uint32 {
|
|
|
|
for b != 0 {
|
|
|
|
a, b = b, a%b
|
|
|
|
}
|
|
|
|
return a
|
|
|
|
}
|