mirror of
https://github.com/golang/go
synced 2024-11-17 12:04:43 -07:00
runtime: measure stack usage; start stacks larger if needed
Measure the average stack size used by goroutines at every GC. When starting a new goroutine, allocate an initial goroutine stack of that average size. Intuition is that we'll waste at most 2x in stack space because only half the goroutines can be below average. In turn, we avoid some of the early stack growth / copying needed in the average case. More details in the design doc at: https://docs.google.com/document/d/1YDlGIdVTPnmUiTAavlZxBI1d9pwGQgZT7IKFKlIXohQ/edit?usp=sharing name old time/op new time/op delta Issue18138 95.3µs ± 0% 67.3µs ±13% -29.35% (p=0.000 n=9+10) Fixes #18138 Change-Id: Iba34d22ed04279da7e718bbd569bbf2734922eaa Reviewed-on: https://go-review.googlesource.com/c/go/+/345889 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
parent
da0a6f4b57
commit
016d755213
@ -1294,7 +1294,7 @@ func (c *GCController) StartCycle(stackSize, globalsSize uint64, scannableFrac f
|
||||
if c.heapMarked > trigger {
|
||||
trigger = c.heapMarked
|
||||
}
|
||||
c.scannableStackSize = stackSize
|
||||
c.maxStackScan = stackSize
|
||||
c.globalsScan = globalsSize
|
||||
c.heapLive = trigger
|
||||
c.heapScan += uint64(float64(trigger-c.heapMarked) * scannableFrac)
|
||||
|
@ -171,6 +171,12 @@ func initMetrics() {
|
||||
}
|
||||
},
|
||||
},
|
||||
"/gc/stack/starting-size:bytes": {
|
||||
compute: func(in *statAggregate, out *metricValue) {
|
||||
out.kind = metricKindUint64
|
||||
out.scalar = uint64(startingStackSize)
|
||||
},
|
||||
},
|
||||
"/memory/classes/heap/free:bytes": {
|
||||
deps: makeStatDepSet(heapStatsDep),
|
||||
compute: func(in *statAggregate, out *metricValue) {
|
||||
|
@ -140,6 +140,12 @@ var allDesc = []Description{
|
||||
Kind: KindFloat64Histogram,
|
||||
Cumulative: true,
|
||||
},
|
||||
{
|
||||
Name: "/gc/stack/starting-size:bytes",
|
||||
Description: "The stack size of new goroutines.",
|
||||
Kind: KindUint64,
|
||||
Cumulative: false,
|
||||
},
|
||||
{
|
||||
Name: "/memory/classes/heap/free:bytes",
|
||||
Description: "Memory that is completely free and eligible to be returned to the underlying system, " +
|
||||
|
@ -102,6 +102,9 @@ Below is the full list of supported metrics, ordered lexicographically.
|
||||
/gc/pauses:seconds
|
||||
Distribution individual GC-related stop-the-world pause latencies.
|
||||
|
||||
/gc/stack/starting-size:bytes
|
||||
The stack size of new goroutines.
|
||||
|
||||
/memory/classes/heap/free:bytes
|
||||
Memory that is completely free and eligible to be returned to
|
||||
the underlying system, but has not been. This metric is the
|
||||
|
@ -885,6 +885,8 @@ top:
|
||||
goto top
|
||||
}
|
||||
|
||||
gcComputeStartingStackSize()
|
||||
|
||||
// Disable assists and background workers. We must do
|
||||
// this before waking blocked assists.
|
||||
atomic.Store(&gcBlackenEnabled, 0)
|
||||
@ -1111,7 +1113,7 @@ func gcMarkTermination() {
|
||||
print(" ms cpu, ",
|
||||
work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
|
||||
gcController.heapGoal()>>20, " MB goal, ",
|
||||
gcController.stackScan>>20, " MB stacks, ",
|
||||
atomic.Load64(&gcController.maxStackScan)>>20, " MB stacks, ",
|
||||
gcController.globalsScan>>20, " MB globals, ",
|
||||
work.maxprocs, " P")
|
||||
if work.userForced {
|
||||
|
@ -746,14 +746,22 @@ func scanstack(gp *g, gcw *gcWork) int64 {
|
||||
throw("can't scan our own stack")
|
||||
}
|
||||
|
||||
// stackSize is the amount of work we'll be reporting.
|
||||
// scannedSize is the amount of work we'll be reporting.
|
||||
//
|
||||
// We report the total stack size, more than we scan,
|
||||
// because this number needs to line up with gcControllerState's
|
||||
// stackScan and scannableStackSize fields.
|
||||
//
|
||||
// See the documentation on those fields for more information.
|
||||
stackSize := gp.stack.hi - gp.stack.lo
|
||||
// It is less than the allocated size (which is hi-lo).
|
||||
var sp uintptr
|
||||
if gp.syscallsp != 0 {
|
||||
sp = gp.syscallsp // If in a system call this is the stack pointer (gp.sched.sp can be 0 in this case on Windows).
|
||||
} else {
|
||||
sp = gp.sched.sp
|
||||
}
|
||||
scannedSize := gp.stack.hi - sp
|
||||
|
||||
// Keep statistics for initial stack size calculation.
|
||||
// Note that this accumulates the scanned size, not the allocated size.
|
||||
p := getg().m.p.ptr()
|
||||
p.scannedStackSize += uint64(scannedSize)
|
||||
p.scannedStacks++
|
||||
|
||||
if isShrinkStackSafe(gp) {
|
||||
// Shrink the stack if not much of it is being used.
|
||||
@ -894,7 +902,7 @@ func scanstack(gp *g, gcw *gcWork) int64 {
|
||||
if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
|
||||
throw("remaining pointer buffers")
|
||||
}
|
||||
return int64(stackSize)
|
||||
return int64(scannedSize)
|
||||
}
|
||||
|
||||
// Scan a stack frame: local variables and function arguments/results.
|
||||
|
@ -63,9 +63,9 @@ const (
|
||||
defaultHeapMinimum = (goexperiment.HeapMinimum512KiBInt)*(512<<10) +
|
||||
(1-goexperiment.HeapMinimum512KiBInt)*(4<<20)
|
||||
|
||||
// scannableStackSizeSlack is the bytes of stack space allocated or freed
|
||||
// maxStackScanSlack is the bytes of stack space allocated or freed
|
||||
// that can accumulate on a P before updating gcController.stackSize.
|
||||
scannableStackSizeSlack = 8 << 10
|
||||
maxStackScanSlack = 8 << 10
|
||||
|
||||
// memoryLimitHeapGoalHeadroom is the amount of headroom the pacer gives to
|
||||
// the heap goal when operating in the memory-limited regime. That is,
|
||||
@ -227,13 +227,11 @@ type gcControllerState struct {
|
||||
// Updated when the world is stopped.
|
||||
lastHeapScan uint64
|
||||
|
||||
// stackScan is a snapshot of scannableStackSize taken at each GC
|
||||
// STW pause and is used in pacing decisions.
|
||||
//
|
||||
// Updated only while the world is stopped.
|
||||
stackScan uint64
|
||||
// lastStackScan is the number of bytes of stack that were scanned
|
||||
// last GC cycle.
|
||||
lastStackScan uint64
|
||||
|
||||
// scannableStackSize is the amount of allocated goroutine stack space in
|
||||
// maxStackScan is the amount of allocated goroutine stack space in
|
||||
// use by goroutines.
|
||||
//
|
||||
// This number tracks allocated goroutine stack space rather than used
|
||||
@ -243,7 +241,7 @@ type gcControllerState struct {
|
||||
// to conservatively overcount than undercount.
|
||||
//
|
||||
// Read and updated atomically.
|
||||
scannableStackSize uint64
|
||||
maxStackScan uint64
|
||||
|
||||
// globalsScan is the total amount of global variable space
|
||||
// that is scannable.
|
||||
@ -269,8 +267,8 @@ type gcControllerState struct {
|
||||
// Currently these are measured in bytes. For most uses, this is an
|
||||
// opaque unit of work, but for estimation the definition is important.
|
||||
//
|
||||
// Note that stackScanWork includes all allocated space, not just the
|
||||
// size of the stack itself, mirroring stackSize.
|
||||
// Note that stackScanWork includes only stack space scanned, not all
|
||||
// of the allocated stack.
|
||||
heapScanWork atomic.Int64
|
||||
stackScanWork atomic.Int64
|
||||
globalsScanWork atomic.Int64
|
||||
@ -441,7 +439,6 @@ func (c *gcControllerState) startCycle(markStartTime int64, procs int, trigger g
|
||||
c.fractionalMarkTime = 0
|
||||
c.idleMarkTime = 0
|
||||
c.markStartTime = markStartTime
|
||||
c.stackScan = atomic.Load64(&c.scannableStackSize)
|
||||
c.triggered = c.heapLive
|
||||
|
||||
// Compute the background mark utilization goal. In general,
|
||||
@ -553,13 +550,15 @@ func (c *gcControllerState) revise() {
|
||||
heapGoal := int64(c.heapGoal())
|
||||
|
||||
// The expected scan work is computed as the amount of bytes scanned last
|
||||
// GC cycle, plus our estimate of stacks and globals work for this cycle.
|
||||
scanWorkExpected := int64(c.lastHeapScan + c.stackScan + c.globalsScan)
|
||||
// GC cycle (both heap and stack), plus our estimate of globals work for this cycle.
|
||||
scanWorkExpected := int64(c.lastHeapScan + c.lastStackScan + c.globalsScan)
|
||||
|
||||
// maxScanWork is a worst-case estimate of the amount of scan work that
|
||||
// needs to be performed in this GC cycle. Specifically, it represents
|
||||
// the case where *all* scannable memory turns out to be live.
|
||||
maxScanWork := int64(scan + c.stackScan + c.globalsScan)
|
||||
// the case where *all* scannable memory turns out to be live, and
|
||||
// *all* allocated stack space is scannable.
|
||||
maxStackScan := atomic.Load64(&c.maxStackScan)
|
||||
maxScanWork := int64(scan + maxStackScan + c.globalsScan)
|
||||
if work > scanWorkExpected {
|
||||
// We've already done more scan work than expected. Because our expectation
|
||||
// is based on a steady-state scannable heap size, we assume this means our
|
||||
@ -736,7 +735,7 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) {
|
||||
printlock()
|
||||
goal := gcGoalUtilization * 100
|
||||
print("pacer: ", int(utilization*100), "% CPU (", int(goal), " exp.) for ")
|
||||
print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.stackScan+c.globalsScan, " B exp.) ")
|
||||
print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.lastStackScan+c.globalsScan, " B exp.) ")
|
||||
print("in ", c.triggered, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(heapGoal), ", cons/mark ", oldConsMark, ")")
|
||||
if !ok {
|
||||
print("[controller reset]")
|
||||
@ -884,6 +883,7 @@ func (c *gcControllerState) resetLive(bytesMarked uint64) {
|
||||
c.heapLive = bytesMarked
|
||||
c.heapScan = uint64(c.heapScanWork.Load())
|
||||
c.lastHeapScan = uint64(c.heapScanWork.Load())
|
||||
c.lastStackScan = uint64(c.stackScanWork.Load())
|
||||
c.triggered = ^uint64(0) // Reset triggered.
|
||||
|
||||
// heapLive was updated, so emit a trace event.
|
||||
@ -935,13 +935,13 @@ func (c *gcControllerState) update(dHeapLive, dHeapScan int64) {
|
||||
|
||||
func (c *gcControllerState) addScannableStack(pp *p, amount int64) {
|
||||
if pp == nil {
|
||||
atomic.Xadd64(&c.scannableStackSize, amount)
|
||||
atomic.Xadd64(&c.maxStackScan, amount)
|
||||
return
|
||||
}
|
||||
pp.scannableStackSizeDelta += amount
|
||||
if pp.scannableStackSizeDelta >= scannableStackSizeSlack || pp.scannableStackSizeDelta <= -scannableStackSizeSlack {
|
||||
atomic.Xadd64(&c.scannableStackSize, pp.scannableStackSizeDelta)
|
||||
pp.scannableStackSizeDelta = 0
|
||||
pp.maxStackScanDelta += amount
|
||||
if pp.maxStackScanDelta >= maxStackScanSlack || pp.maxStackScanDelta <= -maxStackScanSlack {
|
||||
atomic.Xadd64(&c.maxStackScan, pp.maxStackScanDelta)
|
||||
pp.maxStackScanDelta = 0
|
||||
}
|
||||
}
|
||||
|
||||
@ -1248,7 +1248,7 @@ func (c *gcControllerState) commit(isSweepDone bool) {
|
||||
// plus additional runway for non-heap sources of GC work.
|
||||
gcPercentHeapGoal := ^uint64(0)
|
||||
if gcPercent := c.gcPercent.Load(); gcPercent >= 0 {
|
||||
gcPercentHeapGoal = c.heapMarked + (c.heapMarked+atomic.Load64(&c.stackScan)+atomic.Load64(&c.globalsScan))*uint64(gcPercent)/100
|
||||
gcPercentHeapGoal = c.heapMarked + (c.heapMarked+atomic.Load64(&c.lastStackScan)+atomic.Load64(&c.globalsScan))*uint64(gcPercent)/100
|
||||
}
|
||||
// Apply the minimum heap size here. It's defined in terms of gcPercent
|
||||
// and is only updated by functions that call commit.
|
||||
@ -1280,7 +1280,7 @@ func (c *gcControllerState) commit(isSweepDone bool) {
|
||||
// Furthermore, by setting the runway so that CPU resources are divided
|
||||
// this way, assuming that the cons/mark ratio is correct, we make that
|
||||
// division a reality.
|
||||
c.runway.Store(uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.stackScan+c.globalsScan)))
|
||||
c.runway.Store(uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.lastStackScan+c.globalsScan)))
|
||||
}
|
||||
|
||||
// setGCPercent updates gcPercent. commit must be called after.
|
||||
|
@ -4237,7 +4237,7 @@ func gfput(_p_ *p, gp *g) {
|
||||
|
||||
stksize := gp.stack.hi - gp.stack.lo
|
||||
|
||||
if stksize != _FixedStack {
|
||||
if stksize != uintptr(startingStackSize) {
|
||||
// non-standard stack size - free it.
|
||||
stackfree(gp.stack)
|
||||
gp.stack.lo = 0
|
||||
@ -4299,10 +4299,21 @@ retry:
|
||||
return nil
|
||||
}
|
||||
_p_.gFree.n--
|
||||
if gp.stack.lo == 0 {
|
||||
// Stack was deallocated in gfput. Allocate a new one.
|
||||
if gp.stack.lo != 0 && gp.stack.hi-gp.stack.lo != uintptr(startingStackSize) {
|
||||
// Deallocate old stack. We kept it in gfput because it was the
|
||||
// right size when the goroutine was put on the free list, but
|
||||
// the right size has changed since then.
|
||||
systemstack(func() {
|
||||
gp.stack = stackalloc(_FixedStack)
|
||||
stackfree(gp.stack)
|
||||
gp.stack.lo = 0
|
||||
gp.stack.hi = 0
|
||||
gp.stackguard0 = 0
|
||||
})
|
||||
}
|
||||
if gp.stack.lo == 0 {
|
||||
// Stack was deallocated in gfput or just above. Allocate a new one.
|
||||
systemstack(func() {
|
||||
gp.stack = stackalloc(startingStackSize)
|
||||
})
|
||||
gp.stackguard0 = gp.stack.lo + _StackGuard
|
||||
} else {
|
||||
|
@ -321,6 +321,7 @@ var debug struct {
|
||||
tracebackancestors int32
|
||||
asyncpreemptoff int32
|
||||
harddecommit int32
|
||||
adaptivestackstart int32
|
||||
|
||||
// debug.malloc is used as a combined debug check
|
||||
// in the malloc function and should be set
|
||||
@ -351,12 +352,14 @@ var dbgvars = []dbgVar{
|
||||
{"asyncpreemptoff", &debug.asyncpreemptoff},
|
||||
{"inittrace", &debug.inittrace},
|
||||
{"harddecommit", &debug.harddecommit},
|
||||
{"adaptivestackstart", &debug.adaptivestackstart},
|
||||
}
|
||||
|
||||
func parsedebugvars() {
|
||||
// defaults
|
||||
debug.cgocheck = 1
|
||||
debug.invalidptr = 1
|
||||
debug.adaptivestackstart = 1 // go119 - set this to 0 to turn larger initial goroutine stacks off
|
||||
if GOOS == "linux" {
|
||||
// On Linux, MADV_FREE is faster than MADV_DONTNEED,
|
||||
// but doesn't affect many of the statistics that
|
||||
|
@ -732,11 +732,19 @@ type p struct {
|
||||
// Race context used while executing timer functions.
|
||||
timerRaceCtx uintptr
|
||||
|
||||
// scannableStackSizeDelta accumulates the amount of stack space held by
|
||||
// maxStackScanDelta accumulates the amount of stack space held by
|
||||
// live goroutines (i.e. those eligible for stack scanning).
|
||||
// Flushed to gcController.scannableStackSize once scannableStackSizeSlack
|
||||
// or -scannableStackSizeSlack is reached.
|
||||
scannableStackSizeDelta int64
|
||||
// Flushed to gcController.maxStackScan once maxStackScanSlack
|
||||
// or -maxStackScanSlack is reached.
|
||||
maxStackScanDelta int64
|
||||
|
||||
// gc-time statistics about current goroutines
|
||||
// Note that this differs from maxStackScan in that this
|
||||
// accumulates the actual stack observed to be used at GC time (hi - sp),
|
||||
// not an instantaneous measure of the total stack size that might need
|
||||
// to be scanned (hi - lo).
|
||||
scannedStackSize uint64 // stack size of goroutines scanned by this P
|
||||
scannedStacks uint64 // number of goroutines scanned by this P
|
||||
|
||||
// preempt is set to indicate that this P should be enter the
|
||||
// scheduler ASAP (regardless of what G is running on it).
|
||||
|
@ -1436,3 +1436,49 @@ func (r *stackObjectRecord) gcdata() *byte {
|
||||
func morestackc() {
|
||||
throw("attempt to execute system stack code on user stack")
|
||||
}
|
||||
|
||||
// startingStackSize is the amount of stack that new goroutines start with.
|
||||
// It is a power of 2, and between _FixedStack and maxstacksize, inclusive.
|
||||
// startingStackSize is updated every GC by tracking the average size of
|
||||
// stacks scanned during the GC.
|
||||
var startingStackSize uint32 = _FixedStack
|
||||
|
||||
func gcComputeStartingStackSize() {
|
||||
if debug.adaptivestackstart == 0 {
|
||||
return
|
||||
}
|
||||
// For details, see the design doc at
|
||||
// https://docs.google.com/document/d/1YDlGIdVTPnmUiTAavlZxBI1d9pwGQgZT7IKFKlIXohQ/edit?usp=sharing
|
||||
// The basic algorithm is to track the average size of stacks
|
||||
// and start goroutines with stack equal to that average size.
|
||||
// Starting at the average size uses at most 2x the space that
|
||||
// an ideal algorithm would have used.
|
||||
// This is just a heuristic to avoid excessive stack growth work
|
||||
// early in a goroutine's lifetime. See issue 18138. Stacks that
|
||||
// are allocated too small can still grow, and stacks allocated
|
||||
// too large can still shrink.
|
||||
var scannedStackSize uint64
|
||||
var scannedStacks uint64
|
||||
for _, p := range allp {
|
||||
scannedStackSize += p.scannedStackSize
|
||||
scannedStacks += p.scannedStacks
|
||||
// Reset for next time
|
||||
p.scannedStackSize = 0
|
||||
p.scannedStacks = 0
|
||||
}
|
||||
if scannedStacks == 0 {
|
||||
startingStackSize = _FixedStack
|
||||
return
|
||||
}
|
||||
avg := scannedStackSize/scannedStacks + _StackGuard
|
||||
// Note: we add _StackGuard to ensure that a goroutine that
|
||||
// uses the average space will not trigger a growth.
|
||||
if avg > uint64(maxstacksize) {
|
||||
avg = uint64(maxstacksize)
|
||||
}
|
||||
if avg < _FixedStack {
|
||||
avg = _FixedStack
|
||||
}
|
||||
// Note: maxstacksize fits in 30 bits, so avg also does.
|
||||
startingStackSize = uint32(round2(int32(avg)))
|
||||
}
|
||||
|
@ -597,6 +597,39 @@ func BenchmarkStackCopyWithStkobj(b *testing.B) {
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkIssue18138(b *testing.B) {
|
||||
// Channel with N "can run a goroutine" tokens
|
||||
const N = 10
|
||||
c := make(chan []byte, N)
|
||||
for i := 0; i < N; i++ {
|
||||
c <- make([]byte, 1)
|
||||
}
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
<-c // get token
|
||||
go func() {
|
||||
useStackPtrs(1000, false) // uses ~1MB max
|
||||
m := make([]byte, 8192) // make GC trigger occasionally
|
||||
c <- m // return token
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func useStackPtrs(n int, b bool) {
|
||||
if b {
|
||||
// This code contributes to the stack frame size, and hence to the
|
||||
// stack copying cost. But since b is always false, it costs no
|
||||
// execution time (not even the zeroing of a).
|
||||
var a [128]*int // 1KB of pointers
|
||||
a[n] = &n
|
||||
n = *a[0]
|
||||
}
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
useStackPtrs(n-1, b)
|
||||
}
|
||||
|
||||
type structWithMethod struct{}
|
||||
|
||||
func (s structWithMethod) caller() string {
|
||||
|
Loading…
Reference in New Issue
Block a user