1
0
mirror of https://github.com/golang/go synced 2024-11-07 03:26:10 -07:00

runtime: make piController much more defensive about overflow

If something goes horribly wrong with the assumptions surrounding a
piController, its internal error state might accumulate in an unbounded
manner. In practice this means unexpected Inf and NaN values.

Avoid this by identifying cases where the error overflows and resetting
controller state.

In the scavenger, this case is much more likely. All that has to happen
is the proportional relationship between sleep time and estimated CPU
usage has to break down. Unfortunately because we're just measuring
monotonic time for all this, there are lots of ways it could happen,
especially in an oversubscribed system. In these cases, just fall back
on a conservative pace for scavenging and try to wait out the issue.

In the pacer I'm pretty sure this is impossible. Because we wire the
output of the controller to the input, the response is very directly
correlated, so it's impossible for the controller's core assumption to
break down.

While we're in the pacer, add more detail about why that controller is
even there, as well as its purpose.

Finally, let's be proactive about other sources of overflow, namely
overflow from a very large input value. This change adds a check after
the first few operations to detect overflow issues from the input,
specifically the multiplication.

No tests for the pacer because I was unable to actually break the
pacer's controller under a fuzzer, and no tests for the scavenger because
it is not really in a testable state.

However:
* This change includes a fuzz test for the piController.
* I broke out the scavenger code locally and fuzz tested it, confirming
  that the patch eliminates the original failure mode.
* I tested that on a local heap-spike test, the scavenger continues
  operating as expected under normal conditions.

Fixes #51061.

Change-Id: I02a01d2dbf0eb9d2a8a8e7274d4165c2b6a3415a
Reviewed-on: https://go-review.googlesource.com/c/go/+/383954
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Michael Anthony Knyszek 2022-02-08 00:52:11 +00:00 committed by Michael Knyszek
parent 18c2033ba5
commit e4a173adf6
4 changed files with 179 additions and 14 deletions

View File

@ -1332,3 +1332,21 @@ func Releasem() {
} }
var Timediv = timediv var Timediv = timediv
type PIController struct {
piController
}
func NewPIController(kp, ti, tt, min, max float64) *PIController {
return &PIController{piController{
kp: kp,
ti: ti,
tt: tt,
min: min,
max: max,
}}
}
func (c *PIController) Next(input, setpoint, period float64) (float64, bool) {
return c.piController.next(input, setpoint, period)
}

View File

@ -154,6 +154,8 @@ type gcControllerState struct {
// For goexperiment.PacerRedesign. // For goexperiment.PacerRedesign.
consMarkController piController consMarkController piController
_ uint32 // Padding for atomics on 32-bit platforms.
// heapGoal is the goal heapLive for when next GC ends. // heapGoal is the goal heapLive for when next GC ends.
// Set to ^uint64(0) if disabled. // Set to ^uint64(0) if disabled.
// //
@ -670,10 +672,31 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
currentConsMark := (float64(c.heapLive-c.trigger) * (utilization + idleUtilization)) / currentConsMark := (float64(c.heapLive-c.trigger) * (utilization + idleUtilization)) /
(float64(scanWork) * (1 - utilization)) (float64(scanWork) * (1 - utilization))
// Update cons/mark controller. // Update cons/mark controller. The time period for this is 1 GC cycle.
// Period for this is 1 GC cycle. //
// This use of a PI controller might seem strange. So, here's an explanation:
//
// currentConsMark represents the consMark we *should've* had to be perfectly
// on-target for this cycle. Given that we assume the next GC will be like this
// one in the steady-state, it stands to reason that we should just pick that
// as our next consMark. In practice, however, currentConsMark is too noisy:
// we're going to be wildly off-target in each GC cycle if we do that.
//
// What we do instead is make a long-term assumption: there is some steady-state
// consMark value, but it's obscured by noise. By constantly shooting for this
// noisy-but-perfect consMark value, the controller will bounce around a bit,
// but its average behavior, in aggregate, should be less noisy and closer to
// the true long-term consMark value, provided its tuned to be slightly overdamped.
var ok bool
oldConsMark := c.consMark oldConsMark := c.consMark
c.consMark = c.consMarkController.next(c.consMark, currentConsMark, 1.0) c.consMark, ok = c.consMarkController.next(c.consMark, currentConsMark, 1.0)
if !ok {
// The error spiraled out of control. This is incredibly unlikely seeing
// as this controller is essentially just a smoothing function, but it might
// mean that something went very wrong with how currentConsMark was calculated.
// Just reset consMark and keep going.
c.consMark = 0
}
if debug.gcpacertrace > 0 { if debug.gcpacertrace > 0 {
printlock() printlock()
@ -681,6 +704,9 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
print("pacer: ", int(utilization*100), "% CPU (", int(goal), " exp.) for ") print("pacer: ", int(utilization*100), "% CPU (", int(goal), " exp.) for ")
print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.stackScan+c.globalsScan, " B exp.) ") print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.stackScan+c.globalsScan, " B exp.) ")
print("in ", c.trigger, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(c.heapGoal), ", cons/mark ", oldConsMark, ")") print("in ", c.trigger, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(c.heapGoal), ", cons/mark ", oldConsMark, ")")
if !ok {
print("[controller reset]")
}
println() println()
printunlock() printunlock()
} }
@ -1263,15 +1289,38 @@ type piController struct {
// PI controller state. // PI controller state.
errIntegral float64 // Integral of the error from t=0 to now. errIntegral float64 // Integral of the error from t=0 to now.
// Error flags.
errOverflow bool // Set if errIntegral ever overflowed.
inputOverflow bool // Set if an operation with the input overflowed.
} }
func (c *piController) next(input, setpoint, period float64) float64 { // next provides a new sample to the controller.
//
// input is the sample, setpoint is the desired point, and period is how much
// time (in whatever unit makes the most sense) has passed since the last sample.
//
// Returns a new value for the variable it's controlling, and whether the operation
// completed successfully. One reason this might fail is if error has been growing
// in an unbounded manner, to the point of overflow.
//
// In the specific case of an error overflow occurs, the errOverflow field will be
// set and the rest of the controller's internal state will be fully reset.
func (c *piController) next(input, setpoint, period float64) (float64, bool) {
// Compute the raw output value. // Compute the raw output value.
prop := c.kp * (setpoint - input) prop := c.kp * (setpoint - input)
rawOutput := prop + c.errIntegral rawOutput := prop + c.errIntegral
// Clamp rawOutput into output. // Clamp rawOutput into output.
output := rawOutput output := rawOutput
if isInf(output) || isNaN(output) {
// The input had a large enough magnitude that either it was already
// overflowed, or some operation with it overflowed.
// Set a flag and reset. That's the safest thing to do.
c.reset()
c.inputOverflow = true
return c.min, false
}
if output < c.min { if output < c.min {
output = c.min output = c.min
} else if output > c.max { } else if output > c.max {
@ -1281,6 +1330,19 @@ func (c *piController) next(input, setpoint, period float64) float64 {
// Update the controller's state. // Update the controller's state.
if c.ti != 0 && c.tt != 0 { if c.ti != 0 && c.tt != 0 {
c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput) c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
if isInf(c.errIntegral) || isNaN(c.errIntegral) {
// So much error has accumulated that we managed to overflow.
// The assumptions around the controller have likely broken down.
// Set a flag and reset. That's the safest thing to do.
c.reset()
c.errOverflow = true
return c.min, false
} }
return output }
return output, true
}
// reset resets the controller state, except for controller error flags.
func (c *piController) reset() {
c.errIntegral = 0
} }

View File

@ -715,3 +715,48 @@ func (f float64Stream) limit(min, max float64) float64Stream {
return v return v
} }
} }
func FuzzPIController(f *testing.F) {
isNormal := func(x float64) bool {
return !math.IsInf(x, 0) && !math.IsNaN(x)
}
isPositive := func(x float64) bool {
return isNormal(x) && x > 0
}
// Seed with constants from controllers in the runtime.
// It's not critical that we keep these in sync, they're just
// reasonable seed inputs.
f.Add(0.3375, 3.2e6, 1e9, 0.001, 1000.0, 0.01)
f.Add(0.9, 4.0, 1000.0, -1000.0, 1000.0, 0.84)
f.Fuzz(func(t *testing.T, kp, ti, tt, min, max, setPoint float64) {
// Ignore uninteresting invalid parameters. These parameters
// are constant, so in practice surprising values will be documented
// or will be other otherwise immediately visible.
//
// We just want to make sure that given a non-Inf, non-NaN input,
// we always get a non-Inf, non-NaN output.
if !isPositive(kp) || !isPositive(ti) || !isPositive(tt) {
return
}
if !isNormal(min) || !isNormal(max) || min > max {
return
}
// Use a random source, but make it deterministic.
rs := rand.New(rand.NewSource(800))
randFloat64 := func() float64 {
return math.Float64frombits(rs.Uint64())
}
p := NewPIController(kp, ti, tt, min, max)
state := float64(0)
for i := 0; i < 100; i++ {
input := randFloat64()
// Ignore the "ok" parameter. We're just trying to break it.
// state is intentionally completely uncorrelated with the input.
var ok bool
state, ok = p.Next(input, setPoint, 1.0)
if !isNormal(state) {
t.Fatalf("got NaN or Inf result from controller: %f %v", state, ok)
}
}
})
}

View File

@ -170,6 +170,7 @@ var scavenge struct {
parked bool parked bool
timer *timer timer *timer
sysmonWake uint32 // Set atomically. sysmonWake uint32 // Set atomically.
printControllerReset bool // Whether the scavenger is in cooldown.
} }
// readyForScavenger signals sysmon to wake the scavenger because // readyForScavenger signals sysmon to wake the scavenger because
@ -295,8 +296,14 @@ func bgscavenge(c chan int) {
max: 1000.0, // 1000:1 max: 1000.0, // 1000:1
} }
// It doesn't really matter what value we start at, but we can't be zero, because // It doesn't really matter what value we start at, but we can't be zero, because
// that'll cause divide-by-zero issues. // that'll cause divide-by-zero issues. Pick something conservative which we'll
critSleepRatio := 0.001 // also use as a fallback.
const startingCritSleepRatio = 0.001
critSleepRatio := startingCritSleepRatio
// Duration left in nanoseconds during which we avoid using the controller and
// we hold critSleepRatio at a conservative value. Used if the controller's
// assumptions fail to hold.
controllerCooldown := int64(0)
for { for {
released := uintptr(0) released := uintptr(0)
crit := float64(0) crit := float64(0)
@ -383,9 +390,22 @@ func bgscavenge(c chan int) {
// because of the additional overheads of using scavenged memory. // because of the additional overheads of using scavenged memory.
crit *= 1 + scavengeCostRatio crit *= 1 + scavengeCostRatio
// Go to sleep for our current sleepNS. // Go to sleep based on how much time we spent doing work.
slept := scavengeSleep(int64(crit / critSleepRatio)) slept := scavengeSleep(int64(crit / critSleepRatio))
// Stop here if we're cooling down from the controller.
if controllerCooldown > 0 {
// crit and slept aren't exact measures of time, but it's OK to be a bit
// sloppy here. We're just hoping we're avoiding some transient bad behavior.
t := slept + int64(crit)
if t > controllerCooldown {
controllerCooldown = 0
} else {
controllerCooldown -= t
}
continue
}
// Calculate the CPU time spent. // Calculate the CPU time spent.
// //
// This may be slightly inaccurate with respect to GOMAXPROCS, but we're // This may be slightly inaccurate with respect to GOMAXPROCS, but we're
@ -395,7 +415,20 @@ func bgscavenge(c chan int) {
cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs)) cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
// Update the critSleepRatio, adjusting until we reach our ideal fraction. // Update the critSleepRatio, adjusting until we reach our ideal fraction.
critSleepRatio = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit) var ok bool
critSleepRatio, ok = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
if !ok {
// The core assumption of the controller, that we can get a proportional
// response, broke down. This may be transient, so temporarily switch to
// sleeping a fixed, conservative amount.
critSleepRatio = startingCritSleepRatio
controllerCooldown = 5e9 // 5 seconds.
// Signal the scav trace printer to output this.
lock(&scavenge.lock)
scavenge.printControllerReset = true
unlock(&scavenge.lock)
}
} }
} }
@ -434,7 +467,11 @@ func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
// released should be the amount of memory released since the last time this // released should be the amount of memory released since the last time this
// was called, and forced indicates whether the scavenge was forced by the // was called, and forced indicates whether the scavenge was forced by the
// application. // application.
//
// scavenge.lock must be held.
func printScavTrace(gen uint32, released uintptr, forced bool) { func printScavTrace(gen uint32, released uintptr, forced bool) {
assertLockHeld(&scavenge.lock)
printlock() printlock()
print("scav ", gen, " ", print("scav ", gen, " ",
released>>10, " KiB work, ", released>>10, " KiB work, ",
@ -443,6 +480,9 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
) )
if forced { if forced {
print(" (forced)") print(" (forced)")
} else if scavenge.printControllerReset {
print(" [controller reset]")
scavenge.printControllerReset = false
} }
println() println()
printunlock() printunlock()