runtime: make CPU limiter assist time much less error-prone

At the expense of performance (having to update another atomic counter) this change makes CPU limiter assist time much less error-prone to manage. There are currently a number of issues with respect to how scavenge assist time is treated, and this change resolves those by just having the limiter maintain its own internal pool that's drained on each update. While we're here, clear the measured assist time each cycle, which was the impetus for the change. Change-Id: I84c513a9f012b4007362a33cddb742c5779782b7 Reviewed-on: https://go-review.googlesource.com/c/go/+/404304 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2024-11-23 20:30:04 -07:00 · 2022-05-06 20:17:52 +00:00 · 2022-05-06 20:17:52 +00:00 · 2eb8b6eec6
commit 2eb8b6eec6
parent 5a4ba6d905
7 changed files with 79 additions and 67 deletions
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -1439,16 +1439,20 @@ func (l *GCCPULimiter) NeedUpdate(now int64) bool {
 	return l.limiter.needUpdate(now)
 }

-func (l *GCCPULimiter) StartGCTransition(enableGC bool, totalAssistTime, now int64) {
-	l.limiter.startGCTransition(enableGC, totalAssistTime, now)
+func (l *GCCPULimiter) StartGCTransition(enableGC bool, now int64) {
+	l.limiter.startGCTransition(enableGC, now)
 }

 func (l *GCCPULimiter) FinishGCTransition(now int64) {
 	l.limiter.finishGCTransition(now)
 }

-func (l *GCCPULimiter) Update(totalAssistTime int64, now int64) {
-	l.limiter.update(totalAssistTime, now)
+func (l *GCCPULimiter) Update(now int64) {
+	l.limiter.update(now)
+}
+
+func (l *GCCPULimiter) AddAssistTime(t int64) {
+	l.limiter.addAssistTime(t)
 }

 func (l *GCCPULimiter) ResetCapacity(now int64, nprocs int32) {
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@ -677,7 +677,7 @@ func gcStart(trigger gcTrigger) {
 	gcController.startCycle(now, int(gomaxprocs), trigger)

 	// Notify the CPU limiter that assists may begin.
-	gcCPULimiter.startGCTransition(true, 0, now)
+	gcCPULimiter.startGCTransition(true, now)

 	// In STW mode, disable scheduling of user Gs. This may also
 	// disable scheduling of this goroutine, so it may block as
@ -891,8 +891,8 @@ top:
 	// this before waking blocked assists.
 	atomic.Store(&gcBlackenEnabled, 0)

-	// Notify the CPU limiter that assists will now cease.
-	gcCPULimiter.startGCTransition(false, gcController.assistTime.Load(), now)
+	// Notify the CPU limiter that GC assists will now cease.
+	gcCPULimiter.startGCTransition(false, now)

 	// Wake all blocked assists. These will run when we
 	// start the world again.
@ -1017,6 +1017,12 @@ func gcMarkTermination() {
 	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
 	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)

+	// Reset assist time stat.
+	//
+	// Do this now, instead of at the start of the next GC cycle, because
+	// these two may keep accumulating even if the GC is not active.
+	mheap_.pages.scav.assistTime.Store(0)
+
 	// Reset sweep state.
 	sweep.nbgsweep = 0
 	sweep.npausesweep = 0
--- a/src/runtime/mgclimit.go
+++ b/src/runtime/mgclimit.go
@ -55,11 +55,10 @@ type gcCPULimiterState struct {
 	// the mark and sweep phases.
 	transitioning bool

-	// lastTotalAssistTime is the last value of a monotonically increasing
-	// count of GC assist time, like gcController.assistTime.
-	lastTotalAssistTime int64
+	_ uint32 // Align assistTimePool and lastUpdate on 32-bit platforms.

-	_ uint32 // Align lastUpdate on 32-bit platforms.
+	// assistTimePool is the accumulated assist time since the last update.
+	assistTimePool atomic.Int64

 	// lastUpdate is the nanotime timestamp of the last time update was called.
 	//
@ -81,15 +80,13 @@ func (l *gcCPULimiterState) limiting() bool {
 	return l.enabled.Load()
 }

-// startGCTransition notifies the limiter of a GC transition. totalAssistTime
-// is the same as described for update. now must be the start of the STW pause
-// for the GC transition.
+// startGCTransition notifies the limiter of a GC transition.
 //
 // This call takes ownership of the limiter and disables all other means of
 // updating the limiter. Release ownership by calling finishGCTransition.
 //
 // It is safe to call concurrently with other operations.
-func (l *gcCPULimiterState) startGCTransition(enableGC bool, totalAssistTime, now int64) {
+func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) {
 	if !l.tryLock() {
 		// This must happen during a STW, so we can't fail to acquire the lock.
 		// If we did, something went wrong. Throw.
@ -99,10 +96,7 @@ func (l *gcCPULimiterState) startGCTransition(enableGC bool, totalAssistTime, no
 		throw("transitioning GC to the same state as before?")
 	}
 	// Flush whatever was left between the last update and now.
-	l.updateLocked(totalAssistTime, now)
-	if enableGC && totalAssistTime != 0 {
-		throw("assist time must be zero on entry to a GC cycle")
-	}
+	l.updateLocked(now)
 	l.gcEnabled = enableGC
 	l.transitioning = true
 	// N.B. finishGCTransition releases the lock.
@ -127,8 +121,6 @@ func (l *gcCPULimiterState) finishGCTransition(now int64) {
 	}
 	l.lastUpdate.Store(now)
 	l.transitioning = false
-	// Reset lastTotalAssistTime for the next GC cycle.
-	l.lastTotalAssistTime = 0
 	l.unlock()
 }

@ -142,12 +134,17 @@ func (l *gcCPULimiterState) needUpdate(now int64) bool {
 	return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod
 }

-// update updates the bucket given runtime-specific information. totalAssistTime must
-// be a value that increases monotonically throughout the GC cycle, and is reset
-// at the start of a new mark phase. now is the current monotonic time in nanoseconds.
+// addAssistTime notifies the limiter of additional assist time. It will be
+// included in the next update.
+func (l *gcCPULimiterState) addAssistTime(t int64) {
+	l.assistTimePool.Add(t)
+}
+
+// update updates the bucket given runtime-specific information. now is the
+// current monotonic time in nanoseconds.
 //
 // This is safe to call concurrently with other operations, except *GCTransition.
-func (l *gcCPULimiterState) update(totalAssistTime int64, now int64) {
+func (l *gcCPULimiterState) update(now int64) {
 	if !l.tryLock() {
 		// We failed to acquire the lock, which means something else is currently
 		// updating. Just drop our update, the next one to update will include
@ -157,31 +154,32 @@ func (l *gcCPULimiterState) update(totalAssistTime int64, now int64) {
 	if l.transitioning {
 		throw("update during transition")
 	}
-	l.updateLocked(totalAssistTime, now)
+	l.updateLocked(now)
 	l.unlock()
 }

 // updatedLocked is the implementation of update. l.lock must be held.
-func (l *gcCPULimiterState) updateLocked(totalAssistTime int64, now int64) {
+func (l *gcCPULimiterState) updateLocked(now int64) {
 	lastUpdate := l.lastUpdate.Load()
-	if now < lastUpdate || totalAssistTime < l.lastTotalAssistTime {
+	if now < lastUpdate {
 		// Defensively avoid overflow. This isn't even the latest update anyway.
-		// This might seem like a lot to back out on, but provided that both
-		// totalAssistTime and now are fresh, updaters must've been closely
-		// racing. It's close enough that it doesn't matter, and in the long
-		// term the result is the same.
 		return
 	}
 	windowTotalTime := (now - lastUpdate) * int64(l.nprocs)
 	l.lastUpdate.Store(now)
-	if !l.gcEnabled {
-		l.accumulate(windowTotalTime, 0)
-		return
+
+	// Drain the pool of assist time.
+	assistTime := l.assistTimePool.Load()
+	if assistTime != 0 {
+		l.assistTimePool.Add(-assistTime)
+	}
+
+	// Accumulate.
+	windowGCTime := assistTime
+	if l.gcEnabled {
+		windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
 	}
-	windowGCTime := totalAssistTime - l.lastTotalAssistTime
-	windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
 	l.accumulate(windowTotalTime-windowGCTime, windowGCTime)
-	l.lastTotalAssistTime = totalAssistTime
 }

 // accumulate adds time to the bucket and signals whether the limiter is enabled.
@ -249,7 +247,7 @@ func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) {
 		throw("failed to acquire lock to reset capacity")
 	}
 	// Flush the rest of the time for this period.
-	l.updateLocked(0, now)
+	l.updateLocked(now)
 	l.nprocs = nprocs

 	l.bucket.capacity = uint64(nprocs) * capacityPerProc
--- a/src/runtime/mgclimit_test.go
+++ b/src/runtime/mgclimit_test.go
@ -21,12 +21,11 @@ func TestGCCPULimiter(t *testing.T) {
 		return ticks
 	}

-	// Create mock assist time.
-	assistCPUTime := int64(0)
-	doAssist := func(d time.Duration, frac float64) int64 {
+	// assistTime computes the CPU time for assists using frac of GOMAXPROCS
+	// over the wall-clock duration d.
+	assistTime := func(d time.Duration, frac float64) int64 {
 		t.Helper()
-		assistCPUTime += int64(frac * float64(d) * procs)
-		return assistCPUTime
+		return int64(frac * float64(d) * procs)
 	}

 	l := NewGCCPULimiter(ticks, procs)
@ -45,9 +44,9 @@ func TestGCCPULimiter(t *testing.T) {

 		// Test filling the bucket with just mutator time.

-		l.Update(0, advance(10*time.Millisecond))
-		l.Update(0, advance(1*time.Second))
-		l.Update(0, advance(1*time.Hour))
+		l.Update(advance(10 * time.Millisecond))
+		l.Update(advance(1 * time.Second))
+		l.Update(advance(1 * time.Hour))
 		if l.Fill() != 0 {
 			t.Fatalf("expected empty bucket from only accumulating mutator time, got fill of %d cpu-ns", l.Fill())
 		}
@ -60,14 +59,14 @@ func TestGCCPULimiter(t *testing.T) {
 		if !l.NeedUpdate(advance(GCCPULimiterUpdatePeriod)) {
 			t.Fatal("doesn't need update even though updated 1.5 periods ago")
 		}
-		l.Update(0, advance(0))
+		l.Update(advance(0))
 		if l.NeedUpdate(advance(0)) {
 			t.Fatal("need update even though just updated")
 		}

 		// Test transitioning the bucket to enable the GC.

-		l.StartGCTransition(true, 0, advance(109*time.Millisecond))
+		l.StartGCTransition(true, advance(109*time.Millisecond))
 		l.FinishGCTransition(advance(2*time.Millisecond + 1*time.Microsecond))

 		if expect := uint64((2*time.Millisecond + 1*time.Microsecond) * procs); l.Fill() != expect {
@ -88,25 +87,27 @@ func TestGCCPULimiter(t *testing.T) {
 		// And here we want n=procs:
 		factor := (1 / (1 - 2*GCBackgroundUtilization))
 		fill := (2*time.Millisecond + 1*time.Microsecond) * procs
-		l.Update(0, advance(time.Duration(factor*float64(fill-procs)/procs)))
+		l.Update(advance(time.Duration(factor * float64(fill-procs) / procs)))
 		if l.Fill() != procs {
 			t.Fatalf("expected fill %d cpu-ns from draining after a GC started, got fill of %d cpu-ns", procs, l.Fill())
 		}

 		// Drain to zero for the rest of the test.
-		l.Update(0, advance(2*procs*CapacityPerProc))
+		l.Update(advance(2 * procs * CapacityPerProc))
 		if l.Fill() != 0 {
 			t.Fatalf("expected empty bucket from draining, got fill of %d cpu-ns", l.Fill())
 		}

 		// Test filling up the bucket with 50% total GC work (so, not moving the bucket at all).
-		l.Update(doAssist(10*time.Millisecond, 0.5-GCBackgroundUtilization), advance(10*time.Millisecond))
+		l.AddAssistTime(assistTime(10*time.Millisecond, 0.5-GCBackgroundUtilization))
+		l.Update(advance(10 * time.Millisecond))
 		if l.Fill() != 0 {
 			t.Fatalf("expected empty bucket from 50%% GC work, got fill of %d cpu-ns", l.Fill())
 		}

 		// Test adding to the bucket overall with 100% GC work.
-		l.Update(doAssist(time.Millisecond, 1.0-GCBackgroundUtilization), advance(time.Millisecond))
+		l.AddAssistTime(assistTime(time.Millisecond, 1.0-GCBackgroundUtilization))
+		l.Update(advance(time.Millisecond))
 		if expect := uint64(procs * time.Millisecond); l.Fill() != expect {
 			t.Errorf("expected %d fill from 100%% GC CPU, got fill of %d cpu-ns", expect, l.Fill())
 		}
@ -118,7 +119,8 @@ func TestGCCPULimiter(t *testing.T) {
 		}

 		// Test filling the bucket exactly full.
-		l.Update(doAssist(CapacityPerProc-time.Millisecond, 1.0-GCBackgroundUtilization), advance(CapacityPerProc-time.Millisecond))
+		l.AddAssistTime(assistTime(CapacityPerProc-time.Millisecond, 1.0-GCBackgroundUtilization))
+		l.Update(advance(CapacityPerProc - time.Millisecond))
 		if l.Fill() != l.Capacity() {
 			t.Errorf("expected bucket filled to capacity %d, got %d", l.Capacity(), l.Fill())
 		}
@ -134,7 +136,8 @@ func TestGCCPULimiter(t *testing.T) {

 		// Test adding with a delta of exactly zero. That is, GC work is exactly 50% of all resources.
 		// Specifically, the limiter should still be on, and no overflow should accumulate.
-		l.Update(doAssist(1*time.Second, 0.5-GCBackgroundUtilization), advance(1*time.Second))
+		l.AddAssistTime(assistTime(1*time.Second, 0.5-GCBackgroundUtilization))
+		l.Update(advance(1 * time.Second))
 		if l.Fill() != l.Capacity() {
 			t.Errorf("expected bucket filled to capacity %d, got %d", l.Capacity(), l.Fill())
 		}
@ -149,7 +152,8 @@ func TestGCCPULimiter(t *testing.T) {
 		}

 		// Drain the bucket by half.
-		l.Update(doAssist(CapacityPerProc, 0), advance(CapacityPerProc))
+		l.AddAssistTime(assistTime(CapacityPerProc, 0))
+		l.Update(advance(CapacityPerProc))
 		if expect := l.Capacity() / 2; l.Fill() != expect {
 			t.Errorf("failed to drain to %d, got fill %d", expect, l.Fill())
 		}
@ -161,7 +165,8 @@ func TestGCCPULimiter(t *testing.T) {
 		}

 		// Test overfilling the bucket.
-		l.Update(doAssist(CapacityPerProc, 1.0-GCBackgroundUtilization), advance(CapacityPerProc))
+		l.AddAssistTime(assistTime(CapacityPerProc, 1.0-GCBackgroundUtilization))
+		l.Update(advance(CapacityPerProc))
 		if l.Fill() != l.Capacity() {
 			t.Errorf("failed to fill to capacity %d, got fill %d", l.Capacity(), l.Fill())
 		}
@ -176,8 +181,8 @@ func TestGCCPULimiter(t *testing.T) {
 		}

 		// Test ending the cycle with some assists left over.
-
-		l.StartGCTransition(false, doAssist(1*time.Millisecond, 1.0-GCBackgroundUtilization), advance(1*time.Millisecond))
+		l.AddAssistTime(assistTime(1*time.Millisecond, 1.0-GCBackgroundUtilization))
+		l.StartGCTransition(false, advance(1*time.Millisecond))
 		if l.Fill() != l.Capacity() {
 			t.Errorf("failed to maintain fill to capacity %d, got fill %d", l.Capacity(), l.Fill())
 		}
@ -206,9 +211,6 @@ func TestGCCPULimiter(t *testing.T) {
 			t.FailNow()
 		}

-		// Reset the mock total assist CPU time, since we just ended the cycle.
-		assistCPUTime = 0
-
 		// Resize procs up and make sure limiting stops.
 		expectFill := l.Capacity()
 		l.ResetCapacity(advance(0), procs+10)
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@ -594,9 +594,10 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	_p_ := gp.m.p.ptr()
 	_p_.gcAssistTime += duration
 	if _p_.gcAssistTime > gcAssistTimeSlack {
-		assistTime := gcController.assistTime.Add(_p_.gcAssistTime)
+		gcController.assistTime.Add(_p_.gcAssistTime)
+		gcCPULimiter.addAssistTime(_p_.gcAssistTime)
+		gcCPULimiter.update(now)
 		_p_.gcAssistTime = 0
-		gcCPULimiter.update(assistTime+mheap_.pages.scav.assistTime.Load(), now)
 	}
 }

--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@ -800,7 +800,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p, now int64) *g {
 	// case the limiter is on but hasn't been checked in a while and
 	// so may have left sufficient headroom to turn off again.
 	if gcCPULimiter.needUpdate(now) {
-		gcCPULimiter.update(gcController.assistTime.Load(), now)
+		gcCPULimiter.update(now)
 	}

 	if !gcMarkWorkAvailable(_p_) {
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -1301,8 +1301,9 @@ HaveSpan:
 		start := nanotime()
 		h.pages.scavenge(bytesToScavenge)
 		now := nanotime()
-		assistTime := h.pages.scav.assistTime.Add(now - start)
-		gcCPULimiter.update(gcController.assistTime.Load()+assistTime, now)
+		h.pages.scav.assistTime.Add(now - start)
+		gcCPULimiter.addAssistTime(now - start)
+		gcCPULimiter.update(now)
 	}

 	// Commit and account for any scavenged memory that the span now owns.