From db7fd1c1420d5ef2f874f40c0349b35007568c77 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Tue, 6 Jan 2015 14:58:49 -0500 Subject: [PATCH] runtime: increase GC concurrency. run GC in its own background goroutine making the caller runnable if resources are available. This is critical in single goroutine applications. Allow goroutines that allocate a lot to help out the GC and in doing so throttle their own allocation. Adjust test so that it only detects that a GC is run during init calls and not whether the GC is memory efficient. Memory efficiency work will happen later in 1.5. Change-Id: I4306f5e377bb47c69bda1aedba66164f12b20c2b Reviewed-on: https://go-review.googlesource.com/2349 Reviewed-by: Russ Cox Reviewed-by: Austin Clements --- src/runtime/malloc.go | 76 +++++++++++++++++++++++++------ src/runtime/mgc.go | 103 ++++++++++++++++++++++++++++++++++++------ src/runtime/mgc0.go | 13 ++++++ test/init1.go | 2 +- 4 files changed, 166 insertions(+), 28 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 5cf4582822..bc14d2222d 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -39,10 +39,27 @@ type pageID uintptr // base address for all 0-byte allocations var zerobase uintptr +// Determine whether to initiate a GC. +// Currently the primitive heuristic we use will start a new +// concurrent GC when approximately half the available space +// made available by the last GC cycle has been used. +// If the GC is already working no need to trigger another one. +// This should establish a feedback loop where if the GC does not +// have sufficient time to complete then more memory will be +// requested from the OS increasing heap size thus allow future +// GCs more time to complete. +// memstat.heap_alloc and memstat.next_gc reads have benign races +// A false negative simple does not start a GC, a false positive +// will start a GC needlessly. Neither have correctness issues. +func shouldtriggergc() bool { + return memstats.heap_alloc+memstats.heap_alloc*3/4 >= memstats.next_gc && atomicloaduint(&bggc.working) == 0 +} + // Allocate an object of size bytes. // Small objects are allocated from the per-P cache's free lists. // Large objects (> 32 kB) are allocated straight from the heap. func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { + shouldhelpgc := false if size == 0 { return unsafe.Pointer(&zerobase) } @@ -144,6 +161,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { systemstack(func() { mCache_Refill(c, tinySizeClass) }) + shouldhelpgc = true s = c.alloc[tinySizeClass] v = s.freelist } @@ -174,6 +192,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { systemstack(func() { mCache_Refill(c, int32(sizeclass)) }) + shouldhelpgc = true s = c.alloc[sizeclass] v = s.freelist } @@ -191,6 +210,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { c.local_cachealloc += intptr(size) } else { var s *mspan + shouldhelpgc = true systemstack(func() { s = largeAlloc(size, uint32(flags)) }) @@ -345,8 +365,15 @@ marked: } } - if memstats.heap_alloc >= memstats.next_gc/2 { + if shouldtriggergc() { gogc(0) + } else if shouldhelpgc && atomicloaduint(&bggc.working) == 1 { + // bggc.lock not taken since race on bggc.working is benign. + // At worse we don't call gchelpwork. + // Delay the gchelpwork until the epilogue so that it doesn't + // interfere with the inner working of malloc such as + // mcache refills that might happen while doing the gchelpwork + systemstack(gchelpwork) } return x @@ -466,14 +493,25 @@ func gogc(force int32) { releasem(mp) mp = nil - semacquire(&worldsema, false) - - if force == 0 && memstats.heap_alloc < memstats.next_gc { - // typically threads which lost the race to grab - // worldsema exit here when gc is done. - semrelease(&worldsema) - return + if force == 0 { + lock(&bggc.lock) + if !bggc.started { + bggc.working = 1 + bggc.started = true + go backgroundgc() + } else if bggc.working == 0 { + bggc.working = 1 + ready(bggc.g) + } + unlock(&bggc.lock) + } else { + gcwork(force) } +} + +func gcwork(force int32) { + + semacquire(&worldsema, false) // Pick up the remaining unswept/not being swept spans concurrently for gosweepone() != ^uintptr(0) { @@ -482,14 +520,17 @@ func gogc(force int32) { // Ok, we're doing it! Stop everybody else - startTime := nanotime() - mp = acquirem() + mp := acquirem() mp.gcing = 1 releasem(mp) gctimer.count++ if force == 0 { gctimer.cycle.sweepterm = nanotime() } + // Pick up the remaining unswept/not being swept spans before we STW + for gosweepone() != ^uintptr(0) { + sweep.nbgsweep++ + } systemstack(stoptheworld) systemstack(finishsweep_m) // finish sweep before we start concurrent scan. if force == 0 { // Do as much work concurrently as possible @@ -500,7 +541,7 @@ func gogc(force int32) { systemstack(gcscan_m) gctimer.cycle.installmarkwb = nanotime() systemstack(stoptheworld) - gcinstallmarkwb() + systemstack(gcinstallmarkwb) systemstack(starttheworld) gctimer.cycle.mark = nanotime() systemstack(gcmark_m) @@ -509,6 +550,7 @@ func gogc(force int32) { systemstack(gcinstalloffwb_m) } + startTime := nanotime() if mp != acquirem() { throw("gogc: rescheduled") } @@ -527,6 +569,7 @@ func gogc(force int32) { eagersweep := force >= 2 for i := 0; i < n; i++ { if i > 0 { + // refresh start time if doing a second GC startTime = nanotime() } // switch to g0, call gc, then switch back @@ -579,8 +622,8 @@ func GCcheckmarkdisable() { // gctimes records the time in nanoseconds of each phase of the concurrent GC. type gctimes struct { sweepterm int64 // stw - scan int64 // stw - installmarkwb int64 + scan int64 + installmarkwb int64 // stw mark int64 markterm int64 // stw sweep int64 @@ -601,7 +644,7 @@ type gcchronograph struct { var gctimer gcchronograph -// GCstarttimes initializes the gc timess. All previous timess are lost. +// GCstarttimes initializes the gc times. All previous times are lost. func GCstarttimes(verbose int64) { gctimer = gcchronograph{verbose: verbose} } @@ -655,6 +698,11 @@ func calctimes() gctimes { // the information from the most recent Concurent GC cycle. Calls from the // application to runtime.GC() are ignored. func GCprinttimes() { + if gctimer.verbose == 0 { + println("GC timers not enabled") + return + } + // Explicitly put times on the heap so printPhase can use it. times := new(gctimes) *times = calctimes() diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go index 35edd8aa30..4d0900a41c 100644 --- a/src/runtime/mgc.go +++ b/src/runtime/mgc.go @@ -123,7 +123,7 @@ const ( _DebugGCPtrs = false // if true, print trace of every pointer load during GC _ConcurrentSweep = true - _WorkbufSize = 4 * 1024 + _WorkbufSize = 4 * 256 _FinBlockSize = 4 * 1024 _RootData = 0 _RootBss = 1 @@ -191,9 +191,9 @@ var badblock [1024]uintptr var nbadblock int32 type workdata struct { - full uint64 // lock-free list of full blocks - empty uint64 // lock-free list of empty blocks - partial uint64 // lock-free list of partially filled blocks + full uint64 // lock-free list of full blocks workbuf + empty uint64 // lock-free list of empty blocks workbuf + partial uint64 // lock-free list of partially filled blocks workbuf pad0 [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait nproc uint32 tstart int64 @@ -587,6 +587,11 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { // base and extent. b := b0 n := n0 + + // ptrmask can have 2 possible values: + // 1. nil - obtain pointer mask from GC bitmap. + // 2. pointer to a compact mask (for stacks and data). + wbuf := getpartialorempty() if b != 0 { wbuf = scanobject(b, n, ptrmask, wbuf) @@ -600,23 +605,23 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { return } } - if gcphase == _GCscan { - throw("scanblock: In GCscan phase but no b passed in.") - } - keepworking := b == 0 + drainallwbufs := b == 0 + drainworkbuf(wbuf, drainallwbufs) +} +// Scan objects in wbuf until wbuf is empty. +// If drainallwbufs is true find all other available workbufs and repeat the process. +//go:nowritebarrier +func drainworkbuf(wbuf *workbuf, drainallwbufs bool) { if gcphase != _GCmark && gcphase != _GCmarktermination { println("gcphase", gcphase) throw("scanblock phase") } - // ptrmask can have 2 possible values: - // 1. nil - obtain pointer mask from GC bitmap. - // 2. pointer to a compact mask (for stacks and data). for { if wbuf.nobj == 0 { - if !keepworking { + if !drainallwbufs { putempty(wbuf) return } @@ -641,11 +646,32 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { // PREFETCH(wbuf->obj[wbuf->nobj - 3]; // } wbuf.nobj-- - b = wbuf.obj[wbuf.nobj] + b := wbuf.obj[wbuf.nobj] wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf) } } +// Scan at most count objects in the wbuf. +//go:nowritebarrier +func drainobjects(wbuf *workbuf, count uintptr) { + for i := uintptr(0); i < count; i++ { + if wbuf.nobj == 0 { + putempty(wbuf) + return + } + + // This might be a good place to add prefetch code... + // if(wbuf->nobj > 4) { + // PREFETCH(wbuf->obj[wbuf->nobj - 3]; + // } + wbuf.nobj-- + b := wbuf.obj[wbuf.nobj] + wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf) + } + putpartial(wbuf) + return +} + //go:nowritebarrier func markroot(desc *parfor, i uint32) { // Note: if you add a case here, please also update heapdump.c:dumproots. @@ -809,6 +835,17 @@ func putpartial(b *workbuf) { } } +// trygetfull tries to get a full or partially empty workbuffer. +// if one is not immediately available return nil +//go:nowritebarrier +func trygetfull() *workbuf { + wbuf := (*workbuf)(lfstackpop(&work.full)) + if wbuf == nil { + wbuf = (*workbuf)(lfstackpop(&work.partial)) + } + return wbuf +} + // Get a full work buffer off the work.full or a partially // filled one off the work.partial list. If nothing is available // wait until all the other gc helpers have finished and then @@ -1090,6 +1127,38 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) { } } +// gchelpwork does a small bounded amount of gc work. The purpose is to +// shorten the time (as measured by allocations) spent doing a concurrent GC. +// The number of mutator calls is roughly propotional to the number of allocations +// made by that mutator. This slows down the allocation while speeding up the GC. +//go:nowritebarrier +func gchelpwork() { + switch gcphase { + default: + throw("gcphasework in bad gcphase") + case _GCoff, _GCquiesce, _GCstw: + // No work. + case _GCsweep: + // We could help by calling sweepone to sweep a single span. + // _ = sweepone() + case _GCscan: + // scan the stack, mark the objects, put pointers in work buffers + // hanging off the P where this is being run. + // scanstack(gp) + case _GCmark: + // Get a full work buffer and empty it. + var wbuf *workbuf + wbuf = trygetfull() + if wbuf != nil { + drainobjects(wbuf, uintptr(len(wbuf.obj))) // drain upto one buffer's worth of objects + } + case _GCmarktermination: + // We should never be here since the world is stopped. + // All available mark work will be emptied before returning. + throw("gcphasework in bad gcphase") + } +} + // The gp has been moved to a GC safepoint. GC phase specific // work is done here. //go:nowritebarrier @@ -1425,6 +1494,14 @@ type sweepdata struct { var sweep sweepdata +// State of the background concurrent GC goroutine. +var bggc struct { + lock mutex + g *g + working uint + started bool +} + // sweeps one span // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep //go:nowritebarrier diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index b9718cbd18..625c8740f7 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -78,6 +78,19 @@ func clearpools() { } } +// backgroundgc is running in a goroutine and does the concurrent GC work. +// bggc holds the state of the backgroundgc. +func backgroundgc() { + bggc.g = getg() + bggc.g.issystem = true + for { + gcwork(0) + lock(&bggc.lock) + bggc.working = 0 + goparkunlock(&bggc.lock, "Concurrent GC wait") + } +} + func bgsweep() { sweep.g = getg() getg().issystem = true diff --git a/test/init1.go b/test/init1.go index f6eda6edfe..83e9149f4c 100644 --- a/test/init1.go +++ b/test/init1.go @@ -31,7 +31,7 @@ func init() { } runtime.ReadMemStats(memstats) sys1 := memstats.Sys - if sys1-sys > chunk*50 { + if sys1-sys > chunk*500 { println("allocated 1000 chunks of", chunk, "and used ", sys1-sys, "memory") panic("init1") }