From 0368a7ceb6e741a641a07e3ae381bdc9fc160a15 Mon Sep 17 00:00:00 2001 From: Carl Shapiro Date: Tue, 3 Dec 2013 14:12:55 -0800 Subject: [PATCH] runtime: move stack scanning into the parallel mark phase This change reduces the cost of the stack scanning by frames. It moves the stack scanning from the serial root enumeration phase to the parallel tracing phase. The output that follows are timings for the issue 6482 benchmark Baseline BenchmarkGoroutineSelect 50 108027405 ns/op BenchmarkGoroutineBlocking 50 89573332 ns/op BenchmarkGoroutineForRange 20 95614116 ns/op BenchmarkGoroutineIdle 20 122809512 ns/op Stack scan by frames, non-parallel BenchmarkGoroutineSelect 20 297138929 ns/op BenchmarkGoroutineBlocking 20 301137599 ns/op BenchmarkGoroutineForRange 10 312499469 ns/op BenchmarkGoroutineIdle 10 209428876 ns/op Stack scan by frames, parallel BenchmarkGoroutineSelect 20 183938431 ns/op BenchmarkGoroutineBlocking 20 170109999 ns/op BenchmarkGoroutineForRange 20 179628882 ns/op BenchmarkGoroutineIdle 20 157541498 ns/op The remaining performance disparity is due to inefficiencies in gentraceback and its callees. The effect was isolated by using a parallel stack scan where scanstack was modified to do a conservative scan of the stack segments without gentraceback followed by a call of gentrackback with a no-op callback. The output that follows are the top-10 most frequent tops of stacks as determined by the Linux perf record facility. Baseline + 25.19% gc.test gc.test [.] runtime.xchg + 19.00% gc.test gc.test [.] scanblock + 8.53% gc.test gc.test [.] scanstack + 8.46% gc.test gc.test [.] flushptrbuf + 5.08% gc.test gc.test [.] procresize + 3.57% gc.test gc.test [.] runtime.chanrecv + 2.94% gc.test gc.test [.] dequeue + 2.74% gc.test gc.test [.] addroots + 2.25% gc.test gc.test [.] runtime.ready + 1.33% gc.test gc.test [.] runtime.cas64 Gentraceback + 18.12% gc.test gc.test [.] runtime.xchg + 14.68% gc.test gc.test [.] scanblock + 8.20% gc.test gc.test [.] runtime.gentraceback + 7.38% gc.test gc.test [.] flushptrbuf + 6.84% gc.test gc.test [.] scanstack + 5.92% gc.test gc.test [.] runtime.findfunc + 3.62% gc.test gc.test [.] procresize + 3.15% gc.test gc.test [.] readvarint + 1.92% gc.test gc.test [.] addroots + 1.87% gc.test gc.test [.] runtime.chanrecv R=golang-dev, dvyukov, rsc CC=golang-dev https://golang.org/cl/17410043 --- src/pkg/runtime/mgc0.c | 324 ++++++++++++++++++++++++----------------- 1 file changed, 191 insertions(+), 133 deletions(-) diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 4b2108ba7a1..8275af5042c 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -19,7 +19,7 @@ enum { Debug = 0, DebugMark = 0, // run second pass to check mark CollectStats = 0, - ScanStackByFrames = 0, + ScanStackByFrames = 1, IgnorePreciseGC = 0, // Four bits per word (see #defines below). @@ -151,6 +151,7 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); static void gchelperstart(void); +static void scanstack(G* gp, void *scanbuf); static struct { uint64 full; // lock-free list of full blocks @@ -176,6 +177,7 @@ static struct { enum { GC_DEFAULT_PTR = GC_NUM_INSTR, GC_CHAN, + GC_G_PTR, GC_NUM_INSTR2 }; @@ -317,6 +319,24 @@ struct PtrTarget uintptr ti; }; +typedef struct Scanbuf Scanbuf; +struct Scanbuf +{ + struct { + PtrTarget *begin; + PtrTarget *end; + PtrTarget *pos; + } ptr; + struct { + Obj *begin; + Obj *end; + Obj *pos; + } obj; + Workbuf *wbuf; + Obj *wp; + uintptr nobj; +}; + typedef struct BufferList BufferList; struct BufferList { @@ -350,7 +370,7 @@ static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj); // flushptrbuf // (find block start, mark and enqueue) static void -flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj) +flushptrbuf(Scanbuf *sbuf) { byte *p, *arena_start, *obj; uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n; @@ -358,17 +378,19 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf PageID k; Obj *wp; Workbuf *wbuf; + PtrTarget *ptrbuf; PtrTarget *ptrbuf_end; arena_start = runtime·mheap.arena_start; - wp = *_wp; - wbuf = *_wbuf; - nobj = *_nobj; + wp = sbuf->wp; + wbuf = sbuf->wbuf; + nobj = sbuf->nobj; - ptrbuf_end = *ptrbufpos; - n = ptrbuf_end - ptrbuf; - *ptrbufpos = ptrbuf; + ptrbuf = sbuf->ptr.begin; + ptrbuf_end = sbuf->ptr.pos; + n = ptrbuf_end - sbuf->ptr.begin; + sbuf->ptr.pos = sbuf->ptr.begin; if(CollectStats) { runtime·xadd64(&gcstats.ptr.sum, n); @@ -514,25 +536,27 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf } } - *_wp = wp; - *_wbuf = wbuf; - *_nobj = nobj; + sbuf->wp = wp; + sbuf->wbuf = wbuf; + sbuf->nobj = nobj; } static void -flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj) +flushobjbuf(Scanbuf *sbuf) { uintptr nobj, off; Obj *wp, obj; Workbuf *wbuf; + Obj *objbuf; Obj *objbuf_end; - wp = *_wp; - wbuf = *_wbuf; - nobj = *_nobj; + wp = sbuf->wp; + wbuf = sbuf->wbuf; + nobj = sbuf->nobj; - objbuf_end = *objbufpos; - *objbufpos = objbuf; + objbuf = sbuf->obj.begin; + objbuf_end = sbuf->obj.pos; + sbuf->obj.pos = sbuf->obj.begin; while(objbuf < objbuf_end) { obj = *objbuf++; @@ -570,9 +594,9 @@ flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_ wp = wbuf->obj + nobj; } - *_wp = wp; - *_wbuf = wbuf; - *_nobj = nobj; + sbuf->wp = wp; + sbuf->wbuf = wbuf; + sbuf->nobj = nobj; } // Program that scans the whole block and treats every block element as a potential pointer @@ -581,6 +605,9 @@ static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR}; // Hchan program static uintptr chanProg[2] = {0, GC_CHAN}; +// G* program +static uintptr gptrProg[2] = {0, GC_G_PTR}; + // Local variables of a program fragment or loop typedef struct Frame Frame; struct Frame { @@ -666,8 +693,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) Slice *sliceptr; Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4]; BufferList *scanbuffers; - PtrTarget *ptrbuf, *ptrbuf_end, *ptrbufpos; - Obj *objbuf, *objbuf_end, *objbufpos; + Scanbuf sbuf; Eface *eface; Iface *iface; Hchan *chan; @@ -681,21 +707,22 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) arena_used = runtime·mheap.arena_used; stack_ptr = stack+nelem(stack)-1; - + precise_type = false; nominal_size = 0; - // Allocate ptrbuf - { - scanbuffers = &bufferList[m->helpgc]; - ptrbuf = &scanbuffers->ptrtarget[0]; - ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget); - objbuf = &scanbuffers->obj[0]; - objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj); - } + // Initialize sbuf + scanbuffers = &bufferList[m->helpgc]; - ptrbufpos = ptrbuf; - objbufpos = objbuf; + sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0]; + sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget); + + sbuf.obj.begin = sbuf.obj.pos = &scanbuffers->obj[0]; + sbuf.obj.end = sbuf.obj.begin + nelem(scanbuffers->obj); + + sbuf.wbuf = wbuf; + sbuf.wp = wp; + sbuf.nobj = nobj; // (Silence the compiler) chan = nil; @@ -713,7 +740,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) if(CollectStats) { runtime·xadd64(&gcstats.nbytes, n); - runtime·xadd64(&gcstats.obj.sum, nobj); + runtime·xadd64(&gcstats.obj.sum, sbuf.nobj); runtime·xadd64(&gcstats.obj.cnt, 1); } @@ -839,9 +866,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) // eface->type t = eface->type; if((void*)t >= arena_start && (void*)t < arena_used) { - *ptrbufpos++ = (PtrTarget){t, 0}; - if(ptrbufpos == ptrbuf_end) - flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj); + *sbuf.ptr.pos++ = (PtrTarget){t, 0}; + if(sbuf.ptr.pos == sbuf.ptr.end) + flushptrbuf(&sbuf); } // eface->data @@ -868,9 +895,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) // iface->tab if((void*)iface->tab >= arena_start && (void*)iface->tab < arena_used) { - *ptrbufpos++ = (PtrTarget){iface->tab, (uintptr)itabtype->gc}; - if(ptrbufpos == ptrbuf_end) - flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj); + *sbuf.ptr.pos++ = (PtrTarget){iface->tab, (uintptr)itabtype->gc}; + if(sbuf.ptr.pos == sbuf.ptr.end) + flushptrbuf(&sbuf); } // iface->data @@ -895,9 +922,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) obj = *(byte**)stack_top.b; stack_top.b += PtrSize; if(obj >= arena_start && obj < arena_used) { - *ptrbufpos++ = (PtrTarget){obj, 0}; - if(ptrbufpos == ptrbuf_end) - flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj); + *sbuf.ptr.pos++ = (PtrTarget){obj, 0}; + if(sbuf.ptr.pos == sbuf.ptr.end) + flushptrbuf(&sbuf); } } goto next_block; @@ -926,7 +953,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) if(*(byte**)i != nil) { // Found a value that may be a pointer. // Do a rescan of the entire block. - enqueue((Obj){b, n, 0}, &wbuf, &wp, &nobj); + enqueue((Obj){b, n, 0}, &sbuf.wbuf, &sbuf.wp, &sbuf.nobj); if(CollectStats) { runtime·xadd64(&gcstats.rescan, 1); runtime·xadd64(&gcstats.rescanbytes, n); @@ -972,9 +999,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) objti = pc[3]; pc += 4; - *objbufpos++ = (Obj){obj, size, objti}; - if(objbufpos == objbuf_end) - flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj); + *sbuf.obj.pos++ = (Obj){obj, size, objti}; + if(sbuf.obj.pos == sbuf.obj.end) + flushobjbuf(&sbuf); continue; case GC_CHAN_PTR: @@ -1007,10 +1034,10 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) // in-use part of the circular buffer is scanned. // (Channel routines zero the unused part, so the current // code does not lead to leaks, it's just a little inefficient.) - *objbufpos++ = (Obj){(byte*)chan+runtime·Hchansize, chancap*chantype->elem->size, + *sbuf.obj.pos++ = (Obj){(byte*)chan+runtime·Hchansize, chancap*chantype->elem->size, (uintptr)chantype->elem->gc | PRECISE | LOOP}; - if(objbufpos == objbuf_end) - flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj); + if(sbuf.obj.pos == sbuf.obj.end) + flushobjbuf(&sbuf); } } if(chan_ret == nil) @@ -1018,15 +1045,20 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) pc = chan_ret; continue; + case GC_G_PTR: + obj = (void*)stack_top.b; + scanstack(obj, &sbuf); + goto next_block; + default: runtime·throw("scanblock: invalid GC instruction"); return; } if(obj >= arena_start && obj < arena_used) { - *ptrbufpos++ = (PtrTarget){obj, objti}; - if(ptrbufpos == ptrbuf_end) - flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj); + *sbuf.ptr.pos++ = (PtrTarget){obj, objti}; + if(sbuf.ptr.pos == sbuf.ptr.end) + flushptrbuf(&sbuf); } } @@ -1034,34 +1066,32 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) // Done scanning [b, b+n). Prepare for the next iteration of // the loop by setting b, n, ti to the parameters for the next block. - if(nobj == 0) { - flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj); - flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj); + if(sbuf.nobj == 0) { + flushptrbuf(&sbuf); + flushobjbuf(&sbuf); - if(nobj == 0) { + if(sbuf.nobj == 0) { if(!keepworking) { - if(wbuf) - putempty(wbuf); - goto endscan; + if(sbuf.wbuf) + putempty(sbuf.wbuf); + return; } // Emptied our buffer: refill. - wbuf = getfull(wbuf); - if(wbuf == nil) - goto endscan; - nobj = wbuf->nobj; - wp = wbuf->obj + wbuf->nobj; + sbuf.wbuf = getfull(sbuf.wbuf); + if(sbuf.wbuf == nil) + return; + sbuf.nobj = sbuf.wbuf->nobj; + sbuf.wp = sbuf.wbuf->obj + sbuf.wbuf->nobj; } } // Fetch b from the work buffer. - --wp; - b = wp->p; - n = wp->n; - ti = wp->ti; - nobj--; + --sbuf.wp; + b = sbuf.wp->p; + n = sbuf.wp->n; + ti = sbuf.wp->ti; + sbuf.nobj--; } - -endscan:; } // debug_scanblock is the debug copy of scanblock. @@ -1340,7 +1370,7 @@ struct BitVector // Scans an interface data value when the interface type indicates // that it is a pointer. static void -scaninterfacedata(uintptr bits, byte *scanp, bool afterprologue) +scaninterfacedata(uintptr bits, byte *scanp, bool afterprologue, Scanbuf *sbuf) { Itab *tab; Type *type; @@ -1356,12 +1386,14 @@ scaninterfacedata(uintptr bits, byte *scanp, bool afterprologue) return; } } - addroot((Obj){scanp+PtrSize, PtrSize, 0}); + *sbuf->obj.pos++ = (Obj){scanp+PtrSize, PtrSize, 0}; + if(sbuf->obj.pos == sbuf->obj.end) + flushobjbuf(sbuf); } // Starting from scanp, scans words corresponding to set bits. static void -scanbitvector(byte *scanp, BitVector *bv, bool afterprologue) +scanbitvector(byte *scanp, BitVector *bv, bool afterprologue, Scanbuf *sbuf) { uintptr word, bits; uint32 *wordp; @@ -1378,71 +1410,28 @@ scanbitvector(byte *scanp, BitVector *bv, bool afterprologue) for(; i > 0; i--) { bits = word & 3; if(bits != BitsNoPointer && *(void**)scanp != nil) - if(bits == BitsPointer) - addroot((Obj){scanp, PtrSize, 0}); - else - scaninterfacedata(bits, scanp, afterprologue); + if(bits == BitsPointer) { + *sbuf->obj.pos++ = (Obj){scanp, PtrSize, 0}; + if(sbuf->obj.pos == sbuf->obj.end) + flushobjbuf(sbuf); + } else + scaninterfacedata(bits, scanp, afterprologue, sbuf); word >>= BitsPerPointer; scanp += PtrSize; } } } -// Scan a stack frame: local variables and function arguments/results. -static void -addframeroots(Stkframe *frame, void*) -{ - Func *f; - BitVector *args, *locals; - uintptr size; - bool afterprologue; - - f = frame->fn; - - // Scan local variables if stack frame has been allocated. - // Use pointer information if known. - afterprologue = (frame->varp > (byte*)frame->sp); - if(afterprologue) { - locals = runtime·funcdata(f, FUNCDATA_GCLocals); - if(locals == nil) { - // No locals information, scan everything. - size = frame->varp - (byte*)frame->sp; - addroot((Obj){frame->varp - size, size, 0}); - } else if(locals->n < 0) { - // Locals size information, scan just the - // locals. - size = -locals->n; - addroot((Obj){frame->varp - size, size, 0}); - } else if(locals->n > 0) { - // Locals bitmap information, scan just the - // pointers in locals. - size = (locals->n*PtrSize) / BitsPerPointer; - scanbitvector(frame->varp - size, locals, afterprologue); - } - } - - // Scan arguments. - // Use pointer information if known. - args = runtime·funcdata(f, FUNCDATA_GCArgs); - if(args != nil && args->n > 0) - scanbitvector(frame->argp, args, false); - else - addroot((Obj){frame->argp, frame->arglen, 0}); -} - static void addstackroots(G *gp) { M *mp; int32 n; Stktop *stk; - uintptr sp, guard, pc, lr; + uintptr sp, guard; void *base; uintptr size; - stk = (Stktop*)gp->stackbase; - guard = gp->stackguard; - if(gp == g) runtime·throw("can't scan our own stack"); if((mp = gp->m) != nil && mp->helpgc) @@ -1454,28 +1443,24 @@ addstackroots(G *gp) // Use the stack segment and stack pointer at the time of // the system call instead, since that won't change underfoot. sp = gp->syscallsp; - pc = gp->syscallpc; - lr = 0; stk = (Stktop*)gp->syscallstack; guard = gp->syscallguard; } else { // Scanning another goroutine's stack. // The goroutine is usually asleep (the world is stopped). sp = gp->sched.sp; - pc = gp->sched.pc; - lr = gp->sched.lr; - + stk = (Stktop*)gp->stackbase; + guard = gp->stackguard; // For function about to start, context argument is a root too. if(gp->sched.ctxt != 0 && runtime·mlookup(gp->sched.ctxt, &base, &size, nil)) addroot((Obj){base, size, 0}); } if(ScanStackByFrames) { + USED(sp); USED(stk); USED(guard); - runtime·gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, addframeroots, nil, false); + addroot((Obj){(byte*)gp, PtrSize, (uintptr)gptrProg}); } else { - USED(lr); - USED(pc); n = 0; while(stk) { if(sp < guard-StackGuard || (uintptr)stk < sp) { @@ -1491,6 +1476,79 @@ addstackroots(G *gp) } } +static void +scanframe(Stkframe *frame, void *arg) +{ + BitVector *args, *locals; + Scanbuf *sbuf; + uintptr size; + bool afterprologue; + + sbuf = arg; + // Scan local variables if stack frame has been allocated. + // Use pointer information if known. + afterprologue = (frame->varp > (byte*)frame->sp); + if(afterprologue) { + locals = runtime·funcdata(frame->fn, FUNCDATA_GCLocals); + if(locals == nil) { + // No locals information, scan everything. + size = frame->varp - (byte*)frame->sp; + *sbuf->obj.pos++ = (Obj){frame->varp - size, size, 0}; + if(sbuf->obj.pos == sbuf->obj.end) + flushobjbuf(sbuf); + } else if(locals->n < 0) { + // Locals size information, scan just the + // locals. + size = -locals->n; + *sbuf->obj.pos++ = (Obj){frame->varp - size, size, 0}; + if(sbuf->obj.pos == sbuf->obj.end) + flushobjbuf(sbuf); + } else if(locals->n > 0) { + // Locals bitmap information, scan just the + // pointers in locals. + size = (locals->n*PtrSize) / BitsPerPointer; + scanbitvector(frame->varp - size, locals, afterprologue, sbuf); + } + } + + // Scan arguments. + // Use pointer information if known. + args = runtime·funcdata(frame->fn, FUNCDATA_GCArgs); + if(args != nil && args->n > 0) + scanbitvector(frame->argp, args, false, sbuf); + else { + *sbuf->obj.pos++ = (Obj){frame->argp, frame->arglen, 0}; + if(sbuf->obj.pos == sbuf->obj.end) + flushobjbuf(sbuf); + } +} + +static void +scanstack(G* gp, void *scanbuf) +{ + uintptr pc; + uintptr sp; + uintptr lr; + + if(gp->syscallstack != (uintptr)nil) { + // Scanning another goroutine that is about to enter or might + // have just exited a system call. It may be executing code such + // as schedlock and may have needed to start a new stack segment. + // Use the stack segment and stack pointer at the time of + // the system call instead, since that won't change underfoot. + sp = gp->syscallsp; + pc = gp->syscallpc; + lr = 0; + } else { + // Scanning another goroutine's stack. + // The goroutine is usually asleep (the world is stopped). + sp = gp->sched.sp; + pc = gp->sched.pc; + lr = gp->sched.lr; + } + runtime·gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, scanframe, scanbuf, false); +} + static void addfinroots(void *v) {