From d4c80d19a80cbdf946102f3b787ce23bf95e4e12 Mon Sep 17 00:00:00 2001 From: Dmitriy Vyukov Date: Thu, 21 Mar 2013 12:48:02 +0400 Subject: [PATCH] runtime: faster parallel GC Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase. benchmark old ns/op new ns/op delta garbage.BenchmarkTree2-8 97100768 71417553 -26.45% garbage.BenchmarkTree2LastPause-8 970931485 714103692 -26.45% garbage.BenchmarkTree2Pause-8 469127802 345029253 -26.45% garbage.BenchmarkParser-8 2880950854 2715456901 -5.74% garbage.BenchmarkParserLastPause-8 137047399 103336476 -24.60% garbage.BenchmarkParserPause-8 80686028 58922680 -26.97% R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant CC=golang-dev https://golang.org/cl/7816044 --- src/pkg/runtime/mgc0.c | 42 +++++++++++++++++++++--------------------- src/pkg/runtime/proc.c | 8 ++++---- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index a79c22ef955..dd268fcda0f 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -140,6 +140,7 @@ static Workbuf* getempty(Workbuf*); static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); +static void gchelperstart(void); static struct { uint64 full; // lock-free list of full blocks @@ -287,11 +288,12 @@ struct BufferList { PtrTarget ptrtarget[IntermediateBufferCapacity]; Obj obj[IntermediateBufferCapacity]; - BufferList *next; + uint32 busy; + byte pad[CacheLineSize]; }; -static BufferList *bufferList; +#pragma dataflag 16 // no pointers +static BufferList bufferList[MaxGcproc]; -static Lock lock; static Type *itabtype; static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj); @@ -598,23 +600,11 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) // Allocate ptrbuf { - runtime·lock(&lock); - - if(bufferList == nil) { - bufferList = runtime·SysAlloc(sizeof(*bufferList)); - if(bufferList == nil) - runtime·throw("runtime: cannot allocate memory"); - bufferList->next = nil; - } - scanbuffers = bufferList; - bufferList = bufferList->next; - + scanbuffers = &bufferList[m->helpgc]; ptrbuf = &scanbuffers->ptrtarget[0]; ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget); objbuf = &scanbuffers->obj[0]; objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj); - - runtime·unlock(&lock); } ptrbufpos = ptrbuf; @@ -1056,11 +1046,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) nobj--; } -endscan: - runtime·lock(&lock); - scanbuffers->next = bufferList; - bufferList = scanbuffers; - runtime·unlock(&lock); +endscan:; } // debug_scanblock is the debug copy of scanblock. @@ -1688,6 +1674,8 @@ runtime·memorydump(void) void runtime·gchelper(void) { + gchelperstart(); + // parallel mark for over gc roots runtime·parfordo(work.markfor); @@ -1701,6 +1689,7 @@ runtime·gchelper(void) } runtime·parfordo(work.sweepfor); + bufferList[m->helpgc].busy = 0; if(runtime·xadd(&work.ndone, +1) == work.nproc-1) runtime·notewakeup(&work.alldone); } @@ -1892,6 +1881,7 @@ gc(struct gc_args *args) t1 = runtime·nanotime(); + gchelperstart(); runtime·parfordo(work.markfor); scanblock(nil, nil, 0, true); @@ -1903,6 +1893,7 @@ gc(struct gc_args *args) t2 = runtime·nanotime(); runtime·parfordo(work.sweepfor); + bufferList[m->helpgc].busy = 0; t3 = runtime·nanotime(); if(work.nproc > 1) @@ -2043,6 +2034,15 @@ runtime∕debug·setGCPercent(intgo in, intgo out) FLUSH(&out); } +static void +gchelperstart(void) +{ + if(m->helpgc < 0 || m->helpgc >= MaxGcproc) + runtime·throw("gchelperstart: bad m->helpgc"); + if(runtime·xchg(&bufferList[m->helpgc].busy, 1)) + runtime·throw("gchelperstart: already busy"); +} + static void runfinq(void) { diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index a6ef83ba73d..8d05730e431 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -332,7 +332,7 @@ runtime·helpgc(int32 nproc) mp = mget(); if(mp == nil) runtime·throw("runtime·gcprocs inconsistency"); - mp->helpgc = 1; + mp->helpgc = n; mp->mcache = runtime·allp[pos]->mcache; pos++; runtime·notewakeup(&mp->park); @@ -386,7 +386,7 @@ runtime·stoptheworld(void) static void mhelpgc(void) { - m->helpgc = 1; + m->helpgc = -1; } void @@ -485,7 +485,7 @@ runtime·mstart(void) m->mstartfn(); if(m->helpgc) { - m->helpgc = false; + m->helpgc = 0; stopm(); } else if(m != &runtime·m0) { acquirep(m->nextp); @@ -794,8 +794,8 @@ retry: runtime·notesleep(&m->park); runtime·noteclear(&m->park); if(m->helpgc) { - m->helpgc = 0; runtime·gchelper(); + m->helpgc = 0; m->mcache = nil; goto retry; }