go/src/pkg/runtime/mgc0.c

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Garbage collector (GC).
//
// GC is:
// - mark&sweep
// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
// - parallel (up to MaxGcproc threads)
// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
// - non-moving/non-compacting
// - full (non-partial)
//
// GC rate.
// Next GC is after we've allocated an extra amount of memory proportional to
// the amount already in use. The proportion is controlled by GOGC environment variable
// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
// (and also the amount of extra memory used).
//
// Concurrent sweep.
// The sweep phase proceeds concurrently with normal program execution.
// The heap is swept span-by-span both lazily (when a goroutine needs another span)
// and concurrently in a background goroutine (this helps programs that are not CPU bound).
// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
// and so next_gc calculation is tricky and happens as follows.
// At the end of the stop-the-world phase next_gc is conservatively set based on total
// heap size; all spans are marked as "needs sweeping".
// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
// closer to the target value. However, this is not enough to avoid over-allocating memory.
// Consider that a goroutine wants to allocate a new span for a large object and
// there are no free swept spans, but there are small-object unswept spans.
// If the goroutine naively allocates a new span, it can surpass the yet-unknown
// target next_gc value. In order to prevent such cases (1) when a goroutine needs
// to allocate a new small-object span, it sweeps small-object spans for the same
// object size until it frees at least one object; (2) when a goroutine needs to
// allocate large-object span from heap, it sweeps spans until it frees at least
// that many pages into heap. Together these two measures ensure that we don't surpass
// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
// but there can still be other one-page unswept spans which could be combined into a two-page span.
// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
// The finalizer goroutine is kicked off only when all spans are swept.
// When the next GC starts, it sweeps all not-yet-swept spans (if any).

#include "runtime.h"
#include "arch_GOARCH.h"
#include "malloc.h"
#include "stack.h"
#include "mgc0.h"
#include "chan.h"
#include "race.h"
#include "type.h"
#include "typekind.h"
#include "funcdata.h"
#include "../../cmd/ld/textflag.h"

enum {
	Debug		= 0,
	ConcurrentSweep	= 1,
	PreciseScan	= 1,

	WorkbufSize	= 4*1024,
	FinBlockSize	= 4*1024,
	RootData	= 0,
	RootBss		= 1,
	RootFinalizers	= 2,
	RootSpans	= 3,
	RootFlushCaches = 4,
	RootCount	= 5,
};

#define ScanConservatively ((byte*)1)

// Initialized from $GOGC.  GOGC=off means no gc.
extern int32 runtime·gcpercent;

static FuncVal* poolcleanup;

void
sync·runtime_registerPoolCleanup(FuncVal *f)
{
	poolcleanup = f;
}

void
runtime·clearpools(void)
{
	P *p, **pp;
	MCache *c;
	int32 i;

	// clear sync.Pool's
	if(poolcleanup != nil)
		reflect·call(poolcleanup, nil, 0, 0);

	for(pp=runtime·allp; p=*pp; pp++) {
		// clear tinyalloc pool
		c = p->mcache;
		if(c != nil) {
			c->tiny = nil;
			c->tinysize = 0;
			c->sudogcache = nil;
		}
		// clear defer pools
		for(i=0; i<nelem(p->deferpool); i++)
			p->deferpool[i] = nil;
	}
}

// Holding worldsema grants an M the right to try to stop the world.
// The procedure is:
//
//	runtime·semacquire(&runtime·worldsema);
//	m->gcing = 1;
//	runtime·stoptheworld();
//
//	... do stuff ...
//
//	m->gcing = 0;
//	runtime·semrelease(&runtime·worldsema);
//	runtime·starttheworld();
//
uint32 runtime·worldsema = 1;

typedef struct Workbuf Workbuf;
struct Workbuf
{
	LFNode	node; // must be first
	uintptr	nobj;
	byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
};

typedef struct Finalizer Finalizer;
struct Finalizer
{
	FuncVal *fn;
	void *arg;
	uintptr nret;
	Type *fint;
	PtrType *ot;
};

typedef struct FinBlock FinBlock;
struct FinBlock
{
	FinBlock *alllink;
	FinBlock *next;
	int32 cnt;
	int32 cap;
	Finalizer fin[1];
};

extern byte data[];
extern byte edata[];
extern byte bss[];
extern byte ebss[];

extern byte gcdata[];
extern byte gcbss[];

static Lock	finlock;	// protects the following variables
static FinBlock	*finq;		// list of finalizers that are to be executed
static FinBlock	*finc;		// cache of free blocks
static FinBlock	*allfin;	// list of all blocks
bool	runtime·fingwait;
bool	runtime·fingwake;
byte*	runtime·gcdatamask;
byte*	runtime·gcbssmask;

static Lock	gclock;

static void	runfinq(void);
static void	bgsweep(void);
static Workbuf* getempty(Workbuf*);
static Workbuf* getfull(Workbuf*);
static void	putempty(Workbuf*);
static Workbuf* handoff(Workbuf*);
static void	gchelperstart(void);
static void	flushallmcaches(void);
static bool	scanframe(Stkframe *frame, void *unused);
static void	scanstack(G *gp);
static byte*	unrollglobgcprog(byte *prog, uintptr size);

static FuncVal runfinqv = {runfinq};
static FuncVal bgsweepv = {bgsweep};

static struct {
	uint64	full;  // lock-free list of full blocks
	uint64	empty; // lock-free list of empty blocks
	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
	uint32	nproc;
	int64	tstart;
	volatile uint32	nwait;
	volatile uint32	ndone;
	Note	alldone;
	ParFor*	markfor;

	// Copy of mheap.allspans for marker or sweeper.
	MSpan**	spans;
	uint32	nspan;
} work;

// scanblock scans a block of n bytes starting at pointer b for references
// to other objects, scanning any it finds recursively until there are no
// unscanned objects left.  Instead of using an explicit recursion, it keeps
// a work list in the Workbuf* structures and loops in the main function
// body.  Keeping an explicit work list is easier on the stack allocator and
// more efficient.
static void
scanblock(byte *b, uintptr n, byte *ptrmask)
{
	byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached;
	uintptr i, nobj, size, idx, x, off, scanbufpos;
	intptr ncached;
	Workbuf *wbuf;
	String *str;
	Slice *slice;
	Iface *iface;
	Eface *eface;
	Type *typ;
	MSpan *s;
	PageID k;
	bool keepworking;

	// Cache memory arena parameters in local vars.
	arena_start = runtime·mheap.arena_start;
	arena_used = runtime·mheap.arena_used;

	wbuf = getempty(nil);
	nobj = wbuf->nobj;
	wp = &wbuf->obj[nobj];
	keepworking = b == nil;
	scanbufpos = 0;
	for(i = 0; i < nelem(scanbuf); i++)
		scanbuf[i] = nil;

	ptrbitp = nil;
	cached = 0;
	ncached = 0;

	// ptrmask can have 3 possible values:
	// 1. nil - obtain pointer mask from GC bitmap.
	// 2. ScanConservatively - don't use any mask, scan conservatively.
	// 3. pointer to a compact mask (for stacks and data).
	if(b != nil)
		goto scanobj;
	for(;;) {
		if(nobj == 0) {
			// Out of work in workbuf.
			// First, see is there is any work in scanbuf.
			for(i = 0; i < nelem(scanbuf); i++) {
				b = scanbuf[scanbufpos];
				scanbuf[scanbufpos++] = nil;
				if(scanbufpos == nelem(scanbuf))
					scanbufpos = 0;
				if(b != nil) {
					n = arena_used - b; // scan until bitBoundary or BitsDead
					ptrmask = nil; // use GC bitmap for pointer info
					goto scanobj;
				}
			}
			if(!keepworking) {
				putempty(wbuf);
				return;
			}
			// Refill workbuf from global queue.
			wbuf = getfull(wbuf);
			if(wbuf == nil)
				return;
			nobj = wbuf->nobj;
			wp = &wbuf->obj[nobj];
		}

		// If another proc wants a pointer, give it some.
		if(work.nwait > 0 && nobj > 4 && work.full == 0) {
			wbuf->nobj = nobj;
			wbuf = handoff(wbuf);
			nobj = wbuf->nobj;
			wp = &wbuf->obj[nobj];
		}

		wp--;
		nobj--;
		b = *wp;
		n = arena_used - b; // scan until next bitBoundary or BitsDead
		ptrmask = nil; // use GC bitmap for pointer info

	scanobj:
		if(!PreciseScan) {
			if(ptrmask == nil) {
				// Heap obj, obtain real size.
				if(!runtime·mlookup(b, &p, &n, nil))
					continue; // not an allocated obj
				if(b != p)
					runtime·throw("bad heap object");
			}
			ptrmask = ScanConservatively;
		}
		// Find bits of the beginning of the object.
		if(ptrmask == nil) {
			off = (uintptr*)b - (uintptr*)arena_start;
			ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
			shift = (off % wordsPerBitmapByte) * gcBits;
			cached = *ptrbitp >> shift;
			cached &= ~bitBoundary;
			ncached = (8 - shift)/gcBits;
		}
		for(i = 0; i < n; i += PtrSize) {
			obj = nil;
			// Find bits for this word.
			if(ptrmask == nil) {
				// Check is we have reached end of span.
				if((((uintptr)b+i)%PageSize) == 0 &&
					runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
					break;
				// Consult GC bitmap.
				if(ncached <= 0) {
					// Refill cache.
					cached = *--ptrbitp;
					ncached = 2;
				}
				bits = cached;
				cached >>= gcBits;
				ncached--;
				if((bits&bitBoundary) != 0)
					break; // reached beginning of the next object
				bits = (bits>>2)&BitsMask;
				if(bits == BitsDead)
					break; // reached no-scan part of the object
			} else if(ptrmask != ScanConservatively) // dense mask (stack or data)
				bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
			else
				bits = BitsPointer;

			if(bits == BitsScalar || bits == BitsDead)
				continue;
			if(bits == BitsPointer) {
				obj = *(byte**)(b+i);
				goto markobj;
			}
			// Find the next pair of bits.
			if(ptrmask == nil) {
				if(ncached <= 0) {
					// Refill cache.
					cached = *--ptrbitp;
					ncached = 2;
				}
				bits = (cached>>2)&BitsMask;
			} else
				bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask;

			switch(bits) {
			case BitsString:
				str = (String*)(b+i);
				if(str->len > 0)
					obj = str->str;
				break;
			case BitsSlice:
				slice = (Slice*)(b+i);
				if(Debug && slice->cap < slice->len) {
					g->m->traceback = 2;
					runtime·printf("bad slice in object %p: %p/%p/%p\n",
						b, slice->array, slice->len, slice->cap);
					runtime·throw("bad slice in heap object");
				}
				if(slice->cap > 0)
					obj = slice->array;
				break;
			case BitsIface:
				iface = (Iface*)(b+i);
				if(iface->tab != nil) {
					typ = iface->tab->type;
					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
						obj = iface->data;
				}
				break;
			case BitsEface:
				eface = (Eface*)(b+i);
				typ = eface->type;
				if(typ != nil) {
					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
						obj = eface->data;
				}
				break;
			}

			if(bits == BitsSlice) {
				i += 2*PtrSize;
				if(ncached == 2)
					ncached = 0;
				else if(ptrmask == nil) {
					// Refill cache and consume one quadruple.
					cached = *--ptrbitp;
					cached >>= gcBits;
					ncached = 1;
				}
			} else {
				i += PtrSize;
				cached >>= gcBits;
				ncached--;
			}

		markobj:
			// At this point we have extracted the next potential pointer.
			// Check if it points into heap.
			if(obj == nil || obj < arena_start || obj >= arena_used)
				continue;
			// Mark the object.
			off = (uintptr*)obj - (uintptr*)arena_start;
			bitp = arena_start - off/wordsPerBitmapByte - 1;
			shift = (off % wordsPerBitmapByte) * gcBits;
			xbits = *bitp;
			bits = (xbits >> shift) & bitMask;
			if((bits&bitBoundary) == 0) {
				// Not a beginning of a block, consult span table to find the block beginning.
				k = (uintptr)obj>>PageShift;
				x = k;
				x -= (uintptr)arena_start>>PageShift;
				s = runtime·mheap.spans[x];
				if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
					continue;
				p = (byte*)((uintptr)s->start<<PageShift);
				if(s->sizeclass != 0) {
					size = s->elemsize;
					idx = ((byte*)obj - p)/size;
					p = p+idx*size;
				}
				if(p == obj) {
					runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
						p, s->start*PageSize, s->limit);
					runtime·throw("failed to find block beginning");
				}
				obj = p;
				goto markobj;
			}

			// Now we have bits, bitp, and shift correct for
			// obj pointing at the base of the object.
			// Only care about not marked objects.
			if((bits&bitMarked) != 0)
				continue;
			// If obj size is greater than 8, then each byte of GC bitmap
			// contains info for at most one object. In such case we use
			// non-atomic byte store to mark the object. This can lead
			// to double enqueue of the object for scanning, but scanning
			// is an idempotent operation, so it is OK. This cannot lead
			// to bitmap corruption because the single marked bit is the
			// only thing that can change in the byte.
			// For 8-byte objects we use non-atomic store, if the other
			// quadruple is already marked. Otherwise we resort to CAS
			// loop for marking.
			if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
				work.nproc == 1)
				*bitp = xbits | (bitMarked<<shift);
			else
				runtime·atomicor8(bitp, bitMarked<<shift);

			if(((xbits>>(shift+2))&BitsMask) == BitsDead)
				continue;  // noscan object

			// Queue the obj for scanning.
			PREFETCH(obj);
			obj = (byte*)((uintptr)obj & ~(PtrSize-1));
			p = scanbuf[scanbufpos];
			scanbuf[scanbufpos++] = obj;
			if(scanbufpos == nelem(scanbuf))
				scanbufpos = 0;
			if(p == nil)
				continue;

			// If workbuf is full, obtain an empty one.
			if(nobj >= nelem(wbuf->obj)) {
				wbuf->nobj = nobj;
				wbuf = getempty(wbuf);
				nobj = wbuf->nobj;
				wp = &wbuf->obj[nobj];
			}
			*wp = p;
			wp++;
			nobj++;
		}

		if(Debug && ptrmask == nil) {
			// For heap objects ensure that we did not overscan.
			n = 0;
			p = nil;
			if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) {
				runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n);
				runtime·throw("scanblock: scanned invalid object");
			}
		}
	}
}

static void
markroot(ParFor *desc, uint32 i)
{
	FinBlock *fb;
	MSpan *s;
	uint32 spanidx, sg;
	G *gp;
	void *p;

	USED(&desc);
	// Note: if you add a case here, please also update heapdump.c:dumproots.
	switch(i) {
	case RootData:
		scanblock(data, edata - data, runtime·gcdatamask);
		break;

	case RootBss:
		scanblock(bss, ebss - bss, runtime·gcbssmask);
		break;

	case RootFinalizers:
		for(fb=allfin; fb; fb=fb->alllink)
			scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), ScanConservatively);
		break;

	case RootSpans:
		// mark MSpan.specials
		sg = runtime·mheap.sweepgen;
		for(spanidx=0; spanidx<work.nspan; spanidx++) {
			Special *sp;
			SpecialFinalizer *spf;

			s = work.spans[spanidx];
			if(s->state != MSpanInUse)
				continue;
			if(s->sweepgen != sg) {
				runtime·printf("sweep %d %d\n", s->sweepgen, sg);
				runtime·throw("gc: unswept span");
			}
			for(sp = s->specials; sp != nil; sp = sp->next) {
				if(sp->kind != KindSpecialFinalizer)
					continue;
				// don't mark finalized object, but scan it so we
				// retain everything it points to.
				spf = (SpecialFinalizer*)sp;
				// A finalizer can be set for an inner byte of an object, find object beginning.
				p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
				scanblock(p, s->elemsize, nil);
				scanblock((void*)&spf->fn, PtrSize, ScanConservatively);
			}
		}
		break;

	case RootFlushCaches:
		flushallmcaches();
		break;

	default:
		// the rest is scanning goroutine stacks
		if(i - RootCount >= runtime·allglen)
			runtime·throw("markroot: bad index");
		gp = runtime·allg[i - RootCount];
		// remember when we've first observed the G blocked
		// needed only to output in traceback
		if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
			gp->waitsince = work.tstart;
		// Shrink a stack if not much of it is being used.
		runtime·shrinkstack(gp);
		scanstack(gp);
		break;
		
	}
}

// Get an empty work buffer off the work.empty list,
// allocating new buffers as needed.
static Workbuf*
getempty(Workbuf *b)
{
	MCache *c;

	if(b != nil)
		runtime·lfstackpush(&work.full, &b->node);
	b = nil;
	c = g->m->mcache;
	if(c->gcworkbuf != nil) {
		b = c->gcworkbuf;
		c->gcworkbuf = nil;
	}
	if(b == nil)
		b = (Workbuf*)runtime·lfstackpop(&work.empty);
	if(b == nil)
		b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys);
	b->nobj = 0;
	return b;
}

static void
putempty(Workbuf *b)
{
	MCache *c;

	c = g->m->mcache;
	if(c->gcworkbuf == nil) {
		c->gcworkbuf = b;
		return;
	}
	runtime·lfstackpush(&work.empty, &b->node);
}

void
runtime·gcworkbuffree(void *b)
{
	if(b != nil)
		putempty(b);
}

// Get a full work buffer off the work.full list, or return nil.
static Workbuf*
getfull(Workbuf *b)
{
	int32 i;

	if(b != nil)
		runtime·lfstackpush(&work.empty, &b->node);
	b = (Workbuf*)runtime·lfstackpop(&work.full);
	if(b != nil || work.nproc == 1)
		return b;

	runtime·xadd(&work.nwait, +1);
	for(i=0;; i++) {
		if(work.full != 0) {
			runtime·xadd(&work.nwait, -1);
			b = (Workbuf*)runtime·lfstackpop(&work.full);
			if(b != nil)
				return b;
			runtime·xadd(&work.nwait, +1);
		}
		if(work.nwait == work.nproc)
			return nil;
		if(i < 10) {
			g->m->gcstats.nprocyield++;
			runtime·procyield(20);
		} else if(i < 20) {
			g->m->gcstats.nosyield++;
			runtime·osyield();
		} else {
			g->m->gcstats.nsleep++;
			runtime·usleep(100);
		}
	}
}

static Workbuf*
handoff(Workbuf *b)
{
	int32 n;
	Workbuf *b1;

	// Make new buffer with half of b's pointers.
	b1 = getempty(nil);
	n = b->nobj/2;
	b->nobj -= n;
	b1->nobj = n;
	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
	g->m->gcstats.nhandoff++;
	g->m->gcstats.nhandoffcnt += n;

	// Put b on full list - let first half of b get stolen.
	runtime·lfstackpush(&work.full, &b->node);
	return b1;
}

BitVector
runtime·stackmapdata(StackMap *stackmap, int32 n)
{
	if(n < 0 || n >= stackmap->n)
		runtime·throw("stackmapdata: index out of range");
	return (BitVector){stackmap->nbit, stackmap->data + n*((stackmap->nbit+31)/32)};
}

// Scan a stack frame: local variables and function arguments/results.
static bool
scanframe(Stkframe *frame, void *unused)
{
	Func *f;
	StackMap *stackmap;
	BitVector bv;
	uintptr size;
	uintptr targetpc;
	int32 pcdata;

	USED(unused);
	f = frame->fn;
	targetpc = frame->continpc;
	if(targetpc == 0) {
		// Frame is dead.
		return true;
	}
	if(Debug > 1)
		runtime·printf("scanframe %s\n", runtime·funcname(f));
	if(targetpc != f->entry)
		targetpc--;
	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
	if(pcdata == -1) {
		// We do not have a valid pcdata value but there might be a
		// stackmap for this function.  It is likely that we are looking
		// at the function prologue, assume so and hope for the best.
		pcdata = 0;
	}

	// Scan local variables if stack frame has been allocated.
	// Use pointer information if known.
	stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
	if(stackmap == nil) {
		// No locals information, scan everything.
		size = frame->varp - (byte*)frame->sp;
		if(Debug > 2)
			runtime·printf("frame %s unsized locals %p+%p\n", runtime·funcname(f), frame->varp-size, size);
		scanblock(frame->varp - size, size, ScanConservatively);
	} else if(stackmap->n < 0) {
		// Locals size information, scan just the locals.
		size = -stackmap->n;
		if(Debug > 2)
			runtime·printf("frame %s conservative locals %p+%p\n", runtime·funcname(f), frame->varp-size, size);
		scanblock(frame->varp - size, size, ScanConservatively);
	} else if(stackmap->n > 0) {
		// Locals bitmap information, scan just the pointers in locals.
		if(pcdata < 0 || pcdata >= stackmap->n) {
			// don't know where we are
			runtime·printf("pcdata is %d and %d stack map entries for %s (targetpc=%p)\n",
				pcdata, stackmap->n, runtime·funcname(f), targetpc);
			runtime·throw("scanframe: bad symbol table");
		}
		bv = runtime·stackmapdata(stackmap, pcdata);
		size = (bv.n * PtrSize) / BitsPerPointer;
		scanblock(frame->varp - size, bv.n/BitsPerPointer*PtrSize, (byte*)bv.data);
	}

	// Scan arguments.
	// Use pointer information if known.
	stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
	if(stackmap != nil) {
		bv = runtime·stackmapdata(stackmap, pcdata);
		scanblock(frame->argp, bv.n/BitsPerPointer*PtrSize, (byte*)bv.data);
	} else {
		if(Debug > 2)
			runtime·printf("frame %s conservative args %p+%p\n", runtime·funcname(f), frame->argp, (uintptr)frame->arglen);
		scanblock(frame->argp, frame->arglen, ScanConservatively);
	}
	return true;
}

static void
scanstack(G *gp)
{
	M *mp;
	int32 n;
	Stktop *stk;
	uintptr sp, guard;

	switch(gp->status){
	default:
		runtime·printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
		runtime·throw("mark - bad status");
	case Gdead:
		return;
	case Grunning:
		runtime·throw("mark - world not stopped");
	case Grunnable:
	case Gsyscall:
	case Gwaiting:
		break;
	}

	if(gp == g)
		runtime·throw("can't scan our own stack");
	if((mp = gp->m) != nil && mp->helpgc)
		runtime·throw("can't scan gchelper stack");

	if(gp->syscallstack != (uintptr)nil) {
		// Scanning another goroutine that is about to enter or might
		// have just exited a system call. It may be executing code such
		// as schedlock and may have needed to start a new stack segment.
		// Use the stack segment and stack pointer at the time of
		// the system call instead, since that won't change underfoot.
		sp = gp->syscallsp;
		stk = (Stktop*)gp->syscallstack;
		guard = gp->syscallguard;
	} else {
		// Scanning another goroutine's stack.
		// The goroutine is usually asleep (the world is stopped).
		sp = gp->sched.sp;
		stk = (Stktop*)gp->stackbase;
		guard = gp->stackguard;
	}
	if(ScanStackByFrames) {
		USED(sp);
		USED(stk);
		USED(guard);
		runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, scanframe, nil, false);
	} else {
		n = 0;
		while(stk) {
			if(sp < guard-StackGuard || (uintptr)stk < sp) {
				runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
				runtime·throw("scanstack");
			}
			if(Debug > 2)
				runtime·printf("conservative stack %p+%p\n", (byte*)sp, (uintptr)stk-sp);
			scanblock((byte*)sp, (uintptr)stk - sp, ScanConservatively);
			sp = stk->gobuf.sp;
			guard = stk->stackguard;
			stk = (Stktop*)stk->stackbase;
			n++;
		}
	}
}

void
runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
{
	FinBlock *block;
	Finalizer *f;

	runtime·lock(&finlock);
	if(finq == nil || finq->cnt == finq->cap) {
		if(finc == nil) {
			finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
			finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
			finc->alllink = allfin;
			allfin = finc;
		}
		block = finc;
		finc = block->next;
		block->next = finq;
		finq = block;
	}
	f = &finq->fin[finq->cnt];
	finq->cnt++;
	f->fn = fn;
	f->nret = nret;
	f->fint = fint;
	f->ot = ot;
	f->arg = p;
	runtime·fingwake = true;
	runtime·unlock(&finlock);
}

void
runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*))
{
	FinBlock *fb;
	Finalizer *f;
	uintptr i;

	for(fb = allfin; fb; fb = fb->alllink) {
		for(i = 0; i < fb->cnt; i++) {
			f = &fb->fin[i];
			callback(f->fn, f->arg, f->nret, f->fint, f->ot);
		}
	}
}

void
runtime·MSpan_EnsureSwept(MSpan *s)
{
	uint32 sg;

	// Caller must disable preemption.
	// Otherwise when this function returns the span can become unswept again
	// (if GC is triggered on another goroutine).
	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
		runtime·throw("MSpan_EnsureSwept: m is not locked");

	sg = runtime·mheap.sweepgen;
	if(runtime·atomicload(&s->sweepgen) == sg)
		return;
	if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
		runtime·MSpan_Sweep(s, false);
		return;
	}
	// unfortunate condition, and we don't have efficient means to wait
	while(runtime·atomicload(&s->sweepgen) != sg)
		runtime·osyield();  
}

// Sweep frees or collects finalizers for blocks not marked in the mark phase.
// It clears the mark bits in preparation for the next GC round.
// Returns true if the span was returned to heap.
// If preserve=true, don't return it to heap nor relink in MCentral lists;
// caller takes care of it.
bool
runtime·MSpan_Sweep(MSpan *s, bool preserve)
{
	int32 cl, n, npages, nfree;
	uintptr size, off, step;
	uint32 sweepgen;
	byte *p, *bitp, shift, xbits, bits;
	MCache *c;
	byte *arena_start;
	MLink head, *end, *link;
	Special *special, **specialp, *y;
	bool res, sweepgenset;

	// It's critical that we enter this function with preemption disabled,
	// GC must not start while we are in the middle of this function.
	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
		runtime·throw("MSpan_Sweep: m is not locked");
	sweepgen = runtime·mheap.sweepgen;
	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
		runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
			s->state, s->sweepgen, sweepgen);
		runtime·throw("MSpan_Sweep: bad span state");
	}
	arena_start = runtime·mheap.arena_start;
	cl = s->sizeclass;
	size = s->elemsize;
	if(cl == 0) {
		n = 1;
	} else {
		// Chunk full of small blocks.
		npages = runtime·class_to_allocnpages[cl];
		n = (npages << PageShift) / size;
	}
	res = false;
	nfree = 0;
	end = &head;
	c = g->m->mcache;
	sweepgenset = false;

	// Mark any free objects in this span so we don't collect them.
	for(link = s->freelist; link != nil; link = link->next) {
		off = (uintptr*)link - (uintptr*)arena_start;
		bitp = arena_start - off/wordsPerBitmapByte - 1;
		shift = (off % wordsPerBitmapByte) * gcBits;
		*bitp |= bitMarked<<shift;
	}

	// Unlink & free special records for any objects we're about to free.
	specialp = &s->specials;
	special = *specialp;
	while(special != nil) {
		// A finalizer can be set for an inner byte of an object, find object beginning.
		p = (byte*)(s->start << PageShift) + special->offset/size*size;
		off = (uintptr*)p - (uintptr*)arena_start;
		bitp = arena_start - off/wordsPerBitmapByte - 1;
		shift = (off % wordsPerBitmapByte) * gcBits;
		bits = (*bitp>>shift) & bitMask;
		if((bits&bitMarked) == 0) {
			// Find the exact byte for which the special was setup
			// (as opposed to object beginning).
			p = (byte*)(s->start << PageShift) + special->offset;
			// about to free object: splice out special record
			y = special;
			special = special->next;
			*specialp = special;
			if(!runtime·freespecial(y, p, size, false)) {
				// stop freeing of object if it has a finalizer
				*bitp |= bitMarked << shift;
			}
		} else {
			// object is still live: keep special record
			specialp = &special->next;
			special = *specialp;
		}
	}

	// Sweep through n objects of given size starting at p.
	// This thread owns the span now, so it can manipulate
	// the block bitmap without atomic operations.
	p = (byte*)(s->start << PageShift);
	// Find bits for the beginning of the span.
	off = (uintptr*)p - (uintptr*)arena_start;
	bitp = arena_start - off/wordsPerBitmapByte - 1;
	shift = 0;
	step = size/(PtrSize*wordsPerBitmapByte);
	// Rewind to the previous quadruple as we move to the next
	// in the beginning of the loop.
	bitp += step;
	if(step == 0) {
		// 8-byte objects.
		bitp++;
		shift = gcBits;
	}
	for(; n > 0; n--, p += size) {
		bitp -= step;
		if(step == 0) {
			if(shift != 0)
				bitp--;
			shift = gcBits - shift;
		}

		xbits = *bitp;
		bits = (xbits>>shift) & bitMask;

		// Allocated and marked object, reset bits to allocated.
		if((bits&bitMarked) != 0) {
			*bitp &= ~(bitMarked<<shift);
			continue;
		}
		// At this point we know that we are looking at garbage object
		// that needs to be collected.
		if(runtime·debug.allocfreetrace)
			runtime·tracefree(p, size);
		// Reset to allocated+noscan.
		*bitp = (xbits & ~((bitMarked|(BitsMask<<2))<<shift)) | ((uintptr)BitsDead<<(shift+2));
		if(cl == 0) {
			// Free large span.
			if(preserve)
				runtime·throw("can't preserve large span");
			runtime·unmarkspan(p, s->npages<<PageShift);
			s->needzero = 1;
			// important to set sweepgen before returning it to heap
			runtime·atomicstore(&s->sweepgen, sweepgen);
			sweepgenset = true;
			// NOTE(rsc,dvyukov): The original implementation of efence
			// in CL 22060046 used SysFree instead of SysFault, so that
			// the operating system would eventually give the memory
			// back to us again, so that an efence program could run
			// longer without running out of memory. Unfortunately,
			// calling SysFree here without any kind of adjustment of the
			// heap data structures means that when the memory does
			// come back to us, we have the wrong metadata for it, either in
			// the MSpan structures or in the garbage collection bitmap.
			// Using SysFault here means that the program will run out of
			// memory fairly quickly in efence mode, but at least it won't
			// have mysterious crashes due to confused memory reuse.
			// It should be possible to switch back to SysFree if we also
			// implement and then call some kind of MHeap_DeleteSpan.
			if(runtime·debug.efence) {
				s->limit = nil;	// prevent mlookup from finding this span
				runtime·SysFault(p, size);
			} else
				runtime·MHeap_Free(&runtime·mheap, s, 1);
			c->local_nlargefree++;
			c->local_largefree += size;
			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (runtime·gcpercent + 100)/100));
			res = true;
		} else {
			// Free small object.
			if(size > 2*sizeof(uintptr))
				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
			else if(size > sizeof(uintptr))
				((uintptr*)p)[1] = 0;

			end->next = (MLink*)p;
			end = (MLink*)p;
			nfree++;
		}
	}

	// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
	// because of the potential for a concurrent free/SetFinalizer.
	// But we need to set it before we make the span available for allocation
	// (return it to heap or mcentral), because allocation code assumes that a
	// span is already swept if available for allocation.

	if(!sweepgenset && nfree == 0) {
		// The span must be in our exclusive ownership until we update sweepgen,
		// check for potential races.
		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
			runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
				s->state, s->sweepgen, sweepgen);
			runtime·throw("MSpan_Sweep: bad span state after sweep");
		}
		runtime·atomicstore(&s->sweepgen, sweepgen);
	}
	if(nfree > 0) {
		c->local_nsmallfree[cl] += nfree;
		c->local_cachealloc -= nfree * size;
		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (runtime·gcpercent + 100)/100));
		res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl].mcentral, s, nfree, head.next, end, preserve);
		// MCentral_FreeSpan updates sweepgen
	}
	return res;
}

// State of background sweep.
// Pretected by gclock.
static struct
{
	G*	g;
	bool	parked;

	uint32	spanidx;	// background sweeper position

	uint32	nbgsweep;
	uint32	npausesweep;
} sweep;

// background sweeping goroutine
static void
bgsweep(void)
{
	g->issystem = 1;
	for(;;) {
		while(runtime·sweepone() != -1) {
			sweep.nbgsweep++;
			runtime·gosched();
		}
		runtime·lock(&gclock);
		if(!runtime·mheap.sweepdone) {
			// It's possible if GC has happened between sweepone has
			// returned -1 and gclock lock.
			runtime·unlock(&gclock);
			continue;
		}
		sweep.parked = true;
		g->isbackground = true;
		runtime·parkunlock(&gclock, runtime·gostringnocopy((byte*)"GC sweep wait"));
		g->isbackground = false;
	}
}

// sweeps one span
// returns number of pages returned to heap, or -1 if there is nothing to sweep
uintptr
runtime·sweepone(void)
{
	MSpan *s;
	uint32 idx, sg;
	uintptr npages;

	// increment locks to ensure that the goroutine is not preempted
	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
	g->m->locks++;
	sg = runtime·mheap.sweepgen;
	for(;;) {
		idx = runtime·xadd(&sweep.spanidx, 1) - 1;
		if(idx >= work.nspan) {
			runtime·mheap.sweepdone = true;
			g->m->locks--;
			return -1;
		}
		s = work.spans[idx];
		if(s->state != MSpanInUse) {
			s->sweepgen = sg;
			continue;
		}
		if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
			continue;
		npages = s->npages;
		if(!runtime·MSpan_Sweep(s, false))
			npages = 0;
		g->m->locks--;
		return npages;
	}
}

void
runtime·gchelper(void)
{
	uint32 nproc;

	g->m->traceback = 2;
	gchelperstart();

	// parallel mark for over gc roots
	runtime·parfordo(work.markfor);

	// help other threads scan secondary blocks
	scanblock(nil, 0, nil);

	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
	if(runtime·xadd(&work.ndone, +1) == nproc-1)
		runtime·notewakeup(&work.alldone);
	g->m->traceback = 0;
}

static void
cachestats(void)
{
	MCache *c;
	P *p, **pp;

	for(pp=runtime·allp; p=*pp; pp++) {
		c = p->mcache;
		if(c==nil)
			continue;
		runtime·purgecachedstats(c);
	}
}

static void
flushallmcaches(void)
{
	P *p, **pp;
	MCache *c;

	// Flush MCache's to MCentral.
	for(pp=runtime·allp; p=*pp; pp++) {
		c = p->mcache;
		if(c==nil)
			continue;
		runtime·MCache_ReleaseAll(c);
		runtime·stackcache_clear(c);
	}
}

static void
flushallmcaches_m(G *gp)
{
	flushallmcaches();
	runtime·gogo(&gp->sched);
}

void
runtime·updatememstats(GCStats *stats)
{
	M *mp;
	MSpan *s;
	int32 i;
	uint64 smallfree;
	uint64 *src, *dst;

	if(stats)
		runtime·memclr((byte*)stats, sizeof(*stats));
	for(mp=runtime·allm; mp; mp=mp->alllink) {
		if(stats) {
			src = (uint64*)&mp->gcstats;
			dst = (uint64*)stats;
			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
				dst[i] += src[i];
			runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
		}
	}
	mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
	mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
	
	// Calculate memory allocator stats.
	// During program execution we only count number of frees and amount of freed memory.
	// Current number of alive object in the heap and amount of alive heap memory
	// are calculated by scanning all spans.
	// Total number of mallocs is calculated as number of frees plus number of alive objects.
	// Similarly, total amount of allocated memory is calculated as amount of freed memory
	// plus amount of alive heap memory.
	mstats.alloc = 0;
	mstats.total_alloc = 0;
	mstats.nmalloc = 0;
	mstats.nfree = 0;
	for(i = 0; i < nelem(mstats.by_size); i++) {
		mstats.by_size[i].nmalloc = 0;
		mstats.by_size[i].nfree = 0;
	}

	// Flush MCache's to MCentral.
	if(g == g->m->g0)
		flushallmcaches();
	else
		runtime·mcall(flushallmcaches_m);

	// Aggregate local stats.
	cachestats();

	// Scan all spans and count number of alive objects.
	runtime·lock(&runtime·mheap.lock);
	for(i = 0; i < runtime·mheap.nspan; i++) {
		s = runtime·mheap.allspans[i];
		if(s->state != MSpanInUse)
			continue;
		if(s->sizeclass == 0) {
			mstats.nmalloc++;
			mstats.alloc += s->elemsize;
		} else {
			mstats.nmalloc += s->ref;
			mstats.by_size[s->sizeclass].nmalloc += s->ref;
			mstats.alloc += s->ref*s->elemsize;
		}
	}
	runtime·unlock(&runtime·mheap.lock);

	// Aggregate by size class.
	smallfree = 0;
	mstats.nfree = runtime·mheap.nlargefree;
	for(i = 0; i < nelem(mstats.by_size); i++) {
		mstats.nfree += runtime·mheap.nsmallfree[i];
		mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
		mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
		smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
	}
	mstats.nmalloc += mstats.nfree;

	// Calculate derived stats.
	mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
	mstats.heap_alloc = mstats.alloc;
	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
}

// Structure of arguments passed to function gc().
// This allows the arguments to be passed via runtime·mcall.
struct gc_args
{
	int64 start_time; // start time of GC in ns (just before stoptheworld)
	bool  eagersweep;
};

static void gc(struct gc_args *args);
static void mgc(G *gp);

int32
runtime·readgogc(void)
{
	byte *p;

	p = runtime·getenv("GOGC");
	if(p == nil || p[0] == '\0')
		return 100;
	if(runtime·strcmp(p, (byte*)"off") == 0)
		return -1;
	return runtime·atoi(p);
}

void
runtime·gcinit(void)
{
	if(sizeof(Workbuf) != WorkbufSize)
		runtime·throw("runtime: size of Workbuf is suboptimal");

	work.markfor = runtime·parforalloc(MaxGcproc);
	runtime·gcpercent = runtime·readgogc();
	runtime·gcdatamask = unrollglobgcprog(gcdata, edata - data);
	runtime·gcbssmask = unrollglobgcprog(gcbss, ebss - bss);
}

// force = 1 - do GC regardless of current heap usage
// force = 2 - go GC and eager sweep
void
runtime·gc(int32 force)
{
	struct gc_args a;
	int32 i;

	// The gc is turned off (via enablegc) until
	// the bootstrap has completed.
	// Also, malloc gets called in the guts
	// of a number of libraries that might be
	// holding locks.  To avoid priority inversion
	// problems, don't bother trying to run gc
	// while holding a lock.  The next mallocgc
	// without a lock will do the gc instead.
	if(!mstats.enablegc || g == g->m->g0 || g->m->locks > 0 || runtime·panicking)
		return;

	if(runtime·gcpercent < 0)
		return;

	runtime·semacquire(&runtime·worldsema, false);
	if(force==0 && mstats.heap_alloc < mstats.next_gc) {
		// typically threads which lost the race to grab
		// worldsema exit here when gc is done.
		runtime·semrelease(&runtime·worldsema);
		return;
	}

	// Ok, we're doing it!  Stop everybody else
	a.start_time = runtime·nanotime();
	a.eagersweep = force >= 2;
	g->m->gcing = 1;
	runtime·stoptheworld();
	
	runtime·clearpools();

	// Run gc on the g0 stack.  We do this so that the g stack
	// we're currently running on will no longer change.  Cuts
	// the root set down a bit (g0 stacks are not scanned, and
	// we don't need to scan gc's internal state).  Also an
	// enabler for copyable stacks.
	for(i = 0; i < (runtime·debug.gctrace > 1 ? 2 : 1); i++) {
		if(i > 0)
			a.start_time = runtime·nanotime();
		// switch to g0, call gc(&a), then switch back
		g->param = &a;
		g->status = Gwaiting;
		g->waitreason = runtime·gostringnocopy((byte*)"garbage collection");
		runtime·mcall(mgc);
	}

	// all done
	g->m->gcing = 0;
	g->m->locks++;
	runtime·semrelease(&runtime·worldsema);
	runtime·starttheworld();
	g->m->locks--;

	// now that gc is done, kick off finalizer thread if needed
	if(!ConcurrentSweep) {
		// give the queued finalizers, if any, a chance to run
		runtime·gosched();
	}
}

static void
mgc(G *gp)
{
	gc(gp->param);
	gp->param = nil;
	gp->status = Grunning;
	runtime·gogo(&gp->sched);
}

void
runtime·gc_m(void)
{
	struct gc_args a;
	G *gp;

	gp = g->m->curg;
	gp->status = Gwaiting;
	gp->waitreason = runtime·gostringnocopy((byte*)"garbage collection");

	a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32);
	a.eagersweep = g->m->scalararg[2];
	gc(&a);

	gp->status = Grunning;
}

static void
gc(struct gc_args *args)
{
	int64 t0, t1, t2, t3, t4;
	uint64 heap0, heap1, obj;
	GCStats stats;

	if(runtime·debug.allocfreetrace)
		runtime·tracegc();

	g->m->traceback = 2;
	t0 = args->start_time;
	work.tstart = args->start_time; 

	t1 = 0;
	if(runtime·debug.gctrace)
		t1 = runtime·nanotime();

	// Sweep what is not sweeped by bgsweep.
	while(runtime·sweepone() != -1)
		sweep.npausesweep++;

	// Cache runtime.mheap.allspans in work.spans to avoid conflicts with
	// resizing/freeing allspans.
	// New spans can be created while GC progresses, but they are not garbage for
	// this round:
	//  - new stack spans can be created even while the world is stopped.
	//  - new malloc spans can be created during the concurrent sweep

	// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
	runtime·lock(&runtime·mheap.lock);
	// Free the old cached sweep array if necessary.
	if(work.spans != nil && work.spans != runtime·mheap.allspans)
		runtime·SysFree(work.spans, work.nspan*sizeof(work.spans[0]), &mstats.other_sys);
	// Cache the current array for marking.
	runtime·mheap.gcspans = runtime·mheap.allspans;
	work.spans = runtime·mheap.allspans;
	work.nspan = runtime·mheap.nspan;
	runtime·unlock(&runtime·mheap.lock);

	work.nwait = 0;
	work.ndone = 0;
	work.nproc = runtime·gcprocs();
	runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot);
	if(work.nproc > 1) {
		runtime·noteclear(&work.alldone);
		runtime·helpgc(work.nproc);
	}

	t2 = 0;
	if(runtime·debug.gctrace)
		t2 = runtime·nanotime();

	gchelperstart();
	runtime·parfordo(work.markfor);
	scanblock(nil, 0, nil);

	t3 = 0;
	if(runtime·debug.gctrace)
		t3 = runtime·nanotime();

	if(work.nproc > 1)
		runtime·notesleep(&work.alldone);

	cachestats();
	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
	// estimate what was live heap size after previous GC (for tracing only)
	heap0 = mstats.next_gc*100/(runtime·gcpercent+100);
	// conservatively set next_gc to high value assuming that everything is live
	// concurrent/lazy sweep will reduce this number while discovering new garbage
	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*runtime·gcpercent/100;

	t4 = runtime·nanotime();
	mstats.last_gc = runtime·unixnanotime();  // must be Unix time to make sense to user
	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
	mstats.pause_total_ns += t4 - t0;
	mstats.numgc++;
	if(mstats.debuggc)
		runtime·printf("pause %D\n", t4-t0);

	if(runtime·debug.gctrace) {
		heap1 = mstats.heap_alloc;
		runtime·updatememstats(&stats);
		if(heap1 != mstats.heap_alloc) {
			runtime·printf("runtime: mstats skew: heap=%D/%D\n", heap1, mstats.heap_alloc);
			runtime·throw("mstats skew");
		}
		obj = mstats.nmalloc - mstats.nfree;

		stats.nprocyield += work.markfor->nprocyield;
		stats.nosyield += work.markfor->nosyield;
		stats.nsleep += work.markfor->nsleep;

		runtime·printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
				" %d/%d/%d sweeps,"
				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
			mstats.numgc, work.nproc, (t1-t0)/1000, (t2-t1)/1000, (t3-t2)/1000, (t4-t3)/1000,
			heap0>>20, heap1>>20, obj,
			mstats.nmalloc, mstats.nfree,
			work.nspan, sweep.nbgsweep, sweep.npausesweep,
			stats.nhandoff, stats.nhandoffcnt,
			work.markfor->nsteal, work.markfor->nstealcnt,
			stats.nprocyield, stats.nosyield, stats.nsleep);
		sweep.nbgsweep = sweep.npausesweep = 0;
	}

	// See the comment in the beginning of this function as to why we need the following.
	// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
	runtime·lock(&runtime·mheap.lock);
	// Free the old cached mark array if necessary.
	if(work.spans != nil && work.spans != runtime·mheap.allspans)
		runtime·SysFree(work.spans, work.nspan*sizeof(work.spans[0]), &mstats.other_sys);
	// Cache the current array for sweeping.
	runtime·mheap.gcspans = runtime·mheap.allspans;
	runtime·mheap.sweepgen += 2;
	runtime·mheap.sweepdone = false;
	work.spans = runtime·mheap.allspans;
	work.nspan = runtime·mheap.nspan;
	sweep.spanidx = 0;
	runtime·unlock(&runtime·mheap.lock);

	// Temporary disable concurrent sweep, because we see failures on builders.
	if(ConcurrentSweep && !args->eagersweep) {
		runtime·lock(&gclock);
		if(sweep.g == nil)
			sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, runtime·gc);
		else if(sweep.parked) {
			sweep.parked = false;
			runtime·ready(sweep.g);
		}
		runtime·unlock(&gclock);
	} else {
		// Sweep all spans eagerly.
		while(runtime·sweepone() != -1)
			sweep.npausesweep++;
	}

	runtime·MProf_GC();
	g->m->traceback = 0;
}

extern uintptr runtime·sizeof_C_MStats;

void
runtime·ReadMemStats(MStats *stats)
{
	// Have to acquire worldsema to stop the world,
	// because stoptheworld can only be used by
	// one goroutine at a time, and there might be
	// a pending garbage collection already calling it.
	runtime·semacquire(&runtime·worldsema, false);
	g->m->gcing = 1;
	runtime·stoptheworld();
	runtime·updatememstats(nil);
	// Size of the trailing by_size array differs between Go and C,
	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
	runtime·memmove(stats, &mstats, runtime·sizeof_C_MStats);

	// Stack numbers are part of the heap numbers, separate those out for user consumption
	stats->stacks_sys = stats->stacks_inuse;
	stats->heap_inuse -= stats->stacks_inuse;
	stats->heap_sys -= stats->stacks_inuse;
	
	g->m->gcing = 0;
	g->m->locks++;
	runtime·semrelease(&runtime·worldsema);
	runtime·starttheworld();
	g->m->locks--;
}

void
runtime∕debug·readGCStats(Slice *pauses)
{
	uint64 *p;
	uint32 i, n;

	// Calling code in runtime/debug should make the slice large enough.
	if(pauses->cap < nelem(mstats.pause_ns)+3)
		runtime·throw("runtime: short slice passed to readGCStats");

	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
	p = (uint64*)pauses->array;
	runtime·lock(&runtime·mheap.lock);
	n = mstats.numgc;
	if(n > nelem(mstats.pause_ns))
		n = nelem(mstats.pause_ns);
	
	// The pause buffer is circular. The most recent pause is at
	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
	// from there to go back farther in time. We deliver the times
	// most recent first (in p[0]).
	for(i=0; i<n; i++)
		p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];

	p[n] = mstats.last_gc;
	p[n+1] = mstats.numgc;
	p[n+2] = mstats.pause_total_ns;	
	runtime·unlock(&runtime·mheap.lock);
	pauses->len = n+3;
}

void
runtime·setgcpercent_m(void) {
	int32 in;
	int32 out;

	in = (int32)(intptr)g->m->scalararg[0];

	runtime·lock(&runtime·mheap.lock);
	out = runtime·gcpercent;
	if(in < 0)
		in = -1;
	runtime·gcpercent = in;
	runtime·unlock(&runtime·mheap.lock);

	g->m->scalararg[0] = (uintptr)(intptr)out;
}

static void
gchelperstart(void)
{
	if(g->m->helpgc < 0 || g->m->helpgc >= MaxGcproc)
		runtime·throw("gchelperstart: bad m->helpgc");
	if(g != g->m->g0)
		runtime·throw("gchelper not running on g0 stack");
}

static void
runfinq(void)
{
	Finalizer *f;
	FinBlock *fb, *next;
	byte *frame;
	uint32 framesz, framecap, i;
	Eface *ef, ef1;

	// This function blocks for long periods of time, and because it is written in C
	// we have no liveness information. Zero everything so that uninitialized pointers
	// do not cause memory leaks.
	f = nil;
	fb = nil;
	next = nil;
	frame = nil;
	framecap = 0;
	framesz = 0;
	i = 0;
	ef = nil;
	ef1.type = nil;
	ef1.data = nil;
	
	// force flush to memory
	USED(&f);
	USED(&fb);
	USED(&next);
	USED(&framesz);
	USED(&i);
	USED(&ef);
	USED(&ef1);

	for(;;) {
		runtime·lock(&finlock);
		fb = finq;
		finq = nil;
		if(fb == nil) {
			runtime·fingwait = true;
			g->isbackground = true;
			runtime·parkunlock(&finlock, runtime·gostringnocopy((byte*)"finalizer wait"));
			g->isbackground = false;
			continue;
		}
		runtime·unlock(&finlock);
		if(raceenabled)
			runtime·racefingo();
		for(; fb; fb=next) {
			next = fb->next;
			for(i=0; i<fb->cnt; i++) {
				f = &fb->fin[i];
				framesz = sizeof(Eface) + f->nret;
				if(framecap < framesz) {
					// The frame does not contain pointers interesting for GC,
					// all not yet finalized objects are stored in finq.
					// If we do not mark it as FlagNoScan,
					// the last finalized object is not collected.
					frame = runtime·mallocgc(framesz, 0, FlagNoScan);
					framecap = framesz;
				}
				if(f->fint == nil)
					runtime·throw("missing type in runfinq");
				if((f->fint->kind&KindMask) == KindPtr) {
					// direct use of pointer
					*(void**)frame = f->arg;
				} else if(((InterfaceType*)f->fint)->mhdr.len == 0) {
					// convert to empty interface
					ef = (Eface*)frame;
					ef->type = &f->ot->typ;
					ef->data = f->arg;
				} else {
					// convert to interface with methods, via empty interface.
					ef1.type = &f->ot->typ;
					ef1.data = f->arg;
					if(!runtime·ifaceE2I2((InterfaceType*)f->fint, ef1, (Iface*)frame))
						runtime·throw("invalid type conversion in runfinq");
				}
				reflect·call(f->fn, frame, framesz, framesz);
				f->fn = nil;
				f->arg = nil;
				f->ot = nil;
			}
			fb->cnt = 0;
			runtime·lock(&finlock);
			fb->next = finc;
			finc = fb;
			runtime·unlock(&finlock);
		}

		// Zero everything that's dead, to avoid memory leaks.
		// See comment at top of function.
		f = nil;
		fb = nil;
		next = nil;
		i = 0;
		ef = nil;
		ef1.type = nil;
		ef1.data = nil;
		runtime·gc(1);	// trigger another gc to clean up the finalized objects, if possible
	}
}

void
runtime·createfing(void)
{
	if(runtime·fing != nil)
		return;
	// Here we use gclock instead of finlock,
	// because newproc1 can allocate, which can cause on-demand span sweep,
	// which can queue finalizers, which would deadlock.
	runtime·lock(&gclock);
	if(runtime·fing == nil)
		runtime·fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
	runtime·unlock(&gclock);
}

void
runtime·createfingM(G *gp)
{
	runtime·createfing();
	runtime·gogo(&gp->sched);
}

G*
runtime·wakefing(void)
{
	G *res;

	res = nil;
	runtime·lock(&finlock);
	if(runtime·fingwait && runtime·fingwake) {
		runtime·fingwait = false;
		runtime·fingwake = false;
		res = runtime·fing;
	}
	runtime·unlock(&finlock);
	return res;
}

// Recursively unrolls GC program in prog.
// mask is where to store the result.
// ppos is a pointer to position in mask, in bits.
// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
static byte*
unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
{
	uintptr pos, siz, i, off;
	byte *arena_start, *prog1, v, *bitp, shift;

	arena_start = runtime·mheap.arena_start;
	pos = *ppos;
	for(;;) {
		switch(prog[0]) {
		case insData:
			prog++;
			siz = prog[0];
			prog++;
			for(i = 0; i < siz; i++) {
				v = prog[i/PointersPerByte];
				v >>= (i%PointersPerByte)*BitsPerPointer;
				v &= BitsMask;
				if(inplace) {
					// Store directly into GC bitmap.
					off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
					bitp = arena_start - off/wordsPerBitmapByte - 1;
					shift = (off % wordsPerBitmapByte) * gcBits;
					if(shift==0)
						*bitp = 0;
					*bitp |= v<<(shift+2);
					pos += PtrSize;
				} else if(sparse) {
					// 4-bits per word
					v <<= (pos%8)+2;
					mask[pos/8] |= v;
					pos += gcBits;
				} else {
					// 2-bits per word
					v <<= pos%8;
					mask[pos/8] |= v;
					pos += BitsPerPointer;
				}
			}
			prog += ROUND(siz*BitsPerPointer, 8)/8;
			break;
		case insArray:
			prog++;
			siz = 0;
			for(i = 0; i < PtrSize; i++)
				siz = (siz<<8) + prog[PtrSize-i-1];
			prog += PtrSize;
			prog1 = nil;
			for(i = 0; i < siz; i++)
				prog1 = unrollgcprog1(mask, prog, &pos, inplace, sparse);
			if(prog1[0] != insArrayEnd)
				runtime·throw("unrollgcprog: array does not end with insArrayEnd");
			prog = prog1+1;
			break;
		case insArrayEnd:
		case insEnd:
			*ppos = pos;
			return prog;
		default:
			runtime·throw("unrollgcprog: unknown instruction");
		}
	}
}

// Unrolls GC program prog for data/bss, returns dense GC mask.
static byte*
unrollglobgcprog(byte *prog, uintptr size)
{
	byte *mask;
	uintptr pos, masksize;

	masksize = ROUND(ROUND(size, PtrSize)/PtrSize*BitsPerPointer, 8)/8;
	mask = runtime·persistentalloc(masksize+1, 0, &mstats.gc_sys);
	mask[masksize] = 0xa1;
	pos = 0;
	prog = unrollgcprog1(mask, prog, &pos, false, false);
	if(pos != size/PtrSize*BitsPerPointer) {
		runtime·printf("unrollglobgcprog: bad program size, got %D, expect %D\n",
			(uint64)pos, (uint64)size/PtrSize*BitsPerPointer);
		runtime·throw("unrollglobgcprog: bad program size");
	}
	if(prog[0] != insEnd)
		runtime·throw("unrollglobgcprog: program does not end with insEnd");
	if(mask[masksize] != 0xa1)
		runtime·throw("unrollglobgcprog: overflow");
	return mask;
}

void
runtime·unrollgcproginplace_m(void)
{
	uintptr size, size0, pos, off;
	byte *arena_start, *prog, *bitp, shift;
	Type *typ;
	void *v;

	v = g->m->ptrarg[0];
	typ = g->m->ptrarg[1];
	size = g->m->scalararg[0];
	size0 = g->m->scalararg[1];
	g->m->ptrarg[0] = nil;
	g->m->ptrarg[1] = nil;

	pos = 0;
	prog = (byte*)typ->gc[1];
	while(pos != size0)
		unrollgcprog1(v, prog, &pos, true, true);
	// Mark first word as bitAllocated.
	arena_start = runtime·mheap.arena_start;
	off = (uintptr*)v - (uintptr*)arena_start;
	bitp = arena_start - off/wordsPerBitmapByte - 1;
	shift = (off % wordsPerBitmapByte) * gcBits;
	*bitp |= bitBoundary<<shift;
	// Mark word after last as BitsDead.
	if(size0 < size) {
		off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
		bitp = arena_start - off/wordsPerBitmapByte - 1;
		shift = (off % wordsPerBitmapByte) * gcBits;
		*bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
	}
}

// Unrolls GC program in typ->gc[1] into typ->gc[0]
void
runtime·unrollgcprog_m(void)
{
	static Lock lock;
	Type *typ;
	byte *mask, *prog;
	uintptr pos;
	uint32 x;

	typ = g->m->ptrarg[0];
	g->m->ptrarg[0] = nil;

	runtime·lock(&lock);
	mask = (byte*)typ->gc[0];
	if(mask[0] == 0) {
		pos = 8;  // skip the unroll flag
		prog = (byte*)typ->gc[1];
		prog = unrollgcprog1(mask, prog, &pos, false, true);
		if(prog[0] != insEnd)
			runtime·throw("unrollgcprog: program does not end with insEnd");
		if(((typ->size/PtrSize)%2) != 0) {
			// repeat the program twice
			prog = (byte*)typ->gc[1];
			unrollgcprog1(mask, prog, &pos, false, true);
		}
		// atomic way to say mask[0] = 1
		x = ((uint32*)mask)[0];
		runtime·atomicstore((uint32*)mask, x|1);
	}
	runtime·unlock(&lock);
}

// mark the span of memory at v as having n blocks of the given size.
// if leftover is true, there is left over space at the end of the span.
void
runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
{
	uintptr i, off, step;
	byte *b;

	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		runtime·throw("markspan: bad pointer");

	// Find bits of the beginning of the span.
	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
	if((off%wordsPerBitmapByte) != 0)
		runtime·throw("markspan: unaligned length");

	// Okay to use non-atomic ops here, because we control
	// the entire span, and each bitmap byte has bits for only
	// one span, so no other goroutines are changing these bitmap words.

	if(size == PtrSize) {
		// Possible only on 64-bits (minimal size class is 8 bytes).
		// Poor man's memset(0x11).
		if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
			runtime·throw("markspan: bad bits");
		if((n%(wordsPerBitmapByte*PtrSize)) != 0)
			runtime·throw("markspan: unaligned length");
		b = b - n/wordsPerBitmapByte + 1;	// find first byte
		if(((uintptr)b%PtrSize) != 0)
			runtime·throw("markspan: unaligned pointer");
		for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
			*(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
		return;
	}

	if(leftover)
		n++;	// mark a boundary just past end of last block too
	step = size/(PtrSize*wordsPerBitmapByte);
	for(i = 0; i != n; i++, b -= step)
		*b = bitBoundary|(BitsDead<<2);
}

// unmark the span of memory at v of length n bytes.
void
runtime·unmarkspan(void *v, uintptr n)
{
	uintptr off;
	byte *b;

	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		runtime·throw("markspan: bad pointer");

	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	if((off % (PtrSize*wordsPerBitmapByte)) != 0)
		runtime·throw("markspan: unaligned pointer");
	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
	n /= PtrSize;
	if(n%(PtrSize*wordsPerBitmapByte) != 0)
		runtime·throw("unmarkspan: unaligned length");
	// Okay to use non-atomic ops here, because we control
	// the entire span, and each bitmap word has bits for only
	// one span, so no other goroutines are changing these
	// bitmap words.
	n /= wordsPerBitmapByte;
	runtime·memclr(b - n + 1, n);
}

void
runtime·MHeap_MapBits(MHeap *h)
{
	// Caller has added extra mappings to the arena.
	// Add extra mappings of bitmap words as needed.
	// We allocate extra bitmap pieces in chunks of bitmapChunk.
	enum {
		bitmapChunk = 8192
	};
	uintptr n;

	n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
	n = ROUND(n, bitmapChunk);
	n = ROUND(n, PhysPageSize);
	if(h->bitmap_mapped >= n)
		return;

	runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats.gc_sys);
	h->bitmap_mapped = n;
}

static bool
getgcmaskcb(Stkframe *frame, void *ctxt)
{
	Stkframe *frame0;

	frame0 = ctxt;
	if(frame0->sp >= (uintptr)frame->varp - frame->sp && frame0->sp < (uintptr)frame->varp) {
		*frame0 = *frame;
		return false;
	}
	return true;
}

// Returns GC type info for object p for testing.
void
runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
{
	Stkframe frame;
	uintptr i, n, off;
	byte *base, bits, shift, *b;

	*mask = nil;
	*len = 0;

	// data
	if(p >= data && p < edata) {
		n = ((PtrType*)t)->elem->size;
		*len = n/PtrSize;
		*mask = runtime·mallocgc(*len, nil, 0);
		for(i = 0; i < n; i += PtrSize) {
			off = (p+i-data)/PtrSize;
			bits = (runtime·gcdatamask[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
			(*mask)[i/PtrSize] = bits;
		}
		return;
	}
	// bss
	if(p >= bss && p < ebss) {
		n = ((PtrType*)t)->elem->size;
		*len = n/PtrSize;
		*mask = runtime·mallocgc(*len, nil, 0);
		for(i = 0; i < n; i += PtrSize) {
			off = (p+i-bss)/PtrSize;
			bits = (runtime·gcbssmask[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
			(*mask)[i/PtrSize] = bits;
		}
		return;
	}
	// heap
	if(runtime·mlookup(p, &base, &n, nil)) {
		*len = n/PtrSize;
		*mask = runtime·mallocgc(*len, nil, 0);
		for(i = 0; i < n; i += PtrSize) {
			off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
			b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
			shift = (off % wordsPerBitmapByte) * gcBits;
			bits = (*b >> (shift+2))&BitsMask;
			(*mask)[i/PtrSize] = bits;
		}
		return;
	}
	// stack
	frame.fn = nil;
	frame.sp = (uintptr)p;
	runtime·gentraceback((uintptr)runtime·getcallerpc(&p), (uintptr)runtime·getcallersp(&p), 0, g, 0, nil, 1000, getgcmaskcb, &frame, false);
	if(frame.fn != nil) {
		Func *f;
		StackMap *stackmap;
		BitVector bv;
		uintptr size;
		uintptr targetpc;
		int32 pcdata;

		f = frame.fn;
		targetpc = frame.continpc;
		if(targetpc == 0)
			return;
		if(targetpc != f->entry)
			targetpc--;
		pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
		if(pcdata == -1)
			return;
		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
		if(stackmap == nil || stackmap->n <= 0)
			return;
		bv = runtime·stackmapdata(stackmap, pcdata);
		size = bv.n/BitsPerPointer*PtrSize;
		n = ((PtrType*)t)->elem->size;
		*len = n/PtrSize;
		*mask = runtime·mallocgc(*len, nil, 0);
		for(i = 0; i < n; i += PtrSize) {
			off = (p+i-frame.varp+size)/PtrSize;
			bits = (bv.data[off*BitsPerPointer/32] >> ((off*BitsPerPointer)%32))&BitsMask;
			(*mask)[i/PtrSize] = bits;
		}
	}
}

void runtime·gc_unixnanotime(int64 *now);

int64 runtime·unixnanotime(void)
{
	int64 now;

	runtime·gc_unixnanotime(&now);
	return now;
}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								// Copyright 2009 The Go Authors. All rights reserved.
 								// Use of this source code is governed by a BSD-style
 								// license that can be found in the LICENSE file.
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								// Garbage collector (GC).
 								//
 								// GC is:
 								// - mark&sweep
 								// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
 								// - parallel (up to MaxGcproc threads)
 								// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
 								// - non-moving/non-compacting
 								// - full (non-partial)
 								//
 								// GC rate.
 								// Next GC is after we've allocated an extra amount of memory proportional to
 								// the amount already in use. The proportion is controlled by GOGC environment variable
 								// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 								// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 								// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 								// (and also the amount of extra memory used).
 								//
 								// Concurrent sweep.
 								// The sweep phase proceeds concurrently with normal program execution.
 								// The heap is swept span-by-span both lazily (when a goroutine needs another span)
 								// and concurrently in a background goroutine (this helps programs that are not CPU bound).
 								// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
 								// and so next_gc calculation is tricky and happens as follows.
 								// At the end of the stop-the-world phase next_gc is conservatively set based on total
 								// heap size; all spans are marked as "needs sweeping".
 								// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
 								// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
 								// closer to the target value. However, this is not enough to avoid over-allocating memory.
 								// Consider that a goroutine wants to allocate a new span for a large object and
 								// there are no free swept spans, but there are small-object unswept spans.
 								// If the goroutine naively allocates a new span, it can surpass the yet-unknown
 								// target next_gc value. In order to prevent such cases (1) when a goroutine needs
 								// to allocate a new small-object span, it sweeps small-object spans for the same
 								// object size until it frees at least one object; (2) when a goroutine needs to
 								// allocate large-object span from heap, it sweeps spans until it frees at least
 								// that many pages into heap. Together these two measures ensure that we don't surpass
 								// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
 								// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
 								// but there can still be other one-page unswept spans which could be combined into a two-page span.
 								// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 								// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 								// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 								// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 								// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 								// The finalizer goroutine is kicked off only when all spans are swept.
 								// When the next GC starts, it sweeps all not-yet-swept spans (if any).
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
 								#include "runtime.h"
-												runtime: make more build-friendly

Collapse the arch,os-specific directories into the main directory
by renaming xxx/foo.c to foo_xxx.c, and so on.

There are no substantial edits here, except to the Makefile.
The assumption is that the Go tool will #define GOOS_darwin
and GOARCH_amd64 and will make any file named something
like signals_darwin.h available as signals_GOOS.h during the
build.  This replaces what used to be done with -I$(GOOS).

There is still work to be done to make runtime build with
standard tools, but this is a big step.  After this we will have
to write a script to generate all the generated files so they
can be checked in (instead of generated during the build).

R=r, iant, r, lucio.dere
CC=golang-dev
https://golang.org/cl/5490053

											
										
										
											2011-12-16 13:33:58 -07:00
+								#include "arch_GOARCH.h"
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								#include "malloc.h"
-												runtime: stack split + garbage collection bug

The g->sched.sp saved stack pointer and the
g->stackbase and g->stackguard stack bounds
can change even while "the world is stopped",
because a goroutine has to call functions (and
therefore might split its stack) when exiting a
system call to check whether the world is stopped
(and if so, wait until the world continues).

That means the garbage collector cannot access
those values safely (without a race) for goroutines
executing system calls.  Instead, save a consistent
triple in g->gcsp, g->gcstack, g->gcguard during
entersyscall and have the garbage collector refer
to those.

The old code was occasionally seeing (because of
the race) an sp and stk that did not correspond to
each other, so that stk - sp was not the number of
stack bytes following sp.  In that case, if sp < stk
then the call scanblock(sp, stk - sp) scanned too
many bytes (anything between the two pointers,
which pointed into different allocation blocks).
If sp > stk then stk - sp wrapped around.
On 32-bit, stk - sp is a uintptr (uint32) converted
to int64 in the call to scanblock, so a large (~4G)
but positive number.  Scanblock would try to scan
that many bytes and eventually fault accessing
unmapped memory.  On 64-bit, stk - sp is a uintptr (uint64)
promoted to int64 in the call to scanblock, so a negative
number.  Scanblock would not scan anything, possibly
causing in-use blocks to be freed.

In short, 32-bit platforms would have seen either
ineffective garbage collection or crashes during garbage
collection, while 64-bit platforms would have seen
either ineffective or incorrect garbage collection.
You can see the invalid arguments to scanblock in the
stack traces in issue 1620.

Fixes #1620.
Fixes #1746.

R=iant, r
CC=golang-dev
https://golang.org/cl/4437075

											
										
										
											2011-04-27 21:21:12 -06:00
+								#include "stack.h"
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								#include "mgc0.h"
-												reflect, runtime: fix crash in GC due to reflect.call + precise GC

Given
        type Outer struct {
                *Inner
                ...
        }
the compiler generates the implementation of (*Outer).M dispatching to
the embedded Inner. The implementation is logically:
        func (p *Outer) M() {
                (p.Inner).M()
        }
but since the only change here is the replacement of one pointer
receiver with another, the actual generated code overwrites the
original receiver with the p.Inner pointer and then jumps to the M
method expecting the *Inner receiver.

During reflect.Value.Call, we create an argument frame and the
associated data structures to describe it to the garbage collector,
populate the frame, call reflect.call to run a function call using
that frame, and then copy the results back out of the frame. The
reflect.call function does a memmove of the frame structure onto the
stack (to set up the inputs), runs the call, and the memmoves the
stack back to the frame structure (to preserve the outputs).

Originally reflect.call did not distinguish inputs from outputs: both
memmoves were for the full stack frame. However, in the case where the
called function was one of these wrappers, the rewritten receiver is
almost certainly a different type than the original receiver. This is
not a problem on the stack, where we use the program counter to
determine the type information and understand that during (*Outer).M
the receiver is an *Outer while during (*Inner).M the receiver in the
same memory word is now an *Inner. But in the statically typed
argument frame created by reflect, the receiver is always an *Outer.
Copying the modified receiver pointer off the stack into the frame
will store an *Inner there, and then if a garbage collection happens
to scan that argument frame before it is discarded, it will scan the
*Inner memory as if it were an *Outer. If the two have different
memory layouts, the collection will intepret the memory incorrectly.

Fix by only copying back the results.

Fixes #7725.

LGTM=khr
R=khr
CC=dave, golang-codereviews
https://golang.org/cl/85180043

											
										
										
											2014-04-08 09:11:35 -06:00
+								#include "chan.h"
-												race: runtime changes
This is a part of a bigger change that adds data race detection feature:
https://golang.org/cl/6456044

R=rsc
CC=gobot, golang-dev
https://golang.org/cl/6535050

											
										
										
											2012-10-07 12:05:32 -06:00
+								#include "race.h"
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+								#include "type.h"
 								#include "typekind.h"
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+								#include "funcdata.h"
-												cmd/cc,runtime: change preprocessor to expand macros inside of
#pragma textflag and #pragma dataflag directives.
Update dataflag directives to use symbols instead of integer constants.

R=golang-dev, bradfitz
CC=golang-dev
https://golang.org/cl/13310043

											
										
										
											2013-08-29 13:36:59 -06:00
+								#include "../../cmd/ld/textflag.h"
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
 								enum {
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									Debug		= 0,
 									ConcurrentSweep	= 1,
 									PreciseScan	= 1,
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									WorkbufSize	= 4*1024,
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+									FinBlockSize	= 4*1024,
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									RootData	= 0,
 									RootBss		= 1,
 									RootFinalizers	= 2,
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									RootSpans	= 3,
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									RootFlushCaches = 4,
 									RootCount	= 5,
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								};
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								#define ScanConservatively ((byte*)1)
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
 								// Initialized from $GOGC.  GOGC=off means no gc.
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+								extern int32 runtime·gcpercent;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
-												sync: less agressive local caching in Pool
Currently Pool can cache up to 15 elements per P, and these elements are not accesible to other Ps.
If a Pool caches large objects, say 2MB, and GOMAXPROCS is set to a large value, say 32,
then the Pool can waste up to 960MB.
The new caching policy caches at most 1 per-P element, the rest is shared between Ps.

Get/Put performance is unchanged. Nested Get/Put performance is 57% worse.
However, overall scalability of nested Get/Put is significantly improved,
so the new policy starts winning under contention.

benchmark                     old ns/op     new ns/op     delta
BenchmarkPool                 27.4          26.7          -2.55%
BenchmarkPool-4               6.63          6.59          -0.60%
BenchmarkPool-16              1.98          1.87          -5.56%
BenchmarkPool-64              1.93          1.86          -3.63%
BenchmarkPoolOverlflow        3970          6235          +57.05%
BenchmarkPoolOverlflow-4      10935         1668          -84.75%
BenchmarkPoolOverlflow-16     13419         520           -96.12%
BenchmarkPoolOverlflow-64     10295         380           -96.31%

LGTM=rsc
R=rsc
CC=golang-codereviews, khr
https://golang.org/cl/86020043

											
										
										
											2014-04-14 11:13:32 -06:00
+								static FuncVal* poolcleanup;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
 								void
-												sync: less agressive local caching in Pool
Currently Pool can cache up to 15 elements per P, and these elements are not accesible to other Ps.
If a Pool caches large objects, say 2MB, and GOMAXPROCS is set to a large value, say 32,
then the Pool can waste up to 960MB.
The new caching policy caches at most 1 per-P element, the rest is shared between Ps.

Get/Put performance is unchanged. Nested Get/Put performance is 57% worse.
However, overall scalability of nested Get/Put is significantly improved,
so the new policy starts winning under contention.

benchmark                     old ns/op     new ns/op     delta
BenchmarkPool                 27.4          26.7          -2.55%
BenchmarkPool-4               6.63          6.59          -0.60%
BenchmarkPool-16              1.98          1.87          -5.56%
BenchmarkPool-64              1.93          1.86          -3.63%
BenchmarkPoolOverlflow        3970          6235          +57.05%
BenchmarkPoolOverlflow-4      10935         1668          -84.75%
BenchmarkPoolOverlflow-16     13419         520           -96.12%
BenchmarkPoolOverlflow-64     10295         380           -96.31%

LGTM=rsc
R=rsc
CC=golang-codereviews, khr
https://golang.org/cl/86020043

											
										
										
											2014-04-14 11:13:32 -06:00
+								sync·runtime_registerPoolCleanup(FuncVal *f)
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								{
-												sync: less agressive local caching in Pool
Currently Pool can cache up to 15 elements per P, and these elements are not accesible to other Ps.
If a Pool caches large objects, say 2MB, and GOMAXPROCS is set to a large value, say 32,
then the Pool can waste up to 960MB.
The new caching policy caches at most 1 per-P element, the rest is shared between Ps.

Get/Put performance is unchanged. Nested Get/Put performance is 57% worse.
However, overall scalability of nested Get/Put is significantly improved,
so the new policy starts winning under contention.

benchmark                     old ns/op     new ns/op     delta
BenchmarkPool                 27.4          26.7          -2.55%
BenchmarkPool-4               6.63          6.59          -0.60%
BenchmarkPool-16              1.98          1.87          -5.56%
BenchmarkPool-64              1.93          1.86          -3.63%
BenchmarkPoolOverlflow        3970          6235          +57.05%
BenchmarkPoolOverlflow-4      10935         1668          -84.75%
BenchmarkPoolOverlflow-16     13419         520           -96.12%
BenchmarkPoolOverlflow-64     10295         380           -96.31%

LGTM=rsc
R=rsc
CC=golang-codereviews, khr
https://golang.org/cl/86020043

											
										
										
											2014-04-14 11:13:32 -06:00
+									poolcleanup = f;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								}
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+								void
 								runtime·clearpools(void)
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								{
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									P *p, **pp;
-												runtime: fix windows build
Currently windows crashes because early allocs in schedinit
try to allocate tiny memory blocks, but m->p is not yet setup.
I've considered calling procresize(1) earlier in schedinit,
but this refactoring is better and must fix the issue as well.
Fixes #7218.

R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/54570045

											
										
										
											2014-01-27 13:26:56 -07:00
+									MCache *c;
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									int32 i;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									// clear sync.Pool's
-												sync: less agressive local caching in Pool
Currently Pool can cache up to 15 elements per P, and these elements are not accesible to other Ps.
If a Pool caches large objects, say 2MB, and GOMAXPROCS is set to a large value, say 32,
then the Pool can waste up to 960MB.
The new caching policy caches at most 1 per-P element, the rest is shared between Ps.

Get/Put performance is unchanged. Nested Get/Put performance is 57% worse.
However, overall scalability of nested Get/Put is significantly improved,
so the new policy starts winning under contention.

benchmark                     old ns/op     new ns/op     delta
BenchmarkPool                 27.4          26.7          -2.55%
BenchmarkPool-4               6.63          6.59          -0.60%
BenchmarkPool-16              1.98          1.87          -5.56%
BenchmarkPool-64              1.93          1.86          -3.63%
BenchmarkPoolOverlflow        3970          6235          +57.05%
BenchmarkPoolOverlflow-4      10935         1668          -84.75%
BenchmarkPoolOverlflow-16     13419         520           -96.12%
BenchmarkPoolOverlflow-64     10295         380           -96.31%

LGTM=rsc
R=rsc
CC=golang-codereviews, khr
https://golang.org/cl/86020043

											
										
										
											2014-04-14 11:13:32 -06:00
+									if(poolcleanup != nil)
 										reflect·call(poolcleanup, nil, 0, 0);
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
 									for(pp=runtime·allp; p=*pp; pp++) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// clear tinyalloc pool
-												runtime: fix windows build
Currently windows crashes because early allocs in schedinit
try to allocate tiny memory blocks, but m->p is not yet setup.
I've considered calling procresize(1) earlier in schedinit,
but this refactoring is better and must fix the issue as well.
Fixes #7218.

R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/54570045

											
										
										
											2014-01-27 13:26:56 -07:00
+										c = p->mcache;
 										if(c != nil) {
 											c->tiny = nil;
 											c->tinysize = 0;
-												runtime: convert common scheduler functions to Go
These are required for chans, semaphores, timers, etc.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123640043

											
										
										
											2014-08-21 10:41:09 -06:00
+											c->sudogcache = nil;
-												runtime: fix windows build
Currently windows crashes because early allocs in schedinit
try to allocate tiny memory blocks, but m->p is not yet setup.
I've considered calling procresize(1) earlier in schedinit,
but this refactoring is better and must fix the issue as well.
Fixes #7218.

R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/54570045

											
										
										
											2014-01-27 13:26:56 -07:00
+										}
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// clear defer pools
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+										for(i=0; i<nelem(p->deferpool); i++)
 											p->deferpool[i] = nil;
 									}
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								}
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+								// Holding worldsema grants an M the right to try to stop the world.
 								// The procedure is:
 								//
 								//	runtime·semacquire(&runtime·worldsema);
 								//	m->gcing = 1;
 								//	runtime·stoptheworld();
 								//
 								//	... do stuff ...
 								//
 								//	m->gcing = 0;
 								//	runtime·semrelease(&runtime·worldsema);
 								//	runtime·starttheworld();
 								//
 								uint32 runtime·worldsema = 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								typedef struct Workbuf Workbuf;
 								struct Workbuf
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								{
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									LFNode	node; // must be first
 									uintptr	nobj;
 									byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								};
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								typedef struct Finalizer Finalizer;
 								struct Finalizer
 								{
-												cmd/gc, reflect, runtime: switch to indirect func value representation

Step 1 of http://golang.org/s/go11func.

R=golang-dev, r, daniel.morsing, remyoudompheng
CC=golang-dev
https://golang.org/cl/7393045

											
										
										
											2013-02-21 15:01:13 -07:00
+									FuncVal *fn;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									void *arg;
-												runtime: prepare for 64-bit ints

This CL makes the runtime understand that the type of
the len or cap of a map, slice, or string is 'int', not 'int32',
and it is also careful to distinguish between function arguments
and results of type 'int' vs type 'int32'.

In the runtime, the new typedefs 'intgo' and 'uintgo' refer
to Go int and uint. The C types int and uint continue to be
unavailable (cause intentional compile errors).

This CL does not change the meaning of int, but it should make
the eventual change of the meaning of int on amd64 a bit
smoother.

Update #2188.

R=iant, r, dave, remyoudompheng
CC=golang-dev
https://golang.org/cl/6551067

											
										
										
											2012-09-24 12:58:34 -06:00
+									uintptr nret;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									Type *fint;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+									PtrType *ot;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								};
 								typedef struct FinBlock FinBlock;
 								struct FinBlock
 								{
 									FinBlock *alllink;
 									FinBlock *next;
 									int32 cnt;
 									int32 cap;
 									Finalizer fin[1];
 								};
-												FFI step 2: can ask for libc.so.6.

introduced explicit "data" symbol instead of etext
to mark beginning of data, so that using larger
alignment (i.e. 4MB like GNU loader) doesn't
confuse garbage collector.

split dodata into dodata and dobss in preparation
for putting the dynamic data + headers in the data
segment instead of stuffed at the beginning of the binary.

R=r
DELTA=52  (37 added, 3 deleted, 12 changed)
OCL=33610
CL=33618

											
										
										
											2009-08-20 17:09:38 -06:00
+								extern byte data[];
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								extern byte edata[];
 								extern byte bss[];
-												ld: add NOPTRBSS for large, pointer-free uninitialized data
cc: add #pragma textflag to set it
runtime: mark mheap to go into noptr-bss.
        remove special case in garbage collector

Remove the ARM from.flag field created by CL 5687044.
The DUPOK flag was already in p->reg, so keep using that.

Otherwise test/nilptr.go creates a very large binary.
Should fix the arm build.
Diagnosed by minux.ma; replacement for CL 5690044.

R=golang-dev, minux.ma, r
CC=golang-dev
https://golang.org/cl/5686060

											
										
										
											2012-02-21 20:08:42 -07:00
+								extern byte ebss[];
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								extern byte gcdata[];
 								extern byte gcbss[];
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+								static Lock	finlock;	// protects the following variables
 								static FinBlock	*finq;		// list of finalizers that are to be executed
 								static FinBlock	*finc;		// cache of free blocks
 								static FinBlock	*allfin;	// list of all blocks
 								bool	runtime·fingwait;
 								bool	runtime·fingwake;
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+								byte*	runtime·gcdatamask;
 								byte*	runtime·gcbssmask;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static Lock	gclock;
-												runtime: use explicit flag when finalizer goroutine is waiting

Avoids spurious wakeups during other sleeping by that goroutine.
Fixes #711.

R=r
CC=golang-dev
https://golang.org/cl/902041

											
										
										
											2010-04-07 21:38:02 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static void	runfinq(void);
 								static void	bgsweep(void);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								static Workbuf* getempty(Workbuf*);
 								static Workbuf* getfull(Workbuf*);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static void	putempty(Workbuf*);
 								static Workbuf* handoff(Workbuf*);
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								static void	gchelperstart(void);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								static void	flushallmcaches(void);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								static bool	scanframe(Stkframe *frame, void *unused);
 								static void	scanstack(G *gp);
 								static byte*	unrollglobgcprog(byte *prog, uintptr size);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static FuncVal runfinqv = {runfinq};
 								static FuncVal bgsweepv = {bgsweep};
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static struct {
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									uint64	full;  // lock-free list of full blocks
 									uint64	empty; // lock-free list of empty blocks
 									byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									uint32	nproc;
-												runtime: output how long goroutines are blocked
Example of output:

goroutine 4 [sleep for 3 min]:
time.Sleep(0x34630b8a000)
        src/pkg/runtime/time.goc:31 +0x31
main.func·002()
        block.go:16 +0x2c
created by main.main
        block.go:17 +0x33

Full program and output are here:
http://play.golang.org/p/NEZdADI3Td

Fixes #6809.

R=golang-codereviews, khr, kamil.kisiel, bradfitz, rsc
CC=golang-codereviews
https://golang.org/cl/50420043

											
										
										
											2014-01-16 01:54:46 -07:00
+									int64	tstart;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									volatile uint32	nwait;
 									volatile uint32	ndone;
 									Note	alldone;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									ParFor*	markfor;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
 									// Copy of mheap.allspans for marker or sweeper.
 									MSpan**	spans;
 									uint32	nspan;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								} work;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								// scanblock scans a block of n bytes starting at pointer b for references
 								// to other objects, scanning any it finds recursively until there are no
 								// unscanned objects left.  Instead of using an explicit recursion, it keeps
 								// a work list in the Workbuf* structures and loops in the main function
 								// body.  Keeping an explicit work list is easier on the stack allocator and
 								// more efficient.
 								static void
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								scanblock(byte *b, uintptr n, byte *ptrmask)
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								{
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached;
 									uintptr i, nobj, size, idx, x, off, scanbufpos;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									intptr ncached;
 									Workbuf *wbuf;
 									String *str;
 									Slice *slice;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									Iface *iface;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									Eface *eface;
 									Type *typ;
 									MSpan *s;
 									PageID k;
 									bool keepworking;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									// Cache memory arena parameters in local vars.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
 									arena_used = runtime·mheap.arena_used;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									wbuf = getempty(nil);
 									nobj = wbuf->nobj;
 									wp = &wbuf->obj[nobj];
 									keepworking = b == nil;
 									scanbufpos = 0;
 									for(i = 0; i < nelem(scanbuf); i++)
 										scanbuf[i] = nil;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									ptrbitp = nil;
 									cached = 0;
 									ncached = 0;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									// ptrmask can have 3 possible values:
 									// 1. nil - obtain pointer mask from GC bitmap.
 									// 2. ScanConservatively - don't use any mask, scan conservatively.
 									// 3. pointer to a compact mask (for stacks and data).
 									if(b != nil)
 										goto scanobj;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									for(;;) {
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										if(nobj == 0) {
 											// Out of work in workbuf.
 											// First, see is there is any work in scanbuf.
 											for(i = 0; i < nelem(scanbuf); i++) {
 												b = scanbuf[scanbufpos];
 												scanbuf[scanbufpos++] = nil;
 												if(scanbufpos == nelem(scanbuf))
 													scanbufpos = 0;
 												if(b != nil) {
 													n = arena_used - b; // scan until bitBoundary or BitsDead
 													ptrmask = nil; // use GC bitmap for pointer info
 													goto scanobj;
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+												}
 											}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(!keepworking) {
 												putempty(wbuf);
 												return;
-												runtime: try to determine the actual type during garbage collection

If the scanned block has no typeinfo the garbage collector will attempt
to get the actual type of the block.

R=golang-dev, bradfitz, rsc
CC=golang-dev
https://golang.org/cl/7093045

											
										
										
											2013-01-18 14:56:17 -07:00
+											}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											// Refill workbuf from global queue.
 											wbuf = getfull(wbuf);
 											if(wbuf == nil)
 												return;
 											nobj = wbuf->nobj;
 											wp = &wbuf->obj[nobj];
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										// If another proc wants a pointer, give it some.
 										if(work.nwait > 0 && nobj > 4 && work.full == 0) {
 											wbuf->nobj = nobj;
 											wbuf = handoff(wbuf);
 											nobj = wbuf->nobj;
 											wp = &wbuf->obj[nobj];
 										}
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										wp--;
 										nobj--;
 										b = *wp;
 										n = arena_used - b; // scan until next bitBoundary or BitsDead
 										ptrmask = nil; // use GC bitmap for pointer info
 									scanobj:
 										if(!PreciseScan) {
 											if(ptrmask == nil) {
 												// Heap obj, obtain real size.
 												if(!runtime·mlookup(b, &p, &n, nil))
 													continue; // not an allocated obj
 												if(b != p)
 													runtime·throw("bad heap object");
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+											}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											ptrmask = ScanConservatively;
 										}
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+										// Find bits of the beginning of the object.
 										if(ptrmask == nil) {
 											off = (uintptr*)b - (uintptr*)arena_start;
 											ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
 											shift = (off % wordsPerBitmapByte) * gcBits;
 											cached = *ptrbitp >> shift;
 											cached &= ~bitBoundary;
 											ncached = (8 - shift)/gcBits;
 										}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										for(i = 0; i < n; i += PtrSize) {
 											obj = nil;
 											// Find bits for this word.
 											if(ptrmask == nil) {
 												// Check is we have reached end of span.
 												if((((uintptr)b+i)%PageSize) == 0 &&
 													runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
 													break;
 												// Consult GC bitmap.
 												if(ncached <= 0) {
 													// Refill cache.
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+													cached = *--ptrbitp;
 													ncached = 2;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												bits = cached;
 												cached >>= gcBits;
 												ncached--;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+												if((bits&bitBoundary) != 0)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+													break; // reached beginning of the next object
 												bits = (bits>>2)&BitsMask;
 												if(bits == BitsDead)
 													break; // reached no-scan part of the object
 											} else if(ptrmask != ScanConservatively) // dense mask (stack or data)
 												bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
 											else
 												bits = BitsPointer;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(bits == BitsScalar || bits == BitsDead)
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+												continue;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(bits == BitsPointer) {
 												obj = *(byte**)(b+i);
 												goto markobj;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											// Find the next pair of bits.
 											if(ptrmask == nil) {
 												if(ncached <= 0) {
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+													// Refill cache.
 													cached = *--ptrbitp;
 													ncached = 2;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												bits = (cached>>2)&BitsMask;
 											} else
 												bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											switch(bits) {
 											case BitsString:
 												str = (String*)(b+i);
 												if(str->len > 0)
 													obj = str->str;
 												break;
 											case BitsSlice:
 												slice = (Slice*)(b+i);
 												if(Debug && slice->cap < slice->len) {
 													g->m->traceback = 2;
 													runtime·printf("bad slice in object %p: %p/%p/%p\n",
 														b, slice->array, slice->len, slice->cap);
 													runtime·throw("bad slice in heap object");
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												if(slice->cap > 0)
 													obj = slice->array;
 												break;
 											case BitsIface:
 												iface = (Iface*)(b+i);
 												if(iface->tab != nil) {
 													typ = iface->tab->type;
-												cmd/gc, runtime: refactor interface inlining decision into compiler

We need to change the interface value representation for
concurrent garbage collection, so that there is no ambiguity
about whether the data word holds a pointer or scalar.

This CL does NOT make any representation changes.

Instead, it removes representation assumptions from
various pieces of code throughout the tree.
The isdirectiface function in cmd/gc/subr.c is now
the only place that decides that policy.
The policy propagates out from there in the reflect
metadata, as a new flag in the internal kind value.

A follow-up CL will change the representation by
changing the isdirectiface function. If that CL causes
problems, it will be easy to roll back.

Update #8405.

LGTM=iant
R=golang-codereviews, iant
CC=golang-codereviews, r
https://golang.org/cl/129090043

											
										
										
											2014-08-18 19:13:11 -06:00
+													if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+														obj = iface->data;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												break;
 											case BitsEface:
 												eface = (Eface*)(b+i);
 												typ = eface->type;
 												if(typ != nil) {
-												cmd/gc, runtime: refactor interface inlining decision into compiler

We need to change the interface value representation for
concurrent garbage collection, so that there is no ambiguity
about whether the data word holds a pointer or scalar.

This CL does NOT make any representation changes.

Instead, it removes representation assumptions from
various pieces of code throughout the tree.
The isdirectiface function in cmd/gc/subr.c is now
the only place that decides that policy.
The policy propagates out from there in the reflect
metadata, as a new flag in the internal kind value.

A follow-up CL will change the representation by
changing the isdirectiface function. If that CL causes
problems, it will be easy to roll back.

Update #8405.

LGTM=iant
R=golang-codereviews, iant
CC=golang-codereviews, r
https://golang.org/cl/129090043

											
										
										
											2014-08-18 19:13:11 -06:00
+													if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+														obj = eface->data;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												break;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+											}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(bits == BitsSlice) {
 												i += 2*PtrSize;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+												if(ncached == 2)
 													ncached = 0;
 												else if(ptrmask == nil) {
 													// Refill cache and consume one quadruple.
 													cached = *--ptrbitp;
 													cached >>= gcBits;
 													ncached = 1;
 												}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											} else {
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												i += PtrSize;
 												cached >>= gcBits;
 												ncached--;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										markobj:
 											// At this point we have extracted the next potential pointer.
 											// Check if it points into heap.
 											if(obj == nil || obj < arena_start || obj >= arena_used)
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+												continue;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											// Mark the object.
 											off = (uintptr*)obj - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+											bitp = arena_start - off/wordsPerBitmapByte - 1;
 											shift = (off % wordsPerBitmapByte) * gcBits;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											xbits = *bitp;
 											bits = (xbits >> shift) & bitMask;
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+											if((bits&bitBoundary) == 0) {
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+												// Not a beginning of a block, consult span table to find the block beginning.
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												k = (uintptr)obj>>PageShift;
 												x = k;
 												x -= (uintptr)arena_start>>PageShift;
 												s = runtime·mheap.spans[x];
 												if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+													continue;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												p = (byte*)((uintptr)s->start<<PageShift);
 												if(s->sizeclass != 0) {
 													size = s->elemsize;
 													idx = ((byte*)obj - p)/size;
 													p = p+idx*size;
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+												}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												if(p == obj) {
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+													runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
 														p, s->start*PageSize, s->limit);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+													runtime·throw("failed to find block beginning");
 												}
 												obj = p;
 												goto markobj;
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+											}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											// Now we have bits, bitp, and shift correct for
 											// obj pointing at the base of the object.
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+											// Only care about not marked objects.
 											if((bits&bitMarked) != 0)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												continue;
-												runtime: mark objects with non-atomic operations

On the go.benchmarks/garbage benchmark with GOMAXPROCS=16:
                   old ns/op     new ns/op     delta
time               1392254       1353170       -2.81%
cputime            21995751      21373999      -2.83%
gc-pause-one       15044812      13050524      -13.26%
gc-pause-total     213636        185317        -13.26%

LGTM=rlh
R=golang-codereviews, rlh
CC=golang-codereviews, khr, rsc
https://golang.org/cl/123380043

											
										
										
											2014-08-14 11:38:24 -06:00
+											// If obj size is greater than 8, then each byte of GC bitmap
 											// contains info for at most one object. In such case we use
 											// non-atomic byte store to mark the object. This can lead
 											// to double enqueue of the object for scanning, but scanning
 											// is an idempotent operation, so it is OK. This cannot lead
 											// to bitmap corruption because the single marked bit is the
 											// only thing that can change in the byte.
 											// For 8-byte objects we use non-atomic store, if the other
 											// quadruple is already marked. Otherwise we resort to CAS
 											// loop for marking.
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+											if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
-												runtime: mark with non-atomic operations when GOMAXPROCS=1
Perf builders show 3-5% GC pause increase with GOMAXPROCS=1 when marking with atomic ops:
http://goperfd.appspot.com/perfdetail?commit=a8a6e765d6a87f7ccb71fd85a60eb5a821151f85&commit0=3b864e02b987171e05e2e9d0840b85b5b6476386&kind=builder&builder=linux-amd64&benchmark=http

LGTM=rlh
R=golang-codereviews, rlh
CC=dave, golang-codereviews, khr, rsc
https://golang.org/cl/128340043

											
										
										
											2014-08-15 23:07:55 -06:00
+												work.nproc == 1)
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+												*bitp = xbits | (bitMarked<<shift);
 											else
 												runtime·atomicor8(bitp, bitMarked<<shift);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(((xbits>>(shift+2))&BitsMask) == BitsDead)
 												continue;  // noscan object
 											// Queue the obj for scanning.
 											PREFETCH(obj);
 											obj = (byte*)((uintptr)obj & ~(PtrSize-1));
 											p = scanbuf[scanbufpos];
 											scanbuf[scanbufpos++] = obj;
 											if(scanbufpos == nelem(scanbuf))
 												scanbufpos = 0;
 											if(p == nil)
 												continue;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											// If workbuf is full, obtain an empty one.
 											if(nobj >= nelem(wbuf->obj)) {
 												wbuf->nobj = nobj;
 												wbuf = getempty(wbuf);
 												nobj = wbuf->nobj;
 												wp = &wbuf->obj[nobj];
 											}
 											*wp = p;
 											wp++;
 											nobj++;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										if(Debug && ptrmask == nil) {
 											// For heap objects ensure that we did not overscan.
 											n = 0;
 											p = nil;
 											if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) {
 												runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n);
 												runtime·throw("scanblock: scanned invalid object");
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
 								}
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+								static void
 								markroot(ParFor *desc, uint32 i)
 								{
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									FinBlock *fb;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									MSpan *s;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									uint32 spanidx, sg;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									G *gp;
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+									void *p;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									USED(&desc);
-												runtime: WriteHeapDump dumps the heap to a file.

See http://golang.org/s/go13heapdump for the file format.

LGTM=rsc
R=rsc, bradfitz, dvyukov, khr
CC=golang-codereviews
https://golang.org/cl/37540043

											
										
										
											2014-03-25 16:09:49 -06:00
+									// Note: if you add a case here, please also update heapdump.c:dumproots.
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									switch(i) {
 									case RootData:
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+										scanblock(data, edata - data, runtime·gcdatamask);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										break;
 									case RootBss:
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+										scanblock(bss, ebss - bss, runtime·gcbssmask);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										break;
 									case RootFinalizers:
 										for(fb=allfin; fb; fb=fb->alllink)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), ScanConservatively);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										break;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									case RootSpans:
 										// mark MSpan.specials
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+										sg = runtime·mheap.sweepgen;
 										for(spanidx=0; spanidx<work.nspan; spanidx++) {
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											Special *sp;
 											SpecialFinalizer *spf;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+											s = work.spans[spanidx];
-												undo CL 101570044 / 2c57aaea79c4

redo stack allocation.  This is mostly the same as
the original CL with a few bug fixes.

1. add racemalloc() for stack allocations
2. fix poolalloc/poolfree to terminate free lists correctly.
3. adjust span ref count correctly.
4. don't use cache for sizes >= StackCacheSize.

Should fix bugs and memory leaks in original changelist.

««« original CL description
undo CL 104200047 / 318b04f28372

Breaks windows and race detector.
TBR=rsc

««« original CL description
runtime: stack allocator, separate from mallocgc

In order to move malloc to Go, we need to have a
separate stack allocator.  If we run out of stack
during malloc, malloc will not be available
to allocate a new stack.

Stacks are the last remaining FlagNoGC objects in the
GC heap.  Once they are out, we can get rid of the
distinction between the allocated/blockboundary bits.
(This will be in a separate change.)

Fixes #7468
Fixes #7424

LGTM=rsc, dvyukov
R=golang-codereviews, dvyukov, khr, dave, rsc
CC=golang-codereviews
https://golang.org/cl/104200047
»»»

TBR=rsc
CC=golang-codereviews
https://golang.org/cl/101570044
»»»

LGTM=dvyukov
R=dvyukov, dave, khr, alex.brainman
CC=golang-codereviews
https://golang.org/cl/112240044

											
										
										
											2014-07-17 15:41:46 -06:00
+											if(s->state != MSpanInUse)
 												continue;
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											if(s->sweepgen != sg) {
 												runtime·printf("sweep %d %d\n", s->sweepgen, sg);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+												runtime·throw("gc: unswept span");
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											for(sp = s->specials; sp != nil; sp = sp->next) {
 												if(sp->kind != KindSpecialFinalizer)
 													continue;
 												// don't mark finalized object, but scan it so we
 												// retain everything it points to.
 												spf = (SpecialFinalizer*)sp;
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+												// A finalizer can be set for an inner byte of an object, find object beginning.
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+												p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+												scanblock(p, s->elemsize, nil);
 												scanblock((void*)&spf->fn, PtrSize, ScanConservatively);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											}
 										}
 										break;
 									case RootFlushCaches:
 										flushallmcaches();
 										break;
 									default:
 										// the rest is scanning goroutine stacks
 										if(i - RootCount >= runtime·allglen)
 											runtime·throw("markroot: bad index");
 										gp = runtime·allg[i - RootCount];
 										// remember when we've first observed the G blocked
 										// needed only to output in traceback
 										if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
 											gp->waitsince = work.tstart;
-												runtime: shrink stacks in parallel
Shrinkstack does not touch normal heap anymore,
so we can shink stacks concurrently with marking.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, khr, rlh, rsc
https://golang.org/cl/122130043

											
										
										
											2014-08-07 02:55:28 -06:00
+										// Shrink a stack if not much of it is being used.
 										runtime·shrinkstack(gp);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanstack(gp);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										break;
 									}
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// Get an empty work buffer off the work.empty list,
 								// allocating new buffers as needed.
 								static Workbuf*
 								getempty(Workbuf *b)
 								{
-												runtime: cache one GC workbuf in thread-local storage
We call scanblock for lots of small root pieces
e.g. for every stack frame args and locals area.
Every scanblock invocation calls getempty/putempty,
which accesses lock-free stack shared among all worker threads.
One-element local cache allows most scanblock calls
to proceed without accessing the shared stack.

LGTM=rsc
R=golang-codereviews, rlh
CC=golang-codereviews, khr, rsc
https://golang.org/cl/121250043

											
										
										
											2014-08-05 15:50:37 -06:00
+									MCache *c;
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									if(b != nil)
 										runtime·lfstackpush(&work.full, &b->node);
-												runtime: cache one GC workbuf in thread-local storage
We call scanblock for lots of small root pieces
e.g. for every stack frame args and locals area.
Every scanblock invocation calls getempty/putempty,
which accesses lock-free stack shared among all worker threads.
One-element local cache allows most scanblock calls
to proceed without accessing the shared stack.

LGTM=rsc
R=golang-codereviews, rlh
CC=golang-codereviews, khr, rsc
https://golang.org/cl/121250043

											
										
										
											2014-08-05 15:50:37 -06:00
+									b = nil;
 									c = g->m->mcache;
 									if(c->gcworkbuf != nil) {
 										b = c->gcworkbuf;
 										c->gcworkbuf = nil;
 									}
 									if(b == nil)
 										b = (Workbuf*)runtime·lfstackpop(&work.empty);
-												runtime: simplify code

LGTM=khr
R=golang-codereviews, dave, khr
CC=golang-codereviews, rsc
https://golang.org/cl/116950043

											
										
										
											2014-07-21 15:56:01 -06:00
+									if(b == nil)
 										b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									b->nobj = 0;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									return b;
 								}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static void
 								putempty(Workbuf *b)
 								{
-												runtime: cache one GC workbuf in thread-local storage
We call scanblock for lots of small root pieces
e.g. for every stack frame args and locals area.
Every scanblock invocation calls getempty/putempty,
which accesses lock-free stack shared among all worker threads.
One-element local cache allows most scanblock calls
to proceed without accessing the shared stack.

LGTM=rsc
R=golang-codereviews, rlh
CC=golang-codereviews, khr, rsc
https://golang.org/cl/121250043

											
										
										
											2014-08-05 15:50:37 -06:00
+									MCache *c;
 									c = g->m->mcache;
 									if(c->gcworkbuf == nil) {
 										c->gcworkbuf = b;
 										return;
 									}
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·lfstackpush(&work.empty, &b->node);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								}
-												runtime: cache one GC workbuf in thread-local storage
We call scanblock for lots of small root pieces
e.g. for every stack frame args and locals area.
Every scanblock invocation calls getempty/putempty,
which accesses lock-free stack shared among all worker threads.
One-element local cache allows most scanblock calls
to proceed without accessing the shared stack.

LGTM=rsc
R=golang-codereviews, rlh
CC=golang-codereviews, khr, rsc
https://golang.org/cl/121250043

											
										
										
											2014-08-05 15:50:37 -06:00
+								void
 								runtime·gcworkbuffree(void *b)
 								{
 									if(b != nil)
 										putempty(b);
 								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// Get a full work buffer off the work.full list, or return nil.
 								static Workbuf*
 								getfull(Workbuf *b)
 								{
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									int32 i;
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									if(b != nil)
 										runtime·lfstackpush(&work.empty, &b->node);
 									b = (Workbuf*)runtime·lfstackpop(&work.full);
 									if(b != nil || work.nproc == 1)
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										return b;
 									runtime·xadd(&work.nwait, +1);
 									for(i=0;; i++) {
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+										if(work.full != 0) {
 											runtime·xadd(&work.nwait, -1);
 											b = (Workbuf*)runtime·lfstackpop(&work.full);
 											if(b != nil)
 												return b;
 											runtime·xadd(&work.nwait, +1);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										}
 										if(work.nwait == work.nproc)
 											return nil;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										if(i < 10) {
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+											g->m->gcstats.nprocyield++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·procyield(20);
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										} else if(i < 20) {
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+											g->m->gcstats.nosyield++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·osyield();
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										} else {
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+											g->m->gcstats.nsleep++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·usleep(100);
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								}
 								static Workbuf*
 								handoff(Workbuf *b)
 								{
 									int32 n;
 									Workbuf *b1;
 									// Make new buffer with half of b's pointers.
 									b1 = getempty(nil);
 									n = b->nobj/2;
 									b->nobj -= n;
 									b1->nobj = n;
 									runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->gcstats.nhandoff++;
 									g->m->gcstats.nhandoffcnt += n;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
 									// Put b on full list - let first half of b get stolen.
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·lfstackpush(&work.full, &b->node);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									return b1;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												cmd/gc, cmd/ld, runtime: compact liveness bitmaps

Reduce footprint of liveness bitmaps by about 5x.

1. Mark all liveness bitmap symbols as 4-byte aligned
(they were aligned to a larger size by default).

2. The bitmap data is a bitmap count n followed by n bitmaps.
Each bitmap begins with its own count m giving the number
of bits. All the m's are the same for the n bitmaps.
Emit this bitmap length once instead of n times.

3. Many bitmaps within a function have the same bit values,
but each call site was given a distinct bitmap. Merge duplicate
bitmaps so that no bitmap is written more than once.

4. Many functions end up with the same aggregate bitmap data.
We used to name the bitmap data funcname.gcargs and funcname.gclocals.
Instead, name it gclocals.<md5 of data> and mark it dupok so
that the linker coalesces duplicate sets. This cut the bitmap
data remaining after step 3 by 40%; I was not expecting it to
be quite so dramatic.

Applied to "go build -ldflags -w code.google.com/p/go.tools/cmd/godoc":

                bitmaps           pclntab           binary on disk
before this CL  1326600           1985854           12738268
4-byte align    1154288 (0.87x)   1985854 (1.00x)   12566236 (0.99x)
one bitmap len   782528 (0.54x)   1985854 (1.00x)   12193500 (0.96x)
dedup bitmap     414748 (0.31x)   1948478 (0.98x)   11787996 (0.93x)
dedup bitmap set 245580 (0.19x)   1948478 (0.98x)   11620060 (0.91x)

While here, remove various dead blocks of code from plive.c.

Fixes #6929.
Fixes #7568.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/83630044

											
										
										
											2014-04-02 14:49:27 -06:00
+								BitVector
-												runtime: grow stack by copying

On stack overflow, if all frames on the stack are
copyable, we copy the frames to a new stack twice
as large as the old one.  During GC, if a G is using
less than 1/4 of its stack, copy the stack to a stack
half its size.

TODO
- Do something about C frames.  When a C frame is in the
  stack segment, it isn't copyable.  We allocate a new segment
  in this case.
  - For idempotent C code, we can abort it, copy the stack,
    then retry.  I'm working on a separate CL for this.
  - For other C code, we can raise the stackguard
    to the lowest Go frame so the next call that Go frame
    makes triggers a copy, which will then succeed.
- Pick a starting stack size?

The plan is that eventually we reach a point where the
stack contains only copyable frames.

LGTM=rsc
R=dvyukov, rsc
CC=golang-codereviews
https://golang.org/cl/54650044

											
										
										
											2014-02-27 00:28:44 -07:00
+								runtime·stackmapdata(StackMap *stackmap, int32 n)
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								{
-												cmd/gc, cmd/ld, runtime: compact liveness bitmaps

Reduce footprint of liveness bitmaps by about 5x.

1. Mark all liveness bitmap symbols as 4-byte aligned
(they were aligned to a larger size by default).

2. The bitmap data is a bitmap count n followed by n bitmaps.
Each bitmap begins with its own count m giving the number
of bits. All the m's are the same for the n bitmaps.
Emit this bitmap length once instead of n times.

3. Many bitmaps within a function have the same bit values,
but each call site was given a distinct bitmap. Merge duplicate
bitmaps so that no bitmap is written more than once.

4. Many functions end up with the same aggregate bitmap data.
We used to name the bitmap data funcname.gcargs and funcname.gclocals.
Instead, name it gclocals.<md5 of data> and mark it dupok so
that the linker coalesces duplicate sets. This cut the bitmap
data remaining after step 3 by 40%; I was not expecting it to
be quite so dramatic.

Applied to "go build -ldflags -w code.google.com/p/go.tools/cmd/godoc":

                bitmaps           pclntab           binary on disk
before this CL  1326600           1985854           12738268
4-byte align    1154288 (0.87x)   1985854 (1.00x)   12566236 (0.99x)
one bitmap len   782528 (0.54x)   1985854 (1.00x)   12193500 (0.96x)
dedup bitmap     414748 (0.31x)   1948478 (0.98x)   11787996 (0.93x)
dedup bitmap set 245580 (0.19x)   1948478 (0.98x)   11620060 (0.91x)

While here, remove various dead blocks of code from plive.c.

Fixes #6929.
Fixes #7568.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/83630044

											
										
										
											2014-04-02 14:49:27 -06:00
+									if(n < 0 || n >= stackmap->n)
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+										runtime·throw("stackmapdata: index out of range");
-												cmd/gc, cmd/ld, runtime: compact liveness bitmaps

Reduce footprint of liveness bitmaps by about 5x.

1. Mark all liveness bitmap symbols as 4-byte aligned
(they were aligned to a larger size by default).

2. The bitmap data is a bitmap count n followed by n bitmaps.
Each bitmap begins with its own count m giving the number
of bits. All the m's are the same for the n bitmaps.
Emit this bitmap length once instead of n times.

3. Many bitmaps within a function have the same bit values,
but each call site was given a distinct bitmap. Merge duplicate
bitmaps so that no bitmap is written more than once.

4. Many functions end up with the same aggregate bitmap data.
We used to name the bitmap data funcname.gcargs and funcname.gclocals.
Instead, name it gclocals.<md5 of data> and mark it dupok so
that the linker coalesces duplicate sets. This cut the bitmap
data remaining after step 3 by 40%; I was not expecting it to
be quite so dramatic.

Applied to "go build -ldflags -w code.google.com/p/go.tools/cmd/godoc":

                bitmaps           pclntab           binary on disk
before this CL  1326600           1985854           12738268
4-byte align    1154288 (0.87x)   1985854 (1.00x)   12566236 (0.99x)
one bitmap len   782528 (0.54x)   1985854 (1.00x)   12193500 (0.96x)
dedup bitmap     414748 (0.31x)   1948478 (0.98x)   11787996 (0.93x)
dedup bitmap set 245580 (0.19x)   1948478 (0.98x)   11620060 (0.91x)

While here, remove various dead blocks of code from plive.c.

Fixes #6929.
Fixes #7568.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/83630044

											
										
										
											2014-04-02 14:49:27 -06:00
+									return (BitVector){stackmap->nbit, stackmap->data + n*((stackmap->nbit+31)/32)};
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								}
 								// Scan a stack frame: local variables and function arguments/results.
-												runtime: grow stack by copying

On stack overflow, if all frames on the stack are
copyable, we copy the frames to a new stack twice
as large as the old one.  During GC, if a G is using
less than 1/4 of its stack, copy the stack to a stack
half its size.

TODO
- Do something about C frames.  When a C frame is in the
  stack segment, it isn't copyable.  We allocate a new segment
  in this case.
  - For idempotent C code, we can abort it, copy the stack,
    then retry.  I'm working on a separate CL for this.
  - For other C code, we can raise the stackguard
    to the lowest Go frame so the next call that Go frame
    makes triggers a copy, which will then succeed.
- Pick a starting stack size?

The plan is that eventually we reach a point where the
stack contains only copyable frames.

LGTM=rsc
R=dvyukov, rsc
CC=golang-codereviews
https://golang.org/cl/54650044

											
										
										
											2014-02-27 00:28:44 -07:00
+								static bool
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								scanframe(Stkframe *frame, void *unused)
-												cmd/ld, runtime: restrict stack root scan to locals and arguments

Updates #5134

R=golang-dev, bradfitz, cshapiro, daniel.morsing, ality, iant
CC=golang-dev
https://golang.org/cl/8022044

											
										
										
											2013-03-28 15:36:23 -06:00
+								{
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									Func *f;
 									StackMap *stackmap;
-												cmd/gc, cmd/ld, runtime: compact liveness bitmaps

Reduce footprint of liveness bitmaps by about 5x.

1. Mark all liveness bitmap symbols as 4-byte aligned
(they were aligned to a larger size by default).

2. The bitmap data is a bitmap count n followed by n bitmaps.
Each bitmap begins with its own count m giving the number
of bits. All the m's are the same for the n bitmaps.
Emit this bitmap length once instead of n times.

3. Many bitmaps within a function have the same bit values,
but each call site was given a distinct bitmap. Merge duplicate
bitmaps so that no bitmap is written more than once.

4. Many functions end up with the same aggregate bitmap data.
We used to name the bitmap data funcname.gcargs and funcname.gclocals.
Instead, name it gclocals.<md5 of data> and mark it dupok so
that the linker coalesces duplicate sets. This cut the bitmap
data remaining after step 3 by 40%; I was not expecting it to
be quite so dramatic.

Applied to "go build -ldflags -w code.google.com/p/go.tools/cmd/godoc":

                bitmaps           pclntab           binary on disk
before this CL  1326600           1985854           12738268
4-byte align    1154288 (0.87x)   1985854 (1.00x)   12566236 (0.99x)
one bitmap len   782528 (0.54x)   1985854 (1.00x)   12193500 (0.96x)
dedup bitmap     414748 (0.31x)   1948478 (0.98x)   11787996 (0.93x)
dedup bitmap set 245580 (0.19x)   1948478 (0.98x)   11620060 (0.91x)

While here, remove various dead blocks of code from plive.c.

Fixes #6929.
Fixes #7568.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/83630044

											
										
										
											2014-04-02 14:49:27 -06:00
+									BitVector bv;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									uintptr size;
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									uintptr targetpc;
 									int32 pcdata;
-												cmd/ld, runtime: restrict stack root scan to locals and arguments

Updates #5134

R=golang-dev, bradfitz, cshapiro, daniel.morsing, ality, iant
CC=golang-dev
https://golang.org/cl/8022044

											
										
										
											2013-03-28 15:36:23 -06:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									USED(unused);
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									f = frame->fn;
-												runtime: make continuation pc available to stack walk

The 'continuation pc' is where the frame will continue
execution, if anywhere. For a frame that stopped execution
due to a CALL instruction, the continuation pc is immediately
after the CALL. But for a frame that stopped execution due to
a fault, the continuation pc is the pc after the most recent CALL
to deferproc in that frame, or else 0. That is where execution
will continue, if anywhere.

The liveness information is only recorded for CALL instructions.
This change makes sure that we never look for liveness information
except for CALL instructions.

Using a valid PC fixes crashes when a garbage collection or
stack copying tries to process a stack frame that has faulted.

Record continuation pc in heapdump (format change).

Fixes #8048.

LGTM=iant, khr
R=khr, iant, dvyukov
CC=golang-codereviews, r
https://golang.org/cl/100870044

											
										
										
											2014-05-31 08:10:12 -06:00
+									targetpc = frame->continpc;
 									if(targetpc == 0) {
 										// Frame is dead.
 										return true;
 									}
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									if(Debug > 1)
 										runtime·printf("scanframe %s\n", runtime·funcname(f));
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									if(targetpc != f->entry)
 										targetpc--;
 									pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
 									if(pcdata == -1) {
 										// We do not have a valid pcdata value but there might be a
 										// stackmap for this function.  It is likely that we are looking
 										// at the function prologue, assume so and hope for the best.
 										pcdata = 0;
 									}
-												runtime: adjust traceback / garbage collector boundary

The garbage collection routine addframeroots is duplicating
logic in the traceback routine that calls it, sometimes correctly,
sometimes incorrectly, sometimes incompletely.
Pass necessary information to addframeroots instead of
deriving it anew.

Should make addframeroots significantly more robust.
It's certainly smaller.

Also try to standardize on uintptr for saved pc, sp values.

Will make CL 10036044 trivial.

R=golang-dev, dave, dvyukov
CC=golang-dev
https://golang.org/cl/10169045

											
										
										
											2013-06-12 06:49:38 -06:00
+									// Scan local variables if stack frame has been allocated.
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									// Use pointer information if known.
-												runtime: remove obsolete afterprologue check
Afterprologue check was required when did not know
about return arguments of functions and/or they were not zeroed.
Now 100% precision is required for stacks due to stack copying,
so it must work w/o afterprologue one way or another.
I can limit this change for 1.3 to merely adding a TODO,
but this check is super confusing so I don't want this knowledge to get lost.

LGTM=rsc
R=golang-codereviews, gobot, rsc, khr
CC=golang-codereviews, khr, rsc
https://golang.org/cl/96580045

											
										
										
											2014-06-19 23:04:10 -06:00
+									stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
 									if(stackmap == nil) {
 										// No locals information, scan everything.
 										size = frame->varp - (byte*)frame->sp;
 										if(Debug > 2)
 											runtime·printf("frame %s unsized locals %p+%p\n", runtime·funcname(f), frame->varp-size, size);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanblock(frame->varp - size, size, ScanConservatively);
-												runtime: remove obsolete afterprologue check
Afterprologue check was required when did not know
about return arguments of functions and/or they were not zeroed.
Now 100% precision is required for stacks due to stack copying,
so it must work w/o afterprologue one way or another.
I can limit this change for 1.3 to merely adding a TODO,
but this check is super confusing so I don't want this knowledge to get lost.

LGTM=rsc
R=golang-codereviews, gobot, rsc, khr
CC=golang-codereviews, khr, rsc
https://golang.org/cl/96580045

											
										
										
											2014-06-19 23:04:10 -06:00
+									} else if(stackmap->n < 0) {
 										// Locals size information, scan just the locals.
 										size = -stackmap->n;
 										if(Debug > 2)
 											runtime·printf("frame %s conservative locals %p+%p\n", runtime·funcname(f), frame->varp-size, size);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanblock(frame->varp - size, size, ScanConservatively);
-												runtime: remove obsolete afterprologue check
Afterprologue check was required when did not know
about return arguments of functions and/or they were not zeroed.
Now 100% precision is required for stacks due to stack copying,
so it must work w/o afterprologue one way or another.
I can limit this change for 1.3 to merely adding a TODO,
but this check is super confusing so I don't want this knowledge to get lost.

LGTM=rsc
R=golang-codereviews, gobot, rsc, khr
CC=golang-codereviews, khr, rsc
https://golang.org/cl/96580045

											
										
										
											2014-06-19 23:04:10 -06:00
+									} else if(stackmap->n > 0) {
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										// Locals bitmap information, scan just the pointers in locals.
-												runtime: remove obsolete afterprologue check
Afterprologue check was required when did not know
about return arguments of functions and/or they were not zeroed.
Now 100% precision is required for stacks due to stack copying,
so it must work w/o afterprologue one way or another.
I can limit this change for 1.3 to merely adding a TODO,
but this check is super confusing so I don't want this knowledge to get lost.

LGTM=rsc
R=golang-codereviews, gobot, rsc, khr
CC=golang-codereviews, khr, rsc
https://golang.org/cl/96580045

											
										
										
											2014-06-19 23:04:10 -06:00
+										if(pcdata < 0 || pcdata >= stackmap->n) {
 											// don't know where we are
 											runtime·printf("pcdata is %d and %d stack map entries for %s (targetpc=%p)\n",
 												pcdata, stackmap->n, runtime·funcname(f), targetpc);
 											runtime·throw("scanframe: bad symbol table");
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+										}
-												runtime: remove obsolete afterprologue check
Afterprologue check was required when did not know
about return arguments of functions and/or they were not zeroed.
Now 100% precision is required for stacks due to stack copying,
so it must work w/o afterprologue one way or another.
I can limit this change for 1.3 to merely adding a TODO,
but this check is super confusing so I don't want this knowledge to get lost.

LGTM=rsc
R=golang-codereviews, gobot, rsc, khr
CC=golang-codereviews, khr, rsc
https://golang.org/cl/96580045

											
										
										
											2014-06-19 23:04:10 -06:00
+										bv = runtime·stackmapdata(stackmap, pcdata);
 										size = (bv.n * PtrSize) / BitsPerPointer;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanblock(frame->varp - size, bv.n/BitsPerPointer*PtrSize, (byte*)bv.data);
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									}
-												runtime: adjust traceback / garbage collector boundary

The garbage collection routine addframeroots is duplicating
logic in the traceback routine that calls it, sometimes correctly,
sometimes incorrectly, sometimes incompletely.
Pass necessary information to addframeroots instead of
deriving it anew.

Should make addframeroots significantly more robust.
It's certainly smaller.

Also try to standardize on uintptr for saved pc, sp values.

Will make CL 10036044 trivial.

R=golang-dev, dave, dvyukov
CC=golang-dev
https://golang.org/cl/10169045

											
										
										
											2013-06-12 06:49:38 -06:00
 									// Scan arguments.
 									// Use pointer information if known.
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
 									if(stackmap != nil) {
-												runtime: grow stack by copying

On stack overflow, if all frames on the stack are
copyable, we copy the frames to a new stack twice
as large as the old one.  During GC, if a G is using
less than 1/4 of its stack, copy the stack to a stack
half its size.

TODO
- Do something about C frames.  When a C frame is in the
  stack segment, it isn't copyable.  We allocate a new segment
  in this case.
  - For idempotent C code, we can abort it, copy the stack,
    then retry.  I'm working on a separate CL for this.
  - For other C code, we can raise the stackguard
    to the lowest Go frame so the next call that Go frame
    makes triggers a copy, which will then succeed.
- Pick a starting stack size?

The plan is that eventually we reach a point where the
stack contains only copyable frames.

LGTM=rsc
R=dvyukov, rsc
CC=golang-codereviews
https://golang.org/cl/54650044

											
										
										
											2014-02-27 00:28:44 -07:00
+										bv = runtime·stackmapdata(stackmap, pcdata);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanblock(frame->argp, bv.n/BitsPerPointer*PtrSize, (byte*)bv.data);
-												reflect, runtime: fix crash in GC due to reflect.call + precise GC

Given
        type Outer struct {
                *Inner
                ...
        }
the compiler generates the implementation of (*Outer).M dispatching to
the embedded Inner. The implementation is logically:
        func (p *Outer) M() {
                (p.Inner).M()
        }
but since the only change here is the replacement of one pointer
receiver with another, the actual generated code overwrites the
original receiver with the p.Inner pointer and then jumps to the M
method expecting the *Inner receiver.

During reflect.Value.Call, we create an argument frame and the
associated data structures to describe it to the garbage collector,
populate the frame, call reflect.call to run a function call using
that frame, and then copy the results back out of the frame. The
reflect.call function does a memmove of the frame structure onto the
stack (to set up the inputs), runs the call, and the memmoves the
stack back to the frame structure (to preserve the outputs).

Originally reflect.call did not distinguish inputs from outputs: both
memmoves were for the full stack frame. However, in the case where the
called function was one of these wrappers, the rewritten receiver is
almost certainly a different type than the original receiver. This is
not a problem on the stack, where we use the program counter to
determine the type information and understand that during (*Outer).M
the receiver is an *Outer while during (*Inner).M the receiver in the
same memory word is now an *Inner. But in the statically typed
argument frame created by reflect, the receiver is always an *Outer.
Copying the modified receiver pointer off the stack into the frame
will store an *Inner there, and then if a garbage collection happens
to scan that argument frame before it is discarded, it will scan the
*Inner memory as if it were an *Outer. If the two have different
memory layouts, the collection will intepret the memory incorrectly.

Fix by only copying back the results.

Fixes #7725.

LGTM=khr
R=khr
CC=dave, golang-codereviews
https://golang.org/cl/85180043

											
										
										
											2014-04-08 09:11:35 -06:00
+									} else {
 										if(Debug > 2)
 											runtime·printf("frame %s conservative args %p+%p\n", runtime·funcname(f), frame->argp, (uintptr)frame->arglen);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										scanblock(frame->argp, frame->arglen, ScanConservatively);
-												reflect, runtime: fix crash in GC due to reflect.call + precise GC

Given
        type Outer struct {
                *Inner
                ...
        }
the compiler generates the implementation of (*Outer).M dispatching to
the embedded Inner. The implementation is logically:
        func (p *Outer) M() {
                (p.Inner).M()
        }
but since the only change here is the replacement of one pointer
receiver with another, the actual generated code overwrites the
original receiver with the p.Inner pointer and then jumps to the M
method expecting the *Inner receiver.

During reflect.Value.Call, we create an argument frame and the
associated data structures to describe it to the garbage collector,
populate the frame, call reflect.call to run a function call using
that frame, and then copy the results back out of the frame. The
reflect.call function does a memmove of the frame structure onto the
stack (to set up the inputs), runs the call, and the memmoves the
stack back to the frame structure (to preserve the outputs).

Originally reflect.call did not distinguish inputs from outputs: both
memmoves were for the full stack frame. However, in the case where the
called function was one of these wrappers, the rewritten receiver is
almost certainly a different type than the original receiver. This is
not a problem on the stack, where we use the program counter to
determine the type information and understand that during (*Outer).M
the receiver is an *Outer while during (*Inner).M the receiver in the
same memory word is now an *Inner. But in the statically typed
argument frame created by reflect, the receiver is always an *Outer.
Copying the modified receiver pointer off the stack into the frame
will store an *Inner there, and then if a garbage collection happens
to scan that argument frame before it is discarded, it will scan the
*Inner memory as if it were an *Outer. If the two have different
memory layouts, the collection will intepret the memory incorrectly.

Fix by only copying back the results.

Fixes #7725.

LGTM=khr
R=khr
CC=dave, golang-codereviews
https://golang.org/cl/85180043

											
										
										
											2014-04-08 09:11:35 -06:00
+									}
-												runtime: grow stack by copying

On stack overflow, if all frames on the stack are
copyable, we copy the frames to a new stack twice
as large as the old one.  During GC, if a G is using
less than 1/4 of its stack, copy the stack to a stack
half its size.

TODO
- Do something about C frames.  When a C frame is in the
  stack segment, it isn't copyable.  We allocate a new segment
  in this case.
  - For idempotent C code, we can abort it, copy the stack,
    then retry.  I'm working on a separate CL for this.
  - For other C code, we can raise the stackguard
    to the lowest Go frame so the next call that Go frame
    makes triggers a copy, which will then succeed.
- Pick a starting stack size?

The plan is that eventually we reach a point where the
stack contains only copyable frames.

LGTM=rsc
R=dvyukov, rsc
CC=golang-codereviews
https://golang.org/cl/54650044

											
										
										
											2014-02-27 00:28:44 -07:00
+									return true;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								static void
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								scanstack(G *gp)
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								{
 									M *mp;
 									int32 n;
 									Stktop *stk;
 									uintptr sp, guard;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									switch(gp->status){
 									default:
 										runtime·printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
 										runtime·throw("mark - bad status");
 									case Gdead:
 										return;
 									case Grunning:
 										runtime·throw("mark - world not stopped");
 									case Grunnable:
 									case Gsyscall:
 									case Gwaiting:
 										break;
 									}
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									if(gp == g)
 										runtime·throw("can't scan our own stack");
 									if((mp = gp->m) != nil && mp->helpgc)
 										runtime·throw("can't scan gchelper stack");
-												runtime: grow stack by copying

On stack overflow, if all frames on the stack are
copyable, we copy the frames to a new stack twice
as large as the old one.  During GC, if a G is using
less than 1/4 of its stack, copy the stack to a stack
half its size.

TODO
- Do something about C frames.  When a C frame is in the
  stack segment, it isn't copyable.  We allocate a new segment
  in this case.
  - For idempotent C code, we can abort it, copy the stack,
    then retry.  I'm working on a separate CL for this.
  - For other C code, we can raise the stackguard
    to the lowest Go frame so the next call that Go frame
    makes triggers a copy, which will then succeed.
- Pick a starting stack size?

The plan is that eventually we reach a point where the
stack contains only copyable frames.

LGTM=rsc
R=dvyukov, rsc
CC=golang-codereviews
https://golang.org/cl/54650044

											
										
										
											2014-02-27 00:28:44 -07:00
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									if(gp->syscallstack != (uintptr)nil) {
 										// Scanning another goroutine that is about to enter or might
 										// have just exited a system call. It may be executing code such
 										// as schedlock and may have needed to start a new stack segment.
 										// Use the stack segment and stack pointer at the time of
 										// the system call instead, since that won't change underfoot.
 										sp = gp->syscallsp;
 										stk = (Stktop*)gp->syscallstack;
 										guard = gp->syscallguard;
 									} else {
 										// Scanning another goroutine's stack.
 										// The goroutine is usually asleep (the world is stopped).
 										sp = gp->sched.sp;
 										stk = (Stktop*)gp->stackbase;
 										guard = gp->stackguard;
 									}
 									if(ScanStackByFrames) {
 										USED(sp);
 										USED(stk);
 										USED(guard);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, scanframe, nil, false);
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									} else {
 										n = 0;
 										while(stk) {
 											if(sp < guard-StackGuard || (uintptr)stk < sp) {
 												runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
 												runtime·throw("scanstack");
 											}
-												reflect, runtime: fix crash in GC due to reflect.call + precise GC

Given
        type Outer struct {
                *Inner
                ...
        }
the compiler generates the implementation of (*Outer).M dispatching to
the embedded Inner. The implementation is logically:
        func (p *Outer) M() {
                (p.Inner).M()
        }
but since the only change here is the replacement of one pointer
receiver with another, the actual generated code overwrites the
original receiver with the p.Inner pointer and then jumps to the M
method expecting the *Inner receiver.

During reflect.Value.Call, we create an argument frame and the
associated data structures to describe it to the garbage collector,
populate the frame, call reflect.call to run a function call using
that frame, and then copy the results back out of the frame. The
reflect.call function does a memmove of the frame structure onto the
stack (to set up the inputs), runs the call, and the memmoves the
stack back to the frame structure (to preserve the outputs).

Originally reflect.call did not distinguish inputs from outputs: both
memmoves were for the full stack frame. However, in the case where the
called function was one of these wrappers, the rewritten receiver is
almost certainly a different type than the original receiver. This is
not a problem on the stack, where we use the program counter to
determine the type information and understand that during (*Outer).M
the receiver is an *Outer while during (*Inner).M the receiver in the
same memory word is now an *Inner. But in the statically typed
argument frame created by reflect, the receiver is always an *Outer.
Copying the modified receiver pointer off the stack into the frame
will store an *Inner there, and then if a garbage collection happens
to scan that argument frame before it is discarded, it will scan the
*Inner memory as if it were an *Outer. If the two have different
memory layouts, the collection will intepret the memory incorrectly.

Fix by only copying back the results.

Fixes #7725.

LGTM=khr
R=khr
CC=dave, golang-codereviews
https://golang.org/cl/85180043

											
										
										
											2014-04-08 09:11:35 -06:00
+											if(Debug > 2)
 												runtime·printf("conservative stack %p+%p\n", (byte*)sp, (uintptr)stk-sp);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											scanblock((byte*)sp, (uintptr)stk - sp, ScanConservatively);
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+											sp = stk->gobuf.sp;
 											guard = stk->stackguard;
 											stk = (Stktop*)stk->stackbase;
 											n++;
 										}
 									}
 								}
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+								void
 								runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								{
 									FinBlock *block;
 									Finalizer *f;
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+									runtime·lock(&finlock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									if(finq == nil || finq->cnt == finq->cap) {
 										if(finc == nil) {
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+											finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
 											finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											finc->alllink = allfin;
 											allfin = finc;
 										}
 										block = finc;
 										finc = block->next;
 										block->next = finq;
 										finq = block;
 									}
 									f = &finq->fin[finq->cnt];
 									finq->cnt++;
 									f->fn = fn;
 									f->nret = nret;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									f->fint = fint;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+									f->ot = ot;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									f->arg = p;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+									runtime·fingwake = true;
 									runtime·unlock(&finlock);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								}
-												runtime: WriteHeapDump dumps the heap to a file.

See http://golang.org/s/go13heapdump for the file format.

LGTM=rsc
R=rsc, bradfitz, dvyukov, khr
CC=golang-codereviews
https://golang.org/cl/37540043

											
										
										
											2014-03-25 16:09:49 -06:00
+								void
 								runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*))
 								{
 									FinBlock *fb;
 									Finalizer *f;
 									uintptr i;
 									for(fb = allfin; fb; fb = fb->alllink) {
 										for(i = 0; i < fb->cnt; i++) {
 											f = &fb->fin[i];
 											callback(f->fn, f->arg, f->nret, f->fint, f->ot);
 										}
 									}
 								}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								void
 								runtime·MSpan_EnsureSwept(MSpan *s)
 								{
 									uint32 sg;
-												runtime: fix potential memory corruption
Reinforce the guarantee that MSpan_EnsureSwept actually ensures that the span is swept.
I have not observed crashes related to this, but I do not see why it can't crash as well.

LGTM=rsc
R=golang-codereviews
CC=golang-codereviews, khr, rsc
https://golang.org/cl/67990043

											
										
										
											2014-02-24 09:53:20 -07:00
+									// Caller must disable preemption.
 									// Otherwise when this function returns the span can become unswept again
 									// (if GC is triggered on another goroutine).
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
-												runtime: fix potential memory corruption
Reinforce the guarantee that MSpan_EnsureSwept actually ensures that the span is swept.
I have not observed crashes related to this, but I do not see why it can't crash as well.

LGTM=rsc
R=golang-codereviews
CC=golang-codereviews, khr, rsc
https://golang.org/cl/67990043

											
										
										
											2014-02-24 09:53:20 -07:00
+										runtime·throw("MSpan_EnsureSwept: m is not locked");
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sg = runtime·mheap.sweepgen;
 									if(runtime·atomicload(&s->sweepgen) == sg)
 										return;
 									if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+										runtime·MSpan_Sweep(s, false);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										return;
 									}
 									// unfortunate condition, and we don't have efficient means to wait
 									while(runtime·atomicload(&s->sweepgen) != sg)
 										runtime·osyield();
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								}
 								// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// It clears the mark bits in preparation for the next GC round.
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								// Returns true if the span was returned to heap.
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+								// If preserve=true, don't return it to heap nor relink in MCentral lists;
 								// caller takes care of it.
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								bool
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+								runtime·MSpan_Sweep(MSpan *s, bool preserve)
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+								{
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									int32 cl, n, npages, nfree;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr size, off, step;
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									uint32 sweepgen;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									byte *p, *bitp, shift, xbits, bits;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									MCache *c;
 									byte *arena_start;
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+									MLink head, *end, *link;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									Special *special, **specialp, *y;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									bool res, sweepgenset;
-												runtime: tighten garbage collector
 * specialize sweepspan as sweepspan0 and sweepspan1.
 * in sweepspan1, inline "free" to avoid expensive mlookup.

R=iant
CC=golang-dev
https://golang.org/cl/206060

											
										
										
											2010-02-10 15:59:39 -07:00
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									// It's critical that we enter this function with preemption disabled,
 									// GC must not start while we are in the middle of this function.
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+										runtime·throw("MSpan_Sweep: m is not locked");
 									sweepgen = runtime·mheap.sweepgen;
 									if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+											s->state, s->sweepgen, sweepgen);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·throw("MSpan_Sweep: bad span state");
 									}
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									cl = s->sizeclass;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									size = s->elemsize;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									if(cl == 0) {
 										n = 1;
 									} else {
 										// Chunk full of small blocks.
 										npages = runtime·class_to_allocnpages[cl];
 										n = (npages << PageShift) / size;
 									}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									res = false;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									nfree = 0;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									end = &head;
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									c = g->m->mcache;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sweepgenset = false;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+									// Mark any free objects in this span so we don't collect them.
 									for(link = s->freelist; link != nil; link = link->next) {
 										off = (uintptr*)link - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+										bitp = arena_start - off/wordsPerBitmapByte - 1;
 										shift = (off % wordsPerBitmapByte) * gcBits;
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+										*bitp |= bitMarked<<shift;
 									}
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									// Unlink & free special records for any objects we're about to free.
 									specialp = &s->specials;
 									special = *specialp;
 									while(special != nil) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// A finalizer can be set for an inner byte of an object, find object beginning.
 										p = (byte*)(s->start << PageShift) + special->offset/size*size;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+										off = (uintptr*)p - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+										bitp = arena_start - off/wordsPerBitmapByte - 1;
 										shift = (off % wordsPerBitmapByte) * gcBits;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										bits = (*bitp>>shift) & bitMask;
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+										if((bits&bitMarked) == 0) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+											// Find the exact byte for which the special was setup
 											// (as opposed to object beginning).
 											p = (byte*)(s->start << PageShift) + special->offset;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+											// about to free object: splice out special record
 											y = special;
 											special = special->next;
 											*specialp = special;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											if(!runtime·freespecial(y, p, size, false)) {
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+												// stop freeing of object if it has a finalizer
 												*bitp |= bitMarked << shift;
 											}
 										} else {
 											// object is still live: keep special record
 											specialp = &special->next;
 											special = *specialp;
 										}
 									}
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									// Sweep through n objects of given size starting at p.
 									// This thread owns the span now, so it can manipulate
 									// the block bitmap without atomic operations.
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									p = (byte*)(s->start << PageShift);
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									// Find bits for the beginning of the span.
 									off = (uintptr*)p - (uintptr*)arena_start;
 									bitp = arena_start - off/wordsPerBitmapByte - 1;
 									shift = 0;
 									step = size/(PtrSize*wordsPerBitmapByte);
 									// Rewind to the previous quadruple as we move to the next
 									// in the beginning of the loop.
 									bitp += step;
 									if(step == 0) {
 										// 8-byte objects.
 										bitp++;
 										shift = gcBits;
 									}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									for(; n > 0; n--, p += size) {
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+										bitp -= step;
 										if(step == 0) {
 											if(shift != 0)
 												bitp--;
 											shift = gcBits - shift;
 										}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										xbits = *bitp;
 										bits = (xbits>>shift) & bitMask;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										// Allocated and marked object, reset bits to allocated.
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+										if((bits&bitMarked) != 0) {
 											*bitp &= ~(bitMarked<<shift);
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+											continue;
 										}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										// At this point we know that we are looking at garbage object
 										// that needs to be collected.
-												runtime: adjust GODEBUG=allocfreetrace=1 and GODEBUG=gcdead=1

GODEBUG=allocfreetrace=1:

The allocfreetrace=1 mode prints a stack trace for each block
allocated and freed, and also a stack trace for each garbage collection.

It was implemented by reusing the heap profiling support: if allocfreetrace=1
then the heap profile was effectively running at 1 sample per 1 byte allocated
(always sample). The stack being shown at allocation was the stack gathered
for profiling, meaning it was derived only from the program counters and
did not include information about function arguments or frame pointers.
The stack being shown at free was the allocation stack, not the free stack.
If you are generating this log, you can find the allocation stack yourself, but
it can be useful to see exactly the sequence that led to freeing the block:
was it the garbage collector or an explicit free? Now that the garbage collector
runs on an m0 stack, the stack trace for the garbage collector was never interesting.

Fix all these problems:

1. Decouple allocfreetrace=1 from heap profiling.
2. Print the standard goroutine stack traces instead of a custom format.
3. Print the stack trace at time of allocation for an allocation,
   and print the stack trace at time of free (not the allocation trace again)
   for a free.
4. Print all goroutine stacks at garbage collection. Having all the stacks
   means that you can see the exact point at which each goroutine was
   preempted, which is often useful for identifying liveness-related errors.

GODEBUG=gcdead=1:

This mode overwrites dead pointers with a poison value.
Detect the poison value as an invalid pointer during collection,
the same way that small integers are invalid pointers.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/81670043

											
										
										
											2014-04-01 11:30:10 -06:00
+										if(runtime·debug.allocfreetrace)
 											runtime·tracefree(p, size);
-												runtime: keep objects in free lists marked as allocated.
Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043

											
										
										
											2014-08-13 10:42:55 -06:00
+										// Reset to allocated+noscan.
 										*bitp = (xbits & ~((bitMarked|(BitsMask<<2))<<shift)) | ((uintptr)BitsDead<<(shift+2));
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+										if(cl == 0) {
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+											// Free large span.
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+											if(preserve)
 												runtime·throw("can't preserve large span");
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											runtime·unmarkspan(p, s->npages<<PageShift);
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											s->needzero = 1;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											// important to set sweepgen before returning it to heap
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+											runtime·atomicstore(&s->sweepgen, sweepgen);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											sweepgenset = true;
-												runtime: get rid of free
Several reasons:
1. Significantly simplifies runtime.
2. This code proved to be buggy.
3. Free is incompatible with bump-the-pointer allocation.
4. We want to write runtime in Go, Go does not have free.
5. Too much code to free env strings on startup.

LGTM=khr
R=golang-codereviews, josharian, tracey.brendan, khr
CC=bradfitz, golang-codereviews, r, rlh, rsc
https://golang.org/cl/116390043

											
										
										
											2014-07-31 02:55:40 -06:00
+											// NOTE(rsc,dvyukov): The original implementation of efence
 											// in CL 22060046 used SysFree instead of SysFault, so that
 											// the operating system would eventually give the memory
 											// back to us again, so that an efence program could run
 											// longer without running out of memory. Unfortunately,
 											// calling SysFree here without any kind of adjustment of the
 											// heap data structures means that when the memory does
 											// come back to us, we have the wrong metadata for it, either in
 											// the MSpan structures or in the garbage collection bitmap.
 											// Using SysFault here means that the program will run out of
 											// memory fairly quickly in efence mode, but at least it won't
 											// have mysterious crashes due to confused memory reuse.
 											// It should be possible to switch back to SysFree if we also
 											// implement and then call some kind of MHeap_DeleteSpan.
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											if(runtime·debug.efence) {
 												s->limit = nil;	// prevent mlookup from finding this span
-												runtime: fix malloc page alignment + efence

Two memory allocator bug fixes.

- efence is not maintaining the proper heap metadata
  to make eventual memory reuse safe, so use SysFault.

- now that our heap PageSize is 8k but most hardware
  uses 4k pages, SysAlloc and SysReserve results must be
  explicitly aligned. Do that in a few more call sites and
  document this fact in malloc.h.

Fixes #7448.

LGTM=iant
R=golang-codereviews, josharian, iant
CC=dvyukov, golang-codereviews
https://golang.org/cl/71750048

											
										
										
											2014-03-06 16:34:29 -07:00
+												runtime·SysFault(p, size);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											} else
-												runtime: add GODEBUG option for an electric fence like heap mode

When enabled this new debugging mode will allocate objects on
their own page and never recycle memory addresses.  This is an
essential tool to root cause a broad class of heap corruption.

R=golang-dev, dave, daniel.morsing, dvyukov, rsc, iant, cshapiro
CC=golang-dev
https://golang.org/cl/22060046

											
										
										
											2013-12-06 15:40:45 -07:00
+												runtime·MHeap_Free(&runtime·mheap, s, 1);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+											c->local_nlargefree++;
 											c->local_largefree += size;
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+											runtime·xadd64(&mstats.next_gc, -(uint64)(size * (runtime·gcpercent + 100)/100));
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											res = true;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+										} else {
 											// Free small object.
-												runtime: zero 2-word memory blocks in-place
Currently for 2-word blocks we set the flag to clear the flag. Makes no sense.
In particular on 32-bits we call memclr always.

R=golang-codereviews, dave, iant
CC=golang-codereviews, khr, rsc
https://golang.org/cl/41170044

											
										
										
											2014-01-20 23:53:51 -07:00
+											if(size > 2*sizeof(uintptr))
-												runtime: use a distinct pattern to mark free blocks in need of zeroing

R=golang-dev, dvyukov, khr, cshapiro
CC=golang-dev
https://golang.org/cl/8392043

											
										
										
											2013-04-04 15:18:52 -06:00
+												((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
-												runtime: zero 2-word memory blocks in-place
Currently for 2-word blocks we set the flag to clear the flag. Makes no sense.
In particular on 32-bits we call memclr always.

R=golang-codereviews, dave, iant
CC=golang-codereviews, khr, rsc
https://golang.org/cl/41170044

											
										
										
											2014-01-20 23:53:51 -07:00
+											else if(size > sizeof(uintptr))
 												((uintptr*)p)[1] = 0;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+											end->next = (MLink*)p;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+											end = (MLink*)p;
 											nfree++;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										}
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									}
-												runtime: get rid of the settype buffer and lock.

MCaches	now hold a MSpan for each sizeclass which they have
exclusive access to allocate from, so no lock is needed.

Modifying the heap bitmaps also no longer requires a cas.

runtime.free gets more expensive.  But we don't use it
much any more.

It's not much faster on 1 processor, but it's a lot
faster on multiple processors.

benchmark                 old ns/op    new ns/op    delta
BenchmarkSetTypeNoPtr1           24           23   -0.42%
BenchmarkSetTypeNoPtr2           33           34   +0.89%
BenchmarkSetTypePtr1             51           49   -3.72%
BenchmarkSetTypePtr2             55           54   -1.98%

benchmark                old ns/op    new ns/op    delta
BenchmarkAllocation          52739        50770   -3.73%
BenchmarkAllocation-2        33957        34141   +0.54%
BenchmarkAllocation-3        33326        29015  -12.94%
BenchmarkAllocation-4        38105        25795  -32.31%
BenchmarkAllocation-5        68055        24409  -64.13%
BenchmarkAllocation-6        71544        23488  -67.17%
BenchmarkAllocation-7        68374        23041  -66.30%
BenchmarkAllocation-8        70117        20758  -70.40%

LGTM=rsc, dvyukov
R=dvyukov, bradfitz, khr, rsc
CC=golang-codereviews
https://golang.org/cl/46810043

											
										
										
											2014-02-26 16:52:58 -07:00
+									// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
 									// because of the potential for a concurrent free/SetFinalizer.
 									// But we need to set it before we make the span available for allocation
 									// (return it to heap or mcentral), because allocation code assumes that a
 									// span is already swept if available for allocation.
 									if(!sweepgenset && nfree == 0) {
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+										// The span must be in our exclusive ownership until we update sweepgen,
 										// check for potential races.
 										if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
 											runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
 												s->state, s->sweepgen, sweepgen);
 											runtime·throw("MSpan_Sweep: bad span state after sweep");
 										}
 										runtime·atomicstore(&s->sweepgen, sweepgen);
 									}
-												runtime: get rid of the settype buffer and lock.

MCaches	now hold a MSpan for each sizeclass which they have
exclusive access to allocate from, so no lock is needed.

Modifying the heap bitmaps also no longer requires a cas.

runtime.free gets more expensive.  But we don't use it
much any more.

It's not much faster on 1 processor, but it's a lot
faster on multiple processors.

benchmark                 old ns/op    new ns/op    delta
BenchmarkSetTypeNoPtr1           24           23   -0.42%
BenchmarkSetTypeNoPtr2           33           34   +0.89%
BenchmarkSetTypePtr1             51           49   -3.72%
BenchmarkSetTypePtr2             55           54   -1.98%

benchmark                old ns/op    new ns/op    delta
BenchmarkAllocation          52739        50770   -3.73%
BenchmarkAllocation-2        33957        34141   +0.54%
BenchmarkAllocation-3        33326        29015  -12.94%
BenchmarkAllocation-4        38105        25795  -32.31%
BenchmarkAllocation-5        68055        24409  -64.13%
BenchmarkAllocation-6        71544        23488  -67.17%
BenchmarkAllocation-7        68374        23041  -66.30%
BenchmarkAllocation-8        70117        20758  -70.40%

LGTM=rsc, dvyukov
R=dvyukov, bradfitz, khr, rsc
CC=golang-codereviews
https://golang.org/cl/46810043

											
										
										
											2014-02-26 16:52:58 -07:00
+									if(nfree > 0) {
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+										c->local_nsmallfree[cl] += nfree;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+										c->local_cachealloc -= nfree * size;
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+										runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (runtime·gcpercent + 100)/100));
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+										res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl].mcentral, s, nfree, head.next, end, preserve);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										// MCentral_FreeSpan updates sweepgen
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									}
 									return res;
 								}
 								// State of background sweep.
 								// Pretected by gclock.
 								static struct
 								{
 									G*	g;
 									bool	parked;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									uint32	spanidx;	// background sweeper position
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
 									uint32	nbgsweep;
 									uint32	npausesweep;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								} sweep;
 								// background sweeping goroutine
 								static void
 								bgsweep(void)
 								{
 									g->issystem = 1;
 									for(;;) {
 										while(runtime·sweepone() != -1) {
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											sweep.nbgsweep++;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											runtime·gosched();
 										}
 										runtime·lock(&gclock);
-												runtime: fix a race in bgsweep
See the comment for description.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75670044

											
										
										
											2014-03-14 11:21:44 -06:00
+										if(!runtime·mheap.sweepdone) {
 											// It's possible if GC has happened between sweepone has
 											// returned -1 and gclock lock.
 											runtime·unlock(&gclock);
 											continue;
 										}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										sweep.parked = true;
-												runtime: fix program termination when main goroutine calls Goexit
Do not consider idle finalizer/bgsweep/timer goroutines as doing something useful.
We can't simply set isbackground for the whole lifetime of the goroutines,
because when finalizer goroutine calls user function, we do want to consider it
as doing something useful.
This is borken due to timers for quite some time.
With background sweep is become even more broken.
Fixes #7784.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/87960044

											
										
										
											2014-04-15 09:48:17 -06:00
+										g->isbackground = true;
-												runtime: convert common scheduler functions to Go
These are required for chans, semaphores, timers, etc.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123640043

											
										
										
											2014-08-21 10:41:09 -06:00
+										runtime·parkunlock(&gclock, runtime·gostringnocopy((byte*)"GC sweep wait"));
-												runtime: fix program termination when main goroutine calls Goexit
Do not consider idle finalizer/bgsweep/timer goroutines as doing something useful.
We can't simply set isbackground for the whole lifetime of the goroutines,
because when finalizer goroutine calls user function, we do want to consider it
as doing something useful.
This is borken due to timers for quite some time.
With background sweep is become even more broken.
Fixes #7784.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/87960044

											
										
										
											2014-04-15 09:48:17 -06:00
+										g->isbackground = false;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									}
 								}
 								// sweeps one span
 								// returns number of pages returned to heap, or -1 if there is nothing to sweep
 								uintptr
 								runtime·sweepone(void)
 								{
 									MSpan *s;
 									uint32 idx, sg;
 									uintptr npages;
 									// increment locks to ensure that the goroutine is not preempted
 									// in the middle of sweep thus leaving the span in an inconsistent state for next GC
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->locks++;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sg = runtime·mheap.sweepgen;
 									for(;;) {
 										idx = runtime·xadd(&sweep.spanidx, 1) - 1;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+										if(idx >= work.nspan) {
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											runtime·mheap.sweepdone = true;
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+											g->m->locks--;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											return -1;
 										}
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+										s = work.spans[idx];
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										if(s->state != MSpanInUse) {
 											s->sweepgen = sg;
 											continue;
 										}
 										if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
 											continue;
 										npages = s->npages;
-												runtime: implement transfer cache
Currently we do the following dance after sweeping a span:
1. lock mcentral
2. remove the span from a list
3. unlock mcentral
4. unmark span
5. lock mheap
6. insert the span into heap
7. unlock mheap
8. lock mcentral
9. observe empty list
10. unlock mcentral
11. lock mheap
12. grab the span
13. unlock mheap
14. mark span
15. lock mcentral
16. insert the span into empty list
17. unlock mcentral

This change short-circuits this sequence to nothing,
that is, we just cache and use the span after sweeping.

This gives us functionality similar (even better) to tcmalloc's transfer cache.

benchmark            old ns/op     new ns/op     delta
BenchmarkMalloc8     22.2          19.5          -12.16%
BenchmarkMalloc16    31.0          26.6          -14.19%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/119550043

											
										
										
											2014-08-18 06:52:31 -06:00
+										if(!runtime·MSpan_Sweep(s, false))
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											npages = 0;
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+										g->m->locks--;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										return npages;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									}
 								}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								void
 								runtime·gchelper(void)
 								{
-												runtime: change nproc local variable to uint32

The nproc and ndone fields are uint32.  This makes the type
consistent.

LGTM=minux.ma
R=golang-codereviews, minux.ma
CC=golang-codereviews
https://golang.org/cl/79340044

											
										
										
											2014-03-25 06:18:08 -06:00
+									uint32 nproc;
-												runtime: fix data race in GC
Fixes #5139.
Update #7065.

R=golang-codereviews, bradfitz, minux.ma
CC=golang-codereviews
https://golang.org/cl/52090045

											
										
										
											2014-01-15 08:38:08 -07:00
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->traceback = 2;
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									gchelperstart();
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									// parallel mark for over gc roots
 									runtime·parfordo(work.markfor);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									// help other threads scan secondary blocks
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									scanblock(nil, 0, nil);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: fix data race in GC
Fixes #5139.
Update #7065.

R=golang-codereviews, bradfitz, minux.ma
CC=golang-codereviews
https://golang.org/cl/52090045

											
										
										
											2014-01-15 08:38:08 -07:00
+									nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
 									if(runtime·xadd(&work.ndone, +1) == nproc-1)
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										runtime·notewakeup(&work.alldone);
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->traceback = 0;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								}
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								static void
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+								cachestats(void)
 								{
 									MCache *c;
 									P *p, **pp;
 									for(pp=runtime·allp; p=*pp; pp++) {
 										c = p->mcache;
 										if(c==nil)
 											continue;
 										runtime·purgecachedstats(c);
 									}
 								}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								static void
 								flushallmcaches(void)
 								{
 									P *p, **pp;
 									MCache *c;
 									// Flush MCache's to MCentral.
 									for(pp=runtime·allp; p=*pp; pp++) {
 										c = p->mcache;
 										if(c==nil)
 											continue;
 										runtime·MCache_ReleaseAll(c);
-												undo CL 101570044 / 2c57aaea79c4

redo stack allocation.  This is mostly the same as
the original CL with a few bug fixes.

1. add racemalloc() for stack allocations
2. fix poolalloc/poolfree to terminate free lists correctly.
3. adjust span ref count correctly.
4. don't use cache for sizes >= StackCacheSize.

Should fix bugs and memory leaks in original changelist.

««« original CL description
undo CL 104200047 / 318b04f28372

Breaks windows and race detector.
TBR=rsc

««« original CL description
runtime: stack allocator, separate from mallocgc

In order to move malloc to Go, we need to have a
separate stack allocator.  If we run out of stack
during malloc, malloc will not be available
to allocate a new stack.

Stacks are the last remaining FlagNoGC objects in the
GC heap.  Once they are out, we can get rid of the
distinction between the allocated/blockboundary bits.
(This will be in a separate change.)

Fixes #7468
Fixes #7424

LGTM=rsc, dvyukov
R=golang-codereviews, dvyukov, khr, dave, rsc
CC=golang-codereviews
https://golang.org/cl/104200047
»»»

TBR=rsc
CC=golang-codereviews
https://golang.org/cl/101570044
»»»

LGTM=dvyukov
R=dvyukov, dave, khr, alex.brainman
CC=golang-codereviews
https://golang.org/cl/112240044

											
										
										
											2014-07-17 15:41:46 -06:00
+										runtime·stackcache_clear(c);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									}
 								}
-												undo CL 101570044 / 2c57aaea79c4

redo stack allocation.  This is mostly the same as
the original CL with a few bug fixes.

1. add racemalloc() for stack allocations
2. fix poolalloc/poolfree to terminate free lists correctly.
3. adjust span ref count correctly.
4. don't use cache for sizes >= StackCacheSize.

Should fix bugs and memory leaks in original changelist.

««« original CL description
undo CL 104200047 / 318b04f28372

Breaks windows and race detector.
TBR=rsc

««« original CL description
runtime: stack allocator, separate from mallocgc

In order to move malloc to Go, we need to have a
separate stack allocator.  If we run out of stack
during malloc, malloc will not be available
to allocate a new stack.

Stacks are the last remaining FlagNoGC objects in the
GC heap.  Once they are out, we can get rid of the
distinction between the allocated/blockboundary bits.
(This will be in a separate change.)

Fixes #7468
Fixes #7424

LGTM=rsc, dvyukov
R=golang-codereviews, dvyukov, khr, dave, rsc
CC=golang-codereviews
https://golang.org/cl/104200047
»»»

TBR=rsc
CC=golang-codereviews
https://golang.org/cl/101570044
»»»

LGTM=dvyukov
R=dvyukov, dave, khr, alex.brainman
CC=golang-codereviews
https://golang.org/cl/112240044

											
										
										
											2014-07-17 15:41:46 -06:00
+								static void
 								flushallmcaches_m(G *gp)
 								{
 									flushallmcaches();
 									runtime·gogo(&gp->sched);
 								}
-												runtime: WriteHeapDump dumps the heap to a file.

See http://golang.org/s/go13heapdump for the file format.

LGTM=rsc
R=rsc, bradfitz, dvyukov, khr
CC=golang-codereviews
https://golang.org/cl/37540043

											
										
										
											2014-03-25 16:09:49 -06:00
+								void
 								runtime·updatememstats(GCStats *stats)
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								{
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									M *mp;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									MSpan *s;
-												runtime: add per-M caches for MemStats
Avoid touching centralized state during
memory manager opreations.

R=rsc
CC=golang-dev
https://golang.org/cl/4766042

											
										
										
											2011-07-18 12:52:57 -06:00
+									int32 i;
-												undo CL 101570044 / 2c57aaea79c4

redo stack allocation.  This is mostly the same as
the original CL with a few bug fixes.

1. add racemalloc() for stack allocations
2. fix poolalloc/poolfree to terminate free lists correctly.
3. adjust span ref count correctly.
4. don't use cache for sizes >= StackCacheSize.

Should fix bugs and memory leaks in original changelist.

««« original CL description
undo CL 104200047 / 318b04f28372

Breaks windows and race detector.
TBR=rsc

««« original CL description
runtime: stack allocator, separate from mallocgc

In order to move malloc to Go, we need to have a
separate stack allocator.  If we run out of stack
during malloc, malloc will not be available
to allocate a new stack.

Stacks are the last remaining FlagNoGC objects in the
GC heap.  Once they are out, we can get rid of the
distinction between the allocated/blockboundary bits.
(This will be in a separate change.)

Fixes #7468
Fixes #7424

LGTM=rsc, dvyukov
R=golang-codereviews, dvyukov, khr, dave, rsc
CC=golang-codereviews
https://golang.org/cl/104200047
»»»

TBR=rsc
CC=golang-codereviews
https://golang.org/cl/101570044
»»»

LGTM=dvyukov
R=dvyukov, dave, khr, alex.brainman
CC=golang-codereviews
https://golang.org/cl/112240044

											
										
										
											2014-07-17 15:41:46 -06:00
+									uint64 smallfree;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+									uint64 *src, *dst;
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+									if(stats)
 										runtime·memclr((byte*)stats, sizeof(*stats));
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									for(mp=runtime·allm; mp; mp=mp->alllink) {
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										if(stats) {
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+											src = (uint64*)&mp->gcstats;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											dst = (uint64*)stats;
 											for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
 												dst[i] += src[i];
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+											runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										}
-												runtime: improved scheduler
Distribute runnable queues, memory cache
and cache of dead G's per processor.
Faster non-blocking syscall enter/exit.
More conservative worker thread blocking/unblocking.

R=dave, bradfitz, remyoudompheng, rsc
CC=golang-dev
https://golang.org/cl/7314062

											
										
										
											2013-03-01 04:49:16 -07:00
+									}
-												runtime: account for all sys memory in MemStats
Currently lots of sys allocations are not accounted in any of XxxSys,
including GC bitmap, spans table, GC roots blocks, GC finalizer blocks,
iface table, netpoll descriptors and more. Up to ~20% can unaccounted.
This change introduces 2 new stats: GCSys and OtherSys for GC metadata
and all other misc allocations, respectively.
Also ensures that all XxxSys indeed sum up to Sys. All sys memory allocation
functions require the stat for accounting, so that it's impossible to miss something.
Also fix updating of mcache_sys/inuse, they were not updated after deallocation.

test/bench/garbage/parser before:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14204928
MCacheSys	16384
BuckHashSys	1439992

after:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14188544
MCacheSys	16384
BuckHashSys	3194304
GCSys		39198688
OtherSys	3129656

Fixes #5799.

R=rsc, dave, alex.brainman
CC=golang-dev
https://golang.org/cl/12946043

											
										
										
											2013-09-06 14:55:40 -06:00
+									mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
 									mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
 									mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
 										mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									// Calculate memory allocator stats.
 									// During program execution we only count number of frees and amount of freed memory.
 									// Current number of alive object in the heap and amount of alive heap memory
 									// are calculated by scanning all spans.
 									// Total number of mallocs is calculated as number of frees plus number of alive objects.
 									// Similarly, total amount of allocated memory is calculated as amount of freed memory
 									// plus amount of alive heap memory.
 									mstats.alloc = 0;
 									mstats.total_alloc = 0;
 									mstats.nmalloc = 0;
 									mstats.nfree = 0;
 									for(i = 0; i < nelem(mstats.by_size); i++) {
 										mstats.by_size[i].nmalloc = 0;
 										mstats.by_size[i].nfree = 0;
 									}
 									// Flush MCache's to MCentral.
-												runtime: fix gctrace=1

updatememstats is called on both the m and g stacks.
Call into flushallmcaches correctly.  flushallmcaches
can only run on the M stack.

This is somewhat temporary.  once ReadMemStats is in
Go we can have all of this code M-only.

LGTM=dvyukov
R=golang-codereviews, dvyukov
CC=golang-codereviews
https://golang.org/cl/116880043

											
										
										
											2014-07-18 03:05:21 -06:00
+									if(g == g->m->g0)
 										flushallmcaches();
 									else
 										runtime·mcall(flushallmcaches_m);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
 									// Aggregate local stats.
 									cachestats();
 									// Scan all spans and count number of alive objects.
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									runtime·lock(&runtime·mheap.lock);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									for(i = 0; i < runtime·mheap.nspan; i++) {
 										s = runtime·mheap.allspans[i];
 										if(s->state != MSpanInUse)
 											continue;
 										if(s->sizeclass == 0) {
 											mstats.nmalloc++;
 											mstats.alloc += s->elemsize;
 										} else {
 											mstats.nmalloc += s->ref;
 											mstats.by_size[s->sizeclass].nmalloc += s->ref;
 											mstats.alloc += s->ref*s->elemsize;
 										}
 									}
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									runtime·unlock(&runtime·mheap.lock);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
 									// Aggregate by size class.
 									smallfree = 0;
 									mstats.nfree = runtime·mheap.nlargefree;
 									for(i = 0; i < nelem(mstats.by_size); i++) {
 										mstats.nfree += runtime·mheap.nsmallfree[i];
 										mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
 										mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
 										smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
 									}
 									mstats.nmalloc += mstats.nfree;
 									// Calculate derived stats.
 									mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
 									mstats.heap_alloc = mstats.alloc;
 									mstats.heap_objects = mstats.nmalloc - mstats.nfree;
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								// Structure of arguments passed to function gc().
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								// This allows the arguments to be passed via runtime·mcall.
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								struct gc_args
 								{
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									int64 start_time; // start time of GC in ns (just before stoptheworld)
-												runtime: fix freeOSMemory to free memory immediately
Currently freeOSMemory makes only marking phase of GC, but not sweeping phase.
So recently memory is not released after freeOSMemory.
Do both marking and sweeping during freeOSMemory.
Fixes #8019.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/97550043

											
										
										
											2014-05-19 02:06:30 -06:00
+									bool  eagersweep;
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								};
 								static void gc(struct gc_args *args);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								static void mgc(G *gp);
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+								int32
 								runtime·readgogc(void)
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								{
 									byte *p;
 									p = runtime·getenv("GOGC");
 									if(p == nil || p[0] == '\0')
 										return 100;
 									if(runtime·strcmp(p, (byte*)"off") == 0)
 										return -1;
 									return runtime·atoi(p);
 								}
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+								void
 								runtime·gcinit(void)
 								{
 									if(sizeof(Workbuf) != WorkbufSize)
 										runtime·throw("runtime: size of Workbuf is suboptimal");
 									work.markfor = runtime·parforalloc(MaxGcproc);
 									runtime·gcpercent = runtime·readgogc();
 									runtime·gcdatamask = unrollglobgcprog(gcdata, edata - data);
 									runtime·gcbssmask = unrollglobgcprog(gcbss, ebss - bss);
 								}
-												runtime: fix freeOSMemory to free memory immediately
Currently freeOSMemory makes only marking phase of GC, but not sweeping phase.
So recently memory is not released after freeOSMemory.
Do both marking and sweeping during freeOSMemory.
Fixes #8019.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/97550043

											
										
										
											2014-05-19 02:06:30 -06:00
+								// force = 1 - do GC regardless of current heap usage
 								// force = 2 - go GC and eager sweep
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								void
-												runtime: ,s/[a-zA-Z0-9_]+/runtime·&/g, almost

Prefix all external symbols in runtime by runtime·,
to avoid conflicts with possible symbols of the same
name in linked-in C libraries.  The obvious conflicts
are printf, malloc, and free, but hide everything to
avoid future pain.

The symbols left alone are:

	** known to cgo **
	_cgo_free
	_cgo_malloc
	libcgo_thread_start
	initcgo
	ncgocall

	** known to linker **
	_rt0_$GOARCH
	_rt0_$GOARCH_$GOOS
	text
	etext
	data
	end
	pclntab
	epclntab
	symtab
	esymtab

	** known to C compiler **
	_divv
	_modv
	_div64by32
	etc (arch specific)

Tested on darwin/386, darwin/amd64, linux/386, linux/amd64.

Built (but not tested) for freebsd/386, freebsd/amd64, linux/arm, windows/386.

R=r, PeterGo
CC=golang-dev
https://golang.org/cl/2899041

											
										
										
											2010-11-04 12:00:19 -06:00
+								runtime·gc(int32 force)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								{
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									struct gc_args a;
 									int32 i;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
 									// The gc is turned off (via enablegc) until
 									// the bootstrap has completed.
 									// Also, malloc gets called in the guts
 									// of a number of libraries that might be
 									// holding locks.  To avoid priority inversion
 									// problems, don't bother trying to run gc
 									// while holding a lock.  The next mallocgc
 									// without a lock will do the gc instead.
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									if(!mstats.enablegc || g == g->m->g0 || g->m->locks > 0 || runtime·panicking)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										return;
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									if(runtime·gcpercent < 0)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										return;
-												net: add special netFD mutex
The mutex, fdMutex, handles locking and lifetime of sysfd,
and serializes Read and Write methods.
This allows to strip 2 sync.Mutex.Lock calls,
2 sync.Mutex.Unlock calls, 1 defer and some amount
of misc overhead from every network operation.

On linux/amd64, Intel E5-2690:
benchmark                             old ns/op    new ns/op    delta
BenchmarkTCP4Persistent                    9595         9454   -1.47%
BenchmarkTCP4Persistent-2                  8978         8772   -2.29%
BenchmarkTCP4ConcurrentReadWrite           4900         4625   -5.61%
BenchmarkTCP4ConcurrentReadWrite-2         2603         2500   -3.96%

In general it strips 70-500 ns from every network operation depending
on processor model. On my relatively new E5-2690 it accounts to ~5%
of network op cost.

Fixes #6074.

R=golang-dev, bradfitz, alex.brainman, iant, mikioh.mikioh
CC=golang-dev
https://golang.org/cl/12418043

											
										
										
											2013-08-09 11:43:00 -06:00
+									runtime·semacquire(&runtime·worldsema, false);
-												runtime: fix freeOSMemory to free memory immediately
Currently freeOSMemory makes only marking phase of GC, but not sweeping phase.
So recently memory is not released after freeOSMemory.
Do both marking and sweeping during freeOSMemory.
Fixes #8019.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/97550043

											
										
										
											2014-05-19 02:06:30 -06:00
+									if(force==0 && mstats.heap_alloc < mstats.next_gc) {
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+										// typically threads which lost the race to grab
 										// worldsema exit here when gc is done.
 										runtime·semrelease(&runtime·worldsema);
 										return;
 									}
 									// Ok, we're doing it!  Stop everybody else
 									a.start_time = runtime·nanotime();
-												runtime: fix freeOSMemory to free memory immediately
Currently freeOSMemory makes only marking phase of GC, but not sweeping phase.
So recently memory is not released after freeOSMemory.
Do both marking and sweeping during freeOSMemory.
Fixes #8019.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/97550043

											
										
										
											2014-05-19 02:06:30 -06:00
+									a.eagersweep = force >= 2;
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->gcing = 1;
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									runtime·stoptheworld();
-												runtime: emit collection stacks in GODEBUG=allocfreetrace mode

R=khr, dvyukov
CC=golang-codereviews
https://golang.org/cl/51830043

											
										
										
											2014-01-14 08:39:50 -07:00
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									runtime·clearpools();
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									// Run gc on the g0 stack.  We do this so that the g stack
 									// we're currently running on will no longer change.  Cuts
 									// the root set down a bit (g0 stacks are not scanned, and
 									// we don't need to scan gc's internal state).  Also an
 									// enabler for copyable stacks.
-												runtime: introduce GODEBUG env var
Currently it replaces GOGCTRACE env var (GODEBUG=gctrace=1).
The plan is to extend it with other type of debug tracing,
e.g. GODEBUG=gctrace=1,schedtrace=100.

R=rsc
CC=bradfitz, daniel.morsing, gobot, golang-dev
https://golang.org/cl/10026045

											
										
										
											2013-06-28 08:37:06 -06:00
+									for(i = 0; i < (runtime·debug.gctrace > 1 ? 2 : 1); i++) {
-												runtime: use 3x fewer nanotime calls in garbage collection

Cuts the number of calls from 6 to 2 in the non-debug case.

LGTM=iant
R=golang-codereviews, iant
CC=0intro, aram, golang-codereviews, khr
https://golang.org/cl/86040043

											
										
										
											2014-04-09 08:38:12 -06:00
+										if(i > 0)
 											a.start_time = runtime·nanotime();
-												runtime: do not trigger GC on g0
GC acquires worldsema, which is a goroutine-level semaphore
which parks goroutines. g0 can not be parked.
Fixes #6193.

R=khr, khr
CC=golang-dev
https://golang.org/cl/12880045

											
										
										
											2013-08-21 16:17:45 -06:00
+										// switch to g0, call gc(&a), then switch back
 										g->param = &a;
 										g->status = Gwaiting;
-												runtime: convert common scheduler functions to Go
These are required for chans, semaphores, timers, etc.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123640043

											
										
										
											2014-08-21 10:41:09 -06:00
+										g->waitreason = runtime·gostringnocopy((byte*)"garbage collection");
-												runtime: do not trigger GC on g0
GC acquires worldsema, which is a goroutine-level semaphore
which parks goroutines. g0 can not be parked.
Fixes #6193.

R=khr, khr
CC=golang-dev
https://golang.org/cl/12880045

											
										
										
											2013-08-21 16:17:45 -06:00
+										runtime·mcall(mgc);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									}
 									// all done
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->gcing = 0;
 									g->m->locks++;
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									runtime·semrelease(&runtime·worldsema);
 									runtime·starttheworld();
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->locks--;
-												runtime: fix non-concurrent sweep

State of the world:

CL 46430043 introduced a new concurrent sweep but is broken.

CL 62360043 made the new sweep non-concurrent
to try to fix the world while we understand what's wrong with
the concurrent version.

This CL fixes the non-concurrent form to run finalizers.
This CL is just a band-aid to get the build green again.

Dmitriy is working on understanding and then fixing what's
wrong with the concurrent sweep.

TBR=dvyukov
CC=golang-codereviews
https://golang.org/cl/62370043

											
										
										
											2014-02-12 13:54:21 -07:00
 									// now that gc is done, kick off finalizer thread if needed
 									if(!ConcurrentSweep) {
 										// give the queued finalizers, if any, a chance to run
 										runtime·gosched();
 									}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								}
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								static void
 								mgc(G *gp)
 								{
 									gc(gp->param);
 									gp->param = nil;
-												runtime: record proper goroutine state during stack split

Until now, the goroutine state has been scattered during the
execution of newstack and oldstack. It's all there, and those routines
know how to get back to a working goroutine, but other pieces of
the system, like stack traces, do not. If something does interrupt
the newstack or oldstack execution, the rest of the system can't
understand the goroutine. For example, if newstack decides there
is an overflow and calls throw, the stack tracer wouldn't dump the
goroutine correctly.

For newstack to save a useful state snapshot, it needs to be able
to rewind the PC in the function that triggered the split back to
the beginning of the function. (The PC is a few instructions in, just
after the call to morestack.) To make that possible, we change the
prologues to insert a jmp back to the beginning of the function
after the call to morestack. That is, the prologue used to be roughly:

        TEXT myfunc
                check for split
                jmpcond nosplit
                call morestack
        nosplit:
                sub $xxx, sp

Now an extra instruction is inserted after the call:

        TEXT myfunc
        start:
                check for split
                jmpcond nosplit
                call morestack
                jmp start
        nosplit:
                sub $xxx, sp

The jmp is not executed directly. It is decoded and simulated by
runtime.rewindmorestack to discover the beginning of the function,
and then the call to morestack returns directly to the start label
instead of to the jump instruction. So logically the jmp is still
executed, just not by the cpu.

The prologue thus repeats in the case of a function that needs a
stack split, but against the cost of the split itself, the extra few
instructions are noise. The repeated prologue has the nice effect of
making a stack split double-check that the new stack is big enough:
if morestack happens to return on a too-small stack, we'll now notice
before corruption happens.

The ability for newstack to rewind to the beginning of the function
should help preemption too. If newstack decides that it was called
for preemption instead of a stack split, it now has the goroutine state
correctly paused if rescheduling is needed, and when the goroutine
can run again, it can return to the start label on its original stack
and re-execute the split check.

Here is an example of a split stack overflow showing the full
trace, without any special cases in the stack printer.
(This one was triggered by making the split check incorrect.)

runtime: newstack framesize=0x0 argsize=0x18 sp=0x6aebd0 stack=[0x6b0000, 0x6b0fa0]
        morebuf={pc:0x69f5b sp:0x6aebd8 lr:0x0}
        sched={pc:0x68880 sp:0x6aebd0 lr:0x0 ctxt:0x34e700}
runtime: split stack overflow: 0x6aebd0 < 0x6b0000
fatal error: runtime: split stack overflow

goroutine 1 [stack split]:
runtime.mallocgc(0x290, 0x100000000, 0x1)
        /Users/rsc/g/go/src/pkg/runtime/zmalloc_darwin_amd64.c:21 fp=0x6aebd8
runtime.new()
        /Users/rsc/g/go/src/pkg/runtime/zmalloc_darwin_amd64.c:682 +0x5b fp=0x6aec08
go/build.(*Context).Import(0x5ae340, 0xc210030c71, 0xa, 0xc2100b4380, 0x1b, ...)
        /Users/rsc/g/go/src/pkg/go/build/build.go:424 +0x3a fp=0x6b00a0
main.loadImport(0xc210030c71, 0xa, 0xc2100b4380, 0x1b, 0xc2100b42c0, ...)
        /Users/rsc/g/go/src/cmd/go/pkg.go:249 +0x371 fp=0x6b01a8
main.(*Package).load(0xc21017c800, 0xc2100b42c0, 0xc2101828c0, 0x0, 0x0, ...)
        /Users/rsc/g/go/src/cmd/go/pkg.go:431 +0x2801 fp=0x6b0c98
main.loadPackage(0x369040, 0x7, 0xc2100b42c0, 0x0)
        /Users/rsc/g/go/src/cmd/go/pkg.go:709 +0x857 fp=0x6b0f80
----- stack segment boundary -----
main.(*builder).action(0xc2100902a0, 0x0, 0x0, 0xc2100e6c00, 0xc2100e5750, ...)
        /Users/rsc/g/go/src/cmd/go/build.go:539 +0x437 fp=0x6b14a0
main.(*builder).action(0xc2100902a0, 0x0, 0x0, 0xc21015b400, 0x2, ...)
        /Users/rsc/g/go/src/cmd/go/build.go:528 +0x1d2 fp=0x6b1658
main.(*builder).test(0xc2100902a0, 0xc210092000, 0x0, 0x0, 0xc21008ff60, ...)
        /Users/rsc/g/go/src/cmd/go/test.go:622 +0x1b53 fp=0x6b1f68
----- stack segment boundary -----
main.runTest(0x5a6b20, 0xc21000a020, 0x2, 0x2)
        /Users/rsc/g/go/src/cmd/go/test.go:366 +0xd09 fp=0x6a5cf0
main.main()
        /Users/rsc/g/go/src/cmd/go/main.go:161 +0x4f9 fp=0x6a5f78
runtime.main()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:183 +0x92 fp=0x6a5fa0
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1266 fp=0x6a5fa8

And here is a seg fault during oldstack:

SIGSEGV: segmentation violation
PC=0x1b2a6

runtime.oldstack()
        /Users/rsc/g/go/src/pkg/runtime/stack.c:159 +0x76
runtime.lessstack()
        /Users/rsc/g/go/src/pkg/runtime/asm_amd64.s:270 +0x22

goroutine 1 [stack unsplit]:
fmt.(*pp).printArg(0x2102e64e0, 0xe5c80, 0x2102c9220, 0x73, 0x0, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:818 +0x3d3 fp=0x221031e6f8
fmt.(*pp).doPrintf(0x2102e64e0, 0x12fb20, 0x2, 0x221031eb98, 0x1, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:1183 +0x15cb fp=0x221031eaf0
fmt.Sprintf(0x12fb20, 0x2, 0x221031eb98, 0x1, 0x1, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:234 +0x67 fp=0x221031eb40
flag.(*stringValue).String(0x2102c9210, 0x1, 0x0)
        /Users/rsc/g/go/src/pkg/flag/flag.go:180 +0xb3 fp=0x221031ebb0
flag.(*FlagSet).Var(0x2102f6000, 0x293d38, 0x2102c9210, 0x143490, 0xa, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:633 +0x40 fp=0x221031eca0
flag.(*FlagSet).StringVar(0x2102f6000, 0x2102c9210, 0x143490, 0xa, 0x12fa60, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:550 +0x91 fp=0x221031ece8
flag.(*FlagSet).String(0x2102f6000, 0x143490, 0xa, 0x12fa60, 0x0, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:563 +0x87 fp=0x221031ed38
flag.String(0x143490, 0xa, 0x12fa60, 0x0, 0x161950, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:570 +0x6b fp=0x221031ed80
testing.init()
        /Users/rsc/g/go/src/pkg/testing/testing.go:-531 +0xbb fp=0x221031edc0
strings_test.init()
        /Users/rsc/g/go/src/pkg/strings/strings_test.go:1115 +0x62 fp=0x221031ef70
main.init()
        strings/_test/_testmain.go:90 +0x3d fp=0x221031ef78
runtime.main()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:180 +0x8a fp=0x221031efa0
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1269 fp=0x221031efa8

goroutine 2 [runnable]:
runtime.MHeap_Scavenger()
        /Users/rsc/g/go/src/pkg/runtime/mheap.c:438
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1269
created by runtime.main
        /Users/rsc/g/go/src/pkg/runtime/proc.c:166

rax     0x23ccc0
rbx     0x23ccc0
rcx     0x0
rdx     0x38
rdi     0x2102c0170
rsi     0x221032cfe0
rbp     0x221032cfa0
rsp     0x7fff5fbff5b0
r8      0x2102c0120
r9      0x221032cfa0
r10     0x221032c000
r11     0x104ce8
r12     0xe5c80
r13     0x1be82baac718
r14     0x13091135f7d69200
r15     0x0
rip     0x1b2a6
rflags  0x10246
cs      0x2b
fs      0x0
gs      0x0

Fixes #5723.

R=r, dvyukov, go.peter.90, dave, iant
CC=golang-dev
https://golang.org/cl/10360048

											
										
										
											2013-06-27 09:32:01 -06:00
+									gp->status = Grunning;
-												runtime: add lr, ctxt, ret to Gobuf

Add gostartcall and gostartcallfn.
The old gogocall = gostartcall + gogo.
The old gogocallfn = gostartcallfn + gogo.

R=dvyukov, minux.ma
CC=golang-dev
https://golang.org/cl/10036044

											
										
										
											2013-06-12 13:22:26 -06:00
+									runtime·gogo(&gp->sched);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								}
-												cmd/gc, reflect, runtime: switch to indirect func value representation

Step 1 of http://golang.org/s/go11func.

R=golang-dev, r, daniel.morsing, remyoudompheng
CC=golang-dev
https://golang.org/cl/7393045

											
										
										
											2013-02-21 15:01:13 -07:00
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+								void
-												runtime: clean up naming of mcallable functions.

Introduce the mFunction type to represent an mcall/onM-able function.
Name such functions using _m.

LGTM=bradfitz
R=bradfitz
CC=golang-codereviews
https://golang.org/cl/121320043

											
										
										
											2014-08-06 15:33:57 -06:00
+								runtime·gc_m(void)
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+								{
 									struct gc_args a;
 									G *gp;
 									gp = g->m->curg;
 									gp->status = Gwaiting;
-												runtime: convert common scheduler functions to Go
These are required for chans, semaphores, timers, etc.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123640043

											
										
										
											2014-08-21 10:41:09 -06:00
+									gp->waitreason = runtime·gostringnocopy((byte*)"garbage collection");
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
-												runtime: fix MemStats on 32-bits
Int64's do not fit into uintptr's.

LGTM=khr
R=golang-codereviews, khr, rsc
CC=golang-codereviews, rlh
https://golang.org/cl/128380043

											
										
										
											2014-08-19 01:53:20 -06:00
+									a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32);
 									a.eagersweep = g->m->scalararg[2];
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									gc(&a);
 									gp->status = Grunning;
 								}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								static void
 								gc(struct gc_args *args)
 								{
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									int64 t0, t1, t2, t3, t4;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									uint64 heap0, heap1, obj;
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+									GCStats stats;
-												runtime: adjust GODEBUG=allocfreetrace=1 and GODEBUG=gcdead=1

GODEBUG=allocfreetrace=1:

The allocfreetrace=1 mode prints a stack trace for each block
allocated and freed, and also a stack trace for each garbage collection.

It was implemented by reusing the heap profiling support: if allocfreetrace=1
then the heap profile was effectively running at 1 sample per 1 byte allocated
(always sample). The stack being shown at allocation was the stack gathered
for profiling, meaning it was derived only from the program counters and
did not include information about function arguments or frame pointers.
The stack being shown at free was the allocation stack, not the free stack.
If you are generating this log, you can find the allocation stack yourself, but
it can be useful to see exactly the sequence that led to freeing the block:
was it the garbage collector or an explicit free? Now that the garbage collector
runs on an m0 stack, the stack trace for the garbage collector was never interesting.

Fix all these problems:

1. Decouple allocfreetrace=1 from heap profiling.
2. Print the standard goroutine stack traces instead of a custom format.
3. Print the stack trace at time of allocation for an allocation,
   and print the stack trace at time of free (not the allocation trace again)
   for a free.
4. Print all goroutine stacks at garbage collection. Having all the stacks
   means that you can see the exact point at which each goroutine was
   preempted, which is often useful for identifying liveness-related errors.

GODEBUG=gcdead=1:

This mode overwrites dead pointers with a poison value.
Detect the poison value as an invalid pointer during collection,
the same way that small integers are invalid pointers.

LGTM=khr
R=khr
CC=golang-codereviews
https://golang.org/cl/81670043

											
										
										
											2014-04-01 11:30:10 -06:00
+									if(runtime·debug.allocfreetrace)
 										runtime·tracegc();
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->traceback = 2;
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									t0 = args->start_time;
-												runtime: output how long goroutines are blocked
Example of output:

goroutine 4 [sleep for 3 min]:
time.Sleep(0x34630b8a000)
        src/pkg/runtime/time.goc:31 +0x31
main.func·002()
        block.go:16 +0x2c
created by main.main
        block.go:17 +0x33

Full program and output are here:
http://play.golang.org/p/NEZdADI3Td

Fixes #6809.

R=golang-codereviews, khr, kamil.kisiel, bradfitz, rsc
CC=golang-codereviews
https://golang.org/cl/50420043

											
										
										
											2014-01-16 01:54:46 -07:00
+									work.tstart = args->start_time;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: use 3x fewer nanotime calls in garbage collection

Cuts the number of calls from 6 to 2 in the non-debug case.

LGTM=iant
R=golang-codereviews, iant
CC=0intro, aram, golang-codereviews, khr
https://golang.org/cl/86040043

											
										
										
											2014-04-09 08:38:12 -06:00
+									t1 = 0;
 									if(runtime·debug.gctrace)
 										t1 = runtime·nanotime();
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
 									// Sweep what is not sweeped by bgsweep.
 									while(runtime·sweepone() != -1)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										sweep.npausesweep++;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									// Cache runtime.mheap.allspans in work.spans to avoid conflicts with
 									// resizing/freeing allspans.
 									// New spans can be created while GC progresses, but they are not garbage for
 									// this round:
 									//  - new stack spans can be created even while the world is stopped.
 									//  - new malloc spans can be created during the concurrent sweep
 									// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
 									runtime·lock(&runtime·mheap.lock);
 									// Free the old cached sweep array if necessary.
 									if(work.spans != nil && work.spans != runtime·mheap.allspans)
 										runtime·SysFree(work.spans, work.nspan*sizeof(work.spans[0]), &mstats.other_sys);
 									// Cache the current array for marking.
 									runtime·mheap.gcspans = runtime·mheap.allspans;
 									work.spans = runtime·mheap.allspans;
 									work.nspan = runtime·mheap.nspan;
 									runtime·unlock(&runtime·mheap.lock);
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+									work.nwait = 0;
 									work.ndone = 0;
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									work.nproc = runtime·gcprocs();
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									if(work.nproc > 1) {
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										runtime·noteclear(&work.alldone);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+										runtime·helpgc(work.nproc);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									}
-												runtime: use 3x fewer nanotime calls in garbage collection

Cuts the number of calls from 6 to 2 in the non-debug case.

LGTM=iant
R=golang-codereviews, iant
CC=0intro, aram, golang-codereviews, khr
https://golang.org/cl/86040043

											
										
										
											2014-04-09 08:38:12 -06:00
+									t2 = 0;
 									if(runtime·debug.gctrace)
 										t2 = runtime·nanotime();
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									gchelperstart();
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·parfordo(work.markfor);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									scanblock(nil, 0, nil);
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
-												runtime: use 3x fewer nanotime calls in garbage collection

Cuts the number of calls from 6 to 2 in the non-debug case.

LGTM=iant
R=golang-codereviews, iant
CC=0intro, aram, golang-codereviews, khr
https://golang.org/cl/86040043

											
										
										
											2014-04-09 08:38:12 -06:00
+									t3 = 0;
 									if(runtime·debug.gctrace)
 										t3 = runtime·nanotime();
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+									if(work.nproc > 1)
 										runtime·notesleep(&work.alldone);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									cachestats();
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
 									// estimate what was live heap size after previous GC (for tracing only)
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									heap0 = mstats.next_gc*100/(runtime·gcpercent+100);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									// conservatively set next_gc to high value assuming that everything is live
 									// concurrent/lazy sweep will reduce this number while discovering new garbage
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*runtime·gcpercent/100;
-												runtime: garbage collection + malloc performance
  * add bit tracking finalizer status, avoiding getfinalizer lookup
  * add ability to allocate uncleared memory

R=iant
CC=golang-dev
https://golang.org/cl/207044

											
										
										
											2010-02-10 01:00:12 -07:00
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									t4 = runtime·nanotime();
-												runtime: fix triggering of forced GC
mstats.last_gc is unix time now, it is compared with abstract monotonic time.
On my machine GC is forced every 5 mins regardless of last_gc.

LGTM=rsc
R=golang-codereviews
CC=golang-codereviews, iant, rsc
https://golang.org/cl/91350045

											
										
										
											2014-05-12 23:53:03 -06:00
+									mstats.last_gc = runtime·unixnanotime();  // must be Unix time to make sense to user
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
 									mstats.pause_total_ns += t4 - t0;
-												runtime: instrument malloc + garbage collector.
add simple garbage collection benchmark.

R=iant
CC=golang-dev
https://golang.org/cl/204053

											
										
										
											2010-02-08 15:32:22 -07:00
+									mstats.numgc++;
 									if(mstats.debuggc)
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+										runtime·printf("pause %D\n", t4-t0);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: introduce GODEBUG env var
Currently it replaces GOGCTRACE env var (GODEBUG=gctrace=1).
The plan is to extend it with other type of debug tracing,
e.g. GODEBUG=gctrace=1,schedtrace=100.

R=rsc
CC=bradfitz, daniel.morsing, gobot, golang-dev
https://golang.org/cl/10026045

											
										
										
											2013-06-28 08:37:06 -06:00
+									if(runtime·debug.gctrace) {
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+										heap1 = mstats.heap_alloc;
-												runtime: WriteHeapDump dumps the heap to a file.

See http://golang.org/s/go13heapdump for the file format.

LGTM=rsc
R=rsc, bradfitz, dvyukov, khr
CC=golang-codereviews
https://golang.org/cl/37540043

											
										
										
											2014-03-25 16:09:49 -06:00
+										runtime·updatememstats(&stats);
-												runtime: fix runaway memory usage
It was caused by mstats.heap_alloc skew.
Fixes #7430.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/69870055

											
										
										
											2014-03-06 10:33:00 -07:00
+										if(heap1 != mstats.heap_alloc) {
-												runtime: fix warnings on Plan 9

warning: pkg/runtime/mgc0.c:2352 format mismatch p UVLONG, arg 2
warning: pkg/runtime/mgc0.c:2352 format mismatch p UVLONG, arg 3

LGTM=bradfitz
R=golang-codereviews, bradfitz
CC=golang-codereviews
https://golang.org/cl/71950044

											
										
										
											2014-03-06 12:56:22 -07:00
+											runtime·printf("runtime: mstats skew: heap=%D/%D\n", heap1, mstats.heap_alloc);
-												runtime: fix runaway memory usage
It was caused by mstats.heap_alloc skew.
Fixes #7430.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/69870055

											
										
										
											2014-03-06 10:33:00 -07:00
+											runtime·throw("mstats skew");
 										}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										obj = mstats.nmalloc - mstats.nfree;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										stats.nprocyield += work.markfor->nprocyield;
 										stats.nosyield += work.markfor->nosyield;
 										stats.nsleep += work.markfor->nsleep;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
-												runtime: make times in GODEBUG=gctrace=1 output clearer

TBR=0intro
CC=golang-codereviews
https://golang.org/cl/86620043

											
										
										
											2014-04-10 12:34:48 -06:00
+										runtime·printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+												" %d/%d/%d sweeps,"
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+												" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-												runtime: make times in GODEBUG=gctrace=1 output clearer

TBR=0intro
CC=golang-codereviews
https://golang.org/cl/86620043

											
										
										
											2014-04-10 12:34:48 -06:00
+											mstats.numgc, work.nproc, (t1-t0)/1000, (t2-t1)/1000, (t3-t2)/1000, (t4-t3)/1000,
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											heap0>>20, heap1>>20, obj,
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+											mstats.nmalloc, mstats.nfree,
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+											work.nspan, sweep.nbgsweep, sweep.npausesweep,
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											stats.nhandoff, stats.nhandoffcnt,
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											work.markfor->nsteal, work.markfor->nstealcnt,
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											stats.nprocyield, stats.nosyield, stats.nsleep);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+										sweep.nbgsweep = sweep.npausesweep = 0;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									// See the comment in the beginning of this function as to why we need the following.
 									// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
 									runtime·lock(&runtime·mheap.lock);
 									// Free the old cached mark array if necessary.
 									if(work.spans != nil && work.spans != runtime·mheap.allspans)
 										runtime·SysFree(work.spans, work.nspan*sizeof(work.spans[0]), &mstats.other_sys);
 									// Cache the current array for sweeping.
 									runtime·mheap.gcspans = runtime·mheap.allspans;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									runtime·mheap.sweepgen += 2;
 									runtime·mheap.sweepdone = false;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									work.spans = runtime·mheap.allspans;
 									work.nspan = runtime·mheap.nspan;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sweep.spanidx = 0;
-												runtime: fix races on mheap.allspans

This is based on the crash dump provided by Alan
and on mental experiments:

sweep 0 74
fatal error: gc: unswept span
runtime stack:
runtime.throw(0x9df60d)
markroot(0xc208002000, 0x3)
runtime.parfordo(0xc208002000)
runtime.gchelper()

I think that when we moved all stacks into heap,
we introduced a bunch of bad data races. This was later
worsened by parallel stack shrinking.

Observation 1: exitsyscall can allocate a stack from heap at any time (including during STW).
Observation 2: parallel stack shrinking can (surprisingly) grow heap during marking.
Consider that we steadily grow stacks of a number of goroutines from 8K to 16K.
And during GC they all can be shrunk back to 8K. Shrinking will allocate lots of 8K
stacks, and we do not necessary have that many in heap at this moment. So shrinking
can grow heap as well.

Consequence: any access to mheap.allspans in GC (and otherwise) must take heap lock.
This is not true in several places.

Fix this by protecting accesses to mheap.allspans and introducing allspans cache for marking,
similar to what we use for sweeping.

LGTM=rsc
R=golang-codereviews, rsc
CC=adonovan, golang-codereviews, khr, rlh
https://golang.org/cl/126510043

											
										
										
											2014-08-24 02:05:07 -06:00
+									runtime·unlock(&runtime·mheap.lock);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
-												runtime: temporary disable concurrent GC sweep
We see failures on builders, e.g.:
http://build.golang.org/log/70bb28cd6bcf8c4f49810a011bb4337a61977bf4

LGTM=rsc, dave
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62360043

											
										
										
											2014-02-12 13:03:27 -07:00
+									// Temporary disable concurrent sweep, because we see failures on builders.
-												runtime: fix freeOSMemory to free memory immediately
Currently freeOSMemory makes only marking phase of GC, but not sweeping phase.
So recently memory is not released after freeOSMemory.
Do both marking and sweeping during freeOSMemory.
Fixes #8019.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/97550043

											
										
										
											2014-05-19 02:06:30 -06:00
+									if(ConcurrentSweep && !args->eagersweep) {
-												runtime: temporary disable concurrent GC sweep
We see failures on builders, e.g.:
http://build.golang.org/log/70bb28cd6bcf8c4f49810a011bb4337a61977bf4

LGTM=rsc, dave
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62360043

											
										
										
											2014-02-12 13:03:27 -07:00
+										runtime·lock(&gclock);
 										if(sweep.g == nil)
 											sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, runtime·gc);
 										else if(sweep.parked) {
 											sweep.parked = false;
 											runtime·ready(sweep.g);
 										}
 										runtime·unlock(&gclock);
 									} else {
 										// Sweep all spans eagerly.
 										while(runtime·sweepone() != -1)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											sweep.npausesweep++;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									}
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									runtime·MProf_GC();
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->traceback = 0;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+								}
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+								extern uintptr runtime·sizeof_C_MStats;
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								void
-												runtime: delete UpdateMemStats, replace with ReadMemStats(&stats).

Unexports runtime.MemStats and rename MemStatsType to MemStats.
The new accessor requires passing a pointer to a user-allocated
MemStats structure.

Fixes #2572.

R=bradfitz, rsc, bradfitz, gustavo
CC=golang-dev, remy
https://golang.org/cl/5616072

											
										
										
											2012-02-06 11:16:26 -07:00
+								runtime·ReadMemStats(MStats *stats)
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								{
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									// Have to acquire worldsema to stop the world,
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+									// because stoptheworld can only be used by
 									// one goroutine at a time, and there might be
 									// a pending garbage collection already calling it.
-												net: add special netFD mutex
The mutex, fdMutex, handles locking and lifetime of sysfd,
and serializes Read and Write methods.
This allows to strip 2 sync.Mutex.Lock calls,
2 sync.Mutex.Unlock calls, 1 defer and some amount
of misc overhead from every network operation.

On linux/amd64, Intel E5-2690:
benchmark                             old ns/op    new ns/op    delta
BenchmarkTCP4Persistent                    9595         9454   -1.47%
BenchmarkTCP4Persistent-2                  8978         8772   -2.29%
BenchmarkTCP4ConcurrentReadWrite           4900         4625   -5.61%
BenchmarkTCP4ConcurrentReadWrite-2         2603         2500   -3.96%

In general it strips 70-500 ns from every network operation depending
on processor model. On my relatively new E5-2690 it accounts to ~5%
of network op cost.

Fixes #6074.

R=golang-dev, bradfitz, alex.brainman, iant, mikioh.mikioh
CC=golang-dev
https://golang.org/cl/12418043

											
										
										
											2013-08-09 11:43:00 -06:00
+									runtime·semacquire(&runtime·worldsema, false);
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->gcing = 1;
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+									runtime·stoptheworld();
-												runtime: WriteHeapDump dumps the heap to a file.

See http://golang.org/s/go13heapdump for the file format.

LGTM=rsc
R=rsc, bradfitz, dvyukov, khr
CC=golang-codereviews
https://golang.org/cl/37540043

											
										
										
											2014-03-25 16:09:49 -06:00
+									runtime·updatememstats(nil);
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+									// Size of the trailing by_size array differs between Go and C,
 									// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
-												runtime: use memmove rather than memcopy in mgc0.c

For consistency with other code, as that was the only use of
memcopy outside of alg.goc.

LGTM=bradfitz
R=golang-codereviews, bradfitz
CC=golang-codereviews
https://golang.org/cl/122030044

											
										
										
											2014-08-04 21:40:44 -06:00
+									runtime·memmove(stats, &mstats, runtime·sizeof_C_MStats);
-												undo CL 101570044 / 2c57aaea79c4

redo stack allocation.  This is mostly the same as
the original CL with a few bug fixes.

1. add racemalloc() for stack allocations
2. fix poolalloc/poolfree to terminate free lists correctly.
3. adjust span ref count correctly.
4. don't use cache for sizes >= StackCacheSize.

Should fix bugs and memory leaks in original changelist.

««« original CL description
undo CL 104200047 / 318b04f28372

Breaks windows and race detector.
TBR=rsc

««« original CL description
runtime: stack allocator, separate from mallocgc

In order to move malloc to Go, we need to have a
separate stack allocator.  If we run out of stack
during malloc, malloc will not be available
to allocate a new stack.

Stacks are the last remaining FlagNoGC objects in the
GC heap.  Once they are out, we can get rid of the
distinction between the allocated/blockboundary bits.
(This will be in a separate change.)

Fixes #7468
Fixes #7424

LGTM=rsc, dvyukov
R=golang-codereviews, dvyukov, khr, dave, rsc
CC=golang-codereviews
https://golang.org/cl/104200047
»»»

TBR=rsc
CC=golang-codereviews
https://golang.org/cl/101570044
»»»

LGTM=dvyukov
R=dvyukov, dave, khr, alex.brainman
CC=golang-codereviews
https://golang.org/cl/112240044

											
										
										
											2014-07-17 15:41:46 -06:00
 									// Stack numbers are part of the heap numbers, separate those out for user consumption
 									stats->stacks_sys = stats->stacks_inuse;
 									stats->heap_inuse -= stats->stacks_inuse;
 									stats->heap_sys -= stats->stacks_inuse;
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->gcing = 0;
 									g->m->locks++;
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									runtime·semrelease(&runtime·worldsema);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									runtime·starttheworld();
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									g->m->locks--;
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								}
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								void
 								runtime∕debug·readGCStats(Slice *pauses)
 								{
 									uint64 *p;
 									uint32 i, n;
 									// Calling code in runtime/debug should make the slice large enough.
 									if(pauses->cap < nelem(mstats.pause_ns)+3)
 										runtime·throw("runtime: short slice passed to readGCStats");
 									// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
 									p = (uint64*)pauses->array;
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+									runtime·lock(&runtime·mheap.lock);
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									n = mstats.numgc;
 									if(n > nelem(mstats.pause_ns))
 										n = nelem(mstats.pause_ns);
 									// The pause buffer is circular. The most recent pause is at
 									// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
 									// from there to go back farther in time. We deliver the times
 									// most recent first (in p[0]).
 									for(i=0; i<n; i++)
 										p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
 									p[n] = mstats.last_gc;
 									p[n+1] = mstats.numgc;
 									p[n+2] = mstats.pause_total_ns;
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+									runtime·unlock(&runtime·mheap.lock);
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									pauses->len = n+3;
 								}
-												runtime,runtime/debug: Converted some functions from goc to Go.

LGTM=khr
R=golang-codereviews, khr
CC=dvyukov, golang-codereviews
https://golang.org/cl/131010044

											
										
										
											2014-08-24 21:27:00 -06:00
+								void
 								runtime·setgcpercent_m(void) {
 									int32 in;
-												runtime: use goc2c as much as possible

Package runtime's C functions written to be called from Go
started out written in C using carefully constructed argument
lists and the FLUSH macro to write a result back to memory.

For some functions, the appropriate parameter list ended up
being architecture-dependent due to differences in alignment,
so we added 'goc2c', which takes a .goc file containing Go func
declarations but C bodies, rewrites the Go func declaration to
equivalent C declarations for the target architecture, adds the
needed FLUSH statements, and writes out an equivalent C file.
That C file is compiled as part of package runtime.

Native Client's x86-64 support introduces the most complex
alignment rules yet, breaking many functions that could until
now be portably written in C. Using goc2c for those avoids the
breakage.

Separately, Keith's work on emitting stack information from
the C compiler would require the hand-written functions
to add #pragmas specifying how many arguments are result
parameters. Using goc2c for those avoids maintaining #pragmas.

For both reasons, use goc2c for as many Go-called C functions
as possible.

This CL is a replay of the bulk of CL 15400047 and CL 15790043,
both of which were reviewed as part of the NaCl port and are
checked in to the NaCl branch. This CL is part of bringing the
NaCl code into the main tree.

No new code here, just reformatting and occasional movement
into .h files.

LGTM=r
R=dave, alex.brainman, r
CC=golang-codereviews
https://golang.org/cl/65220044

											
										
										
											2014-02-20 13:58:47 -07:00
+									int32 out;
-												runtime,runtime/debug: Converted some functions from goc to Go.

LGTM=khr
R=golang-codereviews, khr
CC=dvyukov, golang-codereviews
https://golang.org/cl/131010044

											
										
										
											2014-08-24 21:27:00 -06:00
+									in = (int32)(intptr)g->m->scalararg[0];
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+									runtime·lock(&runtime·mheap.lock);
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									out = runtime·gcpercent;
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									if(in < 0)
 										in = -1;
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									runtime·gcpercent = in;
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+									runtime·unlock(&runtime·mheap.lock);
-												runtime,runtime/debug: Converted some functions from goc to Go.

LGTM=khr
R=golang-codereviews, khr
CC=dvyukov, golang-codereviews
https://golang.org/cl/131010044

											
										
										
											2014-08-24 21:27:00 -06:00
 									g->m->scalararg[0] = (uintptr)(intptr)out;
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								}
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								static void
 								gchelperstart(void)
 								{
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									if(g->m->helpgc < 0 || g->m->helpgc >= MaxGcproc)
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+										runtime·throw("gchelperstart: bad m->helpgc");
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									if(g != g->m->g0)
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+										runtime·throw("gchelper not running on g0 stack");
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								}
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+								static void
 								runfinq(void)
 								{
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									Finalizer *f;
 									FinBlock *fb, *next;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									byte *frame;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									uint32 framesz, framecap, i;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									Eface *ef, ef1;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
-												runtime: fix memory leak in runfinq

One reason the sync.Pool finalizer test can fail is that
this function's ef1 contains uninitialized data that just
happens to point at some of the old pool. I've seen this cause
retention of a single pool cache line (32 elements) on arm.

Really we need liveness information for C functions, but
for now we can be more careful about data in long-lived
C functions that block.

LGTM=bradfitz, dvyukov
R=golang-codereviews, bradfitz, dvyukov
CC=golang-codereviews, iant, khr
https://golang.org/cl/72490043

											
										
										
											2014-03-07 09:27:01 -07:00
+									// This function blocks for long periods of time, and because it is written in C
 									// we have no liveness information. Zero everything so that uninitialized pointers
 									// do not cause memory leaks.
 									f = nil;
 									fb = nil;
 									next = nil;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									frame = nil;
 									framecap = 0;
-												runtime: fix memory leak in runfinq

One reason the sync.Pool finalizer test can fail is that
this function's ef1 contains uninitialized data that just
happens to point at some of the old pool. I've seen this cause
retention of a single pool cache line (32 elements) on arm.

Really we need liveness information for C functions, but
for now we can be more careful about data in long-lived
C functions that block.

LGTM=bradfitz, dvyukov
R=golang-codereviews, bradfitz, dvyukov
CC=golang-codereviews, iant, khr
https://golang.org/cl/72490043

											
										
										
											2014-03-07 09:27:01 -07:00
+									framesz = 0;
 									i = 0;
 									ef = nil;
 									ef1.type = nil;
 									ef1.data = nil;
 									// force flush to memory
 									USED(&f);
 									USED(&fb);
 									USED(&next);
 									USED(&framesz);
 									USED(&i);
 									USED(&ef);
 									USED(&ef1);
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									for(;;) {
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+										runtime·lock(&finlock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										fb = finq;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+										finq = nil;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										if(fb == nil) {
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+											runtime·fingwait = true;
-												runtime: fix program termination when main goroutine calls Goexit
Do not consider idle finalizer/bgsweep/timer goroutines as doing something useful.
We can't simply set isbackground for the whole lifetime of the goroutines,
because when finalizer goroutine calls user function, we do want to consider it
as doing something useful.
This is borken due to timers for quite some time.
With background sweep is become even more broken.
Fixes #7784.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/87960044

											
										
										
											2014-04-15 09:48:17 -06:00
+											g->isbackground = true;
-												runtime: convert common scheduler functions to Go
These are required for chans, semaphores, timers, etc.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123640043

											
										
										
											2014-08-21 10:41:09 -06:00
+											runtime·parkunlock(&finlock, runtime·gostringnocopy((byte*)"finalizer wait"));
-												runtime: fix program termination when main goroutine calls Goexit
Do not consider idle finalizer/bgsweep/timer goroutines as doing something useful.
We can't simply set isbackground for the whole lifetime of the goroutines,
because when finalizer goroutine calls user function, we do want to consider it
as doing something useful.
This is borken due to timers for quite some time.
With background sweep is become even more broken.
Fixes #7784.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/87960044

											
										
										
											2014-04-15 09:48:17 -06:00
+											g->isbackground = false;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+											continue;
 										}
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+										runtime·unlock(&finlock);
-												runtime/race: more precise handling of finalizers
Currently race detector runtime just disables race detection in the finalizer goroutine.
It has false positives when a finalizer writes to shared memory -- the race with finalizer is reported in a normal goroutine that accesses the same memory.
After this change I am going to synchronize the finalizer goroutine with the rest of the world in racefingo(). This is closer to what happens in reality and so
does not have false positives.
And also add README file with instructions how to build the runtime.

R=golang-dev, minux.ma, rsc
CC=golang-dev
https://golang.org/cl/6810095

											
										
										
											2012-11-14 05:58:10 -07:00
+										if(raceenabled)
 											runtime·racefingo();
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										for(; fb; fb=next) {
 											next = fb->next;
 											for(i=0; i<fb->cnt; i++) {
 												f = &fb->fin[i];
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												framesz = sizeof(Eface) + f->nret;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+												if(framecap < framesz) {
-												runtime: prevent GC from seeing the contents of a frame in runfinq
This holds the last finalized object and arguments to its finalizer.
Fixes #5348.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/11454044

											
										
										
											2013-07-19 08:01:33 -06:00
+													// The frame does not contain pointers interesting for GC,
-												runtime: fix heap memory corruption
With concurrent sweeping finc if modified by runfinq and queuefinalizer concurrently.
Fixes crashes like this one:
http://build.golang.org/log/6ad7b59ef2e93e3c9347eabfb4c4bd66df58fd5a
Fixes #7324.
Update #7396

LGTM=rsc
R=golang-codereviews, minux.ma, rsc
CC=golang-codereviews, khr
https://golang.org/cl/67980043

											
										
										
											2014-02-24 09:53:50 -07:00
+													// all not yet finalized objects are stored in finq.
-												runtime: rename FlagNoPointers to FlagNoScan.  Better represents
the use of the flag, especially for objects which actually do have
pointers but we don't want the GC to scan them.

R=golang-dev, cshapiro
CC=golang-dev
https://golang.org/cl/13181045

											
										
										
											2013-08-23 18:28:47 -06:00
+													// If we do not mark it as FlagNoScan,
-												runtime: prevent GC from seeing the contents of a frame in runfinq
This holds the last finalized object and arguments to its finalizer.
Fixes #5348.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/11454044

											
										
										
											2013-07-19 08:01:33 -06:00
+													// the last finalized object is not collected.
-												runtime: remove mal/malloc/FlagNoGC/FlagNoInvokeGC
FlagNoGC is unused now.
FlagNoInvokeGC is unneeded as we don't invoke GC
on g0 and when holding locks anyway.
mal/malloc have very few uses and you never remember
the exact set of flags they use and the difference between them.
Moreover, eventually we need to give exact types to all allocations,
something what mal/malloc do not support.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/117580043

											
										
										
											2014-08-07 03:04:04 -06:00
+													frame = runtime·mallocgc(framesz, 0, FlagNoScan);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+													framecap = framesz;
 												}
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												if(f->fint == nil)
 													runtime·throw("missing type in runfinq");
-												cmd/gc, runtime: refactor interface inlining decision into compiler

We need to change the interface value representation for
concurrent garbage collection, so that there is no ambiguity
about whether the data word holds a pointer or scalar.

This CL does NOT make any representation changes.

Instead, it removes representation assumptions from
various pieces of code throughout the tree.
The isdirectiface function in cmd/gc/subr.c is now
the only place that decides that policy.
The policy propagates out from there in the reflect
metadata, as a new flag in the internal kind value.

A follow-up CL will change the representation by
changing the isdirectiface function. If that CL causes
problems, it will be easy to roll back.

Update #8405.

LGTM=iant
R=golang-codereviews, iant
CC=golang-codereviews, r
https://golang.org/cl/129090043

											
										
										
											2014-08-18 19:13:11 -06:00
+												if((f->fint->kind&KindMask) == KindPtr) {
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+													// direct use of pointer
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+													*(void**)frame = f->arg;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												} else if(((InterfaceType*)f->fint)->mhdr.len == 0) {
 													// convert to empty interface
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+													ef = (Eface*)frame;
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+													ef->type = &f->ot->typ;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+													ef->data = f->arg;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												} else {
 													// convert to interface with methods, via empty interface.
-												cmd/cc, runtime: eliminate use of the unnamed substructure C extension

Eliminating use of this extension makes it easier to port the Go runtime
to other compilers. This CL also disables the extension in cc to prevent
accidental use.

LGTM=rsc, khr
R=rsc, aram, khr, dvyukov
CC=axwalk, golang-codereviews
https://golang.org/cl/106790044

											
										
										
											2014-08-07 07:00:02 -06:00
+													ef1.type = &f->ot->typ;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+													ef1.data = f->arg;
 													if(!runtime·ifaceE2I2((InterfaceType*)f->fint, ef1, (Iface*)frame))
 														runtime·throw("invalid type conversion in runfinq");
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												}
-												reflect, runtime: fix crash in GC due to reflect.call + precise GC

Given
        type Outer struct {
                *Inner
                ...
        }
the compiler generates the implementation of (*Outer).M dispatching to
the embedded Inner. The implementation is logically:
        func (p *Outer) M() {
                (p.Inner).M()
        }
but since the only change here is the replacement of one pointer
receiver with another, the actual generated code overwrites the
original receiver with the p.Inner pointer and then jumps to the M
method expecting the *Inner receiver.

During reflect.Value.Call, we create an argument frame and the
associated data structures to describe it to the garbage collector,
populate the frame, call reflect.call to run a function call using
that frame, and then copy the results back out of the frame. The
reflect.call function does a memmove of the frame structure onto the
stack (to set up the inputs), runs the call, and the memmoves the
stack back to the frame structure (to preserve the outputs).

Originally reflect.call did not distinguish inputs from outputs: both
memmoves were for the full stack frame. However, in the case where the
called function was one of these wrappers, the rewritten receiver is
almost certainly a different type than the original receiver. This is
not a problem on the stack, where we use the program counter to
determine the type information and understand that during (*Outer).M
the receiver is an *Outer while during (*Inner).M the receiver in the
same memory word is now an *Inner. But in the statically typed
argument frame created by reflect, the receiver is always an *Outer.
Copying the modified receiver pointer off the stack into the frame
will store an *Inner there, and then if a garbage collection happens
to scan that argument frame before it is discarded, it will scan the
*Inner memory as if it were an *Outer. If the two have different
memory layouts, the collection will intepret the memory incorrectly.

Fix by only copying back the results.

Fixes #7725.

LGTM=khr
R=khr
CC=dave, golang-codereviews
https://golang.org/cl/85180043

											
										
										
											2014-04-08 09:11:35 -06:00
+												reflect·call(f->fn, frame, framesz, framesz);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+												f->fn = nil;
 												f->arg = nil;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												f->ot = nil;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											}
 											fb->cnt = 0;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+											runtime·lock(&finlock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											fb->next = finc;
 											finc = fb;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+											runtime·unlock(&finlock);
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+										}
-												runtime: fix memory leak in runfinq

One reason the sync.Pool finalizer test can fail is that
this function's ef1 contains uninitialized data that just
happens to point at some of the old pool. I've seen this cause
retention of a single pool cache line (32 elements) on arm.

Really we need liveness information for C functions, but
for now we can be more careful about data in long-lived
C functions that block.

LGTM=bradfitz, dvyukov
R=golang-codereviews, bradfitz, dvyukov
CC=golang-codereviews, iant, khr
https://golang.org/cl/72490043

											
										
										
											2014-03-07 09:27:01 -07:00
 										// Zero everything that's dead, to avoid memory leaks.
 										// See comment at top of function.
 										f = nil;
 										fb = nil;
 										next = nil;
 										i = 0;
 										ef = nil;
 										ef1.type = nil;
 										ef1.data = nil;
-												runtime: ,s/[a-zA-Z0-9_]+/runtime·&/g, almost

Prefix all external symbols in runtime by runtime·,
to avoid conflicts with possible symbols of the same
name in linked-in C libraries.  The obvious conflicts
are printf, malloc, and free, but hide everything to
avoid future pain.

The symbols left alone are:

	** known to cgo **
	_cgo_free
	_cgo_malloc
	libcgo_thread_start
	initcgo
	ncgocall

	** known to linker **
	_rt0_$GOARCH
	_rt0_$GOARCH_$GOOS
	text
	etext
	data
	end
	pclntab
	epclntab
	symtab
	esymtab

	** known to C compiler **
	_divv
	_modv
	_div64by32
	etc (arch specific)

Tested on darwin/386, darwin/amd64, linux/386, linux/amd64.

Built (but not tested) for freebsd/386, freebsd/amd64, linux/arm, windows/386.

R=r, PeterGo
CC=golang-dev
https://golang.org/cl/2899041

											
										
										
											2010-11-04 12:00:19 -06:00
+										runtime·gc(1);	// trigger another gc to clean up the finalized objects, if possible
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+								void
 								runtime·createfing(void)
-												runtime: fix another race in bgsweep
It's possible that bgsweep constantly does not catch up for some reason,
in this case runfinq was not woken at all.

R=rsc
CC=golang-codereviews
https://golang.org/cl/75940043

											
										
										
											2014-03-14 13:32:12 -06:00
+								{
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									if(runtime·fing != nil)
-												runtime: fix another race in bgsweep
It's possible that bgsweep constantly does not catch up for some reason,
in this case runfinq was not woken at all.

R=rsc
CC=golang-codereviews
https://golang.org/cl/75940043

											
										
										
											2014-03-14 13:32:12 -06:00
+										return;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+									// Here we use gclock instead of finlock,
 									// because newproc1 can allocate, which can cause on-demand span sweep,
 									// which can queue finalizers, which would deadlock.
-												runtime: fix another race in bgsweep
It's possible that bgsweep constantly does not catch up for some reason,
in this case runfinq was not woken at all.

R=rsc
CC=golang-codereviews
https://golang.org/cl/75940043

											
										
										
											2014-03-14 13:32:12 -06:00
+									runtime·lock(&gclock);
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+									if(runtime·fing == nil)
 										runtime·fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
-												runtime: fix another race in bgsweep
It's possible that bgsweep constantly does not catch up for some reason,
in this case runfinq was not woken at all.

R=rsc
CC=golang-codereviews
https://golang.org/cl/75940043

											
										
										
											2014-03-14 13:32:12 -06:00
+									runtime·unlock(&gclock);
 								}
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+								void
 								runtime·createfingM(G *gp)
 								{
 									runtime·createfing();
 									runtime·gogo(&gp->sched);
 								}
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+								G*
 								runtime·wakefing(void)
 								{
 									G *res;
 									res = nil;
 									runtime·lock(&finlock);
 									if(runtime·fingwait && runtime·fingwake) {
 										runtime·fingwait = false;
 										runtime·fingwake = false;
-												runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046

											
										
										
											2014-07-30 10:01:52 -06:00
+										res = runtime·fing;
-												runtime: fix yet another race in bgsweep
Currently it's possible that bgsweep finishes before all spans
have been swept (we only know that sweeping of all spans has *started*).
In such case bgsweep may fail wake up runfinq goroutine when it needs to.
finq may still be nil at this point, but some finalizers may be queued later.
Make bgsweep to wait for sweeping to *complete*, then it can decide
whether it needs to wake up runfinq for sure.
Update #7533

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/75960043

											
										
										
											2014-03-26 05:11:36 -06:00
+									}
 									runtime·unlock(&finlock);
 									return res;
 								}
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+								// Recursively unrolls GC program in prog.
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								// mask is where to store the result.
 								// ppos is a pointer to position in mask, in bits.
 								// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
 								static byte*
 								unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								{
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr pos, siz, i, off;
 									byte *arena_start, *prog1, v, *bitp, shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									arena_start = runtime·mheap.arena_start;
 									pos = *ppos;
 									for(;;) {
 										switch(prog[0]) {
 										case insData:
 											prog++;
 											siz = prog[0];
 											prog++;
 											for(i = 0; i < siz; i++) {
 												v = prog[i/PointersPerByte];
 												v >>= (i%PointersPerByte)*BitsPerPointer;
 												v &= BitsMask;
 												if(inplace) {
 													// Store directly into GC bitmap.
 													off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+													bitp = arena_start - off/wordsPerBitmapByte - 1;
 													shift = (off % wordsPerBitmapByte) * gcBits;
 													if(shift==0)
 														*bitp = 0;
 													*bitp |= v<<(shift+2);
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+													pos += PtrSize;
 												} else if(sparse) {
 													// 4-bits per word
 													v <<= (pos%8)+2;
 													mask[pos/8] |= v;
 													pos += gcBits;
 												} else {
 													// 2-bits per word
 													v <<= pos%8;
 													mask[pos/8] |= v;
 													pos += BitsPerPointer;
 												}
 											}
 											prog += ROUND(siz*BitsPerPointer, 8)/8;
 											break;
 										case insArray:
 											prog++;
 											siz = 0;
 											for(i = 0; i < PtrSize; i++)
 												siz = (siz<<8) + prog[PtrSize-i-1];
 											prog += PtrSize;
 											prog1 = nil;
 											for(i = 0; i < siz; i++)
 												prog1 = unrollgcprog1(mask, prog, &pos, inplace, sparse);
 											if(prog1[0] != insArrayEnd)
 												runtime·throw("unrollgcprog: array does not end with insArrayEnd");
 											prog = prog1+1;
 											break;
 										case insArrayEnd:
 										case insEnd:
 											*ppos = pos;
 											return prog;
 										default:
 											runtime·throw("unrollgcprog: unknown instruction");
 										}
 									}
 								}
 								// Unrolls GC program prog for data/bss, returns dense GC mask.
 								static byte*
 								unrollglobgcprog(byte *prog, uintptr size)
 								{
 									byte *mask;
 									uintptr pos, masksize;
 									masksize = ROUND(ROUND(size, PtrSize)/PtrSize*BitsPerPointer, 8)/8;
 									mask = runtime·persistentalloc(masksize+1, 0, &mstats.gc_sys);
 									mask[masksize] = 0xa1;
 									pos = 0;
 									prog = unrollgcprog1(mask, prog, &pos, false, false);
 									if(pos != size/PtrSize*BitsPerPointer) {
 										runtime·printf("unrollglobgcprog: bad program size, got %D, expect %D\n",
 											(uint64)pos, (uint64)size/PtrSize*BitsPerPointer);
 										runtime·throw("unrollglobgcprog: bad program size");
 									}
 									if(prog[0] != insEnd)
 										runtime·throw("unrollglobgcprog: program does not end with insEnd");
 									if(mask[masksize] != 0xa1)
 										runtime·throw("unrollglobgcprog: overflow");
 									return mask;
 								}
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+								void
 								runtime·unrollgcproginplace_m(void)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								{
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr size, size0, pos, off;
 									byte *arena_start, *prog, *bitp, shift;
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+									Type *typ;
 									void *v;
 									v = g->m->ptrarg[0];
 									typ = g->m->ptrarg[1];
 									size = g->m->scalararg[0];
 									size0 = g->m->scalararg[1];
 									g->m->ptrarg[0] = nil;
 									g->m->ptrarg[1] = nil;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
 									pos = 0;
 									prog = (byte*)typ->gc[1];
 									while(pos != size0)
 										unrollgcprog1(v, prog, &pos, true, true);
 									// Mark first word as bitAllocated.
 									arena_start = runtime·mheap.arena_start;
 									off = (uintptr*)v - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									bitp = arena_start - off/wordsPerBitmapByte - 1;
 									shift = (off % wordsPerBitmapByte) * gcBits;
 									*bitp |= bitBoundary<<shift;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									// Mark word after last as BitsDead.
 									if(size0 < size) {
 										off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+										bitp = arena_start - off/wordsPerBitmapByte - 1;
 										shift = (off % wordsPerBitmapByte) * gcBits;
 										*bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									}
 								}
 								// Unrolls GC program in typ->gc[1] into typ->gc[0]
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+								void
 								runtime·unrollgcprog_m(void)
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+								{
 									static Lock lock;
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+									Type *typ;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									byte *mask, *prog;
 									uintptr pos;
 									uint32 x;
-												runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

											
										
										
											2014-08-07 03:34:30 -06:00
+									typ = g->m->ptrarg[0];
 									g->m->ptrarg[0] = nil;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+									runtime·lock(&lock);
 									mask = (byte*)typ->gc[0];
 									if(mask[0] == 0) {
 										pos = 8;  // skip the unroll flag
 										prog = (byte*)typ->gc[1];
 										prog = unrollgcprog1(mask, prog, &pos, false, true);
 										if(prog[0] != insEnd)
 											runtime·throw("unrollgcprog: program does not end with insEnd");
 										if(((typ->size/PtrSize)%2) != 0) {
 											// repeat the program twice
 											prog = (byte*)typ->gc[1];
 											unrollgcprog1(mask, prog, &pos, false, true);
 										}
 										// atomic way to say mask[0] = 1
 										x = ((uint32*)mask)[0];
 										runtime·atomicstore((uint32*)mask, x|1);
 									}
 									runtime·unlock(&lock);
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// mark the span of memory at v as having n blocks of the given size.
 								// if leftover is true, there is left over space at the end of the span.
 								void
 								runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 								{
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr i, off, step;
 									byte *b;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("markspan: bad pointer");
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									// Find bits of the beginning of the span.
 									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
 									if((off%wordsPerBitmapByte) != 0)
 										runtime·throw("markspan: unaligned length");
 									// Okay to use non-atomic ops here, because we control
 									// the entire span, and each bitmap byte has bits for only
 									// one span, so no other goroutines are changing these bitmap words.
 									if(size == PtrSize) {
 										// Possible only on 64-bits (minimal size class is 8 bytes).
 										// Poor man's memset(0x11).
 										if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
 											runtime·throw("markspan: bad bits");
 										if((n%(wordsPerBitmapByte*PtrSize)) != 0)
 											runtime·throw("markspan: unaligned length");
 										b = b - n/wordsPerBitmapByte + 1;	// find first byte
 										if(((uintptr)b%PtrSize) != 0)
 											runtime·throw("markspan: unaligned pointer");
 										for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
 											*(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
 										return;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
 									if(leftover)
 										n++;	// mark a boundary just past end of last block too
 									step = size/(PtrSize*wordsPerBitmapByte);
 									for(i = 0; i != n; i++, b -= step)
 										*b = bitBoundary|(BitsDead<<2);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								}
 								// unmark the span of memory at v of length n bytes.
 								void
 								runtime·unmarkspan(void *v, uintptr n)
 								{
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr off;
 									byte *b;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("markspan: bad pointer");
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									if((off % (PtrSize*wordsPerBitmapByte)) != 0)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("markspan: unaligned pointer");
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									n /= PtrSize;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									if(n%(PtrSize*wordsPerBitmapByte) != 0)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("unmarkspan: unaligned length");
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									// Okay to use non-atomic ops here, because we control
 									// the entire span, and each bitmap word has bits for only
 									// one span, so no other goroutines are changing these
 									// bitmap words.
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									n /= wordsPerBitmapByte;
 									runtime·memclr(b - n + 1, n);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								}
 								void
 								runtime·MHeap_MapBits(MHeap *h)
 								{
 									// Caller has added extra mappings to the arena.
 									// Add extra mappings of bitmap words as needed.
 									// We allocate extra bitmap pieces in chunks of bitmapChunk.
 									enum {
 										bitmapChunk = 8192
 									};
 									uintptr n;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
-												runtime: allocate page table lazily
This removes the 256MB memory allocation at startup,
which conflicts with ulimit.
Also will allow to eliminate an unnecessary memory dereference in GC,
because the page table is usually mapped at known address.
Update #5049.
Update #5236.

R=golang-dev, khr, r, khr, rsc
CC=golang-dev
https://golang.org/cl/9791044

											
										
										
											2013-05-28 12:04:34 -06:00
+									n = ROUND(n, bitmapChunk);
-												all: merge NaCl branch (part 1)

See golang.org/s/go13nacl for design overview.

This CL is the mostly mechanical changes from rsc's Go 1.2 based NaCl branch, specifically 39cb35750369 to 500771b477cf from https://code.google.com/r/rsc-go13nacl. This CL does not include working NaCl support, there are probably two or three more large merges to come.

CL 15750044 is not included as it involves more invasive changes to the linker which will need to be merged separately.

The exact change lists included are

15050047: syscall: support for Native Client
15360044: syscall: unzip implementation for Native Client
15370044: syscall: Native Client SRPC implementation
15400047: cmd/dist, cmd/go, go/build, test: support for Native Client
15410048: runtime: support for Native Client
15410049: syscall: file descriptor table for Native Client
15410050: syscall: in-memory file system for Native Client
15440048: all: update +build lines for Native Client port
15540045: cmd/6g, cmd/8g, cmd/gc: support for Native Client
15570045: os: support for Native Client
15680044: crypto/..., hash/crc32, reflect, sync/atomic: support for amd64p32
15690044: net: support for Native Client
15690048: runtime: support for fake time like on Go Playground
15690051: build: disable various tests on Native Client

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/68150047

											
										
										
											2014-02-25 07:47:42 -07:00
+									n = ROUND(n, PhysPageSize);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									if(h->bitmap_mapped >= n)
 										return;
-												runtime: accurately record whether heap memory is reserved

The existing code did not have a clear notion of whether
memory has been actually reserved.  It checked based on
whether in 32-bit mode or 64-bit mode and (on GNU/Linux) the
requested address, but it confused the requested address and
the returned address.

LGTM=rsc
R=rsc, dvyukov
CC=golang-codereviews, michael.hudson
https://golang.org/cl/79610043

											
										
										
											2014-03-25 14:22:19 -06:00
+									runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats.gc_sys);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									h->bitmap_mapped = n;
 								}
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
 								static bool
 								getgcmaskcb(Stkframe *frame, void *ctxt)
 								{
 									Stkframe *frame0;
 									frame0 = ctxt;
 									if(frame0->sp >= (uintptr)frame->varp - frame->sp && frame0->sp < (uintptr)frame->varp) {
 										*frame0 = *frame;
 										return false;
 									}
 									return true;
 								}
 								// Returns GC type info for object p for testing.
 								void
 								runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
 								{
 									Stkframe frame;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+									uintptr i, n, off;
 									byte *base, bits, shift, *b;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
 									*mask = nil;
 									*len = 0;
 									// data
 									if(p >= data && p < edata) {
 										n = ((PtrType*)t)->elem->size;
 										*len = n/PtrSize;
 										*mask = runtime·mallocgc(*len, nil, 0);
 										for(i = 0; i < n; i += PtrSize) {
 											off = (p+i-data)/PtrSize;
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+											bits = (runtime·gcdatamask[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											(*mask)[i/PtrSize] = bits;
 										}
 										return;
 									}
 									// bss
 									if(p >= bss && p < ebss) {
 										n = ((PtrType*)t)->elem->size;
 										*len = n/PtrSize;
 										*mask = runtime·mallocgc(*len, nil, 0);
 										for(i = 0; i < n; i += PtrSize) {
 											off = (p+i-bss)/PtrSize;
-												runtime: fix dump of data/bss
Fixes #8530.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/124440043

											
										
										
											2014-08-18 06:42:24 -06:00
+											bits = (runtime·gcbssmask[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											(*mask)[i/PtrSize] = bits;
 										}
 										return;
 									}
 									// heap
 									if(runtime·mlookup(p, &base, &n, nil)) {
 										*len = n/PtrSize;
 										*mask = runtime·mallocgc(*len, nil, 0);
 										for(i = 0; i < n; i += PtrSize) {
 											off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
-												runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043

											
										
										
											2014-08-19 07:38:00 -06:00
+											b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
 											shift = (off % wordsPerBitmapByte) * gcBits;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											bits = (*b >> (shift+2))&BitsMask;
 											(*mask)[i/PtrSize] = bits;
 										}
 										return;
 									}
 									// stack
 									frame.fn = nil;
 									frame.sp = (uintptr)p;
 									runtime·gentraceback((uintptr)runtime·getcallerpc(&p), (uintptr)runtime·getcallersp(&p), 0, g, 0, nil, 1000, getgcmaskcb, &frame, false);
 									if(frame.fn != nil) {
 										Func *f;
 										StackMap *stackmap;
 										BitVector bv;
 										uintptr size;
 										uintptr targetpc;
 										int32 pcdata;
 										f = frame.fn;
 										targetpc = frame.continpc;
 										if(targetpc == 0)
 											return;
 										if(targetpc != f->entry)
 											targetpc--;
 										pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
 										if(pcdata == -1)
 											return;
 										stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
 										if(stackmap == nil || stackmap->n <= 0)
 											return;
 										bv = runtime·stackmapdata(stackmap, pcdata);
 										size = bv.n/BitsPerPointer*PtrSize;
 										n = ((PtrType*)t)->elem->size;
 										*len = n/PtrSize;
 										*mask = runtime·mallocgc(*len, nil, 0);
 										for(i = 0; i < n; i += PtrSize) {
 											off = (p+i-frame.varp+size)/PtrSize;
-												runtime: fix getgcmask
bv.data is an array of uint32s but the code was using
offsets computed for an array of bytes.
Add a test for stack GC info.
Fixes #8531.

LGTM=rsc
R=golang-codereviews
CC=golang-codereviews, khr, rsc
https://golang.org/cl/124450043

											
										
										
											2014-08-15 12:36:12 -06:00
+											bits = (bv.data[off*BitsPerPointer/32] >> ((off*BitsPerPointer)%32))&BitsMask;
-												runtime: simpler and faster GC

Implement the design described in:
https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub

Summary of the changes:
GC uses "2-bits per word" pointer type info embed directly into bitmap.
Scanning of stacks/data/heap is unified.
The old spans types go away.
Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap).
Linker generates "dense" 2-bits type info for data/bss (the same as stacks use).

Summary of results:
-1680 lines of code total (-1000+ in mgc0.c only)
-25% memory consumption
-3-7% binary size
-15% GC pause reduction
-7% run time reduction

LGTM=khr
R=golang-codereviews, rsc, christoph, khr
CC=golang-codereviews, rlh
https://golang.org/cl/106260045

											
										
										
											2014-07-29 01:01:02 -06:00
+											(*mask)[i/PtrSize] = bits;
 										}
 									}
 								}
-												runtime: convert timers to Go

LGTM=rsc
R=golang-codereviews, ruiu, rsc, daniel.morsing
CC=golang-codereviews, khr
https://golang.org/cl/123700044

											
										
										
											2014-08-25 10:25:22 -06:00
 								void runtime·gc_unixnanotime(int64 *now);
 								int64 runtime·unixnanotime(void)
 								{
 									int64 now;
 									runtime·gc_unixnanotime(&now);
 									return now;
 								}