go/src/pkg/runtime/mgc0.c

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Garbage collector (GC).
//
// GC is:
// - mark&sweep
// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
// - parallel (up to MaxGcproc threads)
// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
// - non-moving/non-compacting
// - full (non-partial)
//
// GC rate.
// Next GC is after we've allocated an extra amount of memory proportional to
// the amount already in use. The proportion is controlled by GOGC environment variable
// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
// (and also the amount of extra memory used).
//
// Concurrent sweep.
// The sweep phase proceeds concurrently with normal program execution.
// The heap is swept span-by-span both lazily (when a goroutine needs another span)
// and concurrently in a background goroutine (this helps programs that are not CPU bound).
// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
// and so next_gc calculation is tricky and happens as follows.
// At the end of the stop-the-world phase next_gc is conservatively set based on total
// heap size; all spans are marked as "needs sweeping".
// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
// closer to the target value. However, this is not enough to avoid over-allocating memory.
// Consider that a goroutine wants to allocate a new span for a large object and
// there are no free swept spans, but there are small-object unswept spans.
// If the goroutine naively allocates a new span, it can surpass the yet-unknown
// target next_gc value. In order to prevent such cases (1) when a goroutine needs
// to allocate a new small-object span, it sweeps small-object spans for the same
// object size until it frees at least one object; (2) when a goroutine needs to
// allocate large-object span from heap, it sweeps spans until it frees at least
// that many pages into heap. Together these two measures ensure that we don't surpass
// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
// but there can still be other one-page unswept spans which could be combined into a two-page span.
// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
// The finalizer goroutine is kicked off only when all spans are swept.
// When the next GC starts, it sweeps all not-yet-swept spans (if any).

#include "runtime.h"
#include "arch_GOARCH.h"
#include "malloc.h"
#include "stack.h"
#include "mgc0.h"
#include "race.h"
#include "type.h"
#include "typekind.h"
#include "funcdata.h"
#include "../../cmd/ld/textflag.h"

enum {
	Debug = 0,
	CollectStats = 0,
	ScanStackByFrames = 1,
	IgnorePreciseGC = 0,
	ConcurrentSweep = 1,

	// Four bits per word (see #defines below).
	wordsPerBitmapWord = sizeof(void*)*8/4,
	bitShift = sizeof(void*)*8/4,

	WorkbufSize	= 16*1024,
	RootBlockSize	= 4*1024,
	FinBlockSize	= 4*1024,

	handoffThreshold = 4,
	IntermediateBufferCapacity = 64,

	// Bits in type information
	PRECISE = 1,
	LOOP = 2,
	PC_BITS = PRECISE | LOOP,

	// Pointer map
	BitsPerPointer = 2,
	BitsNoPointer = 0,
	BitsPointer = 1,
	BitsIface = 2,
	BitsEface = 3,

	RootData	= 0,
	RootBss		= 1,
	RootFinalizers	= 2,
	RootSpanTypes	= 3,
	RootFlushCaches = 4,
	RootCount	= 5,
};

#define GcpercentUnknown (-2)

// Initialized from $GOGC.  GOGC=off means no gc.
static int32 gcpercent = GcpercentUnknown;

static struct
{
	Lock;  
	void* head;
} pools;

void
sync·runtime_registerPool(void **p)
{
	runtime·lock(&pools);
	p[0] = pools.head;
	pools.head = p;
	runtime·unlock(&pools);
}

static void
clearpools(void)
{
	void **pool, **next;
	P *p, **pp;
	MCache *c;
	uintptr off;
	int32 i;

	// clear sync.Pool's
	for(pool = pools.head; pool != nil; pool = next) {
		next = pool[0];
		pool[0] = nil; // next
		pool[1] = nil; // local
		pool[2] = nil; // localSize
		off = (uintptr)pool[3] / sizeof(void*);
		pool[off+0] = nil; // global slice
		pool[off+1] = nil;
		pool[off+2] = nil;
	}
	pools.head = nil;

	for(pp=runtime·allp; p=*pp; pp++) {
		// clear tinyalloc pool
		c = p->mcache;
		if(c != nil) {
			c->tiny = nil;
			c->tinysize = 0;
		}
		// clear defer pools
		for(i=0; i<nelem(p->deferpool); i++)
			p->deferpool[i] = nil;
	}
}

// Bits in per-word bitmap.
// #defines because enum might not be able to hold the values.
//
// Each word in the bitmap describes wordsPerBitmapWord words
// of heap memory.  There are 4 bitmap bits dedicated to each heap word,
// so on a 64-bit system there is one bitmap word per 16 heap words.
// The bits in the word are packed together by type first, then by
// heap location, so each 64-bit bitmap word consists of, from top to bottom,
// the 16 bitSpecial bits for the corresponding heap words, then the 16 bitMarked bits,
// then the 16 bitScan/bitBlockBoundary bits, then the 16 bitAllocated bits.
// This layout makes it easier to iterate over the bits of a given type.
//
// The bitmap starts at mheap.arena_start and extends *backward* from
// there.  On a 64-bit system the off'th word in the arena is tracked by
// the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
// the only difference is that the divisor is 8.)
//
// To pull out the bits corresponding to a given pointer p, we use:
//
//	off = p - (uintptr*)mheap.arena_start;  // word offset
//	b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
//	shift = off % wordsPerBitmapWord
//	bits = *b >> shift;
//	/* then test bits & bitAllocated, bits & bitMarked, etc. */
//
#define bitAllocated		((uintptr)1<<(bitShift*0))	/* block start; eligible for garbage collection */
#define bitScan			((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
#define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
#define bitSpecial		((uintptr)1<<(bitShift*3))	/* when bitAllocated is set - has finalizer or being profiled */
#define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set - mark for FlagNoGC objects */

#define bitMask (bitAllocated | bitScan | bitMarked | bitSpecial)

// Holding worldsema grants an M the right to try to stop the world.
// The procedure is:
//
//	runtime·semacquire(&runtime·worldsema);
//	m->gcing = 1;
//	runtime·stoptheworld();
//
//	... do stuff ...
//
//	m->gcing = 0;
//	runtime·semrelease(&runtime·worldsema);
//	runtime·starttheworld();
//
uint32 runtime·worldsema = 1;

typedef struct Obj Obj;
struct Obj
{
	byte	*p;	// data pointer
	uintptr	n;	// size of data in bytes
	uintptr	ti;	// type info
};

typedef struct Workbuf Workbuf;
struct Workbuf
{
#define SIZE (WorkbufSize-sizeof(LFNode)-sizeof(uintptr))
	LFNode  node; // must be first
	uintptr nobj;
	Obj     obj[SIZE/sizeof(Obj) - 1];
	uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
#undef SIZE
};

typedef struct Finalizer Finalizer;
struct Finalizer
{
	FuncVal *fn;
	void *arg;
	uintptr nret;
	Type *fint;
	PtrType *ot;
};

typedef struct FinBlock FinBlock;
struct FinBlock
{
	FinBlock *alllink;
	FinBlock *next;
	int32 cnt;
	int32 cap;
	Finalizer fin[1];
};

extern byte data[];
extern byte edata[];
extern byte bss[];
extern byte ebss[];

extern byte gcdata[];
extern byte gcbss[];

static G	*fing;
static FinBlock	*finq; // list of finalizers that are to be executed
static FinBlock	*finc; // cache of free blocks
static FinBlock	*allfin; // list of all blocks
static int32	fingwait;
static Lock	gclock;

static void	runfinq(void);
static void	bgsweep(void);
static Workbuf* getempty(Workbuf*);
static Workbuf* getfull(Workbuf*);
static void	putempty(Workbuf*);
static Workbuf* handoff(Workbuf*);
static void	gchelperstart(void);
static void	addfinroots(void *wbufp, void *v);
static void	flushallmcaches(void);
static void	scanframe(Stkframe *frame, void *wbufp);
static void	addstackroots(G *gp, Workbuf **wbufp);

static FuncVal runfinqv = {runfinq};
static FuncVal bgsweepv = {bgsweep};

static struct {
	uint64	full;  // lock-free list of full blocks
	uint64	empty; // lock-free list of empty blocks
	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
	uint32	nproc;
	int64	tstart;
	volatile uint32	nwait;
	volatile uint32	ndone;
	Note	alldone;
	ParFor	*markfor;

	Lock;
	byte	*chunk;
	uintptr	nchunk;
} work;

enum {
	GC_DEFAULT_PTR = GC_NUM_INSTR,
	GC_CHAN,

	GC_NUM_INSTR2
};

static struct {
	struct {
		uint64 sum;
		uint64 cnt;
	} ptr;
	uint64 nbytes;
	struct {
		uint64 sum;
		uint64 cnt;
		uint64 notype;
		uint64 typelookup;
	} obj;
	uint64 rescan;
	uint64 rescanbytes;
	uint64 instr[GC_NUM_INSTR2];
	uint64 putempty;
	uint64 getfull;
	struct {
		uint64 foundbit;
		uint64 foundword;
		uint64 foundspan;
	} flushptrbuf;
	struct {
		uint64 foundbit;
		uint64 foundword;
		uint64 foundspan;
	} markonly;
	uint32 nbgsweep;
	uint32 npausesweep;
} gcstats;

// markonly marks an object. It returns true if the object
// has been marked by this function, false otherwise.
// This function doesn't append the object to any buffer.
static bool
markonly(void *obj)
{
	byte *p;
	uintptr *bitp, bits, shift, x, xbits, off, j;
	MSpan *s;
	PageID k;

	// Words outside the arena cannot be pointers.
	if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
		return false;

	// obj may be a pointer to a live object.
	// Try to find the beginning of the object.

	// Round down to word boundary.
	obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));

	// Find bits for this word.
	off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
	bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;
	xbits = *bitp;
	bits = xbits >> shift;

	// Pointing at the beginning of a block?
	if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
		if(CollectStats)
			runtime·xadd64(&gcstats.markonly.foundbit, 1);
		goto found;
	}

	// Pointing just past the beginning?
	// Scan backward a little to find a block boundary.
	for(j=shift; j-->0; ) {
		if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
			shift = j;
			bits = xbits>>shift;
			if(CollectStats)
				runtime·xadd64(&gcstats.markonly.foundword, 1);
			goto found;
		}
	}

	// Otherwise consult span table to find beginning.
	// (Manually inlined copy of MHeap_LookupMaybe.)
	k = (uintptr)obj>>PageShift;
	x = k;
	x -= (uintptr)runtime·mheap.arena_start>>PageShift;
	s = runtime·mheap.spans[x];
	if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
		return false;
	p = (byte*)((uintptr)s->start<<PageShift);
	if(s->sizeclass == 0) {
		obj = p;
	} else {
		uintptr size = s->elemsize;
		int32 i = ((byte*)obj - p)/size;
		obj = p+i*size;
	}

	// Now that we know the object header, reload bits.
	off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
	bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;
	xbits = *bitp;
	bits = xbits >> shift;
	if(CollectStats)
		runtime·xadd64(&gcstats.markonly.foundspan, 1);

found:
	// Now we have bits, bitp, and shift correct for
	// obj pointing at the base of the object.
	// Only care about allocated and not marked.
	if((bits & (bitAllocated|bitMarked)) != bitAllocated)
		return false;
	if(work.nproc == 1)
		*bitp |= bitMarked<<shift;
	else {
		for(;;) {
			x = *bitp;
			if(x & (bitMarked<<shift))
				return false;
			if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
				break;
		}
	}

	// The object is now marked
	return true;
}

// PtrTarget is a structure used by intermediate buffers.
// The intermediate buffers hold GC data before it
// is moved/flushed to the work buffer (Workbuf).
// The size of an intermediate buffer is very small,
// such as 32 or 64 elements.
typedef struct PtrTarget PtrTarget;
struct PtrTarget
{
	void *p;
	uintptr ti;
};

typedef	struct Scanbuf Scanbuf;
struct	Scanbuf
{
	struct {
		PtrTarget *begin;
		PtrTarget *end;
		PtrTarget *pos;
	} ptr;
	struct {
		Obj *begin;
		Obj *end;
		Obj *pos;
	} obj;
	Workbuf *wbuf;
	Obj *wp;
	uintptr nobj;
};

typedef struct BufferList BufferList;
struct BufferList
{
	PtrTarget ptrtarget[IntermediateBufferCapacity];
	Obj obj[IntermediateBufferCapacity];
	uint32 busy;
	byte pad[CacheLineSize];
};
#pragma dataflag NOPTR
static BufferList bufferList[MaxGcproc];

static Type *itabtype;

static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);

// flushptrbuf moves data from the PtrTarget buffer to the work buffer.
// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
// while the work buffer contains blocks which have been marked
// and are prepared to be scanned by the garbage collector.
//
// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
//
// A simplified drawing explaining how the todo-list moves from a structure to another:
//
//     scanblock
//  (find pointers)
//    Obj ------> PtrTarget (pointer targets)
//     ↑          |
//     |          |
//     `----------'
//     flushptrbuf
//  (find block start, mark and enqueue)
static void
flushptrbuf(Scanbuf *sbuf)
{
	byte *p, *arena_start, *obj;
	uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
	MSpan *s;
	PageID k;
	Obj *wp;
	Workbuf *wbuf;
	PtrTarget *ptrbuf;
	PtrTarget *ptrbuf_end;

	arena_start = runtime·mheap.arena_start;

	wp = sbuf->wp;
	wbuf = sbuf->wbuf;
	nobj = sbuf->nobj;

	ptrbuf = sbuf->ptr.begin;
	ptrbuf_end = sbuf->ptr.pos;
	n = ptrbuf_end - sbuf->ptr.begin;
	sbuf->ptr.pos = sbuf->ptr.begin;

	if(CollectStats) {
		runtime·xadd64(&gcstats.ptr.sum, n);
		runtime·xadd64(&gcstats.ptr.cnt, 1);
	}

	// If buffer is nearly full, get a new one.
	if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
		if(wbuf != nil)
			wbuf->nobj = nobj;
		wbuf = getempty(wbuf);
		wp = wbuf->obj;
		nobj = 0;

		if(n >= nelem(wbuf->obj))
			runtime·throw("ptrbuf has to be smaller than WorkBuf");
	}

	while(ptrbuf < ptrbuf_end) {
		obj = ptrbuf->p;
		ti = ptrbuf->ti;
		ptrbuf++;

		// obj belongs to interval [mheap.arena_start, mheap.arena_used).
		if(Debug > 1) {
			if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
				runtime·throw("object is outside of mheap");
		}

		// obj may be a pointer to a live object.
		// Try to find the beginning of the object.

		// Round down to word boundary.
		if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
			obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
			ti = 0;
		}

		// Find bits for this word.
		off = (uintptr*)obj - (uintptr*)arena_start;
		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		xbits = *bitp;
		bits = xbits >> shift;

		// Pointing at the beginning of a block?
		if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
			if(CollectStats)
				runtime·xadd64(&gcstats.flushptrbuf.foundbit, 1);
			goto found;
		}

		ti = 0;

		// Pointing just past the beginning?
		// Scan backward a little to find a block boundary.
		for(j=shift; j-->0; ) {
			if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
				obj = (byte*)obj - (shift-j)*PtrSize;
				shift = j;
				bits = xbits>>shift;
				if(CollectStats)
					runtime·xadd64(&gcstats.flushptrbuf.foundword, 1);
				goto found;
			}
		}

		// Otherwise consult span table to find beginning.
		// (Manually inlined copy of MHeap_LookupMaybe.)
		k = (uintptr)obj>>PageShift;
		x = k;
		x -= (uintptr)arena_start>>PageShift;
		s = runtime·mheap.spans[x];
		if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
			continue;
		p = (byte*)((uintptr)s->start<<PageShift);
		if(s->sizeclass == 0) {
			obj = p;
		} else {
			size = s->elemsize;
			int32 i = ((byte*)obj - p)/size;
			obj = p+i*size;
		}

		// Now that we know the object header, reload bits.
		off = (uintptr*)obj - (uintptr*)arena_start;
		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		xbits = *bitp;
		bits = xbits >> shift;
		if(CollectStats)
			runtime·xadd64(&gcstats.flushptrbuf.foundspan, 1);

	found:
		// Now we have bits, bitp, and shift correct for
		// obj pointing at the base of the object.
		// Only care about allocated and not marked.
		if((bits & (bitAllocated|bitMarked)) != bitAllocated)
			continue;
		if(work.nproc == 1)
			*bitp |= bitMarked<<shift;
		else {
			for(;;) {
				x = *bitp;
				if(x & (bitMarked<<shift))
					goto continue_obj;
				if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
					break;
			}
		}

		// If object has no pointers, don't need to scan further.
		if((bits & bitScan) == 0)
			continue;

		// Ask span about size class.
		// (Manually inlined copy of MHeap_Lookup.)
		x = (uintptr)obj >> PageShift;
		x -= (uintptr)arena_start>>PageShift;
		s = runtime·mheap.spans[x];

		PREFETCH(obj);

		*wp = (Obj){obj, s->elemsize, ti};
		wp++;
		nobj++;
	continue_obj:;
	}

	// If another proc wants a pointer, give it some.
	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
		wbuf->nobj = nobj;
		wbuf = handoff(wbuf);
		nobj = wbuf->nobj;
		wp = wbuf->obj + nobj;
	}

	sbuf->wp = wp;
	sbuf->wbuf = wbuf;
	sbuf->nobj = nobj;
}

static void
flushobjbuf(Scanbuf *sbuf)
{
	uintptr nobj, off;
	Obj *wp, obj;
	Workbuf *wbuf;
	Obj *objbuf;
	Obj *objbuf_end;

	wp = sbuf->wp;
	wbuf = sbuf->wbuf;
	nobj = sbuf->nobj;

	objbuf = sbuf->obj.begin;
	objbuf_end = sbuf->obj.pos;
	sbuf->obj.pos = sbuf->obj.begin;

	while(objbuf < objbuf_end) {
		obj = *objbuf++;

		// Align obj.b to a word boundary.
		off = (uintptr)obj.p & (PtrSize-1);
		if(off != 0) {
			obj.p += PtrSize - off;
			obj.n -= PtrSize - off;
			obj.ti = 0;
		}

		if(obj.p == nil || obj.n == 0)
			continue;

		// If buffer is full, get a new one.
		if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
			if(wbuf != nil)
				wbuf->nobj = nobj;
			wbuf = getempty(wbuf);
			wp = wbuf->obj;
			nobj = 0;
		}

		*wp = obj;
		wp++;
		nobj++;
	}

	// If another proc wants a pointer, give it some.
	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
		wbuf->nobj = nobj;
		wbuf = handoff(wbuf);
		nobj = wbuf->nobj;
		wp = wbuf->obj + nobj;
	}

	sbuf->wp = wp;
	sbuf->wbuf = wbuf;
	sbuf->nobj = nobj;
}

// Program that scans the whole block and treats every block element as a potential pointer
static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};

// Hchan program
static uintptr chanProg[2] = {0, GC_CHAN};

// Local variables of a program fragment or loop
typedef struct Frame Frame;
struct Frame {
	uintptr count, elemsize, b;
	uintptr *loop_or_ret;
};

// Sanity check for the derived type info objti.
static void
checkptr(void *obj, uintptr objti)
{
	uintptr *pc1, *pc2, type, tisize, i, j, x;
	byte *objstart;
	Type *t;
	MSpan *s;

	if(!Debug)
		runtime·throw("checkptr is debug only");

	if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
		return;
	type = runtime·gettype(obj);
	t = (Type*)(type & ~(uintptr)(PtrSize-1));
	if(t == nil)
		return;
	x = (uintptr)obj >> PageShift;
	x -= (uintptr)(runtime·mheap.arena_start)>>PageShift;
	s = runtime·mheap.spans[x];
	objstart = (byte*)((uintptr)s->start<<PageShift);
	if(s->sizeclass != 0) {
		i = ((byte*)obj - objstart)/s->elemsize;
		objstart += i*s->elemsize;
	}
	tisize = *(uintptr*)objti;
	// Sanity check for object size: it should fit into the memory block.
	if((byte*)obj + tisize > objstart + s->elemsize) {
		runtime·printf("object of type '%S' at %p/%p does not fit in block %p/%p\n",
			       *t->string, obj, tisize, objstart, s->elemsize);
		runtime·throw("invalid gc type info");
	}
	if(obj != objstart)
		return;
	// If obj points to the beginning of the memory block,
	// check type info as well.
	if(t->string == nil ||
		// Gob allocates unsafe pointers for indirection.
		(runtime·strcmp(t->string->str, (byte*)"unsafe.Pointer") &&
		// Runtime and gc think differently about closures.
		runtime·strstr(t->string->str, (byte*)"struct { F uintptr") != t->string->str)) {
		pc1 = (uintptr*)objti;
		pc2 = (uintptr*)t->gc;
		// A simple best-effort check until first GC_END.
		for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
			if(pc1[j] != pc2[j]) {
				runtime·printf("invalid gc type info for '%s' at %p, type info %p, block info %p\n",
					       t->string ? (int8*)t->string->str : (int8*)"?", j, pc1[j], pc2[j]);
				runtime·throw("invalid gc type info");
			}
		}
	}
}					

// scanblock scans a block of n bytes starting at pointer b for references
// to other objects, scanning any it finds recursively until there are no
// unscanned objects left.  Instead of using an explicit recursion, it keeps
// a work list in the Workbuf* structures and loops in the main function
// body.  Keeping an explicit work list is easier on the stack allocator and
// more efficient.
static void
scanblock(Workbuf *wbuf, bool keepworking)
{
	byte *b, *arena_start, *arena_used;
	uintptr n, i, end_b, elemsize, size, ti, objti, count, type, nobj;
	uintptr *pc, precise_type, nominal_size;
	uintptr *chan_ret, chancap;
	void *obj;
	Type *t;
	Slice *sliceptr;
	Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
	BufferList *scanbuffers;
	Scanbuf sbuf;
	Eface *eface;
	Iface *iface;
	Hchan *chan;
	ChanType *chantype;
	Obj *wp;

	if(sizeof(Workbuf) % WorkbufSize != 0)
		runtime·throw("scanblock: size of Workbuf is suboptimal");

	// Memory arena parameters.
	arena_start = runtime·mheap.arena_start;
	arena_used = runtime·mheap.arena_used;

	stack_ptr = stack+nelem(stack)-1;

	precise_type = false;
	nominal_size = 0;

	if(wbuf) {
		nobj = wbuf->nobj;
		wp = &wbuf->obj[nobj];
	} else {
		nobj = 0;
		wp = nil;
	}

	// Initialize sbuf
	scanbuffers = &bufferList[m->helpgc];

	sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0];
	sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget);

	sbuf.obj.begin = sbuf.obj.pos = &scanbuffers->obj[0];
	sbuf.obj.end = sbuf.obj.begin + nelem(scanbuffers->obj);

	sbuf.wbuf = wbuf;
	sbuf.wp = wp;
	sbuf.nobj = nobj;

	// (Silence the compiler)
	chan = nil;
	chantype = nil;
	chan_ret = nil;

	goto next_block;

	for(;;) {
		// Each iteration scans the block b of length n, queueing pointers in
		// the work buffer.
		if(Debug > 1) {
			runtime·printf("scanblock %p %D\n", b, (int64)n);
		}

		if(CollectStats) {
			runtime·xadd64(&gcstats.nbytes, n);
			runtime·xadd64(&gcstats.obj.sum, sbuf.nobj);
			runtime·xadd64(&gcstats.obj.cnt, 1);
		}

		if(ti != 0) {
			pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
			precise_type = (ti & PRECISE);
			stack_top.elemsize = pc[0];
			if(!precise_type)
				nominal_size = pc[0];
			if(ti & LOOP) {
				stack_top.count = 0;	// 0 means an infinite number of iterations
				stack_top.loop_or_ret = pc+1;
			} else {
				stack_top.count = 1;
			}
			if(Debug) {
				// Simple sanity check for provided type info ti:
				// The declared size of the object must be not larger than the actual size
				// (it can be smaller due to inferior pointers).
				// It's difficult to make a comprehensive check due to inferior pointers,
				// reflection, gob, etc.
				if(pc[0] > n) {
					runtime·printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
					runtime·throw("invalid gc type info");
				}
			}
		} else if(UseSpanType) {
			if(CollectStats)
				runtime·xadd64(&gcstats.obj.notype, 1);

			type = runtime·gettype(b);
			if(type != 0) {
				if(CollectStats)
					runtime·xadd64(&gcstats.obj.typelookup, 1);

				t = (Type*)(type & ~(uintptr)(PtrSize-1));
				switch(type & (PtrSize-1)) {
				case TypeInfo_SingleObject:
					pc = (uintptr*)t->gc;
					precise_type = true;  // type information about 'b' is precise
					stack_top.count = 1;
					stack_top.elemsize = pc[0];
					break;
				case TypeInfo_Array:
					pc = (uintptr*)t->gc;
					if(pc[0] == 0)
						goto next_block;
					precise_type = true;  // type information about 'b' is precise
					stack_top.count = 0;  // 0 means an infinite number of iterations
					stack_top.elemsize = pc[0];
					stack_top.loop_or_ret = pc+1;
					break;
				case TypeInfo_Chan:
					chan = (Hchan*)b;
					chantype = (ChanType*)t;
					chan_ret = nil;
					pc = chanProg;
					break;
				default:
					runtime·throw("scanblock: invalid type");
					return;
				}
			} else {
				pc = defaultProg;
			}
		} else {
			pc = defaultProg;
		}

		if(IgnorePreciseGC)
			pc = defaultProg;

		pc++;
		stack_top.b = (uintptr)b;

		end_b = (uintptr)b + n - PtrSize;

	for(;;) {
		if(CollectStats)
			runtime·xadd64(&gcstats.instr[pc[0]], 1);

		obj = nil;
		objti = 0;
		switch(pc[0]) {
		case GC_PTR:
			obj = *(void**)(stack_top.b + pc[1]);
			objti = pc[2];
			pc += 3;
			if(Debug)
				checkptr(obj, objti);
			break;

		case GC_SLICE:
			sliceptr = (Slice*)(stack_top.b + pc[1]);
			if(sliceptr->cap != 0) {
				obj = sliceptr->array;
				// Can't use slice element type for scanning,
				// because if it points to an array embedded
				// in the beginning of a struct,
				// we will scan the whole struct as the slice.
				// So just obtain type info from heap.
			}
			pc += 3;
			break;

		case GC_APTR:
			obj = *(void**)(stack_top.b + pc[1]);
			pc += 2;
			break;

		case GC_STRING:
			obj = *(void**)(stack_top.b + pc[1]);
			markonly(obj);
			pc += 2;
			continue;

		case GC_EFACE:
			eface = (Eface*)(stack_top.b + pc[1]);
			pc += 2;
			if(eface->type == nil)
				continue;

			// eface->type
			t = eface->type;
			if((void*)t >= arena_start && (void*)t < arena_used) {
				*sbuf.ptr.pos++ = (PtrTarget){t, 0};
				if(sbuf.ptr.pos == sbuf.ptr.end)
					flushptrbuf(&sbuf);
			}

			// eface->data
			if(eface->data >= arena_start && eface->data < arena_used) {
				if(t->size <= sizeof(void*)) {
					if((t->kind & KindNoPointers))
						continue;

					obj = eface->data;
					if((t->kind & ~KindNoPointers) == KindPtr)
						objti = (uintptr)((PtrType*)t)->elem->gc;
				} else {
					obj = eface->data;
					objti = (uintptr)t->gc;
				}
			}
			break;

		case GC_IFACE:
			iface = (Iface*)(stack_top.b + pc[1]);
			pc += 2;
			if(iface->tab == nil)
				continue;
			
			// iface->tab
			if((void*)iface->tab >= arena_start && (void*)iface->tab < arena_used) {
				*sbuf.ptr.pos++ = (PtrTarget){iface->tab, (uintptr)itabtype->gc};
				if(sbuf.ptr.pos == sbuf.ptr.end)
					flushptrbuf(&sbuf);
			}

			// iface->data
			if(iface->data >= arena_start && iface->data < arena_used) {
				t = iface->tab->type;
				if(t->size <= sizeof(void*)) {
					if((t->kind & KindNoPointers))
						continue;

					obj = iface->data;
					if((t->kind & ~KindNoPointers) == KindPtr)
						objti = (uintptr)((PtrType*)t)->elem->gc;
				} else {
					obj = iface->data;
					objti = (uintptr)t->gc;
				}
			}
			break;

		case GC_DEFAULT_PTR:
			while(stack_top.b <= end_b) {
				obj = *(byte**)stack_top.b;
				stack_top.b += PtrSize;
				if(obj >= arena_start && obj < arena_used) {
					*sbuf.ptr.pos++ = (PtrTarget){obj, 0};
					if(sbuf.ptr.pos == sbuf.ptr.end)
						flushptrbuf(&sbuf);
				}
			}
			goto next_block;

		case GC_END:
			if(--stack_top.count != 0) {
				// Next iteration of a loop if possible.
				stack_top.b += stack_top.elemsize;
				if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
					pc = stack_top.loop_or_ret;
					continue;
				}
				i = stack_top.b;
			} else {
				// Stack pop if possible.
				if(stack_ptr+1 < stack+nelem(stack)) {
					pc = stack_top.loop_or_ret;
					stack_top = *(++stack_ptr);
					continue;
				}
				i = (uintptr)b + nominal_size;
			}
			if(!precise_type) {
				// Quickly scan [b+i,b+n) for possible pointers.
				for(; i<=end_b; i+=PtrSize) {
					if(*(byte**)i != nil) {
						// Found a value that may be a pointer.
						// Do a rescan of the entire block.
						enqueue((Obj){b, n, 0}, &sbuf.wbuf, &sbuf.wp, &sbuf.nobj);
						if(CollectStats) {
							runtime·xadd64(&gcstats.rescan, 1);
							runtime·xadd64(&gcstats.rescanbytes, n);
						}
						break;
					}
				}
			}
			goto next_block;

		case GC_ARRAY_START:
			i = stack_top.b + pc[1];
			count = pc[2];
			elemsize = pc[3];
			pc += 4;

			// Stack push.
			*stack_ptr-- = stack_top;
			stack_top = (Frame){count, elemsize, i, pc};
			continue;

		case GC_ARRAY_NEXT:
			if(--stack_top.count != 0) {
				stack_top.b += stack_top.elemsize;
				pc = stack_top.loop_or_ret;
			} else {
				// Stack pop.
				stack_top = *(++stack_ptr);
				pc += 1;
			}
			continue;

		case GC_CALL:
			// Stack push.
			*stack_ptr-- = stack_top;
			stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
			pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
			continue;

		case GC_REGION:
			obj = (void*)(stack_top.b + pc[1]);
			size = pc[2];
			objti = pc[3];
			pc += 4;

			*sbuf.obj.pos++ = (Obj){obj, size, objti};
			if(sbuf.obj.pos == sbuf.obj.end)
				flushobjbuf(&sbuf);
			continue;

		case GC_CHAN_PTR:
			chan = *(Hchan**)(stack_top.b + pc[1]);
			if(chan == nil) {
				pc += 3;
				continue;
			}
			if(markonly(chan)) {
				chantype = (ChanType*)pc[2];
				if(!(chantype->elem->kind & KindNoPointers)) {
					// Start chanProg.
					chan_ret = pc+3;
					pc = chanProg+1;
					continue;
				}
			}
			pc += 3;
			continue;

		case GC_CHAN:
			// There are no heap pointers in struct Hchan,
			// so we can ignore the leading sizeof(Hchan) bytes.
			if(!(chantype->elem->kind & KindNoPointers)) {
				// Channel's buffer follows Hchan immediately in memory.
				// Size of buffer (cap(c)) is second int in the chan struct.
				chancap = ((uintgo*)chan)[1];
				if(chancap > 0) {
					// TODO(atom): split into two chunks so that only the
					// in-use part of the circular buffer is scanned.
					// (Channel routines zero the unused part, so the current
					// code does not lead to leaks, it's just a little inefficient.)
					*sbuf.obj.pos++ = (Obj){(byte*)chan+runtime·Hchansize, chancap*chantype->elem->size,
						(uintptr)chantype->elem->gc | PRECISE | LOOP};
					if(sbuf.obj.pos == sbuf.obj.end)
						flushobjbuf(&sbuf);
				}
			}
			if(chan_ret == nil)
				goto next_block;
			pc = chan_ret;
			continue;

		default:
			runtime·printf("runtime: invalid GC instruction %p at %p\n", pc[0], pc);
			runtime·throw("scanblock: invalid GC instruction");
			return;
		}

		if(obj >= arena_start && obj < arena_used) {
			*sbuf.ptr.pos++ = (PtrTarget){obj, objti};
			if(sbuf.ptr.pos == sbuf.ptr.end)
				flushptrbuf(&sbuf);
		}
	}

	next_block:
		// Done scanning [b, b+n).  Prepare for the next iteration of
		// the loop by setting b, n, ti to the parameters for the next block.

		if(sbuf.nobj == 0) {
			flushptrbuf(&sbuf);
			flushobjbuf(&sbuf);

			if(sbuf.nobj == 0) {
				if(!keepworking) {
					if(sbuf.wbuf)
						putempty(sbuf.wbuf);
					return;
				}
				// Emptied our buffer: refill.
				sbuf.wbuf = getfull(sbuf.wbuf);
				if(sbuf.wbuf == nil)
					return;
				sbuf.nobj = sbuf.wbuf->nobj;
				sbuf.wp = sbuf.wbuf->obj + sbuf.wbuf->nobj;
			}
		}

		// Fetch b from the work buffer.
		--sbuf.wp;
		b = sbuf.wp->p;
		n = sbuf.wp->n;
		ti = sbuf.wp->ti;
		sbuf.nobj--;
	}
}

// Append obj to the work buffer.
// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
static void
enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
{
	uintptr nobj, off;
	Obj *wp;
	Workbuf *wbuf;

	if(Debug > 1)
		runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);

	// Align obj.b to a word boundary.
	off = (uintptr)obj.p & (PtrSize-1);
	if(off != 0) {
		obj.p += PtrSize - off;
		obj.n -= PtrSize - off;
		obj.ti = 0;
	}

	if(obj.p == nil || obj.n == 0)
		return;

	// Load work buffer state
	wp = *_wp;
	wbuf = *_wbuf;
	nobj = *_nobj;

	// If another proc wants a pointer, give it some.
	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
		wbuf->nobj = nobj;
		wbuf = handoff(wbuf);
		nobj = wbuf->nobj;
		wp = wbuf->obj + nobj;
	}

	// If buffer is full, get a new one.
	if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
		if(wbuf != nil)
			wbuf->nobj = nobj;
		wbuf = getempty(wbuf);
		wp = wbuf->obj;
		nobj = 0;
	}

	*wp = obj;
	wp++;
	nobj++;

	// Save work buffer state
	*_wp = wp;
	*_wbuf = wbuf;
	*_nobj = nobj;
}

static void
enqueue1(Workbuf **wbufp, Obj obj)
{
	Workbuf *wbuf;

	wbuf = *wbufp;
	if(wbuf->nobj >= nelem(wbuf->obj))
		*wbufp = wbuf = getempty(wbuf);
	wbuf->obj[wbuf->nobj++] = obj;
}

static void
markroot(ParFor *desc, uint32 i)
{
	Workbuf *wbuf;
	FinBlock *fb;
	MHeap *h;
	MSpan **allspans, *s;
	uint32 spanidx, sg;
	G *gp;
	void *p;

	USED(&desc);
	wbuf = getempty(nil);
	switch(i) {
	case RootData:
		enqueue1(&wbuf, (Obj){data, edata - data, (uintptr)gcdata});
		break;

	case RootBss:
		enqueue1(&wbuf, (Obj){bss, ebss - bss, (uintptr)gcbss});
		break;

	case RootFinalizers:
		for(fb=allfin; fb; fb=fb->alllink)
			enqueue1(&wbuf, (Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
		break;

	case RootSpanTypes:
		// mark span types and MSpan.specials (to walk spans only once)
		h = &runtime·mheap;
		sg = h->sweepgen;
		allspans = h->allspans;
		for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
			Special *sp;
			SpecialFinalizer *spf;

			s = allspans[spanidx];
			if(s->sweepgen != sg) {
				runtime·printf("sweep %d %d\n", s->sweepgen, sg);
				runtime·throw("gc: unswept span");
			}
			if(s->state != MSpanInUse)
				continue;
			// The garbage collector ignores type pointers stored in MSpan.types:
			//  - Compiler-generated types are stored outside of heap.
			//  - The reflect package has runtime-generated types cached in its data structures.
			//    The garbage collector relies on finding the references via that cache.
			if(s->types.compression == MTypes_Words || s->types.compression == MTypes_Bytes)
				markonly((byte*)s->types.data);
			for(sp = s->specials; sp != nil; sp = sp->next) {
				if(sp->kind != KindSpecialFinalizer)
					continue;
				// don't mark finalized object, but scan it so we
				// retain everything it points to.
				spf = (SpecialFinalizer*)sp;
				// A finalizer can be set for an inner byte of an object, find object beginning.
				p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
				enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
				enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
				enqueue1(&wbuf, (Obj){(void*)&spf->fint, PtrSize, 0});
				enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
			}
		}
		break;

	case RootFlushCaches:
		flushallmcaches();
		break;

	default:
		// the rest is scanning goroutine stacks
		if(i - RootCount >= runtime·allglen)
			runtime·throw("markroot: bad index");
		gp = runtime·allg[i - RootCount];
		// remember when we've first observed the G blocked
		// needed only to output in traceback
		if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
			gp->waitsince = work.tstart;
		addstackroots(gp, &wbuf);
		break;
		
	}

	if(wbuf)
		scanblock(wbuf, false);
}

// Get an empty work buffer off the work.empty list,
// allocating new buffers as needed.
static Workbuf*
getempty(Workbuf *b)
{
	if(b != nil)
		runtime·lfstackpush(&work.full, &b->node);
	b = (Workbuf*)runtime·lfstackpop(&work.empty);
	if(b == nil) {
		// Need to allocate.
		runtime·lock(&work);
		if(work.nchunk < sizeof *b) {
			work.nchunk = 1<<20;
			work.chunk = runtime·SysAlloc(work.nchunk, &mstats.gc_sys);
			if(work.chunk == nil)
				runtime·throw("runtime: cannot allocate memory");
		}
		b = (Workbuf*)work.chunk;
		work.chunk += sizeof *b;
		work.nchunk -= sizeof *b;
		runtime·unlock(&work);
	}
	b->nobj = 0;
	return b;
}

static void
putempty(Workbuf *b)
{
	if(CollectStats)
		runtime·xadd64(&gcstats.putempty, 1);

	runtime·lfstackpush(&work.empty, &b->node);
}

// Get a full work buffer off the work.full list, or return nil.
static Workbuf*
getfull(Workbuf *b)
{
	int32 i;

	if(CollectStats)
		runtime·xadd64(&gcstats.getfull, 1);

	if(b != nil)
		runtime·lfstackpush(&work.empty, &b->node);
	b = (Workbuf*)runtime·lfstackpop(&work.full);
	if(b != nil || work.nproc == 1)
		return b;

	runtime·xadd(&work.nwait, +1);
	for(i=0;; i++) {
		if(work.full != 0) {
			runtime·xadd(&work.nwait, -1);
			b = (Workbuf*)runtime·lfstackpop(&work.full);
			if(b != nil)
				return b;
			runtime·xadd(&work.nwait, +1);
		}
		if(work.nwait == work.nproc)
			return nil;
		if(i < 10) {
			m->gcstats.nprocyield++;
			runtime·procyield(20);
		} else if(i < 20) {
			m->gcstats.nosyield++;
			runtime·osyield();
		} else {
			m->gcstats.nsleep++;
			runtime·usleep(100);
		}
	}
}

static Workbuf*
handoff(Workbuf *b)
{
	int32 n;
	Workbuf *b1;

	// Make new buffer with half of b's pointers.
	b1 = getempty(nil);
	n = b->nobj/2;
	b->nobj -= n;
	b1->nobj = n;
	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
	m->gcstats.nhandoff++;
	m->gcstats.nhandoffcnt += n;

	// Put b on full list - let first half of b get stolen.
	runtime·lfstackpush(&work.full, &b->node);
	return b1;
}

extern byte pclntab[]; // base for f->ptrsoff

typedef struct BitVector BitVector;
struct BitVector
{
	int32 n;
	uint32 data[];
};

typedef struct StackMap StackMap;
struct StackMap
{
	int32 n;
	uint32 data[];
};

static BitVector*
stackmapdata(StackMap *stackmap, int32 n)
{
	BitVector *bv;
	uint32 *ptr;
	uint32 words;
	int32 i;

	if(n < 0 || n >= stackmap->n) {
		runtime·throw("stackmapdata: index out of range");
	}
	ptr = stackmap->data;
	for(i = 0; i < n; i++) {
		bv = (BitVector*)ptr;
		words = ((bv->n + 31) / 32) + 1;
		ptr += words;
	}
	return (BitVector*)ptr;
}

// Scans an interface data value when the interface type indicates
// that it is a pointer.
static void
scaninterfacedata(uintptr bits, byte *scanp, bool afterprologue, void *wbufp)
{
	Itab *tab;
	Type *type;

	if(runtime·precisestack && afterprologue) {
		if(bits == BitsIface) {
			tab = *(Itab**)scanp;
			if(tab->type->size <= sizeof(void*) && (tab->type->kind & KindNoPointers))
				return;
		} else { // bits == BitsEface
			type = *(Type**)scanp;
			if(type->size <= sizeof(void*) && (type->kind & KindNoPointers))
				return;
		}
	}
	enqueue1(wbufp, (Obj){scanp+PtrSize, PtrSize, 0});
}

// Starting from scanp, scans words corresponding to set bits.
static void
scanbitvector(byte *scanp, BitVector *bv, bool afterprologue, void *wbufp)
{
	uintptr word, bits;
	uint32 *wordp;
	int32 i, remptrs;

	wordp = bv->data;
	for(remptrs = bv->n; remptrs > 0; remptrs -= 32) {
		word = *wordp++;
		if(remptrs < 32)
			i = remptrs;
		else
			i = 32;
		i /= BitsPerPointer;
		for(; i > 0; i--) {
			bits = word & 3;
			if(bits != BitsNoPointer && *(void**)scanp != nil)
				if(bits == BitsPointer)
					enqueue1(wbufp, (Obj){scanp, PtrSize, 0});
				else
					scaninterfacedata(bits, scanp, afterprologue, wbufp);
			word >>= BitsPerPointer;
			scanp += PtrSize;
		}
	}
}

// Scan a stack frame: local variables and function arguments/results.
static void
scanframe(Stkframe *frame, void *wbufp)
{
	Func *f;
	StackMap *stackmap;
	BitVector *bv;
	uintptr size;
	uintptr targetpc;
	int32 pcdata;
	bool afterprologue;

	f = frame->fn;
	targetpc = frame->pc;
	if(targetpc != f->entry)
		targetpc--;
	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
	if(pcdata == -1) {
		// We do not have a valid pcdata value but there might be a
		// stackmap for this function.  It is likely that we are looking
		// at the function prologue, assume so and hope for the best.
		pcdata = 0;
	}

	// Scan local variables if stack frame has been allocated.
	// Use pointer information if known.
	afterprologue = (frame->varp > (byte*)frame->sp);
	if(afterprologue) {
		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
		if(stackmap == nil) {
			// No locals information, scan everything.
			size = frame->varp - (byte*)frame->sp;
			enqueue1(wbufp, (Obj){frame->varp - size, size, 0});
		} else if(stackmap->n < 0) {
			// Locals size information, scan just the locals.
			size = -stackmap->n;
			enqueue1(wbufp, (Obj){frame->varp - size, size, 0});
		} else if(stackmap->n > 0) {
			// Locals bitmap information, scan just the pointers in
			// locals.
			if(pcdata < 0 || pcdata >= stackmap->n) {
				// don't know where we are
				runtime·printf("pcdata is %d and %d stack map entries for %s (targetpc=%p)\n",
					pcdata, stackmap->n, runtime·funcname(f), targetpc);
				runtime·throw("scanframe: bad symbol table");
			}
			bv = stackmapdata(stackmap, pcdata);
			size = (bv->n * PtrSize) / BitsPerPointer;
			scanbitvector(frame->varp - size, bv, afterprologue, wbufp);
		}
	}

	// Scan arguments.
	// Use pointer information if known.
	stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
	if(stackmap != nil) {
		bv = stackmapdata(stackmap, pcdata);
		scanbitvector(frame->argp, bv, true, wbufp);
	} else
		enqueue1(wbufp, (Obj){frame->argp, frame->arglen, 0});
}

static void
addstackroots(G *gp, Workbuf **wbufp)
{
	M *mp;
	int32 n;
	Stktop *stk;
	uintptr sp, guard;
	void *base;
	uintptr size;

	switch(gp->status){
	default:
		runtime·printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
		runtime·throw("mark - bad status");
	case Gdead:
		return;
	case Grunning:
		runtime·throw("mark - world not stopped");
	case Grunnable:
	case Gsyscall:
	case Gwaiting:
		break;
	}

	if(gp == g)
		runtime·throw("can't scan our own stack");
	if((mp = gp->m) != nil && mp->helpgc)
		runtime·throw("can't scan gchelper stack");
	if(gp->syscallstack != (uintptr)nil) {
		// Scanning another goroutine that is about to enter or might
		// have just exited a system call. It may be executing code such
		// as schedlock and may have needed to start a new stack segment.
		// Use the stack segment and stack pointer at the time of
		// the system call instead, since that won't change underfoot.
		sp = gp->syscallsp;
		stk = (Stktop*)gp->syscallstack;
		guard = gp->syscallguard;
	} else {
		// Scanning another goroutine's stack.
		// The goroutine is usually asleep (the world is stopped).
		sp = gp->sched.sp;
		stk = (Stktop*)gp->stackbase;
		guard = gp->stackguard;
		// For function about to start, context argument is a root too.
		if(gp->sched.ctxt != 0 && runtime·mlookup(gp->sched.ctxt, &base, &size, nil))
			enqueue1(wbufp, (Obj){base, size, 0});
	}
	if(ScanStackByFrames) {
		USED(sp);
		USED(stk);
		USED(guard);
		runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, scanframe, wbufp, false);
	} else {
		n = 0;
		while(stk) {
			if(sp < guard-StackGuard || (uintptr)stk < sp) {
				runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
				runtime·throw("scanstack");
			}
			enqueue1(wbufp, (Obj){(byte*)sp, (uintptr)stk - sp, (uintptr)defaultProg | PRECISE | LOOP});
			sp = stk->gobuf.sp;
			guard = stk->stackguard;
			stk = (Stktop*)stk->stackbase;
			n++;
		}
	}
}

void
runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
{
	FinBlock *block;
	Finalizer *f;

	runtime·lock(&gclock);
	if(finq == nil || finq->cnt == finq->cap) {
		if(finc == nil) {
			finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
			finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
			finc->alllink = allfin;
			allfin = finc;
		}
		block = finc;
		finc = block->next;
		block->next = finq;
		finq = block;
	}
	f = &finq->fin[finq->cnt];
	finq->cnt++;
	f->fn = fn;
	f->nret = nret;
	f->fint = fint;
	f->ot = ot;
	f->arg = p;
	runtime·unlock(&gclock);
}

void
runtime·MSpan_EnsureSwept(MSpan *s)
{
	uint32 sg;

	// Caller must disable preemption.
	// Otherwise when this function returns the span can become unswept again
	// (if GC is triggered on another goroutine).
	if(m->locks == 0 && m->mallocing == 0)
		runtime·throw("MSpan_EnsureSwept: m is not locked");

	sg = runtime·mheap.sweepgen;
	if(runtime·atomicload(&s->sweepgen) == sg)
		return;
	if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
		runtime·MSpan_Sweep(s);
		return;
	}
	// unfortunate condition, and we don't have efficient means to wait
	while(runtime·atomicload(&s->sweepgen) != sg)
		runtime·osyield();  
}

// Sweep frees or collects finalizers for blocks not marked in the mark phase.
// It clears the mark bits in preparation for the next GC round.
// Returns true if the span was returned to heap.
bool
runtime·MSpan_Sweep(MSpan *s)
{
	int32 cl, n, npages, nfree;
	uintptr size, off, *bitp, shift, bits;
	uint32 sweepgen;
	byte *p;
	MCache *c;
	byte *arena_start;
	MLink head, *end;
	byte *type_data;
	byte compression;
	uintptr type_data_inc;
	MLink *x;
	Special *special, **specialp, *y;
	bool res, sweepgenset;

	// It's critical that we enter this function with preemption disabled,
	// GC must not start while we are in the middle of this function.
	if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
		runtime·throw("MSpan_Sweep: m is not locked");
	sweepgen = runtime·mheap.sweepgen;
	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
		runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
			s->state, s->sweepgen, sweepgen);
		runtime·throw("MSpan_Sweep: bad span state");
	}
	arena_start = runtime·mheap.arena_start;
	cl = s->sizeclass;
	size = s->elemsize;
	if(cl == 0) {
		n = 1;
	} else {
		// Chunk full of small blocks.
		npages = runtime·class_to_allocnpages[cl];
		n = (npages << PageShift) / size;
	}
	res = false;
	nfree = 0;
	end = &head;
	c = m->mcache;
	sweepgenset = false;

	// mark any free objects in this span so we don't collect them
	for(x = s->freelist; x != nil; x = x->next) {
		// This is markonly(x) but faster because we don't need
		// atomic access and we're guaranteed to be pointing at
		// the head of a valid object.
		off = (uintptr*)x - (uintptr*)runtime·mheap.arena_start;
		bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		*bitp |= bitMarked<<shift;
	}
	
	// Unlink & free special records for any objects we're about to free.
	specialp = &s->specials;
	special = *specialp;
	while(special != nil) {
		// A finalizer can be set for an inner byte of an object, find object beginning.
		p = (byte*)(s->start << PageShift) + special->offset/size*size;
		off = (uintptr*)p - (uintptr*)arena_start;
		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		bits = *bitp>>shift;
		if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
			// Find the exact byte for which the special was setup
			// (as opposed to object beginning).
			p = (byte*)(s->start << PageShift) + special->offset;
			// about to free object: splice out special record
			y = special;
			special = special->next;
			*specialp = special;
			if(!runtime·freespecial(y, p, size, false)) {
				// stop freeing of object if it has a finalizer
				*bitp |= bitMarked << shift;
			}
		} else {
			// object is still live: keep special record
			specialp = &special->next;
			special = *specialp;
		}
	}

	type_data = (byte*)s->types.data;
	type_data_inc = sizeof(uintptr);
	compression = s->types.compression;
	switch(compression) {
	case MTypes_Bytes:
		type_data += 8*sizeof(uintptr);
		type_data_inc = 1;
		break;
	}

	// Sweep through n objects of given size starting at p.
	// This thread owns the span now, so it can manipulate
	// the block bitmap without atomic operations.
	p = (byte*)(s->start << PageShift);
	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
		off = (uintptr*)p - (uintptr*)arena_start;
		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		bits = *bitp>>shift;

		if((bits & bitAllocated) == 0)
			continue;

		if((bits & bitMarked) != 0) {
			*bitp &= ~(bitMarked<<shift);
			continue;
		}

		// Clear mark, scan, and special bits.
		*bitp &= ~((bitScan|bitMarked|bitSpecial)<<shift);

		if(cl == 0) {
			// Free large span.
			runtime·unmarkspan(p, 1<<PageShift);
			s->needzero = 1;
			// important to set sweepgen before returning it to heap
			runtime·atomicstore(&s->sweepgen, sweepgen);
			sweepgenset = true;
			if(runtime·debug.efence)
				runtime·SysFree(p, size, &mstats.gc_sys);
			else
				runtime·MHeap_Free(&runtime·mheap, s, 1);
			c->local_nlargefree++;
			c->local_largefree += size;
			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
			res = true;
		} else {
			// Free small object.
			switch(compression) {
			case MTypes_Words:
				*(uintptr*)type_data = 0;
				break;
			case MTypes_Bytes:
				*(byte*)type_data = 0;
				break;
			}
			if(size > 2*sizeof(uintptr))
				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
			else if(size > sizeof(uintptr))
				((uintptr*)p)[1] = 0;

			end->next = (MLink*)p;
			end = (MLink*)p;
			nfree++;
		}
	}

	if(!sweepgenset) {
		// The span must be in our exclusive ownership until we update sweepgen,
		// check for potential races.
		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
			runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
				s->state, s->sweepgen, sweepgen);
			runtime·throw("MSpan_Sweep: bad span state after sweep");
		}
		runtime·atomicstore(&s->sweepgen, sweepgen);
	}
	if(nfree) {
		c->local_nsmallfree[cl] += nfree;
		c->local_cachealloc -= nfree * size;
		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
		res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], s, nfree, head.next, end);
	}
	return res;
}

// State of background sweep.
// Pretected by gclock.
static struct
{
	G*	g;
	bool	parked;

	MSpan**	spans;
	uint32	nspan;
	uint32	spanidx;
} sweep;

// background sweeping goroutine
static void
bgsweep(void)
{
	g->issystem = 1;
	for(;;) {
		while(runtime·sweepone() != -1) {
			gcstats.nbgsweep++;
			runtime·gosched();
		}
		runtime·lock(&gclock);
		if(finq != nil) {
			// kick off or wake up goroutine to run queued finalizers
			if(fing == nil)
				fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
			else if(fingwait) {
				fingwait = 0;
				runtime·ready(fing);
			}
		}
		sweep.parked = true;
		runtime·parkunlock(&gclock, "GC sweep wait");
	}
}

// sweeps one span
// returns number of pages returned to heap, or -1 if there is nothing to sweep
uintptr
runtime·sweepone(void)
{
	MSpan *s;
	uint32 idx, sg;
	uintptr npages;

	// increment locks to ensure that the goroutine is not preempted
	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
	m->locks++;
	sg = runtime·mheap.sweepgen;
	for(;;) {
		idx = runtime·xadd(&sweep.spanidx, 1) - 1;
		if(idx >= sweep.nspan) {
			runtime·mheap.sweepdone = true;
			m->locks--;
			return -1;
		}
		s = sweep.spans[idx];
		if(s->state != MSpanInUse) {
			s->sweepgen = sg;
			continue;
		}
		if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
			continue;
		npages = s->npages;
		if(!runtime·MSpan_Sweep(s))
			npages = 0;
		m->locks--;
		return npages;
	}
}

static void
dumpspan(uint32 idx)
{
	int32 sizeclass, n, npages, i, column;
	uintptr size;
	byte *p;
	byte *arena_start;
	MSpan *s;
	bool allocated, special;

	s = runtime·mheap.allspans[idx];
	if(s->state != MSpanInUse)
		return;
	arena_start = runtime·mheap.arena_start;
	p = (byte*)(s->start << PageShift);
	sizeclass = s->sizeclass;
	size = s->elemsize;
	if(sizeclass == 0) {
		n = 1;
	} else {
		npages = runtime·class_to_allocnpages[sizeclass];
		n = (npages << PageShift) / size;
	}
	
	runtime·printf("%p .. %p:\n", p, p+n*size);
	column = 0;
	for(; n>0; n--, p+=size) {
		uintptr off, *bitp, shift, bits;

		off = (uintptr*)p - (uintptr*)arena_start;
		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		bits = *bitp>>shift;

		allocated = ((bits & bitAllocated) != 0);
		special = ((bits & bitSpecial) != 0);

		for(i=0; i<size; i+=sizeof(void*)) {
			if(column == 0) {
				runtime·printf("\t");
			}
			if(i == 0) {
				runtime·printf(allocated ? "(" : "[");
				runtime·printf(special ? "@" : "");
				runtime·printf("%p: ", p+i);
			} else {
				runtime·printf(" ");
			}

			runtime·printf("%p", *(void**)(p+i));

			if(i+sizeof(void*) >= size) {
				runtime·printf(allocated ? ") " : "] ");
			}

			column++;
			if(column == 8) {
				runtime·printf("\n");
				column = 0;
			}
		}
	}
	runtime·printf("\n");
}

// A debugging function to dump the contents of memory
void
runtime·memorydump(void)
{
	uint32 spanidx;

	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
		dumpspan(spanidx);
	}
}

void
runtime·gchelper(void)
{
	int32 nproc;

	gchelperstart();

	// parallel mark for over gc roots
	runtime·parfordo(work.markfor);

	// help other threads scan secondary blocks
	scanblock(nil, true);

	bufferList[m->helpgc].busy = 0;
	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
	if(runtime·xadd(&work.ndone, +1) == nproc-1)
		runtime·notewakeup(&work.alldone);
}

static void
cachestats(void)
{
	MCache *c;
	P *p, **pp;

	for(pp=runtime·allp; p=*pp; pp++) {
		c = p->mcache;
		if(c==nil)
			continue;
		runtime·purgecachedstats(c);
	}
}

static void
flushallmcaches(void)
{
	P *p, **pp;
	MCache *c;

	// Flush MCache's to MCentral.
	for(pp=runtime·allp; p=*pp; pp++) {
		c = p->mcache;
		if(c==nil)
			continue;
		runtime·MCache_ReleaseAll(c);
	}
}

static void
updatememstats(GCStats *stats)
{
	M *mp;
	MSpan *s;
	int32 i;
	uint64 stacks_inuse, smallfree;
	uint64 *src, *dst;

	if(stats)
		runtime·memclr((byte*)stats, sizeof(*stats));
	stacks_inuse = 0;
	for(mp=runtime·allm; mp; mp=mp->alllink) {
		stacks_inuse += mp->stackinuse*FixedStack;
		if(stats) {
			src = (uint64*)&mp->gcstats;
			dst = (uint64*)stats;
			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
				dst[i] += src[i];
			runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
		}
	}
	mstats.stacks_inuse = stacks_inuse;
	mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
	mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
	
	// Calculate memory allocator stats.
	// During program execution we only count number of frees and amount of freed memory.
	// Current number of alive object in the heap and amount of alive heap memory
	// are calculated by scanning all spans.
	// Total number of mallocs is calculated as number of frees plus number of alive objects.
	// Similarly, total amount of allocated memory is calculated as amount of freed memory
	// plus amount of alive heap memory.
	mstats.alloc = 0;
	mstats.total_alloc = 0;
	mstats.nmalloc = 0;
	mstats.nfree = 0;
	for(i = 0; i < nelem(mstats.by_size); i++) {
		mstats.by_size[i].nmalloc = 0;
		mstats.by_size[i].nfree = 0;
	}

	// Flush MCache's to MCentral.
	flushallmcaches();

	// Aggregate local stats.
	cachestats();

	// Scan all spans and count number of alive objects.
	for(i = 0; i < runtime·mheap.nspan; i++) {
		s = runtime·mheap.allspans[i];
		if(s->state != MSpanInUse)
			continue;
		if(s->sizeclass == 0) {
			mstats.nmalloc++;
			mstats.alloc += s->elemsize;
		} else {
			mstats.nmalloc += s->ref;
			mstats.by_size[s->sizeclass].nmalloc += s->ref;
			mstats.alloc += s->ref*s->elemsize;
		}
	}

	// Aggregate by size class.
	smallfree = 0;
	mstats.nfree = runtime·mheap.nlargefree;
	for(i = 0; i < nelem(mstats.by_size); i++) {
		mstats.nfree += runtime·mheap.nsmallfree[i];
		mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
		mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
		smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
	}
	mstats.nmalloc += mstats.nfree;

	// Calculate derived stats.
	mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
	mstats.heap_alloc = mstats.alloc;
	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
}

// Structure of arguments passed to function gc().
// This allows the arguments to be passed via runtime·mcall.
struct gc_args
{
	int64 start_time; // start time of GC in ns (just before stoptheworld)
};

static void gc(struct gc_args *args);
static void mgc(G *gp);

static int32
readgogc(void)
{
	byte *p;

	p = runtime·getenv("GOGC");
	if(p == nil || p[0] == '\0')
		return 100;
	if(runtime·strcmp(p, (byte*)"off") == 0)
		return -1;
	return runtime·atoi(p);
}

void
runtime·gc(int32 force)
{
	struct gc_args a;
	int32 i;

	// The atomic operations are not atomic if the uint64s
	// are not aligned on uint64 boundaries. This has been
	// a problem in the past.
	if((((uintptr)&work.empty) & 7) != 0)
		runtime·throw("runtime: gc work buffer is misaligned");
	if((((uintptr)&work.full) & 7) != 0)
		runtime·throw("runtime: gc work buffer is misaligned");

	// The gc is turned off (via enablegc) until
	// the bootstrap has completed.
	// Also, malloc gets called in the guts
	// of a number of libraries that might be
	// holding locks.  To avoid priority inversion
	// problems, don't bother trying to run gc
	// while holding a lock.  The next mallocgc
	// without a lock will do the gc instead.
	if(!mstats.enablegc || g == m->g0 || m->locks > 0 || runtime·panicking)
		return;

	if(gcpercent == GcpercentUnknown) {	// first time through
		runtime·lock(&runtime·mheap);
		if(gcpercent == GcpercentUnknown)
			gcpercent = readgogc();
		runtime·unlock(&runtime·mheap);
	}
	if(gcpercent < 0)
		return;

	runtime·semacquire(&runtime·worldsema, false);
	if(!force && mstats.heap_alloc < mstats.next_gc) {
		// typically threads which lost the race to grab
		// worldsema exit here when gc is done.
		runtime·semrelease(&runtime·worldsema);
		return;
	}

	// Ok, we're doing it!  Stop everybody else
	a.start_time = runtime·nanotime();
	m->gcing = 1;
	runtime·stoptheworld();
	
	if(runtime·debug.allocfreetrace)
		runtime·MProf_TraceGC();

	clearpools();

	// Run gc on the g0 stack.  We do this so that the g stack
	// we're currently running on will no longer change.  Cuts
	// the root set down a bit (g0 stacks are not scanned, and
	// we don't need to scan gc's internal state).  Also an
	// enabler for copyable stacks.
	for(i = 0; i < (runtime·debug.gctrace > 1 ? 2 : 1); i++) {
		// switch to g0, call gc(&a), then switch back
		g->param = &a;
		g->status = Gwaiting;
		g->waitreason = "garbage collection";
		runtime·mcall(mgc);
		// record a new start time in case we're going around again
		a.start_time = runtime·nanotime();
	}

	// all done
	m->gcing = 0;
	m->locks++;
	runtime·semrelease(&runtime·worldsema);
	runtime·starttheworld();
	m->locks--;

	// now that gc is done, kick off finalizer thread if needed
	if(!ConcurrentSweep) {
		if(finq != nil) {
			runtime·lock(&gclock);
			// kick off or wake up goroutine to run queued finalizers
			if(fing == nil)
				fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
			else if(fingwait) {
				fingwait = 0;
				runtime·ready(fing);
			}
			runtime·unlock(&gclock);
		}
		// give the queued finalizers, if any, a chance to run
		runtime·gosched();
	}
}

static void
mgc(G *gp)
{
	gc(gp->param);
	gp->param = nil;
	gp->status = Grunning;
	runtime·gogo(&gp->sched);
}

static void
gc(struct gc_args *args)
{
	int64 t0, t1, t2, t3, t4;
	uint64 heap0, heap1, obj, ninstr;
	GCStats stats;
	M *mp;
	uint32 i;
	Eface eface;

	t0 = args->start_time;
	work.tstart = args->start_time; 

	if(CollectStats)
		runtime·memclr((byte*)&gcstats, sizeof(gcstats));

	for(mp=runtime·allm; mp; mp=mp->alllink)
		runtime·settype_flush(mp);

	m->locks++;	// disable gc during mallocs in parforalloc
	if(work.markfor == nil)
		work.markfor = runtime·parforalloc(MaxGcproc);
	m->locks--;

	if(itabtype == nil) {
		// get C pointer to the Go type "itab"
		runtime·gc_itab_ptr(&eface);
		itabtype = ((PtrType*)eface.type)->elem;
	}

	t1 = runtime·nanotime();

	// Sweep what is not sweeped by bgsweep.
	while(runtime·sweepone() != -1)
		gcstats.npausesweep++;

	work.nwait = 0;
	work.ndone = 0;
	work.nproc = runtime·gcprocs();
	runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot);
	if(work.nproc > 1) {
		runtime·noteclear(&work.alldone);
		runtime·helpgc(work.nproc);
	}

	t2 = runtime·nanotime();

	gchelperstart();
	runtime·parfordo(work.markfor);
	scanblock(nil, true);

	t3 = runtime·nanotime();

	bufferList[m->helpgc].busy = 0;
	if(work.nproc > 1)
		runtime·notesleep(&work.alldone);

	cachestats();
	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
	// estimate what was live heap size after previous GC (for tracing only)
	heap0 = mstats.next_gc*100/(gcpercent+100);
	// conservatively set next_gc to high value assuming that everything is live
	// concurrent/lazy sweep will reduce this number while discovering new garbage
	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;

	t4 = runtime·nanotime();
	mstats.last_gc = t4;
	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
	mstats.pause_total_ns += t4 - t0;
	mstats.numgc++;
	if(mstats.debuggc)
		runtime·printf("pause %D\n", t4-t0);

	if(runtime·debug.gctrace) {
		updatememstats(&stats);
		heap1 = mstats.heap_alloc;
		obj = mstats.nmalloc - mstats.nfree;

		stats.nprocyield += work.markfor->nprocyield;
		stats.nosyield += work.markfor->nosyield;
		stats.nsleep += work.markfor->nsleep;

		runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB, %D (%D-%D) objects,"
				" %d/%d/%d sweeps,"
				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
			mstats.numgc, work.nproc, (t3-t2)/1000000, (t2-t1)/1000000, (t1-t0+t4-t3)/1000000,
			heap0>>20, heap1>>20, obj,
			mstats.nmalloc, mstats.nfree,
			sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
			stats.nhandoff, stats.nhandoffcnt,
			work.markfor->nsteal, work.markfor->nstealcnt,
			stats.nprocyield, stats.nosyield, stats.nsleep);
		gcstats.nbgsweep = gcstats.npausesweep = 0;
		if(CollectStats) {
			runtime·printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
			if(gcstats.ptr.cnt != 0)
				runtime·printf("avg ptrbufsize: %D (%D/%D)\n",
					gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
			if(gcstats.obj.cnt != 0)
				runtime·printf("avg nobj: %D (%D/%D)\n",
					gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
			runtime·printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);

			runtime·printf("instruction counts:\n");
			ninstr = 0;
			for(i=0; i<nelem(gcstats.instr); i++) {
				runtime·printf("\t%d:\t%D\n", i, gcstats.instr[i]);
				ninstr += gcstats.instr[i];
			}
			runtime·printf("\ttotal:\t%D\n", ninstr);

			runtime·printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);

			runtime·printf("markonly base lookup: bit %D word %D span %D\n", gcstats.markonly.foundbit, gcstats.markonly.foundword, gcstats.markonly.foundspan);
			runtime·printf("flushptrbuf base lookup: bit %D word %D span %D\n", gcstats.flushptrbuf.foundbit, gcstats.flushptrbuf.foundword, gcstats.flushptrbuf.foundspan);
		}
	}

	// We cache current runtime·mheap.allspans array in sweep.spans,
	// because the former can be resized and freed.
	// Otherwise we would need to take heap lock every time
	// we want to convert span index to span pointer.

	// Free the old cached array if necessary.
	if(sweep.spans && sweep.spans != runtime·mheap.allspans)
		runtime·SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &mstats.other_sys);
	// Cache the current array.
	runtime·mheap.sweepspans = runtime·mheap.allspans;
	runtime·mheap.sweepgen += 2;
	runtime·mheap.sweepdone = false;
	sweep.spans = runtime·mheap.allspans;
	sweep.nspan = runtime·mheap.nspan;
	sweep.spanidx = 0;

	// Temporary disable concurrent sweep, because we see failures on builders.
	if(ConcurrentSweep) {
		runtime·lock(&gclock);
		if(sweep.g == nil)
			sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, runtime·gc);
		else if(sweep.parked) {
			sweep.parked = false;
			runtime·ready(sweep.g);
		}
		runtime·unlock(&gclock);
	} else {
		// Sweep all spans eagerly.
		while(runtime·sweepone() != -1)
			gcstats.npausesweep++;
	}

	runtime·MProf_GC();
}

extern uintptr runtime·sizeof_C_MStats;

void
runtime·ReadMemStats(MStats *stats)
{
	// Have to acquire worldsema to stop the world,
	// because stoptheworld can only be used by
	// one goroutine at a time, and there might be
	// a pending garbage collection already calling it.
	runtime·semacquire(&runtime·worldsema, false);
	m->gcing = 1;
	runtime·stoptheworld();
	updatememstats(nil);
	// Size of the trailing by_size array differs between Go and C,
	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
	runtime·memcopy(runtime·sizeof_C_MStats, stats, &mstats);
	m->gcing = 0;
	m->locks++;
	runtime·semrelease(&runtime·worldsema);
	runtime·starttheworld();
	m->locks--;
}

void
runtime∕debug·readGCStats(Slice *pauses)
{
	uint64 *p;
	uint32 i, n;

	// Calling code in runtime/debug should make the slice large enough.
	if(pauses->cap < nelem(mstats.pause_ns)+3)
		runtime·throw("runtime: short slice passed to readGCStats");

	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
	p = (uint64*)pauses->array;
	runtime·lock(&runtime·mheap);
	n = mstats.numgc;
	if(n > nelem(mstats.pause_ns))
		n = nelem(mstats.pause_ns);
	
	// The pause buffer is circular. The most recent pause is at
	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
	// from there to go back farther in time. We deliver the times
	// most recent first (in p[0]).
	for(i=0; i<n; i++)
		p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];

	p[n] = mstats.last_gc;
	p[n+1] = mstats.numgc;
	p[n+2] = mstats.pause_total_ns;	
	runtime·unlock(&runtime·mheap);
	pauses->len = n+3;
}

int32
runtime·setgcpercent(int32 in) {
	int32 out;

	runtime·lock(&runtime·mheap);
	if(gcpercent == GcpercentUnknown)
		gcpercent = readgogc();
	out = gcpercent;
	if(in < 0)
		in = -1;
	gcpercent = in;
	runtime·unlock(&runtime·mheap);
	return out;
}

static void
gchelperstart(void)
{
	if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
		runtime·throw("gchelperstart: bad m->helpgc");
	if(runtime·xchg(&bufferList[m->helpgc].busy, 1))
		runtime·throw("gchelperstart: already busy");
	if(g != m->g0)
		runtime·throw("gchelper not running on g0 stack");
}

static void
runfinq(void)
{
	Finalizer *f;
	FinBlock *fb, *next;
	byte *frame;
	uint32 framesz, framecap, i;
	Eface *ef, ef1;

	frame = nil;
	framecap = 0;
	for(;;) {
		runtime·lock(&gclock);
		fb = finq;
		finq = nil;
		if(fb == nil) {
			fingwait = 1;
			runtime·parkunlock(&gclock, "finalizer wait");
			continue;
		}
		runtime·unlock(&gclock);
		if(raceenabled)
			runtime·racefingo();
		for(; fb; fb=next) {
			next = fb->next;
			for(i=0; i<fb->cnt; i++) {
				f = &fb->fin[i];
				framesz = sizeof(Eface) + f->nret;
				if(framecap < framesz) {
					runtime·free(frame);
					// The frame does not contain pointers interesting for GC,
					// all not yet finalized objects are stored in finq.
					// If we do not mark it as FlagNoScan,
					// the last finalized object is not collected.
					frame = runtime·mallocgc(framesz, 0, FlagNoScan|FlagNoInvokeGC);
					framecap = framesz;
				}
				if(f->fint == nil)
					runtime·throw("missing type in runfinq");
				if(f->fint->kind == KindPtr) {
					// direct use of pointer
					*(void**)frame = f->arg;
				} else if(((InterfaceType*)f->fint)->mhdr.len == 0) {
					// convert to empty interface
					ef = (Eface*)frame;
					ef->type = f->ot;
					ef->data = f->arg;
				} else {
					// convert to interface with methods, via empty interface.
					ef1.type = f->ot;
					ef1.data = f->arg;
					if(!runtime·ifaceE2I2((InterfaceType*)f->fint, ef1, (Iface*)frame))
						runtime·throw("invalid type conversion in runfinq");
				}
				reflect·call(f->fn, frame, framesz);
				f->fn = nil;
				f->arg = nil;
				f->ot = nil;
			}
			fb->cnt = 0;
			runtime·lock(&gclock);
			fb->next = finc;
			finc = fb;
			runtime·unlock(&gclock);
		}
		runtime·gc(1);	// trigger another gc to clean up the finalized objects, if possible
	}
}

void
runtime·marknogc(void *v)
{
	uintptr *b, obits, bits, off, shift;

	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;

	for(;;) {
		obits = *b;
		if((obits>>shift & bitMask) != bitAllocated)
			runtime·throw("bad initial state for marknogc");
		bits = (obits & ~(bitAllocated<<shift)) | bitBlockBoundary<<shift;
		if(runtime·gomaxprocs == 1) {
			*b = bits;
			break;
		} else {
			// more than one goroutine is potentially running: use atomic op
			if(runtime·casp((void**)b, (void*)obits, (void*)bits))
				break;
		}
	}
}

void
runtime·markscan(void *v)
{
	uintptr *b, obits, bits, off, shift;

	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;

	for(;;) {
		obits = *b;
		if((obits>>shift & bitMask) != bitAllocated)
			runtime·throw("bad initial state for markscan");
		bits = obits | bitScan<<shift;
		if(runtime·gomaxprocs == 1) {
			*b = bits;
			break;
		} else {
			// more than one goroutine is potentially running: use atomic op
			if(runtime·casp((void**)b, (void*)obits, (void*)bits))
				break;
		}
	}
}

// mark the block at v of size n as freed.
void
runtime·markfreed(void *v, uintptr n)
{
	uintptr *b, obits, bits, off, shift;

	if(0)
		runtime·printf("markfreed %p+%p\n", v, n);

	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		runtime·throw("markfreed: bad pointer");

	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;

	for(;;) {
		obits = *b;
		// This could be a free of a gc-eligible object (bitAllocated + others) or
		// a FlagNoGC object (bitBlockBoundary set).  In either case, we revert to
		// a simple no-scan allocated object because it is going on a free list.
		bits = (obits & ~(bitMask<<shift)) | (bitAllocated<<shift);
		if(runtime·gomaxprocs == 1) {
			*b = bits;
			break;
		} else {
			// more than one goroutine is potentially running: use atomic op
			if(runtime·casp((void**)b, (void*)obits, (void*)bits))
				break;
		}
	}
}

// check that the block at v of size n is marked freed.
void
runtime·checkfreed(void *v, uintptr n)
{
	uintptr *b, bits, off, shift;

	if(!runtime·checking)
		return;

	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		return;	// not allocated, so okay

	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	shift = off % wordsPerBitmapWord;

	bits = *b>>shift;
	if((bits & bitAllocated) != 0) {
		runtime·printf("checkfreed %p+%p: off=%p have=%p\n",
			v, n, off, bits & bitMask);
		runtime·throw("checkfreed: not freed");
	}
}

// mark the span of memory at v as having n blocks of the given size.
// if leftover is true, there is left over space at the end of the span.
void
runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
{
	uintptr *b, off, shift, i;
	byte *p;

	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		runtime·throw("markspan: bad pointer");

	if(runtime·checking) {
		// bits should be all zero at the start
		off = (byte*)v + size - runtime·mheap.arena_start;
		b = (uintptr*)(runtime·mheap.arena_start - off/wordsPerBitmapWord);
		for(i = 0; i < size/PtrSize/wordsPerBitmapWord; i++) {
			if(b[i] != 0)
				runtime·throw("markspan: span bits not zero");
		}
	}

	p = v;
	if(leftover)	// mark a boundary just past end of last block too
		n++;
	for(; n-- > 0; p += size) {
		// Okay to use non-atomic ops here, because we control
		// the entire span, and each bitmap word has bits for only
		// one span, so no other goroutines are changing these
		// bitmap words.
		off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
		b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
		shift = off % wordsPerBitmapWord;
		*b = (*b & ~(bitMask<<shift)) | (bitAllocated<<shift);
	}
}

// unmark the span of memory at v of length n bytes.
void
runtime·unmarkspan(void *v, uintptr n)
{
	uintptr *p, *b, off;

	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
		runtime·throw("markspan: bad pointer");

	p = v;
	off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
	if(off % wordsPerBitmapWord != 0)
		runtime·throw("markspan: unaligned pointer");
	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
	n /= PtrSize;
	if(n%wordsPerBitmapWord != 0)
		runtime·throw("unmarkspan: unaligned length");
	// Okay to use non-atomic ops here, because we control
	// the entire span, and each bitmap word has bits for only
	// one span, so no other goroutines are changing these
	// bitmap words.
	n /= wordsPerBitmapWord;
	while(n-- > 0)
		*b-- = 0;
}

void
runtime·MHeap_MapBits(MHeap *h)
{
	// Caller has added extra mappings to the arena.
	// Add extra mappings of bitmap words as needed.
	// We allocate extra bitmap pieces in chunks of bitmapChunk.
	enum {
		bitmapChunk = 8192
	};
	uintptr n;

	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
	n = ROUND(n, bitmapChunk);
	n = ROUND(n, PhysPageSize);
	if(h->bitmap_mapped >= n)
		return;

	runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, &mstats.gc_sys);
	h->bitmap_mapped = n;
}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								// Copyright 2009 The Go Authors. All rights reserved.
 								// Use of this source code is governed by a BSD-style
 								// license that can be found in the LICENSE file.
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								// Garbage collector (GC).
 								//
 								// GC is:
 								// - mark&sweep
 								// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
 								// - parallel (up to MaxGcproc threads)
 								// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
 								// - non-moving/non-compacting
 								// - full (non-partial)
 								//
 								// GC rate.
 								// Next GC is after we've allocated an extra amount of memory proportional to
 								// the amount already in use. The proportion is controlled by GOGC environment variable
 								// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 								// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 								// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 								// (and also the amount of extra memory used).
 								//
 								// Concurrent sweep.
 								// The sweep phase proceeds concurrently with normal program execution.
 								// The heap is swept span-by-span both lazily (when a goroutine needs another span)
 								// and concurrently in a background goroutine (this helps programs that are not CPU bound).
 								// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
 								// and so next_gc calculation is tricky and happens as follows.
 								// At the end of the stop-the-world phase next_gc is conservatively set based on total
 								// heap size; all spans are marked as "needs sweeping".
 								// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
 								// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
 								// closer to the target value. However, this is not enough to avoid over-allocating memory.
 								// Consider that a goroutine wants to allocate a new span for a large object and
 								// there are no free swept spans, but there are small-object unswept spans.
 								// If the goroutine naively allocates a new span, it can surpass the yet-unknown
 								// target next_gc value. In order to prevent such cases (1) when a goroutine needs
 								// to allocate a new small-object span, it sweeps small-object spans for the same
 								// object size until it frees at least one object; (2) when a goroutine needs to
 								// allocate large-object span from heap, it sweeps spans until it frees at least
 								// that many pages into heap. Together these two measures ensure that we don't surpass
 								// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
 								// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
 								// but there can still be other one-page unswept spans which could be combined into a two-page span.
 								// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 								// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 								// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 								// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 								// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 								// The finalizer goroutine is kicked off only when all spans are swept.
 								// When the next GC starts, it sweeps all not-yet-swept spans (if any).
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
 								#include "runtime.h"
-												runtime: make more build-friendly

Collapse the arch,os-specific directories into the main directory
by renaming xxx/foo.c to foo_xxx.c, and so on.

There are no substantial edits here, except to the Makefile.
The assumption is that the Go tool will #define GOOS_darwin
and GOARCH_amd64 and will make any file named something
like signals_darwin.h available as signals_GOOS.h during the
build.  This replaces what used to be done with -I$(GOOS).

There is still work to be done to make runtime build with
standard tools, but this is a big step.  After this we will have
to write a script to generate all the generated files so they
can be checked in (instead of generated during the build).

R=r, iant, r, lucio.dere
CC=golang-dev
https://golang.org/cl/5490053

											
										
										
											2011-12-16 13:33:58 -07:00
+								#include "arch_GOARCH.h"
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								#include "malloc.h"
-												runtime: stack split + garbage collection bug

The g->sched.sp saved stack pointer and the
g->stackbase and g->stackguard stack bounds
can change even while "the world is stopped",
because a goroutine has to call functions (and
therefore might split its stack) when exiting a
system call to check whether the world is stopped
(and if so, wait until the world continues).

That means the garbage collector cannot access
those values safely (without a race) for goroutines
executing system calls.  Instead, save a consistent
triple in g->gcsp, g->gcstack, g->gcguard during
entersyscall and have the garbage collector refer
to those.

The old code was occasionally seeing (because of
the race) an sp and stk that did not correspond to
each other, so that stk - sp was not the number of
stack bytes following sp.  In that case, if sp < stk
then the call scanblock(sp, stk - sp) scanned too
many bytes (anything between the two pointers,
which pointed into different allocation blocks).
If sp > stk then stk - sp wrapped around.
On 32-bit, stk - sp is a uintptr (uint32) converted
to int64 in the call to scanblock, so a large (~4G)
but positive number.  Scanblock would try to scan
that many bytes and eventually fault accessing
unmapped memory.  On 64-bit, stk - sp is a uintptr (uint64)
promoted to int64 in the call to scanblock, so a negative
number.  Scanblock would not scan anything, possibly
causing in-use blocks to be freed.

In short, 32-bit platforms would have seen either
ineffective garbage collection or crashes during garbage
collection, while 64-bit platforms would have seen
either ineffective or incorrect garbage collection.
You can see the invalid arguments to scanblock in the
stack traces in issue 1620.

Fixes #1620.
Fixes #1746.

R=iant, r
CC=golang-dev
https://golang.org/cl/4437075

											
										
										
											2011-04-27 21:21:12 -06:00
+								#include "stack.h"
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								#include "mgc0.h"
-												race: runtime changes
This is a part of a bigger change that adds data race detection feature:
https://golang.org/cl/6456044

R=rsc
CC=gobot, golang-dev
https://golang.org/cl/6535050

											
										
										
											2012-10-07 12:05:32 -06:00
+								#include "race.h"
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+								#include "type.h"
 								#include "typekind.h"
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+								#include "funcdata.h"
-												cmd/cc,runtime: change preprocessor to expand macros inside of
#pragma textflag and #pragma dataflag directives.
Update dataflag directives to use symbols instead of integer constants.

R=golang-dev, bradfitz
CC=golang-dev
https://golang.org/cl/13310043

											
										
										
											2013-08-29 13:36:59 -06:00
+								#include "../../cmd/ld/textflag.h"
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
 								enum {
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									Debug = 0,
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+									CollectStats = 0,
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									ScanStackByFrames = 1,
-												runtime: add a hook to disable precise GC

This will let us ask people to rebuild the Go system without
precise GC, and then rebuild and retest their program, to see
if precise GC is causing whatever problem they are having.

R=golang-dev, r
CC=golang-dev
https://golang.org/cl/8700043

											
										
										
											2013-04-12 06:23:38 -06:00
+									IgnorePreciseGC = 0,
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									ConcurrentSweep = 1,
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									// Four bits per word (see #defines below).
 									wordsPerBitmapWord = sizeof(void*)*8/4,
 									bitShift = sizeof(void*)*8/4,
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+									WorkbufSize	= 16*1024,
 									RootBlockSize	= 4*1024,
 									FinBlockSize	= 4*1024,
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									handoffThreshold = 4,
 									IntermediateBufferCapacity = 64,
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
 									// Bits in type information
 									PRECISE = 1,
 									LOOP = 2,
 									PC_BITS = PRECISE | LOOP,
-												cmd/cc, cmd/gc, runtime: Uniquely encode iface and eface pointers in the pointer map.

Prior to this change, pointer maps encoded the disposition of
a word using a single bit.  A zero signaled a non-pointer
value and a one signaled a pointer value.  Interface values,
which are a effectively a union type, were conservatively
labeled as a pointer.

This change widens the logical element size of the pointer map
to two bits per word.  As before, zero signals a non-pointer
value and one signals a pointer value.  Additionally, a two
signals an iface pointer and a three signals an eface pointer.

Following other changes to the runtime, values two and three
will allow a type information to drive interpretation of the
subsequent word so only those interface values containing a
pointer value will be scanned.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/12689046

											
										
										
											2013-08-09 17:48:12 -06:00
 									// Pointer map
 									BitsPerPointer = 2,
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+									BitsNoPointer = 0,
 									BitsPointer = 1,
 									BitsIface = 2,
 									BitsEface = 3,
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
 									RootData	= 0,
 									RootBss		= 1,
 									RootFinalizers	= 2,
 									RootSpanTypes	= 3,
 									RootFlushCaches = 4,
 									RootCount	= 5,
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								};
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								#define GcpercentUnknown (-2)
 								// Initialized from $GOGC.  GOGC=off means no gc.
 								static int32 gcpercent = GcpercentUnknown;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								static struct
 								{
 									Lock;
 									void* head;
 								} pools;
 								void
 								sync·runtime_registerPool(void **p)
 								{
 									runtime·lock(&pools);
 									p[0] = pools.head;
 									pools.head = p;
 									runtime·unlock(&pools);
 								}
 								static void
 								clearpools(void)
 								{
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									void **pool, **next;
 									P *p, **pp;
-												runtime: fix windows build
Currently windows crashes because early allocs in schedinit
try to allocate tiny memory blocks, but m->p is not yet setup.
I've considered calling procresize(1) earlier in schedinit,
but this refactoring is better and must fix the issue as well.
Fixes #7218.

R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/54570045

											
										
										
											2014-01-27 13:26:56 -07:00
+									MCache *c;
-												sync: scalable Pool
Introduce fixed-size P-local caches.
When local caches overflow/underflow a batch of items
is transferred to/from global mutex-protected cache.

benchmark                    old ns/op    new ns/op    delta
BenchmarkPool                    50554        22423  -55.65%
BenchmarkPool-4                 400359         5904  -98.53%
BenchmarkPool-16                403311         1598  -99.60%
BenchmarkPool-32                367310         1526  -99.58%

BenchmarkPoolOverlflow            5214         3633  -30.32%
BenchmarkPoolOverlflow-4         42663         9539  -77.64%
BenchmarkPoolOverlflow-8         46919        11385  -75.73%
BenchmarkPoolOverlflow-16        39454        13048  -66.93%

BenchmarkSprintfEmpty                    84           63  -25.68%
BenchmarkSprintfEmpty-2                 371           32  -91.13%
BenchmarkSprintfEmpty-4                 465           22  -95.25%
BenchmarkSprintfEmpty-8                 565           12  -97.77%
BenchmarkSprintfEmpty-16                498            5  -98.87%
BenchmarkSprintfEmpty-32                492            4  -99.04%

BenchmarkSprintfString                  259          229  -11.58%
BenchmarkSprintfString-2                574          144  -74.91%
BenchmarkSprintfString-4                651           77  -88.05%
BenchmarkSprintfString-8                868           47  -94.48%
BenchmarkSprintfString-16               825           33  -95.96%
BenchmarkSprintfString-32               825           30  -96.28%

BenchmarkSprintfInt                     213          188  -11.74%
BenchmarkSprintfInt-2                   448          138  -69.20%
BenchmarkSprintfInt-4                   624           52  -91.63%
BenchmarkSprintfInt-8                   691           31  -95.43%
BenchmarkSprintfInt-16                  724           18  -97.46%
BenchmarkSprintfInt-32                  718           16  -97.70%

BenchmarkSprintfIntInt                  311          282   -9.32%
BenchmarkSprintfIntInt-2                333          145  -56.46%
BenchmarkSprintfIntInt-4                642          110  -82.87%
BenchmarkSprintfIntInt-8                832           42  -94.90%
BenchmarkSprintfIntInt-16               817           24  -97.00%
BenchmarkSprintfIntInt-32               805           22  -97.17%

BenchmarkSprintfPrefixedInt             309          269  -12.94%
BenchmarkSprintfPrefixedInt-2           245          168  -31.43%
BenchmarkSprintfPrefixedInt-4           598           99  -83.36%
BenchmarkSprintfPrefixedInt-8           770           67  -91.23%
BenchmarkSprintfPrefixedInt-16          829           54  -93.49%
BenchmarkSprintfPrefixedInt-32          824           50  -93.83%

BenchmarkSprintfFloat                   418          398   -4.78%
BenchmarkSprintfFloat-2                 295          203  -31.19%
BenchmarkSprintfFloat-4                 585          128  -78.12%
BenchmarkSprintfFloat-8                 873           60  -93.13%
BenchmarkSprintfFloat-16                884           33  -96.24%
BenchmarkSprintfFloat-32                881           29  -96.62%

BenchmarkManyArgs                      1097         1069   -2.55%
BenchmarkManyArgs-2                     705          567  -19.57%
BenchmarkManyArgs-4                     792          319  -59.72%
BenchmarkManyArgs-8                     963          172  -82.14%
BenchmarkManyArgs-16                   1115          103  -90.76%
BenchmarkManyArgs-32                   1133           90  -92.03%

LGTM=rsc
R=golang-codereviews, bradfitz, minux.ma, gobot, rsc
CC=golang-codereviews
https://golang.org/cl/46010043

											
										
										
											2014-01-24 11:29:53 -07:00
+									uintptr off;
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									int32 i;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+									// clear sync.Pool's
 									for(pool = pools.head; pool != nil; pool = next) {
 										next = pool[0];
 										pool[0] = nil; // next
-												sync: scalable Pool
Introduce fixed-size P-local caches.
When local caches overflow/underflow a batch of items
is transferred to/from global mutex-protected cache.

benchmark                    old ns/op    new ns/op    delta
BenchmarkPool                    50554        22423  -55.65%
BenchmarkPool-4                 400359         5904  -98.53%
BenchmarkPool-16                403311         1598  -99.60%
BenchmarkPool-32                367310         1526  -99.58%

BenchmarkPoolOverlflow            5214         3633  -30.32%
BenchmarkPoolOverlflow-4         42663         9539  -77.64%
BenchmarkPoolOverlflow-8         46919        11385  -75.73%
BenchmarkPoolOverlflow-16        39454        13048  -66.93%

BenchmarkSprintfEmpty                    84           63  -25.68%
BenchmarkSprintfEmpty-2                 371           32  -91.13%
BenchmarkSprintfEmpty-4                 465           22  -95.25%
BenchmarkSprintfEmpty-8                 565           12  -97.77%
BenchmarkSprintfEmpty-16                498            5  -98.87%
BenchmarkSprintfEmpty-32                492            4  -99.04%

BenchmarkSprintfString                  259          229  -11.58%
BenchmarkSprintfString-2                574          144  -74.91%
BenchmarkSprintfString-4                651           77  -88.05%
BenchmarkSprintfString-8                868           47  -94.48%
BenchmarkSprintfString-16               825           33  -95.96%
BenchmarkSprintfString-32               825           30  -96.28%

BenchmarkSprintfInt                     213          188  -11.74%
BenchmarkSprintfInt-2                   448          138  -69.20%
BenchmarkSprintfInt-4                   624           52  -91.63%
BenchmarkSprintfInt-8                   691           31  -95.43%
BenchmarkSprintfInt-16                  724           18  -97.46%
BenchmarkSprintfInt-32                  718           16  -97.70%

BenchmarkSprintfIntInt                  311          282   -9.32%
BenchmarkSprintfIntInt-2                333          145  -56.46%
BenchmarkSprintfIntInt-4                642          110  -82.87%
BenchmarkSprintfIntInt-8                832           42  -94.90%
BenchmarkSprintfIntInt-16               817           24  -97.00%
BenchmarkSprintfIntInt-32               805           22  -97.17%

BenchmarkSprintfPrefixedInt             309          269  -12.94%
BenchmarkSprintfPrefixedInt-2           245          168  -31.43%
BenchmarkSprintfPrefixedInt-4           598           99  -83.36%
BenchmarkSprintfPrefixedInt-8           770           67  -91.23%
BenchmarkSprintfPrefixedInt-16          829           54  -93.49%
BenchmarkSprintfPrefixedInt-32          824           50  -93.83%

BenchmarkSprintfFloat                   418          398   -4.78%
BenchmarkSprintfFloat-2                 295          203  -31.19%
BenchmarkSprintfFloat-4                 585          128  -78.12%
BenchmarkSprintfFloat-8                 873           60  -93.13%
BenchmarkSprintfFloat-16                884           33  -96.24%
BenchmarkSprintfFloat-32                881           29  -96.62%

BenchmarkManyArgs                      1097         1069   -2.55%
BenchmarkManyArgs-2                     705          567  -19.57%
BenchmarkManyArgs-4                     792          319  -59.72%
BenchmarkManyArgs-8                     963          172  -82.14%
BenchmarkManyArgs-16                   1115          103  -90.76%
BenchmarkManyArgs-32                   1133           90  -92.03%

LGTM=rsc
R=golang-codereviews, bradfitz, minux.ma, gobot, rsc
CC=golang-codereviews
https://golang.org/cl/46010043

											
										
										
											2014-01-24 11:29:53 -07:00
+										pool[1] = nil; // local
 										pool[2] = nil; // localSize
 										off = (uintptr)pool[3] / sizeof(void*);
 										pool[off+0] = nil; // global slice
 										pool[off+1] = nil;
 										pool[off+2] = nil;
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+									}
 									pools.head = nil;
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
 									for(pp=runtime·allp; p=*pp; pp++) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// clear tinyalloc pool
-												runtime: fix windows build
Currently windows crashes because early allocs in schedinit
try to allocate tiny memory blocks, but m->p is not yet setup.
I've considered calling procresize(1) earlier in schedinit,
but this refactoring is better and must fix the issue as well.
Fixes #7218.

R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/54570045

											
										
										
											2014-01-27 13:26:56 -07:00
+										c = p->mcache;
 										if(c != nil) {
 											c->tiny = nil;
 											c->tinysize = 0;
 										}
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// clear defer pools
-												runtime: per-P defer pool
Instead of a per-goroutine stack of defers for all sizes,
introduce per-P defer pool for argument sizes 8, 24, 40, 56, 72 bytes.

For a program that starts 1e6 goroutines and then joins then:
old: rss=6.6g virtmem=10.2g time=4.85s
new: rss=4.5g virtmem= 8.2g time=3.48s

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/42750044

											
										
										
											2014-01-21 00:20:23 -07:00
+										for(i=0; i<nelem(p->deferpool); i++)
 											p->deferpool[i] = nil;
 									}
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// Bits in per-word bitmap.
 								// #defines because enum might not be able to hold the values.
 								//
 								// Each word in the bitmap describes wordsPerBitmapWord words
 								// of heap memory.  There are 4 bitmap bits dedicated to each heap word,
 								// so on a 64-bit system there is one bitmap word per 16 heap words.
 								// The bits in the word are packed together by type first, then by
 								// heap location, so each 64-bit bitmap word consists of, from top to bottom,
 								// the 16 bitSpecial bits for the corresponding heap words, then the 16 bitMarked bits,
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								// then the 16 bitScan/bitBlockBoundary bits, then the 16 bitAllocated bits.
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// This layout makes it easier to iterate over the bits of a given type.
 								//
 								// The bitmap starts at mheap.arena_start and extends *backward* from
 								// there.  On a 64-bit system the off'th word in the arena is tracked by
 								// the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
 								// the only difference is that the divisor is 8.)
 								//
 								// To pull out the bits corresponding to a given pointer p, we use:
 								//
 								//	off = p - (uintptr*)mheap.arena_start;  // word offset
 								//	b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
 								//	shift = off % wordsPerBitmapWord
 								//	bits = *b >> shift;
 								//	/* then test bits & bitAllocated, bits & bitMarked, etc. */
 								//
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								#define bitAllocated		((uintptr)1<<(bitShift*0))	/* block start; eligible for garbage collection */
 								#define bitScan			((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								#define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
 								#define bitSpecial		((uintptr)1<<(bitShift*3))	/* when bitAllocated is set - has finalizer or being profiled */
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								#define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set - mark for FlagNoGC objects */
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								#define bitMask (bitAllocated | bitScan | bitMarked | bitSpecial)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+								// Holding worldsema grants an M the right to try to stop the world.
 								// The procedure is:
 								//
 								//	runtime·semacquire(&runtime·worldsema);
 								//	m->gcing = 1;
 								//	runtime·stoptheworld();
 								//
 								//	... do stuff ...
 								//
 								//	m->gcing = 0;
 								//	runtime·semrelease(&runtime·worldsema);
 								//	runtime·starttheworld();
 								//
 								uint32 runtime·worldsema = 1;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								typedef struct Obj Obj;
 								struct Obj
 								{
 									byte	*p;	// data pointer
 									uintptr	n;	// size of data in bytes
 									uintptr	ti;	// type info
 								};
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								typedef struct Workbuf Workbuf;
 								struct Workbuf
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								{
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+								#define SIZE (WorkbufSize-sizeof(LFNode)-sizeof(uintptr))
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									LFNode  node; // must be first
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									uintptr nobj;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									Obj     obj[SIZE/sizeof(Obj) - 1];
 									uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
 								#undef SIZE
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								};
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								typedef struct Finalizer Finalizer;
 								struct Finalizer
 								{
-												cmd/gc, reflect, runtime: switch to indirect func value representation

Step 1 of http://golang.org/s/go11func.

R=golang-dev, r, daniel.morsing, remyoudompheng
CC=golang-dev
https://golang.org/cl/7393045

											
										
										
											2013-02-21 15:01:13 -07:00
+									FuncVal *fn;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									void *arg;
-												runtime: prepare for 64-bit ints

This CL makes the runtime understand that the type of
the len or cap of a map, slice, or string is 'int', not 'int32',
and it is also careful to distinguish between function arguments
and results of type 'int' vs type 'int32'.

In the runtime, the new typedefs 'intgo' and 'uintgo' refer
to Go int and uint. The C types int and uint continue to be
unavailable (cause intentional compile errors).

This CL does not change the meaning of int, but it should make
the eventual change of the meaning of int on amd64 a bit
smoother.

Update #2188.

R=iant, r, dave, remyoudompheng
CC=golang-dev
https://golang.org/cl/6551067

											
										
										
											2012-09-24 12:58:34 -06:00
+									uintptr nret;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									Type *fint;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+									PtrType *ot;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								};
 								typedef struct FinBlock FinBlock;
 								struct FinBlock
 								{
 									FinBlock *alllink;
 									FinBlock *next;
 									int32 cnt;
 									int32 cap;
 									Finalizer fin[1];
 								};
-												FFI step 2: can ask for libc.so.6.

introduced explicit "data" symbol instead of etext
to mark beginning of data, so that using larger
alignment (i.e. 4MB like GNU loader) doesn't
confuse garbage collector.

split dodata into dodata and dobss in preparation
for putting the dynamic data + headers in the data
segment instead of stuffed at the beginning of the binary.

R=r
DELTA=52  (37 added, 3 deleted, 12 changed)
OCL=33610
CL=33618

											
										
										
											2009-08-20 17:09:38 -06:00
+								extern byte data[];
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								extern byte edata[];
 								extern byte bss[];
-												ld: add NOPTRBSS for large, pointer-free uninitialized data
cc: add #pragma textflag to set it
runtime: mark mheap to go into noptr-bss.
        remove special case in garbage collector

Remove the ARM from.flag field created by CL 5687044.
The DUPOK flag was already in p->reg, so keep using that.

Otherwise test/nilptr.go creates a very large binary.
Should fix the arm build.
Diagnosed by minux.ma; replacement for CL 5690044.

R=golang-dev, minux.ma, r
CC=golang-dev
https://golang.org/cl/5686060

											
										
										
											2012-02-21 20:08:42 -07:00
+								extern byte ebss[];
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								extern byte gcdata[];
 								extern byte gcbss[];
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static G	*fing;
 								static FinBlock	*finq; // list of finalizers that are to be executed
 								static FinBlock	*finc; // cache of free blocks
 								static FinBlock	*allfin; // list of all blocks
 								static int32	fingwait;
 								static Lock	gclock;
-												runtime: use explicit flag when finalizer goroutine is waiting

Avoids spurious wakeups during other sleeping by that goroutine.
Fixes #711.

R=r
CC=golang-dev
https://golang.org/cl/902041

											
										
										
											2010-04-07 21:38:02 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static void	runfinq(void);
 								static void	bgsweep(void);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								static Workbuf* getempty(Workbuf*);
 								static Workbuf* getfull(Workbuf*);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static void	putempty(Workbuf*);
 								static Workbuf* handoff(Workbuf*);
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								static void	gchelperstart(void);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								static void	addfinroots(void *wbufp, void *v);
 								static void	flushallmcaches(void);
 								static void	scanframe(Stkframe *frame, void *wbufp);
 								static void	addstackroots(G *gp, Workbuf **wbufp);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								static FuncVal runfinqv = {runfinq};
 								static FuncVal bgsweepv = {bgsweep};
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static struct {
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									uint64	full;  // lock-free list of full blocks
 									uint64	empty; // lock-free list of empty blocks
 									byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									uint32	nproc;
-												runtime: output how long goroutines are blocked
Example of output:

goroutine 4 [sleep for 3 min]:
time.Sleep(0x34630b8a000)
        src/pkg/runtime/time.goc:31 +0x31
main.func·002()
        block.go:16 +0x2c
created by main.main
        block.go:17 +0x33

Full program and output are here:
http://play.golang.org/p/NEZdADI3Td

Fixes #6809.

R=golang-codereviews, khr, kamil.kisiel, bradfitz, rsc
CC=golang-codereviews
https://golang.org/cl/50420043

											
										
										
											2014-01-16 01:54:46 -07:00
+									int64	tstart;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									volatile uint32	nwait;
 									volatile uint32	ndone;
 									Note	alldone;
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									ParFor	*markfor;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
 									Lock;
 									byte	*chunk;
 									uintptr	nchunk;
 								} work;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								enum {
 									GC_DEFAULT_PTR = GC_NUM_INSTR,
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+									GC_CHAN,
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
 									GC_NUM_INSTR2
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								};
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+								static struct {
 									struct {
 										uint64 sum;
 										uint64 cnt;
 									} ptr;
 									uint64 nbytes;
 									struct {
 										uint64 sum;
 										uint64 cnt;
 										uint64 notype;
 										uint64 typelookup;
 									} obj;
 									uint64 rescan;
 									uint64 rescanbytes;
 									uint64 instr[GC_NUM_INSTR2];
 									uint64 putempty;
 									uint64 getfull;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+									struct {
 										uint64 foundbit;
 										uint64 foundword;
 										uint64 foundspan;
 									} flushptrbuf;
 									struct {
 										uint64 foundbit;
 										uint64 foundword;
 										uint64 foundspan;
 									} markonly;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									uint32 nbgsweep;
 									uint32 npausesweep;
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+								} gcstats;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+								// markonly marks an object. It returns true if the object
 								// has been marked by this function, false otherwise.
-												runtime: replace lock() with casp() in the GC

Note: BitTarget will be removed by a forthcoming changeset.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7837044

											
										
										
											2013-03-15 02:02:36 -06:00
+								// This function doesn't append the object to any buffer.
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+								static bool
 								markonly(void *obj)
 								{
 									byte *p;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+									uintptr *bitp, bits, shift, x, xbits, off, j;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+									MSpan *s;
 									PageID k;
 									// Words outside the arena cannot be pointers.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+										return false;
 									// obj may be a pointer to a live object.
 									// Try to find the beginning of the object.
 									// Round down to word boundary.
 									obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 									// Find bits for this word.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
 									bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+									shift = off % wordsPerBitmapWord;
 									xbits = *bitp;
 									bits = xbits >> shift;
 									// Pointing at the beginning of a block?
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+									if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
 										if(CollectStats)
 											runtime·xadd64(&gcstats.markonly.foundbit, 1);
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+										goto found;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+									}
 									// Pointing just past the beginning?
 									// Scan backward a little to find a block boundary.
 									for(j=shift; j-->0; ) {
 										if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
 											shift = j;
 											bits = xbits>>shift;
 											if(CollectStats)
 												runtime·xadd64(&gcstats.markonly.foundword, 1);
 											goto found;
 										}
 									}
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
 									// Otherwise consult span table to find beginning.
 									// (Manually inlined copy of MHeap_LookupMaybe.)
 									k = (uintptr)obj>>PageShift;
 									x = k;
-												runtime: fix 32-bit malloc for pointers >= 0x80000000

The spans array is allocated in runtime·mallocinit.  On a
32-bit system the number of entries in the spans array is
MaxArena32 / PageSize, which (2U << 30) / (1 << 12) == (1 << 19).
So we are allocating an array that can hold 19 bits for an
index that can hold 20 bits.  According to the comment in the
function, this is intentional: we only allocate enough spans
(and bitmaps) for a 2G arena, because allocating more would
probably be wasteful.

But since the span index is simply the upper 20 bits of the
memory address, this scheme only works if memory addresses are
limited to the low 2G of memory.  That would be OK if we were
careful to enforce it, but we're not.  What we are careful to
enforce, in functions like runtime·MHeap_SysAlloc, is that we
always return addresses between the heap's arena_start and
arena_start + MaxArena32.

We generally get away with it because we start allocating just
after the program end, so we only run into trouble with
programs that allocate a lot of memory, enough to get past
address 0x80000000.

This changes the code that computes a span index to subtract
arena_start on 32-bit systems just as we currently do on
64-bit systems.

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/49460043

											
										
										
											2014-01-09 16:00:00 -07:00
+									x -= (uintptr)runtime·mheap.arena_start>>PageShift;
-												runtime: rename mheap.maps to mheap.spans
as was dicussed in cl/9791044

R=golang-dev, r
CC=golang-dev
https://golang.org/cl/9853046

											
										
										
											2013-05-30 07:09:58 -06:00
+									s = runtime·mheap.spans[x];
-												runtime: set MSpan.limit properly for large spans.
Then use the limit to make sure MHeap_LookupMaybe & inlined
copies don't return a span if the pointer is beyond the limit.
Use this fact to optimize all call sites.

R=golang-dev, dvyukov
CC=golang-dev
https://golang.org/cl/9869045

											
										
										
											2013-05-30 22:32:20 -06:00
+									if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+										return false;
 									p = (byte*)((uintptr)s->start<<PageShift);
 									if(s->sizeclass == 0) {
 										obj = p;
 									} else {
 										uintptr size = s->elemsize;
 										int32 i = ((byte*)obj - p)/size;
 										obj = p+i*size;
 									}
 									// Now that we know the object header, reload bits.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
 									bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+									shift = off % wordsPerBitmapWord;
 									xbits = *bitp;
 									bits = xbits >> shift;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+									if(CollectStats)
 										runtime·xadd64(&gcstats.markonly.foundspan, 1);
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
 								found:
 									// Now we have bits, bitp, and shift correct for
 									// obj pointing at the base of the object.
 									// Only care about allocated and not marked.
 									if((bits & (bitAllocated|bitMarked)) != bitAllocated)
 										return false;
-												runtime: replace lock() with casp() in the GC

Note: BitTarget will be removed by a forthcoming changeset.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7837044

											
										
										
											2013-03-15 02:02:36 -06:00
+									if(work.nproc == 1)
 										*bitp |= bitMarked<<shift;
 									else {
 										for(;;) {
 											x = *bitp;
 											if(x & (bitMarked<<shift))
 												return false;
 											if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
 												break;
 										}
 									}
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
 									// The object is now marked
 									return true;
 								}
-												runtime: remove struct BitTarget

R=golang-dev
CC=dvyukov, golang-dev, rsc
https://golang.org/cl/7845043

											
										
										
											2013-03-15 10:37:40 -06:00
+								// PtrTarget is a structure used by intermediate buffers.
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								// The intermediate buffers hold GC data before it
 								// is moved/flushed to the work buffer (Workbuf).
 								// The size of an intermediate buffer is very small,
 								// such as 32 or 64 elements.
-												runtime: introduce typedefs and delete struct keywords in mgc0.c

R=rsc
CC=golang-dev
https://golang.org/cl/7029055

											
										
										
											2013-01-04 08:20:50 -07:00
+								typedef struct PtrTarget PtrTarget;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								struct PtrTarget
 								{
 									void *p;
 									uintptr ti;
 								};
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+								typedef	struct Scanbuf Scanbuf;
 								struct	Scanbuf
 								{
 									struct {
 										PtrTarget *begin;
 										PtrTarget *end;
 										PtrTarget *pos;
 									} ptr;
 									struct {
 										Obj *begin;
 										Obj *end;
 										Obj *pos;
 									} obj;
 									Workbuf *wbuf;
 									Obj *wp;
 									uintptr nobj;
 								};
-												runtime: introduce typedefs and delete struct keywords in mgc0.c

R=rsc
CC=golang-dev
https://golang.org/cl/7029055

											
										
										
											2013-01-04 08:20:50 -07:00
+								typedef struct BufferList BufferList;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								struct BufferList
 								{
-												runtime: introduce typedefs and delete struct keywords in mgc0.c

R=rsc
CC=golang-dev
https://golang.org/cl/7029055

											
										
										
											2013-01-04 08:20:50 -07:00
+									PtrTarget ptrtarget[IntermediateBufferCapacity];
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+									Obj obj[IntermediateBufferCapacity];
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									uint32 busy;
 									byte pad[CacheLineSize];
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								};
-												cmd/cc,runtime: change preprocessor to expand macros inside of
#pragma textflag and #pragma dataflag directives.
Update dataflag directives to use symbols instead of integer constants.

R=golang-dev, bradfitz
CC=golang-dev
https://golang.org/cl/13310043

											
										
										
											2013-08-29 13:36:59 -06:00
+								#pragma dataflag NOPTR
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								static BufferList bufferList[MaxGcproc];
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+								static Type *itabtype;
 								static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
 								// flushptrbuf moves data from the PtrTarget buffer to the work buffer.
 								// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
 								// while the work buffer contains blocks which have been marked
 								// and are prepared to be scanned by the garbage collector.
 								//
 								// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
 								//
 								// A simplified drawing explaining how the todo-list moves from a structure to another:
 								//
 								//     scanblock
 								//  (find pointers)
 								//    Obj ------> PtrTarget (pointer targets)
 								//     ↑          |
-												runtime: remove struct BitTarget

R=golang-dev
CC=dvyukov, golang-dev, rsc
https://golang.org/cl/7845043

											
										
										
											2013-03-15 10:37:40 -06:00
+								//     |          |
 								//     `----------'
 								//     flushptrbuf
 								//  (find block start, mark and enqueue)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								static void
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+								flushptrbuf(Scanbuf *sbuf)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								{
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									byte *p, *arena_start, *obj;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									MSpan *s;
 									PageID k;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									Obj *wp;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									Workbuf *wbuf;
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									PtrTarget *ptrbuf;
-												runtime: introduce typedefs and delete struct keywords in mgc0.c

R=rsc
CC=golang-dev
https://golang.org/cl/7029055

											
										
										
											2013-01-04 08:20:50 -07:00
+									PtrTarget *ptrbuf_end;
-												runtime: stack split + garbage collection bug

The g->sched.sp saved stack pointer and the
g->stackbase and g->stackguard stack bounds
can change even while "the world is stopped",
because a goroutine has to call functions (and
therefore might split its stack) when exiting a
system call to check whether the world is stopped
(and if so, wait until the world continues).

That means the garbage collector cannot access
those values safely (without a race) for goroutines
executing system calls.  Instead, save a consistent
triple in g->gcsp, g->gcstack, g->gcguard during
entersyscall and have the garbage collector refer
to those.

The old code was occasionally seeing (because of
the race) an sp and stk that did not correspond to
each other, so that stk - sp was not the number of
stack bytes following sp.  In that case, if sp < stk
then the call scanblock(sp, stk - sp) scanned too
many bytes (anything between the two pointers,
which pointed into different allocation blocks).
If sp > stk then stk - sp wrapped around.
On 32-bit, stk - sp is a uintptr (uint32) converted
to int64 in the call to scanblock, so a large (~4G)
but positive number.  Scanblock would try to scan
that many bytes and eventually fault accessing
unmapped memory.  On 64-bit, stk - sp is a uintptr (uint64)
promoted to int64 in the call to scanblock, so a negative
number.  Scanblock would not scan anything, possibly
causing in-use blocks to be freed.

In short, 32-bit platforms would have seen either
ineffective garbage collection or crashes during garbage
collection, while 64-bit platforms would have seen
either ineffective or incorrect garbage collection.
You can see the invalid arguments to scanblock in the
stack traces in issue 1620.

Fixes #1620.
Fixes #1746.

R=iant, r
CC=golang-dev
https://golang.org/cl/4437075

											
										
										
											2011-04-27 21:21:12 -06:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									wp = sbuf->wp;
 									wbuf = sbuf->wbuf;
 									nobj = sbuf->nobj;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									ptrbuf = sbuf->ptr.begin;
 									ptrbuf_end = sbuf->ptr.pos;
 									n = ptrbuf_end - sbuf->ptr.begin;
 									sbuf->ptr.pos = sbuf->ptr.begin;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+									if(CollectStats) {
 										runtime·xadd64(&gcstats.ptr.sum, n);
 										runtime·xadd64(&gcstats.ptr.cnt, 1);
 									}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									// If buffer is nearly full, get a new one.
 									if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
 										if(wbuf != nil)
 											wbuf->nobj = nobj;
 										wbuf = getempty(wbuf);
 										wp = wbuf->obj;
 										nobj = 0;
 										if(n >= nelem(wbuf->obj))
 											runtime·throw("ptrbuf has to be smaller than WorkBuf");
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+									while(ptrbuf < ptrbuf_end) {
 										obj = ptrbuf->p;
 										ti = ptrbuf->ti;
 										ptrbuf++;
 										// obj belongs to interval [mheap.arena_start, mheap.arena_used).
 										if(Debug > 1) {
 											if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
 												runtime·throw("object is outside of mheap");
 										}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// obj may be a pointer to a live object.
 										// Try to find the beginning of the object.
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Round down to word boundary.
 										if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
 											obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 											ti = 0;
 										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Find bits for this word.
 										off = (uintptr*)obj - (uintptr*)arena_start;
 										bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										xbits = *bitp;
 										bits = xbits >> shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Pointing at the beginning of a block?
 										if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
 											if(CollectStats)
 												runtime·xadd64(&gcstats.flushptrbuf.foundbit, 1);
 											goto found;
 										}
 										ti = 0;
 										// Pointing just past the beginning?
 										// Scan backward a little to find a block boundary.
 										for(j=shift; j-->0; ) {
 											if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
 												obj = (byte*)obj - (shift-j)*PtrSize;
 												shift = j;
 												bits = xbits>>shift;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+												if(CollectStats)
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+													runtime·xadd64(&gcstats.flushptrbuf.foundword, 1);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+												goto found;
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
+											}
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Otherwise consult span table to find beginning.
 										// (Manually inlined copy of MHeap_LookupMaybe.)
 										k = (uintptr)obj>>PageShift;
 										x = k;
-												runtime: fix 32-bit malloc for pointers >= 0x80000000

The spans array is allocated in runtime·mallocinit.  On a
32-bit system the number of entries in the spans array is
MaxArena32 / PageSize, which (2U << 30) / (1 << 12) == (1 << 19).
So we are allocating an array that can hold 19 bits for an
index that can hold 20 bits.  According to the comment in the
function, this is intentional: we only allocate enough spans
(and bitmaps) for a 2G arena, because allocating more would
probably be wasteful.

But since the span index is simply the upper 20 bits of the
memory address, this scheme only works if memory addresses are
limited to the low 2G of memory.  That would be OK if we were
careful to enforce it, but we're not.  What we are careful to
enforce, in functions like runtime·MHeap_SysAlloc, is that we
always return addresses between the heap's arena_start and
arena_start + MaxArena32.

We generally get away with it because we start allocating just
after the program end, so we only run into trouble with
programs that allocate a lot of memory, enough to get past
address 0x80000000.

This changes the code that computes a span index to subtract
arena_start on 32-bit systems just as we currently do on
64-bit systems.

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/49460043

											
										
										
											2014-01-09 16:00:00 -07:00
+										x -= (uintptr)arena_start>>PageShift;
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										s = runtime·mheap.spans[x];
 										if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
 											continue;
 										p = (byte*)((uintptr)s->start<<PageShift);
 										if(s->sizeclass == 0) {
 											obj = p;
 										} else {
 											size = s->elemsize;
 											int32 i = ((byte*)obj - p)/size;
 											obj = p+i*size;
 										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Now that we know the object header, reload bits.
 										off = (uintptr*)obj - (uintptr*)arena_start;
 										bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										xbits = *bitp;
 										bits = xbits >> shift;
 										if(CollectStats)
 											runtime·xadd64(&gcstats.flushptrbuf.foundspan, 1);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+									found:
 										// Now we have bits, bitp, and shift correct for
 										// obj pointing at the base of the object.
 										// Only care about allocated and not marked.
 										if((bits & (bitAllocated|bitMarked)) != bitAllocated)
 											continue;
 										if(work.nproc == 1)
 											*bitp |= bitMarked<<shift;
 										else {
 											for(;;) {
 												x = *bitp;
 												if(x & (bitMarked<<shift))
 													goto continue_obj;
 												if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
 													break;
-												runtime: replace lock() with casp() in the GC

Note: BitTarget will be removed by a forthcoming changeset.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7837044

											
										
										
											2013-03-15 02:02:36 -06:00
+											}
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// If object has no pointers, don't need to scan further.
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+										if((bits & bitScan) == 0)
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+											continue;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										// Ask span about size class.
 										// (Manually inlined copy of MHeap_Lookup.)
 										x = (uintptr)obj >> PageShift;
-												runtime: fix 32-bit malloc for pointers >= 0x80000000

The spans array is allocated in runtime·mallocinit.  On a
32-bit system the number of entries in the spans array is
MaxArena32 / PageSize, which (2U << 30) / (1 << 12) == (1 << 19).
So we are allocating an array that can hold 19 bits for an
index that can hold 20 bits.  According to the comment in the
function, this is intentional: we only allocate enough spans
(and bitmaps) for a 2G arena, because allocating more would
probably be wasteful.

But since the span index is simply the upper 20 bits of the
memory address, this scheme only works if memory addresses are
limited to the low 2G of memory.  That would be OK if we were
careful to enforce it, but we're not.  What we are careful to
enforce, in functions like runtime·MHeap_SysAlloc, is that we
always return addresses between the heap's arena_start and
arena_start + MaxArena32.

We generally get away with it because we start allocating just
after the program end, so we only run into trouble with
programs that allocate a lot of memory, enough to get past
address 0x80000000.

This changes the code that computes a span index to subtract
arena_start on 32-bit systems just as we currently do on
64-bit systems.

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/49460043

											
										
										
											2014-01-09 16:00:00 -07:00
+										x -= (uintptr)arena_start>>PageShift;
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										s = runtime·mheap.spans[x];
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										PREFETCH(obj);
-												runtime: add memory prefetching to GC

benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               4448988000   4370531000   -1.76%
garbage.BenchmarkParser-2             4086045000   4023083000   -1.54%
garbage.BenchmarkParser-4             3677365000   3667020000   -0.28%
garbage.BenchmarkParser-8             3517253000   3543946000   +0.76%
garbage.BenchmarkParser-16            3506562000   3512518000   +0.17%

garbage.BenchmarkTree                  494435529    505784058   +2.30%
garbage.BenchmarkTree-2                499652705    502774823   +0.62%
garbage.BenchmarkTree-4                468482117    465713352   -0.59%
garbage.BenchmarkTree-8                488533235    482287000   -1.28%
garbage.BenchmarkTree-16               507835176    500654882   -1.41%

garbage.BenchmarkTree2                  31453900     28804600   -8.42%
garbage.BenchmarkTree2-2                21440600     19065800  -11.08%
garbage.BenchmarkTree2-4                10982000     10009100   -8.86%
garbage.BenchmarkTree2-8                 7544700      6479800  -14.11%
garbage.BenchmarkTree2-16                7049500      6163200  -12.57%

garbage.BenchmarkParserPause           135815000    125360666   -7.70%
garbage.BenchmarkParserPause-2          92691523     84365476   -8.98%
garbage.BenchmarkParserPause-4          53392190     46995809  -11.98%
garbage.BenchmarkParserPause-8          36059523     30998900  -14.03%
garbage.BenchmarkParserPause-16         30174300     27613350   -8.49%

garbage.BenchmarkTreePause              20969784     22568102   +7.62%
garbage.BenchmarkTreePause-2            20215875     20975130   +3.76%
garbage.BenchmarkTreePause-4            17240709     17180666   -0.35%
garbage.BenchmarkTreePause-8            18196386     18205870   +0.05%
garbage.BenchmarkTreePause-16           20621158     20486867   -0.65%

garbage.BenchmarkTree2Pause            173992142    159995285   -8.04%
garbage.BenchmarkTree2Pause-2          131281904    118013714  -10.11%
garbage.BenchmarkTree2Pause-4           93484952     85092666   -8.98%
garbage.BenchmarkTree2Pause-8           88950523     77340809  -13.05%
garbage.BenchmarkTree2Pause-16          86071238     76557952  -11.05%

garbage.BenchmarkParserLastPause       327247000    288205000  -11.93%
garbage.BenchmarkParserLastPause-2     217039000    187336000  -13.69%
garbage.BenchmarkParserLastPause-4     119722000    105069000  -12.24%
garbage.BenchmarkParserLastPause-8      70806000     64755000   -8.55%
garbage.BenchmarkParserLastPause-16     62813000     53486000  -14.85%

garbage.BenchmarkTreeLastPause          28420000     29735000   +4.63%
garbage.BenchmarkTreeLastPause-2        23514000     25427000   +8.14%
garbage.BenchmarkTreeLastPause-4        21773000     19548000  -10.22%
garbage.BenchmarkTreeLastPause-8        24072000     24046000   -0.11%
garbage.BenchmarkTreeLastPause-16       25149000     25291000   +0.56%

garbage.BenchmarkTree2LastPause        314491000    287988000   -8.43%
garbage.BenchmarkTree2LastPause-2      214363000    190616000  -11.08%
garbage.BenchmarkTree2LastPause-4      109778000    100052000   -8.86%
garbage.BenchmarkTree2LastPause-8       75390000     64753000  -14.11%
garbage.BenchmarkTree2LastPause-16      70333000     61484000  -12.58%

FTR, below are result with the empty prefetch function,
that is, single RET but no real prefetching.
It suggests that inlinable PREFETCH is worth pursuing.

benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               4448988000   4560488000   +2.51%
garbage.BenchmarkParser-2             4086045000   4129728000   +1.07%
garbage.BenchmarkParser-4             3677365000   3728672000   +1.40%
garbage.BenchmarkParser-8             3517253000   3583968000   +1.90%
garbage.BenchmarkParser-16            3506562000   3591414000   +2.42%

garbage.BenchmarkTree                  494435529    499580882   +1.04%
garbage.BenchmarkTree-4                468482117    467387294   -0.23%
garbage.BenchmarkTree-8                488533235    478311117   -2.09%
garbage.BenchmarkTree-2                499652705    499324235   -0.07%
garbage.BenchmarkTree-16               507835176    502005705   -1.15%

garbage.BenchmarkTree2                  31453900     33296800   +5.86%
garbage.BenchmarkTree2-2                21440600     22466400   +4.78%
garbage.BenchmarkTree2-4                10982000     11402700   +3.83%
garbage.BenchmarkTree2-8                 7544700      7476500   -0.90%
garbage.BenchmarkTree2-16                7049500      7338200   +4.10%

garbage.BenchmarkParserPause           135815000    139529142   +2.73%
garbage.BenchmarkParserPause-2          92691523     95229190   +2.74%
garbage.BenchmarkParserPause-4          53392190     53083476   -0.58%
garbage.BenchmarkParserPause-8          36059523     34594800   -4.06%
garbage.BenchmarkParserPause-16         30174300     30063300   -0.37%

garbage.BenchmarkTreePause              20969784     21866920   +4.28%
garbage.BenchmarkTreePause-2            20215875     20731125   +2.55%
garbage.BenchmarkTreePause-4            17240709     17275837   +0.20%
garbage.BenchmarkTreePause-8            18196386     17898777   -1.64%
garbage.BenchmarkTreePause-16           20621158     20662772   +0.20%

garbage.BenchmarkTree2Pause            173992142    184336857   +5.95%
garbage.BenchmarkTree2Pause-2          131281904    138005714   +5.12%
garbage.BenchmarkTree2Pause-4           93484952     98449238   +5.31%
garbage.BenchmarkTree2Pause-8           88950523     89286095   +0.38%
garbage.BenchmarkTree2Pause-16          86071238     89568666   +4.06%

garbage.BenchmarkParserLastPause       327247000    342189000   +4.57%
garbage.BenchmarkParserLastPause-2     217039000    217224000   +0.09%
garbage.BenchmarkParserLastPause-4     119722000    121327000   +1.34%
garbage.BenchmarkParserLastPause-8      70806000     71941000   +1.60%
garbage.BenchmarkParserLastPause-16     62813000     60166000   -4.21%

garbage.BenchmarkTreeLastPause          28420000     27840000   -2.04%
garbage.BenchmarkTreeLastPause-2        23514000     27390000  +16.48%
garbage.BenchmarkTreeLastPause-4        21773000     21414000   -1.65%
garbage.BenchmarkTreeLastPause-8        24072000     21705000   -9.83%
garbage.BenchmarkTreeLastPause-16       25149000     23932000   -4.84%

garbage.BenchmarkTree2LastPause        314491000    332894000   +5.85%
garbage.BenchmarkTree2LastPause-2      214363000    224611000   +4.78%
garbage.BenchmarkTree2LastPause-4      109778000    113976000   +3.82%
garbage.BenchmarkTree2LastPause-8       75390000     67223000  -10.83%
garbage.BenchmarkTree2LastPause-16      70333000     73216000   +4.10%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991057

											
										
										
											2012-04-07 07:02:44 -06:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+										*wp = (Obj){obj, s->elemsize, ti};
 										wp++;
 										nobj++;
 									continue_obj:;
 									}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: remove outdated comment and related indentation

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/39810043

											
										
										
											2013-12-10 12:17:43 -07:00
+									// If another proc wants a pointer, give it some.
 									if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
 										wbuf->nobj = nobj;
 										wbuf = handoff(wbuf);
 										nobj = wbuf->nobj;
 										wp = wbuf->obj + nobj;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									}
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									sbuf->wp = wp;
 									sbuf->wbuf = wbuf;
 									sbuf->nobj = nobj;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								}
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+								static void
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+								flushobjbuf(Scanbuf *sbuf)
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+								{
 									uintptr nobj, off;
 									Obj *wp, obj;
 									Workbuf *wbuf;
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									Obj *objbuf;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+									Obj *objbuf_end;
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									wp = sbuf->wp;
 									wbuf = sbuf->wbuf;
 									nobj = sbuf->nobj;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									objbuf = sbuf->obj.begin;
 									objbuf_end = sbuf->obj.pos;
 									sbuf->obj.pos = sbuf->obj.begin;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
 									while(objbuf < objbuf_end) {
 										obj = *objbuf++;
 										// Align obj.b to a word boundary.
 										off = (uintptr)obj.p & (PtrSize-1);
 										if(off != 0) {
 											obj.p += PtrSize - off;
 											obj.n -= PtrSize - off;
 											obj.ti = 0;
 										}
 										if(obj.p == nil || obj.n == 0)
 											continue;
 										// If buffer is full, get a new one.
 										if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
 											if(wbuf != nil)
 												wbuf->nobj = nobj;
 											wbuf = getempty(wbuf);
 											wp = wbuf->obj;
 											nobj = 0;
 										}
 										*wp = obj;
 										wp++;
 										nobj++;
 									}
 									// If another proc wants a pointer, give it some.
 									if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
 										wbuf->nobj = nobj;
 										wbuf = handoff(wbuf);
 										nobj = wbuf->nobj;
 										wp = wbuf->obj + nobj;
 									}
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									sbuf->wp = wp;
 									sbuf->wbuf = wbuf;
 									sbuf->nobj = nobj;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
+								}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								// Program that scans the whole block and treats every block element as a potential pointer
 								static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+								// Hchan program
 								static uintptr chanProg[2] = {0, GC_CHAN};
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+								// Local variables of a program fragment or loop
 								typedef struct Frame Frame;
 								struct Frame {
 									uintptr count, elemsize, b;
 									uintptr *loop_or_ret;
 								};
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+								// Sanity check for the derived type info objti.
 								static void
 								checkptr(void *obj, uintptr objti)
 								{
 									uintptr *pc1, *pc2, type, tisize, i, j, x;
 									byte *objstart;
 									Type *t;
 									MSpan *s;
 									if(!Debug)
 										runtime·throw("checkptr is debug only");
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+										return;
 									type = runtime·gettype(obj);
 									t = (Type*)(type & ~(uintptr)(PtrSize-1));
 									if(t == nil)
 										return;
 									x = (uintptr)obj >> PageShift;
-												runtime: fix 32-bit malloc for pointers >= 0x80000000

The spans array is allocated in runtime·mallocinit.  On a
32-bit system the number of entries in the spans array is
MaxArena32 / PageSize, which (2U << 30) / (1 << 12) == (1 << 19).
So we are allocating an array that can hold 19 bits for an
index that can hold 20 bits.  According to the comment in the
function, this is intentional: we only allocate enough spans
(and bitmaps) for a 2G arena, because allocating more would
probably be wasteful.

But since the span index is simply the upper 20 bits of the
memory address, this scheme only works if memory addresses are
limited to the low 2G of memory.  That would be OK if we were
careful to enforce it, but we're not.  What we are careful to
enforce, in functions like runtime·MHeap_SysAlloc, is that we
always return addresses between the heap's arena_start and
arena_start + MaxArena32.

We generally get away with it because we start allocating just
after the program end, so we only run into trouble with
programs that allocate a lot of memory, enough to get past
address 0x80000000.

This changes the code that computes a span index to subtract
arena_start on 32-bit systems just as we currently do on
64-bit systems.

R=golang-codereviews, rsc
CC=golang-codereviews
https://golang.org/cl/49460043

											
										
										
											2014-01-09 16:00:00 -07:00
+									x -= (uintptr)(runtime·mheap.arena_start)>>PageShift;
-												runtime: rename mheap.maps to mheap.spans
as was dicussed in cl/9791044

R=golang-dev, r
CC=golang-dev
https://golang.org/cl/9853046

											
										
										
											2013-05-30 07:09:58 -06:00
+									s = runtime·mheap.spans[x];
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+									objstart = (byte*)((uintptr)s->start<<PageShift);
 									if(s->sizeclass != 0) {
 										i = ((byte*)obj - objstart)/s->elemsize;
 										objstart += i*s->elemsize;
 									}
 									tisize = *(uintptr*)objti;
 									// Sanity check for object size: it should fit into the memory block.
-												runtime: record type information for hashtable internal structures.
Remove all hashtable-specific GC code.

Fixes bug 6119.

R=cshapiro, dvyukov, khr
CC=golang-dev
https://golang.org/cl/13078044

											
										
										
											2013-08-31 10:09:50 -06:00
+									if((byte*)obj + tisize > objstart + s->elemsize) {
 										runtime·printf("object of type '%S' at %p/%p does not fit in block %p/%p\n",
 											       *t->string, obj, tisize, objstart, s->elemsize);
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+										runtime·throw("invalid gc type info");
-												runtime: record type information for hashtable internal structures.
Remove all hashtable-specific GC code.

Fixes bug 6119.

R=cshapiro, dvyukov, khr
CC=golang-dev
https://golang.org/cl/13078044

											
										
										
											2013-08-31 10:09:50 -06:00
+									}
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+									if(obj != objstart)
 										return;
 									// If obj points to the beginning of the memory block,
 									// check type info as well.
 									if(t->string == nil ||
 										// Gob allocates unsafe pointers for indirection.
 										(runtime·strcmp(t->string->str, (byte*)"unsafe.Pointer") &&
 										// Runtime and gc think differently about closures.
 										runtime·strstr(t->string->str, (byte*)"struct { F uintptr") != t->string->str)) {
 										pc1 = (uintptr*)objti;
 										pc2 = (uintptr*)t->gc;
 										// A simple best-effort check until first GC_END.
 										for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
 											if(pc1[j] != pc2[j]) {
 												runtime·printf("invalid gc type info for '%s' at %p, type info %p, block info %p\n",
-												runtime: record type information for hashtable internal structures.
Remove all hashtable-specific GC code.

Fixes bug 6119.

R=cshapiro, dvyukov, khr
CC=golang-dev
https://golang.org/cl/13078044

											
										
										
											2013-08-31 10:09:50 -06:00
+													       t->string ? (int8*)t->string->str : (int8*)"?", j, pc1[j], pc2[j]);
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+												runtime·throw("invalid gc type info");
 											}
 										}
 									}
 								}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								// scanblock scans a block of n bytes starting at pointer b for references
 								// to other objects, scanning any it finds recursively until there are no
 								// unscanned objects left.  Instead of using an explicit recursion, it keeps
 								// a work list in the Workbuf* structures and loops in the main function
 								// body.  Keeping an explicit work list is easier on the stack allocator and
 								// more efficient.
 								static void
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								scanblock(Workbuf *wbuf, bool keepworking)
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								{
 									byte *b, *arena_start, *arena_used;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									uintptr n, i, end_b, elemsize, size, ti, objti, count, type, nobj;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									uintptr *pc, precise_type, nominal_size;
-												runtime: record type information for hashtable internal structures.
Remove all hashtable-specific GC code.

Fixes bug 6119.

R=cshapiro, dvyukov, khr
CC=golang-dev
https://golang.org/cl/13078044

											
										
										
											2013-08-31 10:09:50 -06:00
+									uintptr *chan_ret, chancap;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									void *obj;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									Type *t;
 									Slice *sliceptr;
 									Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
-												runtime: introduce typedefs and delete struct keywords in mgc0.c

R=rsc
CC=golang-dev
https://golang.org/cl/7029055

											
										
										
											2013-01-04 08:20:50 -07:00
+									BufferList *scanbuffers;
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									Scanbuf sbuf;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									Eface *eface;
 									Iface *iface;
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+									Hchan *chan;
 									ChanType *chantype;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									Obj *wp;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+									if(sizeof(Workbuf) % WorkbufSize != 0)
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										runtime·throw("scanblock: size of Workbuf is suboptimal");
 									// Memory arena parameters.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
 									arena_used = runtime·mheap.arena_used;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									stack_ptr = stack+nelem(stack)-1;
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									precise_type = false;
 									nominal_size = 0;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									if(wbuf) {
 										nobj = wbuf->nobj;
 										wp = &wbuf->obj[nobj];
 									} else {
 										nobj = 0;
 										wp = nil;
 									}
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									// Initialize sbuf
 									scanbuffers = &bufferList[m->helpgc];
 									sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0];
 									sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+									sbuf.obj.begin = sbuf.obj.pos = &scanbuffers->obj[0];
 									sbuf.obj.end = sbuf.obj.begin + nelem(scanbuffers->obj);
 									sbuf.wbuf = wbuf;
 									sbuf.wp = wp;
 									sbuf.nobj = nobj;
-												runtime: precise garbage collection of hashmaps

R=golang-dev, rsc
CC=dave, dvyukov, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7252047

											
										
										
											2013-02-08 14:00:33 -07:00
 									// (Silence the compiler)
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+									chan = nil;
 									chantype = nil;
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+									chan_ret = nil;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
 									goto next_block;
 									for(;;) {
 										// Each iteration scans the block b of length n, queueing pointers in
 										// the work buffer.
 										if(Debug > 1) {
 											runtime·printf("scanblock %p %D\n", b, (int64)n);
 										}
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+										if(CollectStats) {
 											runtime·xadd64(&gcstats.nbytes, n);
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+											runtime·xadd64(&gcstats.obj.sum, sbuf.nobj);
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+											runtime·xadd64(&gcstats.obj.cnt, 1);
 										}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										if(ti != 0) {
 											pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
 											precise_type = (ti & PRECISE);
 											stack_top.elemsize = pc[0];
 											if(!precise_type)
 												nominal_size = pc[0];
 											if(ti & LOOP) {
 												stack_top.count = 0;	// 0 means an infinite number of iterations
 												stack_top.loop_or_ret = pc+1;
 											} else {
 												stack_top.count = 1;
 											}
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+											if(Debug) {
 												// Simple sanity check for provided type info ti:
 												// The declared size of the object must be not larger than the actual size
 												// (it can be smaller due to inferior pointers).
 												// It's difficult to make a comprehensive check due to inferior pointers,
 												// reflection, gob, etc.
 												if(pc[0] > n) {
 													runtime·printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
 													runtime·throw("invalid gc type info");
 												}
 											}
-												runtime: try to determine the actual type during garbage collection

If the scanned block has no typeinfo the garbage collector will attempt
to get the actual type of the block.

R=golang-dev, bradfitz, rsc
CC=golang-dev
https://golang.org/cl/7093045

											
										
										
											2013-01-18 14:56:17 -07:00
+										} else if(UseSpanType) {
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+											if(CollectStats)
 												runtime·xadd64(&gcstats.obj.notype, 1);
-												runtime: try to determine the actual type during garbage collection

If the scanned block has no typeinfo the garbage collector will attempt
to get the actual type of the block.

R=golang-dev, bradfitz, rsc
CC=golang-dev
https://golang.org/cl/7093045

											
										
										
											2013-01-18 14:56:17 -07:00
+											type = runtime·gettype(b);
 											if(type != 0) {
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+												if(CollectStats)
 													runtime·xadd64(&gcstats.obj.typelookup, 1);
-												runtime: try to determine the actual type during garbage collection

If the scanned block has no typeinfo the garbage collector will attempt
to get the actual type of the block.

R=golang-dev, bradfitz, rsc
CC=golang-dev
https://golang.org/cl/7093045

											
										
										
											2013-01-18 14:56:17 -07:00
+												t = (Type*)(type & ~(uintptr)(PtrSize-1));
 												switch(type & (PtrSize-1)) {
 												case TypeInfo_SingleObject:
 													pc = (uintptr*)t->gc;
 													precise_type = true;  // type information about 'b' is precise
 													stack_top.count = 1;
 													stack_top.elemsize = pc[0];
 													break;
 												case TypeInfo_Array:
 													pc = (uintptr*)t->gc;
 													if(pc[0] == 0)
 														goto next_block;
 													precise_type = true;  // type information about 'b' is precise
 													stack_top.count = 0;  // 0 means an infinite number of iterations
 													stack_top.elemsize = pc[0];
 													stack_top.loop_or_ret = pc+1;
 													break;
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+												case TypeInfo_Chan:
 													chan = (Hchan*)b;
 													chantype = (ChanType*)t;
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+													chan_ret = nil;
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+													pc = chanProg;
 													break;
-												runtime: try to determine the actual type during garbage collection

If the scanned block has no typeinfo the garbage collector will attempt
to get the actual type of the block.

R=golang-dev, bradfitz, rsc
CC=golang-dev
https://golang.org/cl/7093045

											
										
										
											2013-01-18 14:56:17 -07:00
+												default:
 													runtime·throw("scanblock: invalid type");
 													return;
 												}
 											} else {
 												pc = defaultProg;
 											}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										} else {
 											pc = defaultProg;
 										}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: add a hook to disable precise GC

This will let us ask people to rebuild the Go system without
precise GC, and then rebuild and retest their program, to see
if precise GC is causing whatever problem they are having.

R=golang-dev, r
CC=golang-dev
https://golang.org/cl/8700043

											
										
										
											2013-04-12 06:23:38 -06:00
+										if(IgnorePreciseGC)
 											pc = defaultProg;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										pc++;
 										stack_top.b = (uintptr)b;
 										end_b = (uintptr)b + n - PtrSize;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									for(;;) {
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+										if(CollectStats)
 											runtime·xadd64(&gcstats.instr[pc[0]], 1);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										obj = nil;
 										objti = 0;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										switch(pc[0]) {
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										case GC_PTR:
 											obj = *(void**)(stack_top.b + pc[1]);
 											objti = pc[2];
 											pc += 3;
-												runtime: poor man's heap type info checker
It's not trivial to make a comprehensive check
due to inferior pointers, reflect, gob, etc.
But this is essentially what I've used to debug
the GC issues.
Update #5193.

R=golang-dev, iant, 0xe2.0x9a.0x9b, r
CC=golang-dev
https://golang.org/cl/8455043

											
										
										
											2013-04-08 14:36:35 -06:00
+											if(Debug)
 												checkptr(obj, objti);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											break;
 										case GC_SLICE:
 											sliceptr = (Slice*)(stack_top.b + pc[1]);
 											if(sliceptr->cap != 0) {
 												obj = sliceptr->array;
-												runtime: fix GC scanning of slices
If a slice points to an array embedded in a struct,
the whole struct can be incorrectly scanned as the slice buffer.
Fixes #5443.

R=cshapiro, iant, r, cshapiro, minux.ma
CC=bradfitz, gobot, golang-dev
https://golang.org/cl/9372044

											
										
										
											2013-05-15 13:50:32 -06:00
+												// Can't use slice element type for scanning,
 												// because if it points to an array embedded
 												// in the beginning of a struct,
 												// we will scan the whole struct as the slice.
 												// So just obtain type info from heap.
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											}
 											pc += 3;
 											break;
 										case GC_APTR:
 											obj = *(void**)(stack_top.b + pc[1]);
 											pc += 2;
 											break;
 										case GC_STRING:
 											obj = *(void**)(stack_top.b + pc[1]);
-												runtime: mark strings without going through an intermediate buffer

R=rsc
CC=golang-dev
https://golang.org/cl/7949043

											
										
										
											2013-03-21 12:00:02 -06:00
+											markonly(obj);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											pc += 2;
-												runtime: mark strings without going through an intermediate buffer

R=rsc
CC=golang-dev
https://golang.org/cl/7949043

											
										
										
											2013-03-21 12:00:02 -06:00
+											continue;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
 										case GC_EFACE:
 											eface = (Eface*)(stack_top.b + pc[1]);
 											pc += 2;
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+											if(eface->type == nil)
 												continue;
 											// eface->type
 											t = eface->type;
 											if((void*)t >= arena_start && (void*)t < arena_used) {
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+												*sbuf.ptr.pos++ = (PtrTarget){t, 0};
 												if(sbuf.ptr.pos == sbuf.ptr.end)
 													flushptrbuf(&sbuf);
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+											}
 											// eface->data
 											if(eface->data >= arena_start && eface->data < arena_used) {
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												if(t->size <= sizeof(void*)) {
 													if((t->kind & KindNoPointers))
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+														continue;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
 													obj = eface->data;
 													if((t->kind & ~KindNoPointers) == KindPtr)
 														objti = (uintptr)((PtrType*)t)->elem->gc;
 												} else {
 													obj = eface->data;
 													objti = (uintptr)t->gc;
 												}
 											}
 											break;
 										case GC_IFACE:
 											iface = (Iface*)(stack_top.b + pc[1]);
 											pc += 2;
 											if(iface->tab == nil)
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+												continue;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
 											// iface->tab
 											if((void*)iface->tab >= arena_start && (void*)iface->tab < arena_used) {
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+												*sbuf.ptr.pos++ = (PtrTarget){iface->tab, (uintptr)itabtype->gc};
 												if(sbuf.ptr.pos == sbuf.ptr.end)
 													flushptrbuf(&sbuf);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											}
 											// iface->data
 											if(iface->data >= arena_start && iface->data < arena_used) {
 												t = iface->tab->type;
 												if(t->size <= sizeof(void*)) {
 													if((t->kind & KindNoPointers))
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+														continue;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
 													obj = iface->data;
 													if((t->kind & ~KindNoPointers) == KindPtr)
 														objti = (uintptr)((PtrType*)t)->elem->gc;
 												} else {
 													obj = iface->data;
 													objti = (uintptr)t->gc;
 												}
 											}
 											break;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										case GC_DEFAULT_PTR:
-												runtime: mark strings without going through an intermediate buffer

R=rsc
CC=golang-dev
https://golang.org/cl/7949043

											
										
										
											2013-03-21 12:00:02 -06:00
+											while(stack_top.b <= end_b) {
 												obj = *(byte**)stack_top.b;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+												stack_top.b += PtrSize;
 												if(obj >= arena_start && obj < arena_used) {
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+													*sbuf.ptr.pos++ = (PtrTarget){obj, 0};
 													if(sbuf.ptr.pos == sbuf.ptr.end)
 														flushptrbuf(&sbuf);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+												}
 											}
 											goto next_block;
 										case GC_END:
 											if(--stack_top.count != 0) {
 												// Next iteration of a loop if possible.
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+												stack_top.b += stack_top.elemsize;
 												if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+													pc = stack_top.loop_or_ret;
 													continue;
 												}
 												i = stack_top.b;
 											} else {
 												// Stack pop if possible.
 												if(stack_ptr+1 < stack+nelem(stack)) {
 													pc = stack_top.loop_or_ret;
 													stack_top = *(++stack_ptr);
 													continue;
 												}
 												i = (uintptr)b + nominal_size;
 											}
 											if(!precise_type) {
 												// Quickly scan [b+i,b+n) for possible pointers.
 												for(; i<=end_b; i+=PtrSize) {
 													if(*(byte**)i != nil) {
 														// Found a value that may be a pointer.
 														// Do a rescan of the entire block.
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+														enqueue((Obj){b, n, 0}, &sbuf.wbuf, &sbuf.wp, &sbuf.nobj);
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+														if(CollectStats) {
 															runtime·xadd64(&gcstats.rescan, 1);
 															runtime·xadd64(&gcstats.rescanbytes, n);
 														}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+														break;
 													}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+												}
 											}
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											goto next_block;
 										case GC_ARRAY_START:
 											i = stack_top.b + pc[1];
 											count = pc[2];
 											elemsize = pc[3];
 											pc += 4;
 											// Stack push.
 											*stack_ptr-- = stack_top;
 											stack_top = (Frame){count, elemsize, i, pc};
 											continue;
 										case GC_ARRAY_NEXT:
 											if(--stack_top.count != 0) {
 												stack_top.b += stack_top.elemsize;
 												pc = stack_top.loop_or_ret;
 											} else {
 												// Stack pop.
 												stack_top = *(++stack_ptr);
 												pc += 1;
 											}
 											continue;
 										case GC_CALL:
 											// Stack push.
 											*stack_ptr-- = stack_top;
 											stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
-												cmd/ld: change GC_CALL to 32-bit relative address

The current code uses 64-bit pc-relative on 64-bit systems,
but in ELF linkers there is no such thing, so we cannot
express this in a .o file. Change to 32-bit.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/7383055

											
										
										
											2013-02-26 20:42:56 -07:00
+											pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											continue;
 										case GC_REGION:
 											obj = (void*)(stack_top.b + pc[1]);
-												runtime: improve precision of GC_REGION

R=rsc
CC=golang-dev
https://golang.org/cl/7383054

											
										
										
											2013-02-27 09:28:53 -07:00
+											size = pc[2];
 											objti = pc[3];
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+											pc += 4;
-												runtime: improve precision of GC_REGION

R=rsc
CC=golang-dev
https://golang.org/cl/7383054

											
										
										
											2013-02-27 09:28:53 -07:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+											*sbuf.obj.pos++ = (Obj){obj, size, objti};
 											if(sbuf.obj.pos == sbuf.obj.end)
 												flushobjbuf(&sbuf);
-												runtime: scan the type of an interface value

R=golang-dev, rsc, bradfitz
CC=golang-dev
https://golang.org/cl/7744047

											
										
										
											2013-03-15 14:07:52 -06:00
+											continue;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+										case GC_CHAN_PTR:
 											chan = *(Hchan**)(stack_top.b + pc[1]);
 											if(chan == nil) {
 												pc += 3;
 												continue;
 											}
 											if(markonly(chan)) {
 												chantype = (ChanType*)pc[2];
 												if(!(chantype->elem->kind & KindNoPointers)) {
 													// Start chanProg.
 													chan_ret = pc+3;
 													pc = chanProg+1;
 													continue;
 												}
 											}
 											pc += 3;
 											continue;
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+										case GC_CHAN:
 											// There are no heap pointers in struct Hchan,
 											// so we can ignore the leading sizeof(Hchan) bytes.
 											if(!(chantype->elem->kind & KindNoPointers)) {
 												// Channel's buffer follows Hchan immediately in memory.
 												// Size of buffer (cap(c)) is second int in the chan struct.
-												runtime: fix heap corruption during GC
The 'n' variable is used during rescan initiation in GC_END case,
but it's overwritten with chan capacity in GC_CHAN case.
As the result rescan is done with the wrong object size.
Fixes #5554.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9831043

											
										
										
											2013-05-28 09:17:47 -06:00
+												chancap = ((uintgo*)chan)[1];
 												if(chancap > 0) {
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+													// TODO(atom): split into two chunks so that only the
 													// in-use part of the circular buffer is scanned.
 													// (Channel routines zero the unused part, so the current
 													// code does not lead to leaks, it's just a little inefficient.)
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+													*sbuf.obj.pos++ = (Obj){(byte*)chan+runtime·Hchansize, chancap*chantype->elem->size,
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+														(uintptr)chantype->elem->gc | PRECISE | LOOP};
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+													if(sbuf.obj.pos == sbuf.obj.end)
 														flushobjbuf(&sbuf);
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
+												}
 											}
-												cmd/gc: support channel types in the garbage collector

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7473044

											
										
										
											2013-03-19 12:51:03 -06:00
+											if(chan_ret == nil)
 												goto next_block;
 											pc = chan_ret;
 											continue;
-												runtime: precise garbage collection of channels

This changeset adds a mostly-precise garbage collection of channels.
The garbage collection support code in the linker isn't recognizing
channel types yet.

Fixes issue http://stackoverflow.com/questions/14712586/memory-consumption-skyrocket

R=dvyukov, rsc, bradfitz
CC=dave, golang-dev, minux.ma, remyoudompheng
https://golang.org/cl/7307086

											
										
										
											2013-02-25 13:58:23 -07:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										default:
-												runtime: use goc2c as much as possible

Package runtime's C functions written to be called from Go
started out written in C using carefully constructed argument
lists and the FLUSH macro to write a result back to memory.

For some functions, the appropriate parameter list ended up
being architecture-dependent due to differences in alignment,
so we added 'goc2c', which takes a .goc file containing Go func
declarations but C bodies, rewrites the Go func declaration to
equivalent C declarations for the target architecture, adds the
needed FLUSH statements, and writes out an equivalent C file.
That C file is compiled as part of package runtime.

Native Client's x86-64 support introduces the most complex
alignment rules yet, breaking many functions that could until
now be portably written in C. Using goc2c for those avoids the
breakage.

Separately, Keith's work on emitting stack information from
the C compiler would require the hand-written functions
to add #pragmas specifying how many arguments are result
parameters. Using goc2c for those avoids maintaining #pragmas.

For both reasons, use goc2c for as many Go-called C functions
as possible.

This CL is a replay of the bulk of CL 15400047 and CL 15790043,
both of which were reviewed as part of the NaCl port and are
checked in to the NaCl branch. This CL is part of bringing the
NaCl code into the main tree.

No new code here, just reformatting and occasional movement
into .h files.

LGTM=r
R=dave, alex.brainman, r
CC=golang-codereviews
https://golang.org/cl/65220044

											
										
										
											2014-02-20 13:58:47 -07:00
+											runtime·printf("runtime: invalid GC instruction %p at %p\n", pc[0], pc);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+											runtime·throw("scanblock: invalid GC instruction");
 											return;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										if(obj >= arena_start && obj < arena_used) {
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+											*sbuf.ptr.pos++ = (PtrTarget){obj, objti};
 											if(sbuf.ptr.pos == sbuf.ptr.end)
 												flushptrbuf(&sbuf);
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										}
 									}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
 									next_block:
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										// Done scanning [b, b+n).  Prepare for the next iteration of
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+										// the loop by setting b, n, ti to the parameters for the next block.
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+										if(sbuf.nobj == 0) {
 											flushptrbuf(&sbuf);
 											flushobjbuf(&sbuf);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+											if(sbuf.nobj == 0) {
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+												if(!keepworking) {
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+													if(sbuf.wbuf)
 														putempty(sbuf.wbuf);
 													return;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+												}
 												// Emptied our buffer: refill.
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+												sbuf.wbuf = getfull(sbuf.wbuf);
 												if(sbuf.wbuf == nil)
 													return;
 												sbuf.nobj = sbuf.wbuf->nobj;
 												sbuf.wp = sbuf.wbuf->obj + sbuf.wbuf->nobj;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+										// Fetch b from the work buffer.
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+										--sbuf.wp;
 										b = sbuf.wp->p;
 										n = sbuf.wp->n;
 										ti = sbuf.wp->ti;
 										sbuf.nobj--;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
 								}
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+								// Append obj to the work buffer.
 								// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
 								static void
 								enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
 								{
 									uintptr nobj, off;
 									Obj *wp;
 									Workbuf *wbuf;
 									if(Debug > 1)
 										runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
 									// Align obj.b to a word boundary.
 									off = (uintptr)obj.p & (PtrSize-1);
 									if(off != 0) {
 										obj.p += PtrSize - off;
 										obj.n -= PtrSize - off;
 										obj.ti = 0;
 									}
 									if(obj.p == nil || obj.n == 0)
 										return;
 									// Load work buffer state
 									wp = *_wp;
 									wbuf = *_wbuf;
 									nobj = *_nobj;
 									// If another proc wants a pointer, give it some.
 									if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
 										wbuf->nobj = nobj;
 										wbuf = handoff(wbuf);
 										nobj = wbuf->nobj;
 										wp = wbuf->obj + nobj;
 									}
 									// If buffer is full, get a new one.
 									if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
 										if(wbuf != nil)
 											wbuf->nobj = nobj;
 										wbuf = getempty(wbuf);
 										wp = wbuf->obj;
 										nobj = 0;
 									}
 									*wp = obj;
 									wp++;
 									nobj++;
 									// Save work buffer state
 									*_wp = wp;
 									*_wbuf = wbuf;
 									*_nobj = nobj;
 								}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								static void
 								enqueue1(Workbuf **wbufp, Obj obj)
 								{
 									Workbuf *wbuf;
 									wbuf = *wbufp;
 									if(wbuf->nobj >= nelem(wbuf->obj))
 										*wbufp = wbuf = getempty(wbuf);
 									wbuf->obj[wbuf->nobj++] = obj;
 								}
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+								static void
 								markroot(ParFor *desc, uint32 i)
 								{
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									Workbuf *wbuf;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									FinBlock *fb;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									MHeap *h;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									MSpan **allspans, *s;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									uint32 spanidx, sg;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									G *gp;
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+									void *p;
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									USED(&desc);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									wbuf = getempty(nil);
 									switch(i) {
 									case RootData:
 										enqueue1(&wbuf, (Obj){data, edata - data, (uintptr)gcdata});
 										break;
 									case RootBss:
 										enqueue1(&wbuf, (Obj){bss, ebss - bss, (uintptr)gcbss});
 										break;
 									case RootFinalizers:
 										for(fb=allfin; fb; fb=fb->alllink)
 											enqueue1(&wbuf, (Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
 										break;
 									case RootSpanTypes:
 										// mark span types and MSpan.specials (to walk spans only once)
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										h = &runtime·mheap;
 										sg = h->sweepgen;
 										allspans = h->allspans;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
 											Special *sp;
 											SpecialFinalizer *spf;
 											s = allspans[spanidx];
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											if(s->sweepgen != sg) {
 												runtime·printf("sweep %d %d\n", s->sweepgen, sg);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+												runtime·throw("gc: unswept span");
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											if(s->state != MSpanInUse)
 												continue;
 											// The garbage collector ignores type pointers stored in MSpan.types:
 											//  - Compiler-generated types are stored outside of heap.
 											//  - The reflect package has runtime-generated types cached in its data structures.
 											//    The garbage collector relies on finding the references via that cache.
 											if(s->types.compression == MTypes_Words || s->types.compression == MTypes_Bytes)
 												markonly((byte*)s->types.data);
 											for(sp = s->specials; sp != nil; sp = sp->next) {
 												if(sp->kind != KindSpecialFinalizer)
 													continue;
 												// don't mark finalized object, but scan it so we
 												// retain everything it points to.
 												spf = (SpecialFinalizer*)sp;
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+												// A finalizer can be set for an inner byte of an object, find object beginning.
 												p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
 												enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+												enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
 												enqueue1(&wbuf, (Obj){(void*)&spf->fint, PtrSize, 0});
 												enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
 											}
 										}
 										break;
 									case RootFlushCaches:
 										flushallmcaches();
 										break;
 									default:
 										// the rest is scanning goroutine stacks
 										if(i - RootCount >= runtime·allglen)
 											runtime·throw("markroot: bad index");
 										gp = runtime·allg[i - RootCount];
 										// remember when we've first observed the G blocked
 										// needed only to output in traceback
 										if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
 											gp->waitsince = work.tstart;
 										addstackroots(gp, &wbuf);
 										break;
 									}
 									if(wbuf)
 										scanblock(wbuf, false);
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// Get an empty work buffer off the work.empty list,
 								// allocating new buffers as needed.
 								static Workbuf*
 								getempty(Workbuf *b)
 								{
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									if(b != nil)
 										runtime·lfstackpush(&work.full, &b->node);
 									b = (Workbuf*)runtime·lfstackpop(&work.empty);
 									if(b == nil) {
 										// Need to allocate.
 										runtime·lock(&work);
 										if(work.nchunk < sizeof *b) {
 											work.nchunk = 1<<20;
-												runtime: account for all sys memory in MemStats
Currently lots of sys allocations are not accounted in any of XxxSys,
including GC bitmap, spans table, GC roots blocks, GC finalizer blocks,
iface table, netpoll descriptors and more. Up to ~20% can unaccounted.
This change introduces 2 new stats: GCSys and OtherSys for GC metadata
and all other misc allocations, respectively.
Also ensures that all XxxSys indeed sum up to Sys. All sys memory allocation
functions require the stat for accounting, so that it's impossible to miss something.
Also fix updating of mcache_sys/inuse, they were not updated after deallocation.

test/bench/garbage/parser before:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14204928
MCacheSys	16384
BuckHashSys	1439992

after:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14188544
MCacheSys	16384
BuckHashSys	3194304
GCSys		39198688
OtherSys	3129656

Fixes #5799.

R=rsc, dave, alex.brainman
CC=golang-dev
https://golang.org/cl/12946043

											
										
										
											2013-09-06 14:55:40 -06:00
+											work.chunk = runtime·SysAlloc(work.nchunk, &mstats.gc_sys);
-												runtime: check the value returned by runtime·SysAlloc

R=golang-dev, rsc
CC=golang-dev, minux.ma
https://golang.org/cl/7424047

											
										
										
											2013-02-28 22:21:08 -07:00
+											if(work.chunk == nil)
 												runtime·throw("runtime: cannot allocate memory");
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										}
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+										b = (Workbuf*)work.chunk;
 										work.chunk += sizeof *b;
 										work.nchunk -= sizeof *b;
 										runtime·unlock(&work);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									b->nobj = 0;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									return b;
 								}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								static void
 								putempty(Workbuf *b)
 								{
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+									if(CollectStats)
 										runtime·xadd64(&gcstats.putempty, 1);
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·lfstackpush(&work.empty, &b->node);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// Get a full work buffer off the work.full list, or return nil.
 								static Workbuf*
 								getfull(Workbuf *b)
 								{
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									int32 i;
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+									if(CollectStats)
 										runtime·xadd64(&gcstats.getfull, 1);
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									if(b != nil)
 										runtime·lfstackpush(&work.empty, &b->node);
 									b = (Workbuf*)runtime·lfstackpop(&work.full);
 									if(b != nil || work.nproc == 1)
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										return b;
 									runtime·xadd(&work.nwait, +1);
 									for(i=0;; i++) {
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+										if(work.full != 0) {
 											runtime·xadd(&work.nwait, -1);
 											b = (Workbuf*)runtime·lfstackpop(&work.full);
 											if(b != nil)
 												return b;
 											runtime·xadd(&work.nwait, +1);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										}
 										if(work.nwait == work.nproc)
 											return nil;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										if(i < 10) {
 											m->gcstats.nprocyield++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·procyield(20);
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										} else if(i < 20) {
 											m->gcstats.nosyield++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·osyield();
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										} else {
 											m->gcstats.nsleep++;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+											runtime·usleep(100);
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									}
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								}
 								static Workbuf*
 								handoff(Workbuf *b)
 								{
 									int32 n;
 									Workbuf *b1;
 									// Make new buffer with half of b's pointers.
 									b1 = getempty(nil);
 									n = b->nobj/2;
 									b->nobj -= n;
 									b1->nobj = n;
 									runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+									m->gcstats.nhandoff++;
 									m->gcstats.nhandoffcnt += n;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
 									// Put b on full list - let first half of b get stolen.
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·lfstackpush(&work.full, &b->node);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									return b1;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												cmd/ld, runtime: use new contiguous pcln table

R=golang-dev, r, dave
CC=golang-dev
https://golang.org/cl/11494043

											
										
										
											2013-07-18 08:43:22 -06:00
+								extern byte pclntab[]; // base for f->ptrsoff
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+								typedef struct BitVector BitVector;
 								struct BitVector
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+								{
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									int32 n;
 									uint32 data[];
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+								};
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								typedef struct StackMap StackMap;
 								struct StackMap
 								{
 									int32 n;
 									uint32 data[];
 								};
 								static BitVector*
 								stackmapdata(StackMap *stackmap, int32 n)
 								{
 									BitVector *bv;
 									uint32 *ptr;
 									uint32 words;
 									int32 i;
 									if(n < 0 || n >= stackmap->n) {
 										runtime·throw("stackmapdata: index out of range");
 									}
 									ptr = stackmap->data;
 									for(i = 0; i < n; i++) {
 										bv = (BitVector*)ptr;
 										words = ((bv->n + 31) / 32) + 1;
 										ptr += words;
 									}
 									return (BitVector*)ptr;
 								}
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+								// Scans an interface data value when the interface type indicates
 								// that it is a pointer.
 								static void
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								scaninterfacedata(uintptr bits, byte *scanp, bool afterprologue, void *wbufp)
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+								{
 									Itab *tab;
 									Type *type;
-												build: disable precise collection of stack frames

The code for call site-specific pointer bitmaps was not ready in time,
but the zeroing required without it is too expensive to use by default.
We will have to wait for precise collection of stack frames until Go 1.3.

The precise collection can be re-enabled by

        GOEXPERIMENT=precisestack ./all.bash

but that will not be the default for a Go 1.2 build.

Fixes #6087.

R=golang-dev, jeremyjackins, dan.kortschak, r
CC=golang-dev
https://golang.org/cl/13677045

											
										
										
											2013-09-16 18:26:10 -06:00
+									if(runtime·precisestack && afterprologue) {
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+										if(bits == BitsIface) {
 											tab = *(Itab**)scanp;
 											if(tab->type->size <= sizeof(void*) && (tab->type->kind & KindNoPointers))
 												return;
 										} else { // bits == BitsEface
 											type = *(Type**)scanp;
 											if(type->size <= sizeof(void*) && (type->kind & KindNoPointers))
 												return;
 										}
 									}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									enqueue1(wbufp, (Obj){scanp+PtrSize, PtrSize, 0});
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+								}
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+								// Starting from scanp, scans words corresponding to set bits.
 								static void
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								scanbitvector(byte *scanp, BitVector *bv, bool afterprologue, void *wbufp)
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+								{
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+									uintptr word, bits;
 									uint32 *wordp;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									int32 i, remptrs;
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+									wordp = bv->data;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									for(remptrs = bv->n; remptrs > 0; remptrs -= 32) {
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+										word = *wordp++;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+										if(remptrs < 32)
 											i = remptrs;
 										else
 											i = 32;
-												cmd/cc, cmd/gc, runtime: Uniquely encode iface and eface pointers in the pointer map.

Prior to this change, pointer maps encoded the disposition of
a word using a single bit.  A zero signaled a non-pointer
value and a one signaled a pointer value.  Interface values,
which are a effectively a union type, were conservatively
labeled as a pointer.

This change widens the logical element size of the pointer map
to two bits per word.  As before, zero signals a non-pointer
value and one signals a pointer value.  Additionally, a two
signals an iface pointer and a three signals an eface pointer.

Following other changes to the runtime, values two and three
will allow a type information to drive interpretation of the
subsequent word so only those interface values containing a
pointer value will be scanned.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/12689046

											
										
										
											2013-08-09 17:48:12 -06:00
+										i /= BitsPerPointer;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+										for(; i > 0; i--) {
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+											bits = word & 3;
 											if(bits != BitsNoPointer && *(void**)scanp != nil)
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+												if(bits == BitsPointer)
 													enqueue1(wbufp, (Obj){scanp, PtrSize, 0});
 												else
 													scaninterfacedata(bits, scanp, afterprologue, wbufp);
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+											word >>= BitsPerPointer;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+											scanp += PtrSize;
 										}
 									}
 								}
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								// Scan a stack frame: local variables and function arguments/results.
-												runtime: move stack scanning into the parallel mark phase

This change reduces the cost of the stack scanning by frames.
It moves the stack scanning from the serial root enumeration
phase to the parallel tracing phase.  The output that follows
are timings for the issue 6482 benchmark

Baseline

BenchmarkGoroutineSelect	      50	 108027405 ns/op
BenchmarkGoroutineBlocking	      50	  89573332 ns/op
BenchmarkGoroutineForRange	      20	  95614116 ns/op
BenchmarkGoroutineIdle		      20	 122809512 ns/op

Stack scan by frames, non-parallel

BenchmarkGoroutineSelect	      20	 297138929 ns/op
BenchmarkGoroutineBlocking	      20	 301137599 ns/op
BenchmarkGoroutineForRange	      10	 312499469 ns/op
BenchmarkGoroutineIdle		      10	 209428876 ns/op

Stack scan by frames, parallel

BenchmarkGoroutineSelect	      20	 183938431 ns/op
BenchmarkGoroutineBlocking	      20	 170109999 ns/op
BenchmarkGoroutineForRange	      20	 179628882 ns/op
BenchmarkGoroutineIdle		      20	 157541498 ns/op

The remaining performance disparity is due to inefficiencies
in gentraceback and its callees.  The effect was isolated by
using a parallel stack scan where scanstack was modified to do
a conservative scan of the stack segments without gentraceback
followed by a call of gentrackback with a no-op callback.

The output that follows are the top-10 most frequent tops of
stacks as determined by the Linux perf record facility.

Baseline

+  25.19%  gc.test  gc.test            [.] runtime.xchg
+  19.00%  gc.test  gc.test            [.] scanblock
+   8.53%  gc.test  gc.test            [.] scanstack
+   8.46%  gc.test  gc.test            [.] flushptrbuf
+   5.08%  gc.test  gc.test            [.] procresize
+   3.57%  gc.test  gc.test            [.] runtime.chanrecv
+   2.94%  gc.test  gc.test            [.] dequeue
+   2.74%  gc.test  gc.test            [.] addroots
+   2.25%  gc.test  gc.test            [.] runtime.ready
+   1.33%  gc.test  gc.test            [.] runtime.cas64

Gentraceback

+  18.12%  gc.test  gc.test             [.] runtime.xchg
+  14.68%  gc.test  gc.test             [.] scanblock
+   8.20%  gc.test  gc.test             [.] runtime.gentraceback
+   7.38%  gc.test  gc.test             [.] flushptrbuf
+   6.84%  gc.test  gc.test             [.] scanstack
+   5.92%  gc.test  gc.test             [.] runtime.findfunc
+   3.62%  gc.test  gc.test             [.] procresize
+   3.15%  gc.test  gc.test             [.] readvarint
+   1.92%  gc.test  gc.test             [.] addroots
+   1.87%  gc.test  gc.test             [.] runtime.chanrecv

R=golang-dev, dvyukov, rsc
CC=golang-dev
https://golang.org/cl/17410043

											
										
										
											2013-12-03 15:12:55 -07:00
+								static void
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								scanframe(Stkframe *frame, void *wbufp)
-												cmd/ld, runtime: restrict stack root scan to locals and arguments

Updates #5134

R=golang-dev, bradfitz, cshapiro, daniel.morsing, ality, iant
CC=golang-dev
https://golang.org/cl/8022044

											
										
										
											2013-03-28 15:36:23 -06:00
+								{
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									Func *f;
 									StackMap *stackmap;
 									BitVector *bv;
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									uintptr size;
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									uintptr targetpc;
 									int32 pcdata;
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+									bool afterprologue;
-												cmd/ld, runtime: restrict stack root scan to locals and arguments

Updates #5134

R=golang-dev, bradfitz, cshapiro, daniel.morsing, ality, iant
CC=golang-dev
https://golang.org/cl/8022044

											
										
										
											2013-03-28 15:36:23 -06:00
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									f = frame->fn;
 									targetpc = frame->pc;
 									if(targetpc != f->entry)
 										targetpc--;
 									pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
 									if(pcdata == -1) {
 										// We do not have a valid pcdata value but there might be a
 										// stackmap for this function.  It is likely that we are looking
 										// at the function prologue, assume so and hope for the best.
 										pcdata = 0;
 									}
-												runtime: adjust traceback / garbage collector boundary

The garbage collection routine addframeroots is duplicating
logic in the traceback routine that calls it, sometimes correctly,
sometimes incorrectly, sometimes incompletely.
Pass necessary information to addframeroots instead of
deriving it anew.

Should make addframeroots significantly more robust.
It's certainly smaller.

Also try to standardize on uintptr for saved pc, sp values.

Will make CL 10036044 trivial.

R=golang-dev, dave, dvyukov
CC=golang-dev
https://golang.org/cl/10169045

											
										
										
											2013-06-12 06:49:38 -06:00
+									// Scan local variables if stack frame has been allocated.
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+									// Use pointer information if known.
-												undo CL 13010045 / 04f8101b46dd

Update the original change but do not read interface types in
the arguments area.  Once the arguments area is zeroed as the
locals area is we can safely read interface type values there
too.

««« original CL description
undo CL 12785045 / 71ce80dc4195

This has broken the 32-bit builds.

««« original CL description
cmd/gc, runtime: use type information to scan interface values

R=golang-dev, rsc, dvyukov
CC=golang-dev
https://golang.org/cl/12785045
»»»

R=khr, golang-dev, khr
CC=golang-dev
https://golang.org/cl/13010045
»»»

R=khr, khr
CC=golang-dev
https://golang.org/cl/13073045

											
										
										
											2013-08-21 14:51:00 -06:00
+									afterprologue = (frame->varp > (byte*)frame->sp);
 									if(afterprologue) {
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+										stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
 										if(stackmap == nil) {
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+											// No locals information, scan everything.
 											size = frame->varp - (byte*)frame->sp;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											enqueue1(wbufp, (Obj){frame->varp - size, size, 0});
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+										} else if(stackmap->n < 0) {
 											// Locals size information, scan just the locals.
 											size = -stackmap->n;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											enqueue1(wbufp, (Obj){frame->varp - size, size, 0});
 										} else if(stackmap->n > 0) {
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+											// Locals bitmap information, scan just the pointers in
 											// locals.
 											if(pcdata < 0 || pcdata >= stackmap->n) {
 												// don't know where we are
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+												runtime·printf("pcdata is %d and %d stack map entries for %s (targetpc=%p)\n",
 													pcdata, stackmap->n, runtime·funcname(f), targetpc);
 												runtime·throw("scanframe: bad symbol table");
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+											}
 											bv = stackmapdata(stackmap, pcdata);
 											size = (bv->n * PtrSize) / BitsPerPointer;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											scanbitvector(frame->varp - size, bv, afterprologue, wbufp);
-												cmd/cc, cmd/gc, runtime: emit bitmaps for scanning locals.

Previously, all word aligned locations in the local variables
area were scanned as conservative roots.  With this change, a
bitmap is generated describing the locations of pointer values
in local variables.

With this change the argument bitmap information has been
changed to only store information about arguments.  The locals
member, has been removed.  In its place, the bitmap data for
local variables is now used to store the size of locals.  If
the size is negative, the magnitude indicates the size of the
local variables area.

R=rsc
CC=golang-dev
https://golang.org/cl/12328044

											
										
										
											2013-08-07 13:47:01 -06:00
+										}
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									}
-												runtime: adjust traceback / garbage collector boundary

The garbage collection routine addframeroots is duplicating
logic in the traceback routine that calls it, sometimes correctly,
sometimes incorrectly, sometimes incompletely.
Pass necessary information to addframeroots instead of
deriving it anew.

Should make addframeroots significantly more robust.
It's certainly smaller.

Also try to standardize on uintptr for saved pc, sp values.

Will make CL 10036044 trivial.

R=golang-dev, dave, dvyukov
CC=golang-dev
https://golang.org/cl/10169045

											
										
										
											2013-06-12 06:49:38 -06:00
 									// Scan arguments.
 									// Use pointer information if known.
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
 									if(stackmap != nil) {
 										bv = stackmapdata(stackmap, pcdata);
-												cmd/gc: add zeroing to enable precise stack accounting

There is more zeroing than I would like right now -
temporaries used for the new map and channel runtime
calls need to be eliminated - but it will do for now.

This CL only has an effect if you are building with

        GOEXPERIMENT=precisestack ./all.bash

(or make.bash). It costs about 5% in the overall time
spent in all.bash. That number will come down before
we make it on by default, but this should be enough for
Keith to try using the precise maps for copying stacks.

amd64 only (and it's not really great generated code).

TBR=khr, iant
CC=golang-codereviews
https://golang.org/cl/56430043

											
										
										
											2014-01-23 21:11:04 -07:00
+										scanbitvector(frame->argp, bv, true, wbufp);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									} else
 										enqueue1(wbufp, (Obj){frame->argp, frame->arglen, 0});
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								static void
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								addstackroots(G *gp, Workbuf **wbufp)
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+								{
 									M *mp;
 									int32 n;
 									Stktop *stk;
 									uintptr sp, guard;
 									void *base;
 									uintptr size;
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									switch(gp->status){
 									default:
 										runtime·printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
 										runtime·throw("mark - bad status");
 									case Gdead:
 										return;
 									case Grunning:
 										runtime·throw("mark - world not stopped");
 									case Grunnable:
 									case Gsyscall:
 									case Gwaiting:
 										break;
 									}
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									if(gp == g)
 										runtime·throw("can't scan our own stack");
 									if((mp = gp->m) != nil && mp->helpgc)
 										runtime·throw("can't scan gchelper stack");
 									if(gp->syscallstack != (uintptr)nil) {
 										// Scanning another goroutine that is about to enter or might
 										// have just exited a system call. It may be executing code such
 										// as schedlock and may have needed to start a new stack segment.
 										// Use the stack segment and stack pointer at the time of
 										// the system call instead, since that won't change underfoot.
 										sp = gp->syscallsp;
 										stk = (Stktop*)gp->syscallstack;
 										guard = gp->syscallguard;
 									} else {
 										// Scanning another goroutine's stack.
 										// The goroutine is usually asleep (the world is stopped).
 										sp = gp->sched.sp;
 										stk = (Stktop*)gp->stackbase;
 										guard = gp->stackguard;
 										// For function about to start, context argument is a root too.
 										if(gp->sched.ctxt != 0 && runtime·mlookup(gp->sched.ctxt, &base, &size, nil))
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											enqueue1(wbufp, (Obj){base, size, 0});
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									}
 									if(ScanStackByFrames) {
 										USED(sp);
 										USED(stk);
 										USED(guard);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+										runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, scanframe, wbufp, false);
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+									} else {
 										n = 0;
 										while(stk) {
 											if(sp < guard-StackGuard || (uintptr)stk < sp) {
 												runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
 												runtime·throw("scanstack");
 											}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+											enqueue1(wbufp, (Obj){(byte*)sp, (uintptr)stk - sp, (uintptr)defaultProg | PRECISE | LOOP});
-												cmd/5g, cmd/5l, cmd/6g, cmd/6l, cmd/8g, cmd/8l, cmd/gc, runtime: generate pointer maps by liveness analysis

This change allows the garbage collector to examine stack
slots that are determined as live and containing a pointer
value by the garbage collector.  This results in a mean
reduction of 65% in the number of stack slots scanned during
an invocation of "GOGC=1 all.bash".

Unfortunately, this does not yet allow garbage collection to
be precise for the stack slots computed as live.  Pointers
confound the determination of what definitions reach a given
instruction.  In general, this problem is not solvable without
runtime cost but some advanced cooperation from the compiler
might mitigate common cases.

R=golang-dev, rsc, cshapiro
CC=golang-dev
https://golang.org/cl/14430048

											
										
										
											2013-12-05 18:35:22 -07:00
+											sp = stk->gobuf.sp;
 											guard = stk->stackguard;
 											stk = (Stktop*)stk->stackbase;
 											n++;
 										}
 									}
 								}
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+								void
 								runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								{
 									FinBlock *block;
 									Finalizer *f;
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									runtime·lock(&gclock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									if(finq == nil || finq->cnt == finq->cap) {
 										if(finc == nil) {
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+											finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
 											finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											finc->alllink = allfin;
 											allfin = finc;
 										}
 										block = finc;
 										finc = block->next;
 										block->next = finq;
 										finq = block;
 									}
 									f = &finq->fin[finq->cnt];
 									finq->cnt++;
 									f->fn = fn;
 									f->nret = nret;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									f->fint = fint;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+									f->ot = ot;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									f->arg = p;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									runtime·unlock(&gclock);
 								}
 								void
 								runtime·MSpan_EnsureSwept(MSpan *s)
 								{
 									uint32 sg;
-												runtime: fix potential memory corruption
Reinforce the guarantee that MSpan_EnsureSwept actually ensures that the span is swept.
I have not observed crashes related to this, but I do not see why it can't crash as well.

LGTM=rsc
R=golang-codereviews
CC=golang-codereviews, khr, rsc
https://golang.org/cl/67990043

											
										
										
											2014-02-24 09:53:20 -07:00
+									// Caller must disable preemption.
 									// Otherwise when this function returns the span can become unswept again
 									// (if GC is triggered on another goroutine).
 									if(m->locks == 0 && m->mallocing == 0)
 										runtime·throw("MSpan_EnsureSwept: m is not locked");
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sg = runtime·mheap.sweepgen;
 									if(runtime·atomicload(&s->sweepgen) == sg)
 										return;
 									if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
 										runtime·MSpan_Sweep(s);
 										return;
 									}
 									// unfortunate condition, and we don't have efficient means to wait
 									while(runtime·atomicload(&s->sweepgen) != sg)
 										runtime·osyield();
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+								}
 								// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								// It clears the mark bits in preparation for the next GC round.
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+								// Returns true if the span was returned to heap.
 								bool
 								runtime·MSpan_Sweep(MSpan *s)
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+								{
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									int32 cl, n, npages, nfree;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									uintptr size, off, *bitp, shift, bits;
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									uint32 sweepgen;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									byte *p;
 									MCache *c;
 									byte *arena_start;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									MLink head, *end;
 									byte *type_data;
 									byte compression;
 									uintptr type_data_inc;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+									MLink *x;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									Special *special, **specialp, *y;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									bool res, sweepgenset;
-												runtime: tighten garbage collector
 * specialize sweepspan as sweepspan0 and sweepspan1.
 * in sweepspan1, inline "free" to avoid expensive mlookup.

R=iant
CC=golang-dev
https://golang.org/cl/206060

											
										
										
											2010-02-10 15:59:39 -07:00
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									// It's critical that we enter this function with preemption disabled,
 									// GC must not start while we are in the middle of this function.
 									if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
 										runtime·throw("MSpan_Sweep: m is not locked");
 									sweepgen = runtime·mheap.sweepgen;
 									if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+											s->state, s->sweepgen, sweepgen);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·throw("MSpan_Sweep: bad span state");
 									}
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									cl = s->sizeclass;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									size = s->elemsize;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									if(cl == 0) {
 										n = 1;
 									} else {
 										// Chunk full of small blocks.
 										npages = runtime·class_to_allocnpages[cl];
 										n = (npages << PageShift) / size;
 									}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									res = false;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									nfree = 0;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									end = &head;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									c = m->mcache;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									sweepgenset = false;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
 									// mark any free objects in this span so we don't collect them
 									for(x = s->freelist; x != nil; x = x->next) {
 										// This is markonly(x) but faster because we don't need
 										// atomic access and we're guaranteed to be pointing at
 										// the head of a valid object.
 										off = (uintptr*)x - (uintptr*)runtime·mheap.arena_start;
 										bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										*bitp |= bitMarked<<shift;
 									}
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									// Unlink & free special records for any objects we're about to free.
 									specialp = &s->specials;
 									special = *specialp;
 									while(special != nil) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+										// A finalizer can be set for an inner byte of an object, find object beginning.
 										p = (byte*)(s->start << PageShift) + special->offset/size*size;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+										off = (uintptr*)p - (uintptr*)arena_start;
 										bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										bits = *bitp>>shift;
 										if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
-												runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

											
										
										
											2014-01-24 11:35:11 -07:00
+											// Find the exact byte for which the special was setup
 											// (as opposed to object beginning).
 											p = (byte*)(s->start << PageShift) + special->offset;
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+											// about to free object: splice out special record
 											y = special;
 											special = special->next;
 											*specialp = special;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											if(!runtime·freespecial(y, p, size, false)) {
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+												// stop freeing of object if it has a finalizer
 												*bitp |= bitMarked << shift;
 											}
 										} else {
 											// object is still live: keep special record
 											specialp = &special->next;
 											special = *specialp;
 										}
 									}
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									type_data = (byte*)s->types.data;
 									type_data_inc = sizeof(uintptr);
 									compression = s->types.compression;
 									switch(compression) {
 									case MTypes_Bytes:
 										type_data += 8*sizeof(uintptr);
 										type_data_inc = 1;
 										break;
 									}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+									// Sweep through n objects of given size starting at p.
 									// This thread owns the span now, so it can manipulate
 									// the block bitmap without atomic operations.
-												runtime: use special records hung off the MSpan to
record finalizers and heap profile info.  Enables
removing the special bit from the heap bitmap.  Also
provides a generic mechanism for annotating occasional
heap objects.

finalizers
        overhead      per obj
old	680 B	      80 B avg
new	16 B/span     48 B

profile
        overhead      per obj
old	32KB	      24 B + hash tables
new	16 B/span     24 B

R=cshapiro, khr, dvyukov, gobot
CC=golang-codereviews
https://golang.org/cl/13314053

											
										
										
											2014-01-07 14:45:50 -07:00
+									p = (byte*)(s->start << PageShift);
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+									for(; n > 0; n--, p += size, type_data+=type_data_inc) {
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+										off = (uintptr*)p - (uintptr*)arena_start;
 										bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										bits = *bitp>>shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+										if((bits & bitAllocated) == 0)
 											continue;
 										if((bits & bitMarked) != 0) {
 											*bitp &= ~(bitMarked<<shift);
 											continue;
 										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+										// Clear mark, scan, and special bits.
 										*bitp &= ~((bitScan|bitMarked|bitSpecial)<<shift);
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+										if(cl == 0) {
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+											// Free large span.
 											runtime·unmarkspan(p, 1<<PageShift);
-												runtime: introduce MSpan.needzero instead of writing to span data

This cleans up the code significantly, and it avoids any
possible problems with madvise zeroing out some but
not all of the data.

Fixes #6400.

LGTM=dave
R=dvyukov, dave
CC=golang-codereviews
https://golang.org/cl/57680046

											
										
										
											2014-02-13 09:10:31 -07:00
+											s->needzero = 1;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											// important to set sweepgen before returning it to heap
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+											runtime·atomicstore(&s->sweepgen, sweepgen);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											sweepgenset = true;
-												runtime: add GODEBUG option for an electric fence like heap mode

When enabled this new debugging mode will allocate objects on
their own page and never recycle memory addresses.  This is an
essential tool to root cause a broad class of heap corruption.

R=golang-dev, dave, daniel.morsing, dvyukov, rsc, iant, cshapiro
CC=golang-dev
https://golang.org/cl/22060046

											
										
										
											2013-12-06 15:40:45 -07:00
+											if(runtime·debug.efence)
 												runtime·SysFree(p, size, &mstats.gc_sys);
 											else
 												runtime·MHeap_Free(&runtime·mheap, s, 1);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+											c->local_nlargefree++;
 											c->local_largefree += size;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											runtime·xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
 											res = true;
-												runtime: minor refactoring in preparation for parallel GC
factor sweepspan() out of sweep(), no logical changes

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991047

											
										
										
											2012-04-05 11:02:20 -06:00
+										} else {
 											// Free small object.
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+											switch(compression) {
 											case MTypes_Words:
 												*(uintptr*)type_data = 0;
 												break;
 											case MTypes_Bytes:
 												*(byte*)type_data = 0;
 												break;
 											}
-												runtime: zero 2-word memory blocks in-place
Currently for 2-word blocks we set the flag to clear the flag. Makes no sense.
In particular on 32-bits we call memclr always.

R=golang-codereviews, dave, iant
CC=golang-codereviews, khr, rsc
https://golang.org/cl/41170044

											
										
										
											2014-01-20 23:53:51 -07:00
+											if(size > 2*sizeof(uintptr))
-												runtime: use a distinct pattern to mark free blocks in need of zeroing

R=golang-dev, dvyukov, khr, cshapiro
CC=golang-dev
https://golang.org/cl/8392043

											
										
										
											2013-04-04 15:18:52 -06:00
+												((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
-												runtime: zero 2-word memory blocks in-place
Currently for 2-word blocks we set the flag to clear the flag. Makes no sense.
In particular on 32-bits we call memclr always.

R=golang-codereviews, dave, iant
CC=golang-codereviews, khr, rsc
https://golang.org/cl/41170044

											
										
										
											2014-01-20 23:53:51 -07:00
+											else if(size > sizeof(uintptr))
 												((uintptr*)p)[1] = 0;
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
+											end->next = (MLink*)p;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+											end = (MLink*)p;
 											nfree++;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										}
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									}
-												runtime: fix concurrent GC sweep
The issue was that one of the MSpan_Sweep callers
was doing sweep with preemption enabled.
Additional checks are added.

LGTM=rsc
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62990043

											
										
										
											2014-02-13 08:36:45 -07:00
+									if(!sweepgenset) {
 										// The span must be in our exclusive ownership until we update sweepgen,
 										// check for potential races.
 										if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
 											runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
 												s->state, s->sweepgen, sweepgen);
 											runtime·throw("MSpan_Sweep: bad span state after sweep");
 										}
 										runtime·atomicstore(&s->sweepgen, sweepgen);
 									}
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+									if(nfree) {
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+										c->local_nsmallfree[cl] += nfree;
-												runtime: speedup GC sweep phase (batch free)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              4370050250   3779668750  -13.51%
garbage.BenchmarkParser-2            3713087000   3628771500   -2.27%
garbage.BenchmarkParser-4            3519755250   3406349750   -3.22%
garbage.BenchmarkParser-8            3386627750   3319144000   -1.99%

garbage.BenchmarkTree                 493585529    408102411  -17.32%
garbage.BenchmarkTree-2               500487176    402285176  -19.62%
garbage.BenchmarkTree-4               473238882    361484058  -23.61%
garbage.BenchmarkTree-8               486977823    368334823  -24.36%

garbage.BenchmarkTree2                 31446600     31203200   -0.77%
garbage.BenchmarkTree2-2               21469000     21077900   -1.82%
garbage.BenchmarkTree2-4               11007600     10899100   -0.99%
garbage.BenchmarkTree2-8                7692400      7032600   -8.58%

garbage.BenchmarkParserPause          241863263    163249450  -32.50%
garbage.BenchmarkParserPause-2        120135418    112981575   -5.95%
garbage.BenchmarkParserPause-4         83411552     64580700  -22.58%
garbage.BenchmarkParserPause-8         51870697     42207244  -18.63%

garbage.BenchmarkTreePause             20940474     13147011  -37.22%
garbage.BenchmarkTreePause-2           20115124     11146715  -44.59%
garbage.BenchmarkTreePause-4           17217584      7486327  -56.52%
garbage.BenchmarkTreePause-8           18258845      7400871  -59.47%

garbage.BenchmarkTree2Pause           174067190    172674190   -0.80%
garbage.BenchmarkTree2Pause-2         131175809    130615761   -0.43%
garbage.BenchmarkTree2Pause-4          95406666     93972047   -1.50%
garbage.BenchmarkTree2Pause-8          86056095     85334952   -0.84%

garbage.BenchmarkParserLastPause      329932000    324790000   -1.56%
garbage.BenchmarkParserLastPause-2    209383000    210456000   +0.51%
garbage.BenchmarkParserLastPause-4    113981000    112921000   -0.93%
garbage.BenchmarkParserLastPause-8     77967000     76625000   -1.72%

garbage.BenchmarkTreeLastPause         29752000     18444000  -38.01%
garbage.BenchmarkTreeLastPause-2       24274000     14766000  -39.17%
garbage.BenchmarkTreeLastPause-4       19565000      8726000  -55.40%
garbage.BenchmarkTreeLastPause-8       21956000     10530000  -52.04%

garbage.BenchmarkTree2LastPause       314411000    311945000   -0.78%
garbage.BenchmarkTree2LastPause-2     214641000    210836000   -1.77%
garbage.BenchmarkTree2LastPause-4     110024000    108943000   -0.98%
garbage.BenchmarkTree2LastPause-8      76873000     70263000   -8.60%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5991049

											
										
										
											2012-04-12 02:01:24 -06:00
+										c->local_cachealloc -= nfree * size;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
 										res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], s, nfree, head.next, end);
 									}
 									return res;
 								}
 								// State of background sweep.
 								// Pretected by gclock.
 								static struct
 								{
 									G*	g;
 									bool	parked;
 									MSpan**	spans;
 									uint32	nspan;
 									uint32	spanidx;
 								} sweep;
 								// background sweeping goroutine
 								static void
 								bgsweep(void)
 								{
 									g->issystem = 1;
 									for(;;) {
 										while(runtime·sweepone() != -1) {
 											gcstats.nbgsweep++;
 											runtime·gosched();
 										}
 										runtime·lock(&gclock);
 										if(finq != nil) {
 											// kick off or wake up goroutine to run queued finalizers
 											if(fing == nil)
 												fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
 											else if(fingwait) {
 												fingwait = 0;
 												runtime·ready(fing);
 											}
 										}
 										sweep.parked = true;
 										runtime·parkunlock(&gclock, "GC sweep wait");
 									}
 								}
 								// sweeps one span
 								// returns number of pages returned to heap, or -1 if there is nothing to sweep
 								uintptr
 								runtime·sweepone(void)
 								{
 									MSpan *s;
 									uint32 idx, sg;
 									uintptr npages;
 									// increment locks to ensure that the goroutine is not preempted
 									// in the middle of sweep thus leaving the span in an inconsistent state for next GC
 									m->locks++;
 									sg = runtime·mheap.sweepgen;
 									for(;;) {
 										idx = runtime·xadd(&sweep.spanidx, 1) - 1;
 										if(idx >= sweep.nspan) {
 											runtime·mheap.sweepdone = true;
 											m->locks--;
 											return -1;
 										}
 										s = sweep.spans[idx];
 										if(s->state != MSpanInUse) {
 											s->sweepgen = sg;
 											continue;
 										}
 										if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
 											continue;
 										npages = s->npages;
 										if(!runtime·MSpan_Sweep(s))
 											npages = 0;
 										m->locks--;
 										return npages;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									}
 								}
-												runtime: add memorydump() debugging function

R=golang-dev
CC=golang-dev, remyoudompheng, rsc
https://golang.org/cl/6780059

											
										
										
											2012-11-01 10:56:25 -06:00
+								static void
 								dumpspan(uint32 idx)
 								{
 									int32 sizeclass, n, npages, i, column;
 									uintptr size;
 									byte *p;
 									byte *arena_start;
 									MSpan *s;
 									bool allocated, special;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									s = runtime·mheap.allspans[idx];
-												runtime: add memorydump() debugging function

R=golang-dev
CC=golang-dev, remyoudompheng, rsc
https://golang.org/cl/6780059

											
										
										
											2012-11-01 10:56:25 -06:00
+									if(s->state != MSpanInUse)
 										return;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									arena_start = runtime·mheap.arena_start;
-												runtime: add memorydump() debugging function

R=golang-dev
CC=golang-dev, remyoudompheng, rsc
https://golang.org/cl/6780059

											
										
										
											2012-11-01 10:56:25 -06:00
+									p = (byte*)(s->start << PageShift);
 									sizeclass = s->sizeclass;
 									size = s->elemsize;
 									if(sizeclass == 0) {
 										n = 1;
 									} else {
 										npages = runtime·class_to_allocnpages[sizeclass];
 										n = (npages << PageShift) / size;
 									}
 									runtime·printf("%p .. %p:\n", p, p+n*size);
 									column = 0;
 									for(; n>0; n--, p+=size) {
 										uintptr off, *bitp, shift, bits;
 										off = (uintptr*)p - (uintptr*)arena_start;
 										bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 										shift = off % wordsPerBitmapWord;
 										bits = *bitp>>shift;
 										allocated = ((bits & bitAllocated) != 0);
 										special = ((bits & bitSpecial) != 0);
 										for(i=0; i<size; i+=sizeof(void*)) {
 											if(column == 0) {
 												runtime·printf("\t");
 											}
 											if(i == 0) {
 												runtime·printf(allocated ? "(" : "[");
 												runtime·printf(special ? "@" : "");
 												runtime·printf("%p: ", p+i);
 											} else {
 												runtime·printf(" ");
 											}
 											runtime·printf("%p", *(void**)(p+i));
 											if(i+sizeof(void*) >= size) {
 												runtime·printf(allocated ? ") " : "] ");
 											}
 											column++;
 											if(column == 8) {
 												runtime·printf("\n");
 												column = 0;
 											}
 										}
 									}
 									runtime·printf("\n");
 								}
 								// A debugging function to dump the contents of memory
 								void
 								runtime·memorydump(void)
 								{
 									uint32 spanidx;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
-												runtime: add memorydump() debugging function

R=golang-dev
CC=golang-dev, remyoudompheng, rsc
https://golang.org/cl/6780059

											
										
										
											2012-11-01 10:56:25 -06:00
+										dumpspan(spanidx);
 									}
 								}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+								void
 								runtime·gchelper(void)
 								{
-												runtime: fix data race in GC
Fixes #5139.
Update #7065.

R=golang-codereviews, bradfitz, minux.ma
CC=golang-codereviews
https://golang.org/cl/52090045

											
										
										
											2014-01-15 08:38:08 -07:00
+									int32 nproc;
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									gchelperstart();
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									// parallel mark for over gc roots
 									runtime·parfordo(work.markfor);
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									// help other threads scan secondary blocks
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									scanblock(nil, true);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									bufferList[m->helpgc].busy = 0;
-												runtime: fix data race in GC
Fixes #5139.
Update #7065.

R=golang-codereviews, bradfitz, minux.ma
CC=golang-codereviews
https://golang.org/cl/52090045

											
										
										
											2014-01-15 08:38:08 -07:00
+									nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
 									if(runtime·xadd(&work.ndone, +1) == nproc-1)
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										runtime·notewakeup(&work.alldone);
 								}
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								static void
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+								cachestats(void)
 								{
 									MCache *c;
 									P *p, **pp;
 									for(pp=runtime·allp; p=*pp; pp++) {
 										c = p->mcache;
 										if(c==nil)
 											continue;
 										runtime·purgecachedstats(c);
 									}
 								}
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+								static void
 								flushallmcaches(void)
 								{
 									P *p, **pp;
 									MCache *c;
 									// Flush MCache's to MCentral.
 									for(pp=runtime·allp; p=*pp; pp++) {
 										c = p->mcache;
 										if(c==nil)
 											continue;
 										runtime·MCache_ReleaseAll(c);
 									}
 								}
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+								static void
 								updatememstats(GCStats *stats)
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								{
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									M *mp;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									MSpan *s;
-												runtime: add per-M caches for MemStats
Avoid touching centralized state during
memory manager opreations.

R=rsc
CC=golang-dev
https://golang.org/cl/4766042

											
										
										
											2011-07-18 12:52:57 -06:00
+									int32 i;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									uint64 stacks_inuse, smallfree;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+									uint64 *src, *dst;
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+									if(stats)
 										runtime·memclr((byte*)stats, sizeof(*stats));
-												runtime: add per-M caches for MemStats
Avoid touching centralized state during
memory manager opreations.

R=rsc
CC=golang-dev
https://golang.org/cl/4766042

											
										
										
											2011-07-18 12:52:57 -06:00
+									stacks_inuse = 0;
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									for(mp=runtime·allm; mp; mp=mp->alllink) {
-												runtime: less aggressive per-thread stack segment caching
Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.

benchmark                      old ns/op    new ns/op    delta
BenchmarkStackGrowth                 665          656   -1.35%
BenchmarkStackGrowth-2               333          328   -1.50%
BenchmarkStackGrowth-4               224          172  -23.21%
BenchmarkStackGrowth-8               124           91  -26.13%
BenchmarkStackGrowth-16               82           47  -41.94%
BenchmarkStackGrowth-32               73           40  -44.79%

BenchmarkStackGrowthDeep           97231        94391   -2.92%
BenchmarkStackGrowthDeep-2         47230        58562  +23.99%
BenchmarkStackGrowthDeep-4         24993        49356  +97.48%
BenchmarkStackGrowthDeep-8         15105        30072  +99.09%
BenchmarkStackGrowthDeep-16        10005        15623  +56.15%
BenchmarkStackGrowthDeep-32        12517        13069   +4.41%

TestStackMem#1,MB                  310          12       -96.13%
TestStackMem#2,MB                  296          14       -95.27%
TestStackMem#3,MB                  479          14       -97.08%

TestStackMem#1,sec                 3.22         2.26     -29.81%
TestStackMem#2,sec                 2.43         2.15     -11.52%
TestStackMem#3,sec                 2.50         2.38      -4.80%

R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044

											
										
										
											2013-01-09 22:57:06 -07:00
+										stacks_inuse += mp->stackinuse*FixedStack;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										if(stats) {
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+											src = (uint64*)&mp->gcstats;
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											dst = (uint64*)stats;
 											for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
 												dst[i] += src[i];
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+											runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+										}
-												runtime: improved scheduler
Distribute runnable queues, memory cache
and cache of dead G's per processor.
Faster non-blocking syscall enter/exit.
More conservative worker thread blocking/unblocking.

R=dave, bradfitz, remyoudompheng, rsc
CC=golang-dev
https://golang.org/cl/7314062

											
										
										
											2013-03-01 04:49:16 -07:00
+									}
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									mstats.stacks_inuse = stacks_inuse;
-												runtime: account for all sys memory in MemStats
Currently lots of sys allocations are not accounted in any of XxxSys,
including GC bitmap, spans table, GC roots blocks, GC finalizer blocks,
iface table, netpoll descriptors and more. Up to ~20% can unaccounted.
This change introduces 2 new stats: GCSys and OtherSys for GC metadata
and all other misc allocations, respectively.
Also ensures that all XxxSys indeed sum up to Sys. All sys memory allocation
functions require the stat for accounting, so that it's impossible to miss something.
Also fix updating of mcache_sys/inuse, they were not updated after deallocation.

test/bench/garbage/parser before:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14204928
MCacheSys	16384
BuckHashSys	1439992

after:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14188544
MCacheSys	16384
BuckHashSys	3194304
GCSys		39198688
OtherSys	3129656

Fixes #5799.

R=rsc, dave, alex.brainman
CC=golang-dev
https://golang.org/cl/12946043

											
										
										
											2013-09-06 14:55:40 -06:00
+									mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
 									mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
 									mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
 										mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									// Calculate memory allocator stats.
 									// During program execution we only count number of frees and amount of freed memory.
 									// Current number of alive object in the heap and amount of alive heap memory
 									// are calculated by scanning all spans.
 									// Total number of mallocs is calculated as number of frees plus number of alive objects.
 									// Similarly, total amount of allocated memory is calculated as amount of freed memory
 									// plus amount of alive heap memory.
 									mstats.alloc = 0;
 									mstats.total_alloc = 0;
 									mstats.nmalloc = 0;
 									mstats.nfree = 0;
 									for(i = 0; i < nelem(mstats.by_size); i++) {
 										mstats.by_size[i].nmalloc = 0;
 										mstats.by_size[i].nfree = 0;
 									}
 									// Flush MCache's to MCentral.
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									flushallmcaches();
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
 									// Aggregate local stats.
 									cachestats();
 									// Scan all spans and count number of alive objects.
 									for(i = 0; i < runtime·mheap.nspan; i++) {
 										s = runtime·mheap.allspans[i];
 										if(s->state != MSpanInUse)
 											continue;
 										if(s->sizeclass == 0) {
 											mstats.nmalloc++;
 											mstats.alloc += s->elemsize;
 										} else {
 											mstats.nmalloc += s->ref;
 											mstats.by_size[s->sizeclass].nmalloc += s->ref;
 											mstats.alloc += s->ref*s->elemsize;
 										}
 									}
 									// Aggregate by size class.
 									smallfree = 0;
 									mstats.nfree = runtime·mheap.nlargefree;
 									for(i = 0; i < nelem(mstats.by_size); i++) {
 										mstats.nfree += runtime·mheap.nsmallfree[i];
 										mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
 										mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
 										smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
 									}
 									mstats.nmalloc += mstats.nfree;
 									// Calculate derived stats.
 									mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
 									mstats.heap_alloc = mstats.alloc;
 									mstats.heap_objects = mstats.nmalloc - mstats.nfree;
-												runtime: use manual stack for garbage collection

Old code was using recursion to traverse object graph.
New code uses an explicit stack, cutting the per-pointer
footprint to two words during the recursion and avoiding
the standard allocator and stack splitting code.

in test/garbage:

Reduces parser runtime by 2-3%
Reduces Peano runtime by 40%
Increases tree runtime by 4-5%

R=r
CC=golang-dev
https://golang.org/cl/2150042

											
										
										
											2010-09-07 07:57:22 -06:00
+								}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								// Structure of arguments passed to function gc().
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								// This allows the arguments to be passed via runtime·mcall.
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								struct gc_args
 								{
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									int64 start_time; // start time of GC in ns (just before stoptheworld)
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								};
 								static void gc(struct gc_args *args);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								static void mgc(G *gp);
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								static int32
 								readgogc(void)
 								{
 									byte *p;
 									p = runtime·getenv("GOGC");
 									if(p == nil || p[0] == '\0')
 										return 100;
 									if(runtime·strcmp(p, (byte*)"off") == 0)
 										return -1;
 									return runtime·atoi(p);
 								}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								void
-												runtime: ,s/[a-zA-Z0-9_]+/runtime·&/g, almost

Prefix all external symbols in runtime by runtime·,
to avoid conflicts with possible symbols of the same
name in linked-in C libraries.  The obvious conflicts
are printf, malloc, and free, but hide everything to
avoid future pain.

The symbols left alone are:

	** known to cgo **
	_cgo_free
	_cgo_malloc
	libcgo_thread_start
	initcgo
	ncgocall

	** known to linker **
	_rt0_$GOARCH
	_rt0_$GOARCH_$GOOS
	text
	etext
	data
	end
	pclntab
	epclntab
	symtab
	esymtab

	** known to C compiler **
	_divv
	_modv
	_div64by32
	etc (arch specific)

Tested on darwin/386, darwin/amd64, linux/386, linux/amd64.

Built (but not tested) for freebsd/386, freebsd/amd64, linux/arm, windows/386.

R=r, PeterGo
CC=golang-dev
https://golang.org/cl/2899041

											
										
										
											2010-11-04 12:00:19 -06:00
+								runtime·gc(int32 force)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								{
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									struct gc_args a;
 									int32 i;
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
-												cmd/ld: use 64-bit alignment for large data and bss objects

Check for specific, important misalignment in garbage collector.
Not a complete fix for issue 599 but an important workaround.

Update #599.

R=golang-dev, iant, dvyukov
CC=golang-dev
https://golang.org/cl/6641049

											
										
										
											2012-10-09 10:50:06 -06:00
+									// The atomic operations are not atomic if the uint64s
 									// are not aligned on uint64 boundaries. This has been
 									// a problem in the past.
 									if((((uintptr)&work.empty) & 7) != 0)
 										runtime·throw("runtime: gc work buffer is misaligned");
-												runtime: fix misaligned 64-bit atomic
Fixes #4869.
Fixes #5007.
Update #5005.

R=golang-dev, 0xe2.0x9a.0x9b, bradfitz
CC=golang-dev
https://golang.org/cl/7534044

											
										
										
											2013-03-10 10:46:11 -06:00
+									if((((uintptr)&work.full) & 7) != 0)
 										runtime·throw("runtime: gc work buffer is misaligned");
-												cmd/ld: use 64-bit alignment for large data and bss objects

Check for specific, important misalignment in garbage collector.
Not a complete fix for issue 599 but an important workaround.

Update #599.

R=golang-dev, iant, dvyukov
CC=golang-dev
https://golang.org/cl/6641049

											
										
										
											2012-10-09 10:50:06 -06:00
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									// The gc is turned off (via enablegc) until
 									// the bootstrap has completed.
 									// Also, malloc gets called in the guts
 									// of a number of libraries that might be
 									// holding locks.  To avoid priority inversion
 									// problems, don't bother trying to run gc
 									// while holding a lock.  The next mallocgc
 									// without a lock will do the gc instead.
-												runtime: do not trigger GC on g0
GC acquires worldsema, which is a goroutine-level semaphore
which parks goroutines. g0 can not be parked.
Fixes #6193.

R=khr, khr
CC=golang-dev
https://golang.org/cl/12880045

											
										
										
											2013-08-21 16:17:45 -06:00
+									if(!mstats.enablegc || g == m->g0 || m->locks > 0 || runtime·panicking)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										return;
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									if(gcpercent == GcpercentUnknown) {	// first time through
-												runtime: fix race condition between GC and setGCPercent
If first GC runs concurrently with setGCPercent,
it can overwrite gcpercent value with default.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/10242047

											
										
										
											2013-06-15 06:07:06 -06:00
+										runtime·lock(&runtime·mheap);
 										if(gcpercent == GcpercentUnknown)
 											gcpercent = readgogc();
 										runtime·unlock(&runtime·mheap);
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+									}
-												fix 386 malloc tests,
detect 386 darwin breakpoint line.

R=r
DELTA=22  (4 added, 0 deleted, 18 changed)
OCL=29929
CL=29944

											
										
										
											2009-06-05 11:59:37 -06:00
+									if(gcpercent < 0)
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+										return;
-												net: add special netFD mutex
The mutex, fdMutex, handles locking and lifetime of sysfd,
and serializes Read and Write methods.
This allows to strip 2 sync.Mutex.Lock calls,
2 sync.Mutex.Unlock calls, 1 defer and some amount
of misc overhead from every network operation.

On linux/amd64, Intel E5-2690:
benchmark                             old ns/op    new ns/op    delta
BenchmarkTCP4Persistent                    9595         9454   -1.47%
BenchmarkTCP4Persistent-2                  8978         8772   -2.29%
BenchmarkTCP4ConcurrentReadWrite           4900         4625   -5.61%
BenchmarkTCP4ConcurrentReadWrite-2         2603         2500   -3.96%

In general it strips 70-500 ns from every network operation depending
on processor model. On my relatively new E5-2690 it accounts to ~5%
of network op cost.

Fixes #6074.

R=golang-dev, bradfitz, alex.brainman, iant, mikioh.mikioh
CC=golang-dev
https://golang.org/cl/12418043

											
										
										
											2013-08-09 11:43:00 -06:00
+									runtime·semacquire(&runtime·worldsema, false);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									if(!force && mstats.heap_alloc < mstats.next_gc) {
 										// typically threads which lost the race to grab
 										// worldsema exit here when gc is done.
 										runtime·semrelease(&runtime·worldsema);
 										return;
 									}
 									// Ok, we're doing it!  Stop everybody else
 									a.start_time = runtime·nanotime();
 									m->gcing = 1;
 									runtime·stoptheworld();
-												runtime: emit collection stacks in GODEBUG=allocfreetrace mode

R=khr, dvyukov
CC=golang-codereviews
https://golang.org/cl/51830043

											
										
										
											2014-01-14 08:39:50 -07:00
 									if(runtime·debug.allocfreetrace)
 										runtime·MProf_TraceGC();
-												sync: add Pool type

Adds the Pool type and docs, and use it in fmt.
This is a temporary implementation, until Dmitry
makes it fast.

Uses the API proposal from Russ in http://goo.gl/cCKeb2 but
adds an optional New field, as used in fmt and elsewhere.
Almost all callers want that.

Update #4720

R=golang-dev, rsc, cshapiro, iant, r, dvyukov, khr
CC=golang-dev
https://golang.org/cl/41860043

											
										
										
											2013-12-18 12:08:34 -07:00
 									clearpools();
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									// Run gc on the g0 stack.  We do this so that the g stack
 									// we're currently running on will no longer change.  Cuts
 									// the root set down a bit (g0 stacks are not scanned, and
 									// we don't need to scan gc's internal state).  Also an
 									// enabler for copyable stacks.
-												runtime: introduce GODEBUG env var
Currently it replaces GOGCTRACE env var (GODEBUG=gctrace=1).
The plan is to extend it with other type of debug tracing,
e.g. GODEBUG=gctrace=1,schedtrace=100.

R=rsc
CC=bradfitz, daniel.morsing, gobot, golang-dev
https://golang.org/cl/10026045

											
										
										
											2013-06-28 08:37:06 -06:00
+									for(i = 0; i < (runtime·debug.gctrace > 1 ? 2 : 1); i++) {
-												runtime: do not trigger GC on g0
GC acquires worldsema, which is a goroutine-level semaphore
which parks goroutines. g0 can not be parked.
Fixes #6193.

R=khr, khr
CC=golang-dev
https://golang.org/cl/12880045

											
										
										
											2013-08-21 16:17:45 -06:00
+										// switch to g0, call gc(&a), then switch back
 										g->param = &a;
 										g->status = Gwaiting;
 										g->waitreason = "garbage collection";
 										runtime·mcall(mgc);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+										// record a new start time in case we're going around again
 										a.start_time = runtime·nanotime();
 									}
 									// all done
-												runtime: more reliable preemption
Currently preemption signal g->stackguard0==StackPreempt
can be lost if it is received when preemption is disabled
(e.g. m->lock!=0). This change duplicates the preemption
signal in g->preempt and restores g->stackguard0
when preemption is enabled.
Update #543.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/10792043

											
										
										
											2013-07-17 10:52:37 -06:00
+									m->gcing = 0;
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									m->locks++;
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									runtime·semrelease(&runtime·worldsema);
 									runtime·starttheworld();
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									m->locks--;
-												runtime: fix non-concurrent sweep

State of the world:

CL 46430043 introduced a new concurrent sweep but is broken.

CL 62360043 made the new sweep non-concurrent
to try to fix the world while we understand what's wrong with
the concurrent version.

This CL fixes the non-concurrent form to run finalizers.
This CL is just a band-aid to get the build green again.

Dmitriy is working on understanding and then fixing what's
wrong with the concurrent sweep.

TBR=dvyukov
CC=golang-codereviews
https://golang.org/cl/62370043

											
										
										
											2014-02-12 13:54:21 -07:00
 									// now that gc is done, kick off finalizer thread if needed
 									if(!ConcurrentSweep) {
 										if(finq != nil) {
 											runtime·lock(&gclock);
 											// kick off or wake up goroutine to run queued finalizers
 											if(fing == nil)
 												fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
 											else if(fingwait) {
 												fingwait = 0;
 												runtime·ready(fing);
 											}
 											runtime·unlock(&gclock);
 										}
 										// give the queued finalizers, if any, a chance to run
 										runtime·gosched();
 									}
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								}
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								static void
 								mgc(G *gp)
 								{
 									gc(gp->param);
 									gp->param = nil;
-												runtime: record proper goroutine state during stack split

Until now, the goroutine state has been scattered during the
execution of newstack and oldstack. It's all there, and those routines
know how to get back to a working goroutine, but other pieces of
the system, like stack traces, do not. If something does interrupt
the newstack or oldstack execution, the rest of the system can't
understand the goroutine. For example, if newstack decides there
is an overflow and calls throw, the stack tracer wouldn't dump the
goroutine correctly.

For newstack to save a useful state snapshot, it needs to be able
to rewind the PC in the function that triggered the split back to
the beginning of the function. (The PC is a few instructions in, just
after the call to morestack.) To make that possible, we change the
prologues to insert a jmp back to the beginning of the function
after the call to morestack. That is, the prologue used to be roughly:

        TEXT myfunc
                check for split
                jmpcond nosplit
                call morestack
        nosplit:
                sub $xxx, sp

Now an extra instruction is inserted after the call:

        TEXT myfunc
        start:
                check for split
                jmpcond nosplit
                call morestack
                jmp start
        nosplit:
                sub $xxx, sp

The jmp is not executed directly. It is decoded and simulated by
runtime.rewindmorestack to discover the beginning of the function,
and then the call to morestack returns directly to the start label
instead of to the jump instruction. So logically the jmp is still
executed, just not by the cpu.

The prologue thus repeats in the case of a function that needs a
stack split, but against the cost of the split itself, the extra few
instructions are noise. The repeated prologue has the nice effect of
making a stack split double-check that the new stack is big enough:
if morestack happens to return on a too-small stack, we'll now notice
before corruption happens.

The ability for newstack to rewind to the beginning of the function
should help preemption too. If newstack decides that it was called
for preemption instead of a stack split, it now has the goroutine state
correctly paused if rescheduling is needed, and when the goroutine
can run again, it can return to the start label on its original stack
and re-execute the split check.

Here is an example of a split stack overflow showing the full
trace, without any special cases in the stack printer.
(This one was triggered by making the split check incorrect.)

runtime: newstack framesize=0x0 argsize=0x18 sp=0x6aebd0 stack=[0x6b0000, 0x6b0fa0]
        morebuf={pc:0x69f5b sp:0x6aebd8 lr:0x0}
        sched={pc:0x68880 sp:0x6aebd0 lr:0x0 ctxt:0x34e700}
runtime: split stack overflow: 0x6aebd0 < 0x6b0000
fatal error: runtime: split stack overflow

goroutine 1 [stack split]:
runtime.mallocgc(0x290, 0x100000000, 0x1)
        /Users/rsc/g/go/src/pkg/runtime/zmalloc_darwin_amd64.c:21 fp=0x6aebd8
runtime.new()
        /Users/rsc/g/go/src/pkg/runtime/zmalloc_darwin_amd64.c:682 +0x5b fp=0x6aec08
go/build.(*Context).Import(0x5ae340, 0xc210030c71, 0xa, 0xc2100b4380, 0x1b, ...)
        /Users/rsc/g/go/src/pkg/go/build/build.go:424 +0x3a fp=0x6b00a0
main.loadImport(0xc210030c71, 0xa, 0xc2100b4380, 0x1b, 0xc2100b42c0, ...)
        /Users/rsc/g/go/src/cmd/go/pkg.go:249 +0x371 fp=0x6b01a8
main.(*Package).load(0xc21017c800, 0xc2100b42c0, 0xc2101828c0, 0x0, 0x0, ...)
        /Users/rsc/g/go/src/cmd/go/pkg.go:431 +0x2801 fp=0x6b0c98
main.loadPackage(0x369040, 0x7, 0xc2100b42c0, 0x0)
        /Users/rsc/g/go/src/cmd/go/pkg.go:709 +0x857 fp=0x6b0f80
----- stack segment boundary -----
main.(*builder).action(0xc2100902a0, 0x0, 0x0, 0xc2100e6c00, 0xc2100e5750, ...)
        /Users/rsc/g/go/src/cmd/go/build.go:539 +0x437 fp=0x6b14a0
main.(*builder).action(0xc2100902a0, 0x0, 0x0, 0xc21015b400, 0x2, ...)
        /Users/rsc/g/go/src/cmd/go/build.go:528 +0x1d2 fp=0x6b1658
main.(*builder).test(0xc2100902a0, 0xc210092000, 0x0, 0x0, 0xc21008ff60, ...)
        /Users/rsc/g/go/src/cmd/go/test.go:622 +0x1b53 fp=0x6b1f68
----- stack segment boundary -----
main.runTest(0x5a6b20, 0xc21000a020, 0x2, 0x2)
        /Users/rsc/g/go/src/cmd/go/test.go:366 +0xd09 fp=0x6a5cf0
main.main()
        /Users/rsc/g/go/src/cmd/go/main.go:161 +0x4f9 fp=0x6a5f78
runtime.main()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:183 +0x92 fp=0x6a5fa0
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1266 fp=0x6a5fa8

And here is a seg fault during oldstack:

SIGSEGV: segmentation violation
PC=0x1b2a6

runtime.oldstack()
        /Users/rsc/g/go/src/pkg/runtime/stack.c:159 +0x76
runtime.lessstack()
        /Users/rsc/g/go/src/pkg/runtime/asm_amd64.s:270 +0x22

goroutine 1 [stack unsplit]:
fmt.(*pp).printArg(0x2102e64e0, 0xe5c80, 0x2102c9220, 0x73, 0x0, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:818 +0x3d3 fp=0x221031e6f8
fmt.(*pp).doPrintf(0x2102e64e0, 0x12fb20, 0x2, 0x221031eb98, 0x1, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:1183 +0x15cb fp=0x221031eaf0
fmt.Sprintf(0x12fb20, 0x2, 0x221031eb98, 0x1, 0x1, ...)
        /Users/rsc/g/go/src/pkg/fmt/print.go:234 +0x67 fp=0x221031eb40
flag.(*stringValue).String(0x2102c9210, 0x1, 0x0)
        /Users/rsc/g/go/src/pkg/flag/flag.go:180 +0xb3 fp=0x221031ebb0
flag.(*FlagSet).Var(0x2102f6000, 0x293d38, 0x2102c9210, 0x143490, 0xa, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:633 +0x40 fp=0x221031eca0
flag.(*FlagSet).StringVar(0x2102f6000, 0x2102c9210, 0x143490, 0xa, 0x12fa60, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:550 +0x91 fp=0x221031ece8
flag.(*FlagSet).String(0x2102f6000, 0x143490, 0xa, 0x12fa60, 0x0, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:563 +0x87 fp=0x221031ed38
flag.String(0x143490, 0xa, 0x12fa60, 0x0, 0x161950, ...)
        /Users/rsc/g/go/src/pkg/flag/flag.go:570 +0x6b fp=0x221031ed80
testing.init()
        /Users/rsc/g/go/src/pkg/testing/testing.go:-531 +0xbb fp=0x221031edc0
strings_test.init()
        /Users/rsc/g/go/src/pkg/strings/strings_test.go:1115 +0x62 fp=0x221031ef70
main.init()
        strings/_test/_testmain.go:90 +0x3d fp=0x221031ef78
runtime.main()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:180 +0x8a fp=0x221031efa0
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1269 fp=0x221031efa8

goroutine 2 [runnable]:
runtime.MHeap_Scavenger()
        /Users/rsc/g/go/src/pkg/runtime/mheap.c:438
runtime.goexit()
        /Users/rsc/g/go/src/pkg/runtime/proc.c:1269
created by runtime.main
        /Users/rsc/g/go/src/pkg/runtime/proc.c:166

rax     0x23ccc0
rbx     0x23ccc0
rcx     0x0
rdx     0x38
rdi     0x2102c0170
rsi     0x221032cfe0
rbp     0x221032cfa0
rsp     0x7fff5fbff5b0
r8      0x2102c0120
r9      0x221032cfa0
r10     0x221032c000
r11     0x104ce8
r12     0xe5c80
r13     0x1be82baac718
r14     0x13091135f7d69200
r15     0x0
rip     0x1b2a6
rflags  0x10246
cs      0x2b
fs      0x0
gs      0x0

Fixes #5723.

R=r, dvyukov, go.peter.90, dave, iant
CC=golang-dev
https://golang.org/cl/10360048

											
										
										
											2013-06-27 09:32:01 -06:00
+									gp->status = Grunning;
-												runtime: add lr, ctxt, ret to Gobuf

Add gostartcall and gostartcallfn.
The old gogocall = gostartcall + gogo.
The old gogocallfn = gostartcallfn + gogo.

R=dvyukov, minux.ma
CC=golang-dev
https://golang.org/cl/10036044

											
										
										
											2013-06-12 13:22:26 -06:00
+									runtime·gogo(&gp->sched);
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+								}
-												cmd/gc, reflect, runtime: switch to indirect func value representation

Step 1 of http://golang.org/s/go11func.

R=golang-dev, r, daniel.morsing, remyoudompheng
CC=golang-dev
https://golang.org/cl/7393045

											
										
										
											2013-02-21 15:01:13 -07:00
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+								static void
 								gc(struct gc_args *args)
 								{
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									int64 t0, t1, t2, t3, t4;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									uint64 heap0, heap1, obj, ninstr;
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+									GCStats stats;
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									M *mp;
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
+									uint32 i;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									Eface eface;
-												runtime: use reflect·call() to enter the function gc()

Garbage collection code (to be merged later) is calling functions
which have many local variables. This increases the probability that
the stack capacity won't be big enough to hold the local variables.
So, start gc() on a bigger stack to eliminate a potentially large number
of calls to runtime·morestack().

R=rsc, remyoudompheng, dsymonds, minux.ma, iant, iant
CC=golang-dev
https://golang.org/cl/6846044

											
										
										
											2012-11-27 11:04:59 -07:00
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									t0 = args->start_time;
-												runtime: output how long goroutines are blocked
Example of output:

goroutine 4 [sleep for 3 min]:
time.Sleep(0x34630b8a000)
        src/pkg/runtime/time.goc:31 +0x31
main.func·002()
        block.go:16 +0x2c
created by main.main
        block.go:17 +0x33

Full program and output are here:
http://play.golang.org/p/NEZdADI3Td

Fixes #6809.

R=golang-codereviews, khr, kamil.kisiel, bradfitz, rsc
CC=golang-codereviews
https://golang.org/cl/50420043

											
										
										
											2014-01-16 01:54:46 -07:00
+									work.tstart = args->start_time;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+									if(CollectStats)
 										runtime·memclr((byte*)&gcstats, sizeof(gcstats));
-												runtime: use "mp" and "gp" instead of "m" and "g" for local variable name to avoid confusion with the global "m" and "g".

R=golang-dev, minux.ma, rsc
CC=bradfitz, golang-dev
https://golang.org/cl/6939064

											
										
										
											2012-12-18 09:30:29 -07:00
+									for(mp=runtime·allm; mp; mp=mp->alllink)
-												runtime: remove dead code
Remove dead code related to allocation of type metadata with SysAlloc.

R=golang-dev, bradfitz
CC=golang-dev
https://golang.org/cl/12311045

											
										
										
											2013-08-04 13:32:06 -06:00
+										runtime·settype_flush(mp);
-												runtime: add types to MSpan

R=rsc
CC=golang-dev
https://golang.org/cl/6554060

											
										
										
											2012-09-24 18:08:05 -06:00
-												runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121

											
										
										
											2012-12-16 17:32:12 -07:00
+									m->locks++;	// disable gc during mallocs in parforalloc
 									if(work.markfor == nil)
 										work.markfor = runtime·parforalloc(MaxGcproc);
 									m->locks--;
-												runtime: interpret type information during garbage collection

R=rsc, dvyukov, remyoudompheng, dave, minux.ma, bradfitz
CC=golang-dev
https://golang.org/cl/6945069

											
										
										
											2013-01-10 13:45:46 -07:00
+									if(itabtype == nil) {
 										// get C pointer to the Go type "itab"
 										runtime·gc_itab_ptr(&eface);
 										itabtype = ((PtrType*)eface.type)->elem;
 									}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									t1 = runtime·nanotime();
 									// Sweep what is not sweeped by bgsweep.
 									while(runtime·sweepone() != -1)
 										gcstats.npausesweep++;
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+									work.nwait = 0;
 									work.ndone = 0;
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									work.nproc = runtime·gcprocs();
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									if(work.nproc > 1) {
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+										runtime·noteclear(&work.alldone);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+										runtime·helpgc(work.nproc);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
+									}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									t2 = runtime·nanotime();
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+									gchelperstart();
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
+									runtime·parfordo(work.markfor);
-												runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase.
This hinders parallelism.
With this change we just kick off parallel for for number_of_goroutines+5 iterations.
Then parallel for callback decides whether it needs to scan stack of a goroutine
scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely.
This requires to store all goroutines in an array instead of a linked list
(to allow direct indexing).
This CL also removes DebugScan functionality. It is broken because it uses
unbounded stack, so it can not run on g0. When it was working, I've found
it helpless for debugging issues because the two algorithms are too different now.
This change would require updating the DebugScan, so it's simpler to just delete it.

With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same.

garbage-8
allocated                 2987886      2989221      +0.04%
allocs                      62885        62887      +0.00%
cputime                  21286000     21272000      -0.07%
gc-pause-one             26633247     24885421      -6.56%
gc-pause-total             873570       811264      -7.13%
rss                     242089984    242515968      +0.18%
sys-gc                   13934336     13869056      -0.47%
sys-heap                205062144    205062144      +0.00%
sys-other                12628288     12628288      +0.00%
sys-stack                11534336     11927552      +3.41%
sys-total               243159104    243487040      +0.13%
time                      2809477      2740795      -2.44%

R=golang-codereviews, rsc
CC=cshapiro, golang-codereviews, khr
https://golang.org/cl/46860043

											
										
										
											2014-01-21 02:06:57 -07:00
+									scanblock(nil, true);
-												runtime: faster GC mark phase
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

											
										
										
											2012-05-24 00:55:50 -06:00
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									t3 = runtime·nanotime();
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									bufferList[m->helpgc].busy = 0;
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+									if(work.nproc > 1)
 										runtime·notesleep(&work.alldone);
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									cachestats();
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
 									// estimate what was live heap size after previous GC (for tracing only)
 									heap0 = mstats.next_gc*100/(gcpercent+100);
 									// conservatively set next_gc to high value assuming that everything is live
 									// concurrent/lazy sweep will reduce this number while discovering new garbage
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
-												runtime: garbage collection + malloc performance
  * add bit tracking finalizer status, avoiding getfinalizer lookup
  * add ability to allocate uncleared memory

R=iant
CC=golang-dev
https://golang.org/cl/207044

											
										
										
											2010-02-10 01:00:12 -07:00
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+									t4 = runtime·nanotime();
 									mstats.last_gc = t4;
 									mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
 									mstats.pause_total_ns += t4 - t0;
-												runtime: instrument malloc + garbage collector.
add simple garbage collection benchmark.

R=iant
CC=golang-dev
https://golang.org/cl/204053

											
										
										
											2010-02-08 15:32:22 -07:00
+									mstats.numgc++;
 									if(mstats.debuggc)
-												runtime: account stop-the-world time in the "other" GOGCTRACE section
Currently it's summed to mark phase.
The change makes it easier to diagnose long stop-the-world phases.

R=golang-dev, bradfitz, dave
CC=golang-dev
https://golang.org/cl/7182043

											
										
										
											2013-01-22 02:44:49 -07:00
+										runtime·printf("pause %D\n", t4-t0);
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: introduce GODEBUG env var
Currently it replaces GOGCTRACE env var (GODEBUG=gctrace=1).
The plan is to extend it with other type of debug tracing,
e.g. GODEBUG=gctrace=1,schedtrace=100.

R=rsc
CC=bradfitz, daniel.morsing, gobot, golang-dev
https://golang.org/cl/10026045

											
										
										
											2013-06-28 08:37:06 -06:00
+									if(runtime·debug.gctrace) {
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+										updatememstats(&stats);
 										heap1 = mstats.heap_alloc;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										obj = mstats.nmalloc - mstats.nfree;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										stats.nprocyield += work.markfor->nprocyield;
 										stats.nosyield += work.markfor->nosyield;
 										stats.nsleep += work.markfor->nsleep;
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB, %D (%D-%D) objects,"
 												" %d/%d/%d sweeps,"
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
+												" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											mstats.numgc, work.nproc, (t3-t2)/1000000, (t2-t1)/1000000, (t1-t0+t4-t3)/1000000,
 											heap0>>20, heap1>>20, obj,
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+											mstats.nmalloc, mstats.nfree,
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											stats.nhandoff, stats.nhandoffcnt,
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											work.markfor->nsteal, work.markfor->nstealcnt,
-												runtime: make GC stats per-M
This is factored out part of:
https://golang.org/cl/5279048/
(Parallel GC)

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkParser              3999106750   3975026500   -0.60%
garbage.BenchmarkParser-2            3720553750   3719196500   -0.04%
garbage.BenchmarkParser-4            3502857000   3474980500   -0.80%
garbage.BenchmarkParser-8            3375448000   3341310500   -1.01%
garbage.BenchmarkParserLastPause      329401000    324097000   -1.61%
garbage.BenchmarkParserLastPause-2    208953000    214222000   +2.52%
garbage.BenchmarkParserLastPause-4    110933000    111656000   +0.65%
garbage.BenchmarkParserLastPause-8     71969000     78230000   +8.70%
garbage.BenchmarkParserPause          230808842    197237400  -14.55%
garbage.BenchmarkParserPause-2        123674365    125197595   +1.23%
garbage.BenchmarkParserPause-4         80518525     85710333   +6.45%
garbage.BenchmarkParserPause-8         58310243     56940512   -2.35%
garbage.BenchmarkTree2                 31471700     31289400   -0.58%
garbage.BenchmarkTree2-2               21536800     21086300   -2.09%
garbage.BenchmarkTree2-4               11074700     10880000   -1.76%
garbage.BenchmarkTree2-8                7568600      7351400   -2.87%
garbage.BenchmarkTree2LastPause       314664000    312840000   -0.58%
garbage.BenchmarkTree2LastPause-2     215319000    210815000   -2.09%
garbage.BenchmarkTree2LastPause-4     110698000    108751000   -1.76%
garbage.BenchmarkTree2LastPause-8      75635000     73463000   -2.87%
garbage.BenchmarkTree2Pause           174280857    173147571   -0.65%
garbage.BenchmarkTree2Pause-2         131332714    129665761   -1.27%
garbage.BenchmarkTree2Pause-4          93803095     93422904   -0.41%
garbage.BenchmarkTree2Pause-8          86242333     85146761   -1.27%

R=rsc
CC=golang-dev
https://golang.org/cl/5987045

											
										
										
											2012-04-05 10:48:28 -06:00
+											stats.nprocyield, stats.nosyield, stats.nsleep);
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										gcstats.nbgsweep = gcstats.npausesweep = 0;
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+										if(CollectStats) {
 											runtime·printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
 												gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
 											if(gcstats.ptr.cnt != 0)
 												runtime·printf("avg ptrbufsize: %D (%D/%D)\n",
 													gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
 											if(gcstats.obj.cnt != 0)
 												runtime·printf("avg nobj: %D (%D/%D)\n",
 													gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
 											runtime·printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
 											runtime·printf("instruction counts:\n");
 											ninstr = 0;
 											for(i=0; i<nelem(gcstats.instr); i++) {
 												runtime·printf("\t%d:\t%D\n", i, gcstats.instr[i]);
 												ninstr += gcstats.instr[i];
 											}
 											runtime·printf("\ttotal:\t%D\n", ninstr);
 											runtime·printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
-												runtime: check bitmap word for allocated bit in markonly

When searching for an allocated bit, flushptrbuf would search
backward in the bitmap word containing the bit of pointer
being looked-up before searching the span.  This extra check
was not replicated in markonly which, instead, after not
finding an allocated bit for a pointer would directly look in
the span.

Using statistics generated from godoc, before this change span
lookups were, on average, more common than word lookups.  It
was common for markonly to consult spans for one third of its
pointer lookups.  With this change in place, what were
previously span lookups are overwhelmingly become by the word
lookups making the total number of span lookups a relatively
small fraction of the whole.

This change also introduces some statistics gathering about
lookups guarded by the CollectStats enum.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/13311043

											
										
										
											2013-08-29 14:52:38 -06:00
 											runtime·printf("markonly base lookup: bit %D word %D span %D\n", gcstats.markonly.foundbit, gcstats.markonly.foundword, gcstats.markonly.foundspan);
 											runtime·printf("flushptrbuf base lookup: bit %D word %D span %D\n", gcstats.flushptrbuf.foundbit, gcstats.flushptrbuf.foundword, gcstats.flushptrbuf.foundspan);
-												runtime: add garbage collector statistics

If the constant CollectStats is non-zero and GOGCTRACE=1
the garbage collector will print basic statistics about executed
GC instructions.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/7413049

											
										
										
											2013-03-04 08:54:37 -07:00
+										}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
-												runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

											
										
										
											2012-05-22 11:35:52 -06:00
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									// We cache current runtime·mheap.allspans array in sweep.spans,
 									// because the former can be resized and freed.
 									// Otherwise we would need to take heap lock every time
 									// we want to convert span index to span pointer.
 									// Free the old cached array if necessary.
 									if(sweep.spans && sweep.spans != runtime·mheap.allspans)
 										runtime·SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &mstats.other_sys);
 									// Cache the current array.
 									runtime·mheap.sweepspans = runtime·mheap.allspans;
 									runtime·mheap.sweepgen += 2;
 									runtime·mheap.sweepdone = false;
 									sweep.spans = runtime·mheap.allspans;
 									sweep.nspan = runtime·mheap.nspan;
 									sweep.spanidx = 0;
-												runtime: temporary disable concurrent GC sweep
We see failures on builders, e.g.:
http://build.golang.org/log/70bb28cd6bcf8c4f49810a011bb4337a61977bf4

LGTM=rsc, dave
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62360043

											
										
										
											2014-02-12 13:03:27 -07:00
+									// Temporary disable concurrent sweep, because we see failures on builders.
-												runtime: fix non-concurrent sweep

State of the world:

CL 46430043 introduced a new concurrent sweep but is broken.

CL 62360043 made the new sweep non-concurrent
to try to fix the world while we understand what's wrong with
the concurrent version.

This CL fixes the non-concurrent form to run finalizers.
This CL is just a band-aid to get the build green again.

Dmitriy is working on understanding and then fixing what's
wrong with the concurrent sweep.

TBR=dvyukov
CC=golang-codereviews
https://golang.org/cl/62370043

											
										
										
											2014-02-12 13:54:21 -07:00
+									if(ConcurrentSweep) {
-												runtime: temporary disable concurrent GC sweep
We see failures on builders, e.g.:
http://build.golang.org/log/70bb28cd6bcf8c4f49810a011bb4337a61977bf4

LGTM=rsc, dave
R=rsc, dave
CC=golang-codereviews
https://golang.org/cl/62360043

											
										
										
											2014-02-12 13:03:27 -07:00
+										runtime·lock(&gclock);
 										if(sweep.g == nil)
 											sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, runtime·gc);
 										else if(sweep.parked) {
 											sweep.parked = false;
 											runtime·ready(sweep.g);
 										}
 										runtime·unlock(&gclock);
 									} else {
 										// Sweep all spans eagerly.
 										while(runtime·sweepone() != -1)
 											gcstats.npausesweep++;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+									}
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									runtime·MProf_GC();
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+								}
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+								extern uintptr runtime·sizeof_C_MStats;
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								void
-												runtime: delete UpdateMemStats, replace with ReadMemStats(&stats).

Unexports runtime.MemStats and rename MemStatsType to MemStats.
The new accessor requires passing a pointer to a user-allocated
MemStats structure.

Fixes #2572.

R=bradfitz, rsc, bradfitz, gustavo
CC=golang-dev, remy
https://golang.org/cl/5616072

											
										
										
											2012-02-06 11:16:26 -07:00
+								runtime·ReadMemStats(MStats *stats)
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								{
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									// Have to acquire worldsema to stop the world,
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+									// because stoptheworld can only be used by
 									// one goroutine at a time, and there might be
 									// a pending garbage collection already calling it.
-												net: add special netFD mutex
The mutex, fdMutex, handles locking and lifetime of sysfd,
and serializes Read and Write methods.
This allows to strip 2 sync.Mutex.Lock calls,
2 sync.Mutex.Unlock calls, 1 defer and some amount
of misc overhead from every network operation.

On linux/amd64, Intel E5-2690:
benchmark                             old ns/op    new ns/op    delta
BenchmarkTCP4Persistent                    9595         9454   -1.47%
BenchmarkTCP4Persistent-2                  8978         8772   -2.29%
BenchmarkTCP4ConcurrentReadWrite           4900         4625   -5.61%
BenchmarkTCP4ConcurrentReadWrite-2         2603         2500   -3.96%

In general it strips 70-500 ns from every network operation depending
on processor model. On my relatively new E5-2690 it accounts to ~5%
of network op cost.

Fixes #6074.

R=golang-dev, bradfitz, alex.brainman, iant, mikioh.mikioh
CC=golang-dev
https://golang.org/cl/12418043

											
										
										
											2013-08-09 11:43:00 -06:00
+									runtime·semacquire(&runtime·worldsema, false);
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+									m->gcing = 1;
 									runtime·stoptheworld();
-												runtime: speedup malloc stats collection
Count only number of frees, everything else is derivable
and does not need to be counted on every malloc.
benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    68           66   -3.07%
BenchmarkMalloc16                   75           70   -6.48%
BenchmarkMallocTypeInfo8           102           97   -4.80%
BenchmarkMallocTypeInfo16          108          105   -2.78%

R=golang-dev, dave, rsc
CC=golang-dev
https://golang.org/cl/9776043

											
										
										
											2013-06-06 04:56:50 -06:00
+									updatememstats(nil);
-												runtime: increase page size to 8K
Tcmalloc uses 8K, 32K and 64K pages, and in custom setups 256K pages.
Only Chromium uses 4K pages today (in "slow but small" configuration).
The general tendency is to increase page size, because it reduces
metadata size and DTLB pressure.
This change reduces GC pause by ~10% and slightly improves other metrics.

json-1
allocated                 8037492      8038689      +0.01%
allocs                     105762       105573      -0.18%
cputime                 158400000    155800000      -1.64%
gc-pause-one              4412234      4135702      -6.27%
gc-pause-total            2647340      2398707      -9.39%
rss                      54923264     54525952      -0.72%
sys-gc                    3952624      3928048      -0.62%
sys-heap                 46399488     46006272      -0.85%
sys-other                 5597504      5290304      -5.49%
sys-stack                  393216       393216      +0.00%
sys-total                56342832     55617840      -1.29%
time                    158478890    156046916      -1.53%
virtual-mem             256548864    256593920      +0.02%

garbage-1
allocated                 2991113      2986259      -0.16%
allocs                      62844        62652      -0.31%
cputime                  16330000     15860000      -2.88%
gc-pause-one            789108229    725555211      -8.05%
gc-pause-total            3945541      3627776      -8.05%
rss                    1143660544   1132253184      -1.00%
sys-gc                   65609600     65806208      +0.30%
sys-heap               1032388608   1035599872      +0.31%
sys-other                37501632     22777664     -39.26%
sys-stack                 8650752      8781824      +1.52%
sys-total              1144150592   1132965568      -0.98%
time                     16364602     15891994      -2.89%
virtual-mem            1327296512   1313746944      -1.02%

This is the exact reincarnation of already LGTMed:
https://golang.org/cl/45770044
which must not break darwin/freebsd after:
https://golang.org/cl/56630043
TBR=iant

LGTM=khr, iant
R=iant, khr
CC=golang-codereviews
https://golang.org/cl/58230043

											
										
										
											2014-01-30 02:28:19 -07:00
+									// Size of the trailing by_size array differs between Go and C,
 									// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
 									runtime·memcopy(runtime·sizeof_C_MStats, stats, &mstats);
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+									m->gcing = 0;
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									m->locks++;
-												runtime: goroutine profile, stack dumps

R=golang-dev, r, r
CC=golang-dev
https://golang.org/cl/5687076

											
										
										
											2012-02-22 19:45:01 -07:00
+									runtime·semrelease(&runtime·worldsema);
-												runtime: refactor helpgc functionality in preparation for parallel GC
Parallel GC needs to know in advance how many helper threads will be there.
Hopefully it's the last patch before I can tackle parallel sweep phase.
The benchmarks are unaffected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6200064

											
										
										
											2012-05-15 09:10:16 -06:00
+									runtime·starttheworld();
-												runtime: use funcdata to supply garbage collection information

This CL introduces a FUNCDATA number for runtime-specific
garbage collection metadata, changes the C and Go compilers
to emit that metadata, and changes the runtime to expect it.

The old pseudo-instructions that carried this information
are gone, as is the linker code to process them.

R=golang-dev, dvyukov, cshapiro
CC=golang-dev
https://golang.org/cl/11406044

											
										
										
											2013-07-19 14:04:09 -06:00
+									m->locks--;
-												runtime: add UpdateMemStats, use in tests

Drops mallocrep1.go back to a reasonable
amount of time.  (154 -> 0.8 seconds on my Mac)

Fixes #2085.

R=golang-dev, dvyukov, r
CC=golang-dev
https://golang.org/cl/4811045

											
										
										
											2011-07-21 22:55:01 -06:00
+								}
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								void
 								runtime∕debug·readGCStats(Slice *pauses)
 								{
 									uint64 *p;
 									uint32 i, n;
 									// Calling code in runtime/debug should make the slice large enough.
 									if(pauses->cap < nelem(mstats.pause_ns)+3)
 										runtime·throw("runtime: short slice passed to readGCStats");
 									// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
 									p = (uint64*)pauses->array;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									runtime·lock(&runtime·mheap);
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									n = mstats.numgc;
 									if(n > nelem(mstats.pause_ns))
 										n = nelem(mstats.pause_ns);
 									// The pause buffer is circular. The most recent pause is at
 									// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
 									// from there to go back farther in time. We deliver the times
 									// most recent first (in p[0]).
 									for(i=0; i<n; i++)
 										p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
 									p[n] = mstats.last_gc;
 									p[n+1] = mstats.numgc;
 									p[n+2] = mstats.pause_total_ns;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									runtime·unlock(&runtime·mheap);
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									pauses->len = n+3;
 								}
-												runtime: use goc2c as much as possible

Package runtime's C functions written to be called from Go
started out written in C using carefully constructed argument
lists and the FLUSH macro to write a result back to memory.

For some functions, the appropriate parameter list ended up
being architecture-dependent due to differences in alignment,
so we added 'goc2c', which takes a .goc file containing Go func
declarations but C bodies, rewrites the Go func declaration to
equivalent C declarations for the target architecture, adds the
needed FLUSH statements, and writes out an equivalent C file.
That C file is compiled as part of package runtime.

Native Client's x86-64 support introduces the most complex
alignment rules yet, breaking many functions that could until
now be portably written in C. Using goc2c for those avoids the
breakage.

Separately, Keith's work on emitting stack information from
the C compiler would require the hand-written functions
to add #pragmas specifying how many arguments are result
parameters. Using goc2c for those avoids maintaining #pragmas.

For both reasons, use goc2c for as many Go-called C functions
as possible.

This CL is a replay of the bulk of CL 15400047 and CL 15790043,
both of which were reviewed as part of the NaCl port and are
checked in to the NaCl branch. This CL is part of bringing the
NaCl code into the main tree.

No new code here, just reformatting and occasional movement
into .h files.

LGTM=r
R=dave, alex.brainman, r
CC=golang-codereviews
https://golang.org/cl/65220044

											
										
										
											2014-02-20 13:58:47 -07:00
+								int32
 								runtime·setgcpercent(int32 in) {
 									int32 out;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									runtime·lock(&runtime·mheap);
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+									if(gcpercent == GcpercentUnknown)
 										gcpercent = readgogc();
 									out = gcpercent;
 									if(in < 0)
 										in = -1;
 									gcpercent = in;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									runtime·unlock(&runtime·mheap);
-												runtime: use goc2c as much as possible

Package runtime's C functions written to be called from Go
started out written in C using carefully constructed argument
lists and the FLUSH macro to write a result back to memory.

For some functions, the appropriate parameter list ended up
being architecture-dependent due to differences in alignment,
so we added 'goc2c', which takes a .goc file containing Go func
declarations but C bodies, rewrites the Go func declaration to
equivalent C declarations for the target architecture, adds the
needed FLUSH statements, and writes out an equivalent C file.
That C file is compiled as part of package runtime.

Native Client's x86-64 support introduces the most complex
alignment rules yet, breaking many functions that could until
now be portably written in C. Using goc2c for those avoids the
breakage.

Separately, Keith's work on emitting stack information from
the C compiler would require the hand-written functions
to add #pragmas specifying how many arguments are result
parameters. Using goc2c for those avoids maintaining #pragmas.

For both reasons, use goc2c for as many Go-called C functions
as possible.

This CL is a replay of the bulk of CL 15400047 and CL 15790043,
both of which were reviewed as part of the NaCl port and are
checked in to the NaCl branch. This CL is part of bringing the
NaCl code into the main tree.

No new code here, just reformatting and occasional movement
into .h files.

LGTM=r
R=dave, alex.brainman, r
CC=golang-codereviews
https://golang.org/cl/65220044

											
										
										
											2014-02-20 13:58:47 -07:00
+									return out;
-												runtime/debug: add controls for garbage collector

Fixes #4090.

R=golang-dev, iant, bradfitz, dsymonds
CC=golang-dev
https://golang.org/cl/7229070

											
										
										
											2013-02-03 22:00:55 -07:00
+								}
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								static void
 								gchelperstart(void)
 								{
 									if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
 										runtime·throw("gchelperstart: bad m->helpgc");
 									if(runtime·xchg(&bufferList[m->helpgc].busy, 1))
 										runtime·throw("gchelperstart: already busy");
-												runtime/gc: Run garbage collector on g0 stack
instead of regular g stack. We do this so that the g stack
we're currently running on is no longer changing.  Cuts
the root set down a bit (g0 stacks are not scanned, and
we don't need to scan gc's internal state).  Also an
enabler for copyable stacks.

R=golang-dev, cshapiro, khr, 0xe2.0x9a.0x9b, dvyukov, rsc, iant
CC=golang-dev
https://golang.org/cl/9754044

											
										
										
											2013-05-31 21:43:33 -06:00
+									if(g != m->g0)
 										runtime·throw("gchelper not running on g0 stack");
-												runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044

											
										
										
											2013-03-21 02:48:02 -06:00
+								}
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+								static void
 								runfinq(void)
 								{
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									Finalizer *f;
 									FinBlock *fb, *next;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									byte *frame;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									uint32 framesz, framecap, i;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+									Eface *ef, ef1;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+									frame = nil;
 									framecap = 0;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									for(;;) {
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·lock(&gclock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										fb = finq;
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+										finq = nil;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										if(fb == nil) {
-												runtime: use explicit flag when finalizer goroutine is waiting

Avoids spurious wakeups during other sleeping by that goroutine.
Fixes #711.

R=r
CC=golang-dev
https://golang.org/cl/902041

											
										
										
											2010-04-07 21:38:02 -06:00
+											fingwait = 1;
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+											runtime·parkunlock(&gclock, "finalizer wait");
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+											continue;
 										}
-												runtime: concurrent GC sweep
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

											
										
										
											2014-02-12 11:16:42 -07:00
+										runtime·unlock(&gclock);
-												runtime/race: more precise handling of finalizers
Currently race detector runtime just disables race detection in the finalizer goroutine.
It has false positives when a finalizer writes to shared memory -- the race with finalizer is reported in a normal goroutine that accesses the same memory.
After this change I am going to synchronize the finalizer goroutine with the rest of the world in racefingo(). This is closer to what happens in reality and so
does not have false positives.
And also add README file with instructions how to build the runtime.

R=golang-dev, minux.ma, rsc
CC=golang-dev
https://golang.org/cl/6810095

											
										
										
											2012-11-14 05:58:10 -07:00
+										if(raceenabled)
 											runtime·racefingo();
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+										for(; fb; fb=next) {
 											next = fb->next;
 											for(i=0; i<fb->cnt; i++) {
 												f = &fb->fin[i];
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												framesz = sizeof(Eface) + f->nret;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+												if(framecap < framesz) {
 													runtime·free(frame);
-												runtime: prevent GC from seeing the contents of a frame in runfinq
This holds the last finalized object and arguments to its finalizer.
Fixes #5348.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/11454044

											
										
										
											2013-07-19 08:01:33 -06:00
+													// The frame does not contain pointers interesting for GC,
-												runtime: fix heap memory corruption
With concurrent sweeping finc if modified by runfinq and queuefinalizer concurrently.
Fixes crashes like this one:
http://build.golang.org/log/6ad7b59ef2e93e3c9347eabfb4c4bd66df58fd5a
Fixes #7324.
Update #7396

LGTM=rsc
R=golang-codereviews, minux.ma, rsc
CC=golang-codereviews, khr
https://golang.org/cl/67980043

											
										
										
											2014-02-24 09:53:50 -07:00
+													// all not yet finalized objects are stored in finq.
-												runtime: rename FlagNoPointers to FlagNoScan.  Better represents
the use of the flag, especially for objects which actually do have
pointers but we don't want the GC to scan them.

R=golang-dev, cshapiro
CC=golang-dev
https://golang.org/cl/13181045

											
										
										
											2013-08-23 18:28:47 -06:00
+													// If we do not mark it as FlagNoScan,
-												runtime: prevent GC from seeing the contents of a frame in runfinq
This holds the last finalized object and arguments to its finalizer.
Fixes #5348.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/11454044

											
										
										
											2013-07-19 08:01:33 -06:00
+													// the last finalized object is not collected.
-												runtime: rename FlagNoPointers to FlagNoScan.  Better represents
the use of the flag, especially for objects which actually do have
pointers but we don't want the GC to scan them.

R=golang-dev, cshapiro
CC=golang-dev
https://golang.org/cl/13181045

											
										
										
											2013-08-23 18:28:47 -06:00
+													frame = runtime·mallocgc(framesz, 0, FlagNoScan|FlagNoInvokeGC);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+													framecap = framesz;
 												}
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												if(f->fint == nil)
 													runtime·throw("missing type in runfinq");
 												if(f->fint->kind == KindPtr) {
 													// direct use of pointer
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+													*(void**)frame = f->arg;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												} else if(((InterfaceType*)f->fint)->mhdr.len == 0) {
 													// convert to empty interface
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+													ef = (Eface*)frame;
 													ef->type = f->ot;
 													ef->data = f->arg;
-												runtime: make SetFinalizer(x, f) accept any f for which f(x) is valid

Originally the requirement was f(x) where f's argument is
exactly x's type.

CL 11858043 relaxed the requirement in a non-standard
way: f's argument must be exactly x's type or interface{}.

If we're going to relax the requirement, it should be done
in a way consistent with the rest of Go. This CL allows f's
argument to have any type for which x is assignable;
that's the same requirement the compiler would impose
if compiling f(x) directly.

Fixes #5368.

R=dvyukov, bradfitz, pieter
CC=golang-dev
https://golang.org/cl/12895043

											
										
										
											2013-08-14 12:54:31 -06:00
+												} else {
 													// convert to interface with methods, via empty interface.
 													ef1.type = f->ot;
 													ef1.data = f->arg;
 													if(!runtime·ifaceE2I2((InterfaceType*)f->fint, ef1, (Iface*)frame))
 														runtime·throw("invalid type conversion in runfinq");
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												}
 												reflect·call(f->fn, frame, framesz);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+												f->fn = nil;
 												f->arg = nil;
-												runtime: allow SetFinalizer with a func(interface{})

Fixes #5368.

R=golang-dev, dvyukov
CC=golang-dev, rsc
https://golang.org/cl/11858043

											
										
										
											2013-07-29 09:43:08 -06:00
+												f->ot = nil;
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											}
 											fb->cnt = 0;
-												runtime: fix heap memory corruption
With concurrent sweeping finc if modified by runfinq and queuefinalizer concurrently.
Fixes crashes like this one:
http://build.golang.org/log/6ad7b59ef2e93e3c9347eabfb4c4bd66df58fd5a
Fixes #7324.
Update #7396

LGTM=rsc
R=golang-codereviews, minux.ma, rsc
CC=golang-codereviews, khr
https://golang.org/cl/67980043

											
										
										
											2014-02-24 09:53:50 -07:00
+											runtime·lock(&gclock);
-												runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057

											
										
										
											2011-10-06 09:42:51 -06:00
+											fb->next = finc;
 											finc = fb;
-												runtime: fix heap memory corruption
With concurrent sweeping finc if modified by runfinq and queuefinalizer concurrently.
Fixes crashes like this one:
http://build.golang.org/log/6ad7b59ef2e93e3c9347eabfb4c4bd66df58fd5a
Fixes #7324.
Update #7396

LGTM=rsc
R=golang-codereviews, minux.ma, rsc
CC=golang-codereviews, khr
https://golang.org/cl/67980043

											
										
										
											2014-02-24 09:53:50 -07:00
+											runtime·unlock(&gclock);
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+										}
-												runtime: ,s/[a-zA-Z0-9_]+/runtime·&/g, almost

Prefix all external symbols in runtime by runtime·,
to avoid conflicts with possible symbols of the same
name in linked-in C libraries.  The obvious conflicts
are printf, malloc, and free, but hide everything to
avoid future pain.

The symbols left alone are:

	** known to cgo **
	_cgo_free
	_cgo_malloc
	libcgo_thread_start
	initcgo
	ncgocall

	** known to linker **
	_rt0_$GOARCH
	_rt0_$GOARCH_$GOOS
	text
	etext
	data
	end
	pclntab
	epclntab
	symtab
	esymtab

	** known to C compiler **
	_divv
	_modv
	_div64by32
	etc (arch specific)

Tested on darwin/386, darwin/amd64, linux/386, linux/amd64.

Built (but not tested) for freebsd/386, freebsd/amd64, linux/arm, windows/386.

R=r, PeterGo
CC=golang-dev
https://golang.org/cl/2899041

											
										
										
											2010-11-04 12:00:19 -06:00
+										runtime·gc(1);	// trigger another gc to clean up the finalized objects, if possible
-												runtime: run all finalizers in a single goroutine.
eliminate second pass of mark+sweep
by scanning finalizer table specially.

R=r
CC=golang-dev
https://golang.org/cl/782041

											
										
										
											2010-03-26 15:15:30 -06:00
+									}
-												gc #0.  mark and sweep collector.

R=r,gri
DELTA=472  (423 added, 2 deleted, 47 changed)
OCL=23522
CL=23541

											
										
										
											2009-01-26 18:37:05 -07:00
+								}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
 								void
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+								runtime·marknogc(void *v)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								{
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									uintptr *b, obits, bits, off, shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
 									shift = off % wordsPerBitmapWord;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+									for(;;) {
 										obits = *b;
 										if((obits>>shift & bitMask) != bitAllocated)
 											runtime·throw("bad initial state for marknogc");
 										bits = (obits & ~(bitAllocated<<shift)) | bitBlockBoundary<<shift;
 										if(runtime·gomaxprocs == 1) {
 											*b = bits;
 											break;
 										} else {
 											// more than one goroutine is potentially running: use atomic op
 											if(runtime·casp((void**)b, (void*)obits, (void*)bits))
 												break;
 										}
 									}
 								}
 								void
 								runtime·markscan(void *v)
 								{
 									uintptr *b, obits, bits, off, shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									shift = off % wordsPerBitmapWord;
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									for(;;) {
 										obits = *b;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+										if((obits>>shift & bitMask) != bitAllocated)
 											runtime·throw("bad initial state for markscan");
 										bits = obits | bitScan<<shift;
-												runtime: remove singleproc var
It was needed for the old scheduler,
because there temporary could be more threads than gomaxprocs.
In the new scheduler gomaxprocs is always respected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/12438043

											
										
										
											2013-08-05 12:58:02 -06:00
+										if(runtime·gomaxprocs == 1) {
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+											*b = bits;
 											break;
 										} else {
-												runtime: fix GC bitmap corruption
The corruption can occur when GOMAXPROCS
is changed from >1 to 1, since GOMAXPROCS=1
does not imply there is only 1 goroutine running,
other goroutines can still be not parked after
the change.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/4873050

											
										
										
											2011-08-16 14:53:02 -06:00
+											// more than one goroutine is potentially running: use atomic op
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+											if(runtime·casp((void**)b, (void*)obits, (void*)bits))
 												break;
 										}
 									}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								}
 								// mark the block at v of size n as freed.
 								void
 								runtime·markfreed(void *v, uintptr n)
 								{
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									uintptr *b, obits, bits, off, shift;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
 									if(0)
-												runtime: markfreed's error reports should be prefixed with "markfreed", not "markallocated".

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/14441055

											
										
										
											2013-10-09 14:28:47 -06:00
+										runtime·printf("markfreed %p+%p\n", v, n);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: markfreed's error reports should be prefixed with "markfreed", not "markallocated".

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/14441055

											
										
										
											2013-10-09 14:28:47 -06:00
+										runtime·throw("markfreed: bad pointer");
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									shift = off % wordsPerBitmapWord;
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									for(;;) {
 										obits = *b;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+										// This could be a free of a gc-eligible object (bitAllocated + others) or
 										// a FlagNoGC object (bitBlockBoundary set).  In either case, we revert to
 										// a simple no-scan allocated object because it is going on a free list.
 										bits = (obits & ~(bitMask<<shift)) | (bitAllocated<<shift);
-												runtime: remove singleproc var
It was needed for the old scheduler,
because there temporary could be more threads than gomaxprocs.
In the new scheduler gomaxprocs is always respected.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/12438043

											
										
										
											2013-08-05 12:58:02 -06:00
+										if(runtime·gomaxprocs == 1) {
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+											*b = bits;
 											break;
 										} else {
-												runtime: fix GC bitmap corruption
The corruption can occur when GOMAXPROCS
is changed from >1 to 1, since GOMAXPROCS=1
does not imply there is only 1 goroutine running,
other goroutines can still be not parked after
the change.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/4873050

											
										
										
											2011-08-16 14:53:02 -06:00
+											// more than one goroutine is potentially running: use atomic op
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+											if(runtime·casp((void**)b, (void*)obits, (void*)bits))
 												break;
 										}
 									}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+								}
 								// check that the block at v of size n is marked freed.
 								void
 								runtime·checkfreed(void *v, uintptr n)
 								{
 									uintptr *b, bits, off, shift;
 									if(!runtime·checking)
 										return;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										return;	// not allocated, so okay
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
 									b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									shift = off % wordsPerBitmapWord;
 									bits = *b>>shift;
 									if((bits & bitAllocated) != 0) {
 										runtime·printf("checkfreed %p+%p: off=%p have=%p\n",
 											v, n, off, bits & bitMask);
 										runtime·throw("checkfreed: not freed");
 									}
 								}
 								// mark the span of memory at v as having n blocks of the given size.
 								// if leftover is true, there is left over space at the end of the span.
 								void
 								runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 								{
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+									uintptr *b, off, shift, i;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									byte *p;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("markspan: bad pointer");
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+									if(runtime·checking) {
 										// bits should be all zero at the start
 										off = (byte*)v + size - runtime·mheap.arena_start;
 										b = (uintptr*)(runtime·mheap.arena_start - off/wordsPerBitmapWord);
 										for(i = 0; i < size/PtrSize/wordsPerBitmapWord; i++) {
 											if(b[i] != 0)
 												runtime·throw("markspan: span bits not zero");
 										}
 									}
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									p = v;
 									if(leftover)	// mark a boundary just past end of last block too
 										n++;
 									for(; n-- > 0; p += size) {
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+										// Okay to use non-atomic ops here, because we control
 										// the entire span, and each bitmap word has bits for only
 										// one span, so no other goroutines are changing these
 										// bitmap words.
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+										off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
 										b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										shift = off % wordsPerBitmapWord;
-												runtime: mark objects in free lists as allocated and unscannable.

On the plus side, we don't need to change the bits when mallocing
pointerless objects.  On the other hand, we need to mark objects in the
free lists during GC.  But the free lists are small at GC time, so it
should be a net win.

benchmark                    old ns/op    new ns/op    delta
BenchmarkMalloc8                    40           33  -17.65%
BenchmarkMalloc16                   45           38  -15.72%
BenchmarkMallocTypeInfo8            58           59   +0.85%
BenchmarkMallocTypeInfo16           63           64   +1.10%

R=golang-dev, rsc, dvyukov
CC=cshapiro, golang-dev
https://golang.org/cl/41040043

											
										
										
											2013-12-18 18:13:59 -07:00
+										*b = (*b & ~(bitMask<<shift)) | (bitAllocated<<shift);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									}
 								}
 								// unmark the span of memory at v of length n bytes.
 								void
 								runtime·unmarkspan(void *v, uintptr n)
 								{
 									uintptr *p, *b, off;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+										runtime·throw("markspan: bad pointer");
 									p = v;
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									if(off % wordsPerBitmapWord != 0)
 										runtime·throw("markspan: unaligned pointer");
-												runtime: make mheap statically allocated again
This depends on: 9791044: runtime: allocate page table lazily
Once page table is moved out of heap, the heap becomes small.
This removes unnecessary dereferences during heap access.
No logical changes.

R=golang-dev, khr
CC=golang-dev
https://golang.org/cl/9802043

											
										
										
											2013-05-28 12:14:47 -06:00
+									b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									n /= PtrSize;
 									if(n%wordsPerBitmapWord != 0)
 										runtime·throw("unmarkspan: unaligned length");
-												runtime: fix memory allocator for GOMAXPROCS > 1

Bitmaps were not being updated safely.
Depends on 4188053.

Fixes #1504.
May fix issue 1479.

R=r, r2
CC=golang-dev
https://golang.org/cl/4184048

											
										
										
											2011-02-16 11:21:20 -07:00
+									// Okay to use non-atomic ops here, because we control
 									// the entire span, and each bitmap word has bits for only
 									// one span, so no other goroutines are changing these
 									// bitmap words.
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									n /= wordsPerBitmapWord;
 									while(n-- > 0)
 										*b-- = 0;
 								}
 								void
 								runtime·MHeap_MapBits(MHeap *h)
 								{
 									// Caller has added extra mappings to the arena.
 									// Add extra mappings of bitmap words as needed.
 									// We allocate extra bitmap pieces in chunks of bitmapChunk.
 									enum {
 										bitmapChunk = 8192
 									};
 									uintptr n;
-												runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

											
										
										
											2011-09-30 07:40:01 -06:00
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
-												runtime: allocate page table lazily
This removes the 256MB memory allocation at startup,
which conflicts with ulimit.
Also will allow to eliminate an unnecessary memory dereference in GC,
because the page table is usually mapped at known address.
Update #5049.
Update #5236.

R=golang-dev, khr, r, khr, rsc
CC=golang-dev
https://golang.org/cl/9791044

											
										
										
											2013-05-28 12:04:34 -06:00
+									n = ROUND(n, bitmapChunk);
-												all: merge NaCl branch (part 1)

See golang.org/s/go13nacl for design overview.

This CL is the mostly mechanical changes from rsc's Go 1.2 based NaCl branch, specifically 39cb35750369 to 500771b477cf from https://code.google.com/r/rsc-go13nacl. This CL does not include working NaCl support, there are probably two or three more large merges to come.

CL 15750044 is not included as it involves more invasive changes to the linker which will need to be merged separately.

The exact change lists included are

15050047: syscall: support for Native Client
15360044: syscall: unzip implementation for Native Client
15370044: syscall: Native Client SRPC implementation
15400047: cmd/dist, cmd/go, go/build, test: support for Native Client
15410048: runtime: support for Native Client
15410049: syscall: file descriptor table for Native Client
15410050: syscall: in-memory file system for Native Client
15440048: all: update +build lines for Native Client port
15540045: cmd/6g, cmd/8g, cmd/gc: support for Native Client
15570045: os: support for Native Client
15680044: crypto/..., hash/crc32, reflect, sync/atomic: support for amd64p32
15690044: net: support for Native Client
15690048: runtime: support for fake time like on Go Playground
15690051: build: disable various tests on Native Client

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/68150047

											
										
										
											2014-02-25 07:47:42 -07:00
+									n = ROUND(n, PhysPageSize);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									if(h->bitmap_mapped >= n)
 										return;
-												runtime: account for all sys memory in MemStats
Currently lots of sys allocations are not accounted in any of XxxSys,
including GC bitmap, spans table, GC roots blocks, GC finalizer blocks,
iface table, netpoll descriptors and more. Up to ~20% can unaccounted.
This change introduces 2 new stats: GCSys and OtherSys for GC metadata
and all other misc allocations, respectively.
Also ensures that all XxxSys indeed sum up to Sys. All sys memory allocation
functions require the stat for accounting, so that it's impossible to miss something.
Also fix updating of mcache_sys/inuse, they were not updated after deallocation.

test/bench/garbage/parser before:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14204928
MCacheSys	16384
BuckHashSys	1439992

after:
Sys		670064344
HeapSys		610271232
StackSys	65536
MSpanSys	14188544
MCacheSys	16384
BuckHashSys	3194304
GCSys		39198688
OtherSys	3129656

Fixes #5799.

R=rsc, dave, alex.brainman
CC=golang-dev
https://golang.org/cl/12946043

											
										
										
											2013-09-06 14:55:40 -06:00
+									runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, &mstats.gc_sys);
-												runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046

											
										
										
											2011-02-02 21:03:47 -07:00
+									h->bitmap_mapped = n;
 								}