diff --git a/src/pkg/runtime/debug.go b/src/pkg/runtime/debug.go index d09db1be6ae..5117e1a551e 100644 --- a/src/pkg/runtime/debug.go +++ b/src/pkg/runtime/debug.go @@ -69,7 +69,8 @@ type MemStatsType struct { // Per-size allocation statistics. // Not locked during update; approximate. - BySize [67]struct { + // 61 is NumSizeClasses in the C code. + BySize [61]struct { Size uint32 Mallocs uint64 Frees uint64 diff --git a/src/pkg/runtime/iface.c b/src/pkg/runtime/iface.c index aa36df68eee..3dec45e2b8e 100644 --- a/src/pkg/runtime/iface.c +++ b/src/pkg/runtime/iface.c @@ -702,7 +702,7 @@ unsafe·New(Eface typ, void *ret) t = (Type*)((Eface*)typ.data-1); if(t->kind&KindNoPointers) - ret = runtime·mallocgc(t->size, RefNoPointers, 1, 1); + ret = runtime·mallocgc(t->size, FlagNoPointers, 1, 1); else ret = runtime·mal(t->size); FLUSH(&ret); @@ -722,7 +722,7 @@ unsafe·NewArray(Eface typ, uint32 n, void *ret) size = n*t->size; if(t->kind&KindNoPointers) - ret = runtime·mallocgc(size, RefNoPointers, 1, 1); + ret = runtime·mallocgc(size, FlagNoPointers, 1, 1); else ret = runtime·mal(size); FLUSH(&ret); diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc index cc28b943df5..8899b01195d 100644 --- a/src/pkg/runtime/malloc.goc +++ b/src/pkg/runtime/malloc.goc @@ -36,14 +36,13 @@ fastrand1(void) // Small objects are allocated from the per-thread cache's free lists. // Large objects (> 32 kB) are allocated straight from the heap. void* -runtime·mallocgc(uintptr size, uint32 refflag, int32 dogc, int32 zeroed) +runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed) { int32 sizeclass, rate; MCache *c; uintptr npages; MSpan *s; void *v; - uint32 *ref; if(runtime·gcwaiting && g != m->g0 && m->locks == 0) runtime·gosched(); @@ -65,12 +64,6 @@ runtime·mallocgc(uintptr size, uint32 refflag, int32 dogc, int32 zeroed) mstats.alloc += size; mstats.total_alloc += size; mstats.by_size[sizeclass].nmalloc++; - - if(!runtime·mlookup(v, nil, nil, nil, &ref)) { - runtime·printf("malloc %D; runtime·mlookup failed\n", (uint64)size); - runtime·throw("malloc runtime·mlookup"); - } - *ref = RefNone | refflag; } else { // TODO(rsc): Report tracebacks for very large allocations. @@ -87,13 +80,14 @@ runtime·mallocgc(uintptr size, uint32 refflag, int32 dogc, int32 zeroed) v = (void*)(s->start << PageShift); // setup for mark sweep - s->gcref0 = RefNone | refflag; - ref = &s->gcref0; + runtime·markspan(v, 0, 0, true); } + if(!(flag & FlagNoGC)) + runtime·markallocated(v, size, (flag&FlagNoPointers) != 0); m->mallocing = 0; - if(!(refflag & RefNoProfiling) && (rate = runtime·MemProfileRate) > 0) { + if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) { if(size >= rate) goto profile; if(m->mcache->next_sample > size) @@ -104,7 +98,7 @@ runtime·mallocgc(uintptr size, uint32 refflag, int32 dogc, int32 zeroed) rate = 0x3fffffff; m->mcache->next_sample = fastrand1() % (2*rate); profile: - *ref |= RefProfiled; + runtime·setblockspecial(v); runtime·MProf_Malloc(v, size); } } @@ -124,33 +118,35 @@ runtime·malloc(uintptr size) void runtime·free(void *v) { - int32 sizeclass, size; + int32 sizeclass; MSpan *s; MCache *c; - uint32 prof, *ref; + uint32 prof; + uintptr size; if(v == nil) return; + + // If you change this also change mgc0.c:/^sweepspan, + // which has a copy of the guts of free. if(m->mallocing) runtime·throw("malloc/free - deadlock"); m->mallocing = 1; - if(!runtime·mlookup(v, nil, nil, &s, &ref)) { + if(!runtime·mlookup(v, nil, nil, &s)) { runtime·printf("free %p: not an allocated block\n", v); runtime·throw("free runtime·mlookup"); } - prof = *ref & RefProfiled; - *ref = RefFree; + prof = runtime·blockspecial(v); // Find size class for v. sizeclass = s->sizeclass; if(sizeclass == 0) { // Large object. - if(prof) - runtime·MProf_Free(v, s->npages<npages<npages<npages<start< sizeof(uintptr)) ((uintptr*)v)[1] = 1; // mark as "needs to be zeroed" - if(prof) - runtime·MProf_Free(v, size); - mstats.alloc -= size; mstats.by_size[sizeclass].nfree++; runtime·MCache_Free(c, v, sizeclass, size); } + runtime·markfreed(v, size); + mstats.alloc -= size; + if(prof) + runtime·MProf_Free(v, size); m->mallocing = 0; } int32 -runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp, uint32 **ref) +runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp) { - uintptr n, nobj, i; + uintptr n, i; byte *p; MSpan *s; @@ -179,12 +176,11 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp, uint32 **ref) if(sp) *sp = s; if(s == nil) { + runtime·checkfreed(v, 1); if(base) *base = nil; if(size) *size = 0; - if(ref) - *ref = 0; return 0; } @@ -195,14 +191,11 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp, uint32 **ref) *base = p; if(size) *size = s->npages<gcref0; return 1; } - if((byte*)v >= (byte*)s->gcref) { - // pointers into the gc ref counts - // do not count as pointers. + if((byte*)v >= (byte*)s->limit) { + // pointers past the last block do not count as pointers. return 0; } @@ -213,21 +206,6 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp, uint32 **ref) if(size) *size = n; - // good for error checking, but expensive - if(0) { - nobj = (s->npages << PageShift) / (n + RefcountOverhead); - if((byte*)s->gcref < p || (byte*)(s->gcref+nobj) > p+(s->npages<state, s, p, s->sizeclass, (uint64)nobj, (uint64)n, (uint64)s->npages); - runtime·printf("s->base sizeclass %d v=%p base=%p gcref=%p blocksize=%D nobj=%D size=%D end=%p end=%p\n", - s->sizeclass, v, p, s->gcref, (uint64)s->npages<gcref + nobj, p+(s->npages<gcref[i]; - return 1; } @@ -246,14 +224,20 @@ runtime·allocmcache(void) int32 runtime·sizeof_C_MStats = sizeof(MStats); +#define MaxArena32 (2U<<30) + void runtime·mallocinit(void) { byte *p; - uintptr arena_size; + uintptr arena_size, bitmap_size; + extern byte end[]; runtime·InitSizes(); + // Set up the allocation arena, a contiguous area of memory where + // allocated data will be found. The arena begins with a bitmap large + // enough to hold 4 bits per allocated word. if(sizeof(void*) == 8) { // On a 64-bit machine, allocate from a single contiguous reservation. // 16 GB should be big enough for now. @@ -273,19 +257,44 @@ runtime·mallocinit(void) // odds of the conservative garbage collector not collecting memory // because some non-pointer block of memory had a bit pattern // that matched a memory address. + // + // Actually we reserve 17 GB (because the bitmap ends up being 1 GB) + // but it hardly matters: fc is not valid UTF-8 either, and we have to + // allocate 15 GB before we get that far. arena_size = 16LL<<30; - p = runtime·SysReserve((void*)(0x00f8ULL<<32), arena_size); + bitmap_size = arena_size / (sizeof(void*)*8/4); + p = runtime·SysReserve((void*)(0x00f8ULL<<32), bitmap_size + arena_size); if(p == nil) runtime·throw("runtime: cannot reserve arena virtual address space"); - runtime·mheap.arena_start = p; - runtime·mheap.arena_used = p; - runtime·mheap.arena_end = p + arena_size; } else { - // On a 32-bit machine, we'll take what we can get for each allocation - // and maintain arena_start and arena_end as min, max we've seen. - runtime·mheap.arena_start = (byte*)0xffffffff; - runtime·mheap.arena_end = 0; + // On a 32-bit machine, we can't typically get away + // with a giant virtual address space reservation. + // Instead we map the memory information bitmap + // immediately after the data segment, large enough + // to handle another 2GB of mappings (256 MB), + // along with a reservation for another 512 MB of memory. + // When that gets used up, we'll start asking the kernel + // for any memory anywhere and hope it's in the 2GB + // following the bitmap (presumably the executable begins + // near the bottom of memory, so we'll have to use up + // most of memory before the kernel resorts to giving out + // memory before the beginning of the text segment). + // + // Alternatively we could reserve 512 MB bitmap, enough + // for 4GB of mappings, and then accept any memory the + // kernel threw at us, but normally that's a waste of 512 MB + // of address space, which is probably too much in a 32-bit world. + bitmap_size = MaxArena32 / (sizeof(void*)*8/4); + arena_size = 512<<20; + + p = (void*)(((uintptr)end + 64*1024 - 1) & ~(64*1024-1)); + if(runtime·SysReserve(p, bitmap_size + arena_size) != p) + runtime·throw("runtime: cannot reserve memory bitmap virtual address space"); } + runtime·mheap.bitmap = p; + runtime·mheap.arena_start = p + bitmap_size; + runtime·mheap.arena_used = runtime·mheap.arena_start; + runtime·mheap.arena_end = runtime·mheap.arena_start + arena_size; // Initialize the rest of the allocator. runtime·MHeap_Init(&runtime·mheap, runtime·SysAlloc); @@ -299,26 +308,41 @@ void* runtime·MHeap_SysAlloc(MHeap *h, uintptr n) { byte *p; - - if(sizeof(void*) == 8) { + + if(n <= h->arena_end - h->arena_used) { // Keep taking from our reservation. - if(h->arena_end - h->arena_used < n) - return nil; p = h->arena_used; runtime·SysMap(p, n); h->arena_used += n; + runtime·MHeap_MapBits(h); return p; - } else { - // Take what we can get from the OS. - p = runtime·SysAlloc(n); - if(p == nil) - return nil; - if(p+n > h->arena_used) - h->arena_used = p+n; - if(p > h->arena_end) - h->arena_end = p; - return p; } + + // On 64-bit, our reservation is all we have. + if(sizeof(void*) == 8) + return nil; + + // On 32-bit, once the reservation is gone we can + // try to get memory at a location chosen by the OS + // and hope that it is in the range we allocated bitmap for. + p = runtime·SysAlloc(n); + if(p == nil) + return nil; + + if(p < h->arena_start || p+n - h->arena_start >= MaxArena32) { + runtime·printf("runtime: memory allocated by OS not in usable range"); + runtime·SysFree(p, n); + return nil; + } + + if(p+n > h->arena_used) { + h->arena_used = p+n; + if(h->arena_used > h->arena_end) + h->arena_end = h->arena_used; + runtime·MHeap_MapBits(h); + } + + return p; } // Runtime stubs. @@ -353,7 +377,6 @@ void* runtime·stackalloc(uint32 n) { void *v; - uint32 *ref; if(m->mallocing || m->gcing || n == FixedStack) { runtime·lock(&stacks); @@ -369,11 +392,7 @@ runtime·stackalloc(uint32 n) runtime·unlock(&stacks); return v; } - v = runtime·mallocgc(n, RefNoProfiling, 0, 0); - if(!runtime·mlookup(v, nil, nil, nil, &ref)) - runtime·throw("stackalloc runtime·mlookup"); - *ref = RefStack; - return v; + return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0); } void @@ -399,7 +418,7 @@ func Free(p *byte) { } func Lookup(p *byte) (base *byte, size uintptr) { - runtime·mlookup(p, &base, &size, nil, nil); + runtime·mlookup(p, &base, &size, nil); } func GC() { @@ -422,7 +441,7 @@ func SetFinalizer(obj Eface, finalizer Eface) { runtime·printf("runtime.SetFinalizer: first argument is %S, not pointer\n", *obj.type->string); goto throw; } - if(!runtime·mlookup(obj.data, &base, &size, nil, nil) || obj.data != base) { + if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) { runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n"); goto throw; } diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h index e2472e8d238..4e2794570de 100644 --- a/src/pkg/runtime/malloc.h +++ b/src/pkg/runtime/malloc.h @@ -97,8 +97,14 @@ typedef uintptr PageID; // address >> PageShift enum { + // Computed constant. The definition of MaxSmallSize and the + // algorithm in msize.c produce some number of different allocation + // size classes. NumSizeClasses is that number. It's needed here + // because there are static arrays of this length; when msize runs its + // size choosing algorithm it double-checks that NumSizeClasses agrees. + NumSizeClasses = 61, + // Tunable constants. - NumSizeClasses = 67, // Number of size classes (must match msize.c) MaxSmallSize = 32<<10, FixAllocChunk = 128<<10, // Chunk size for FixAlloc @@ -290,10 +296,7 @@ struct MSpan uint32 ref; // number of allocated objects in this span uint32 sizeclass; // size class uint32 state; // MSpanInUse etc - union { - uint32 *gcref; // sizeclass > 0 - uint32 gcref0; // sizeclass == 0 - }; + byte *limit; // end of data in span }; void runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages); @@ -336,6 +339,7 @@ struct MHeap // range of addresses we might see in the heap byte *bitmap; + uintptr bitmap_mapped; byte *arena_start; byte *arena_used; byte *arena_end; @@ -359,26 +363,29 @@ MSpan* runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct void runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct); MSpan* runtime·MHeap_Lookup(MHeap *h, void *v); MSpan* runtime·MHeap_LookupMaybe(MHeap *h, void *v); -void runtime·MGetSizeClassInfo(int32 sizeclass, int32 *size, int32 *npages, int32 *nobj); +void runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *size, int32 *npages, int32 *nobj); void* runtime·MHeap_SysAlloc(MHeap *h, uintptr n); +void runtime·MHeap_MapBits(MHeap *h); void* runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed); -int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s, uint32 **ref); +int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s); void runtime·gc(int32 force); +void runtime·markallocated(void *v, uintptr n, bool noptr); +void runtime·checkallocated(void *v, uintptr n); +void runtime·markfreed(void *v, uintptr n); +void runtime·checkfreed(void *v, uintptr n); +int32 runtime·checking; +void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover); +void runtime·unmarkspan(void *v, uintptr size); +bool runtime·blockspecial(void*); +void runtime·setblockspecial(void*); enum { - RefcountOverhead = 4, // one uint32 per object - - RefFree = 0, // must be zero - RefStack, // stack segment - don't free and don't scan for pointers - RefNone, // no references - RefSome, // some references - RefNoPointers = 0x80000000U, // flag - no pointers here - RefHasFinalizer = 0x40000000U, // flag - has finalizer - RefProfiled = 0x20000000U, // flag - is in profiling table - RefNoProfiling = 0x10000000U, // flag - must not profile - RefFlags = 0xFFFF0000U, + // flags to malloc + FlagNoPointers = 1<<0, // no pointers here + FlagNoProfiling = 1<<1, // must not profile + FlagNoGC = 1<<2, // must not free or scan for pointers }; void runtime·MProf_Malloc(void*, uintptr); diff --git a/src/pkg/runtime/mcentral.c b/src/pkg/runtime/mcentral.c index f1ad119d3a4..29b03b58f88 100644 --- a/src/pkg/runtime/mcentral.c +++ b/src/pkg/runtime/mcentral.c @@ -113,8 +113,7 @@ static void MCentral_Free(MCentral *c, void *v) { MSpan *s; - PageID page; - MLink *p, *next; + MLink *p; int32 size; // Find span for v. @@ -138,16 +137,8 @@ MCentral_Free(MCentral *c, void *v) if(--s->ref == 0) { size = runtime·class_to_size[c->sizeclass]; runtime·MSpanList_Remove(s); - // The second word of each freed block indicates - // whether it needs to be zeroed. The first word - // is the link pointer and must always be cleared. - for(p=s->freelist; p; p=next) { - next = p->next; - if(size > sizeof(uintptr) && ((uintptr*)p)[1] != 0) - runtime·memclr((byte*)p, size); - else - p->next = nil; - } + runtime·unmarkspan((byte*)(s->start<npages<start<freelist = nil; c->nfree -= (s->npages << PageShift) / size; runtime·unlock(c); @@ -157,7 +148,7 @@ MCentral_Free(MCentral *c, void *v) } void -runtime·MGetSizeClassInfo(int32 sizeclass, int32 *sizep, int32 *npagesp, int32 *nobj) +runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *sizep, int32 *npagesp, int32 *nobj) { int32 size; int32 npages; @@ -166,7 +157,7 @@ runtime·MGetSizeClassInfo(int32 sizeclass, int32 *sizep, int32 *npagesp, int32 size = runtime·class_to_size[sizeclass]; *npagesp = npages; *sizep = size; - *nobj = (npages << PageShift) / (size + RefcountOverhead); + *nobj = (npages << PageShift) / size; } // Fetch a new span from the heap and @@ -174,7 +165,8 @@ runtime·MGetSizeClassInfo(int32 sizeclass, int32 *sizep, int32 *npagesp, int32 static bool MCentral_Grow(MCentral *c) { - int32 i, n, npages, size; + int32 i, n, npages; + uintptr size; MLink **tailp, *v; byte *p; MSpan *s; @@ -191,7 +183,7 @@ MCentral_Grow(MCentral *c) // Carve span into sequence of blocks. tailp = &s->freelist; p = (byte*)(s->start << PageShift); - s->gcref = (uint32*)(p + size*n); + s->limit = p + size*n; for(i=0; istart<npages<nfree += n; diff --git a/src/pkg/runtime/mfinal.c b/src/pkg/runtime/mfinal.c index f73561b3cd9..03ee777c0be 100644 --- a/src/pkg/runtime/mfinal.c +++ b/src/pkg/runtime/mfinal.c @@ -5,6 +5,7 @@ #include "runtime.h" #include "malloc.h" +// TODO(rsc): Why not just use mheap.Lock? static Lock finlock; // Finalizer hash table. Direct hash, linear scan, at most 3/4 full. @@ -101,24 +102,21 @@ runtime·addfinalizer(void *p, void (*f)(void*), int32 nret) } runtime·lock(&finlock); - if(!runtime·mlookup(p, &base, nil, nil, &ref) || p != base) { + if(!runtime·mlookup(p, &base, nil, nil) || p != base) { runtime·unlock(&finlock); runtime·throw("addfinalizer on invalid pointer"); } if(f == nil) { - if(*ref & RefHasFinalizer) { - lookfintab(&fintab, p, 1); - *ref &= ~RefHasFinalizer; - } + lookfintab(&fintab, p, 1); runtime·unlock(&finlock); return; } - if(*ref & RefHasFinalizer) { + if(lookfintab(&fintab, p, 0)) { runtime·unlock(&finlock); runtime·throw("double finalizer"); } - *ref |= RefHasFinalizer; + runtime·setblockspecial(p); if(fintab.nkey >= fintab.max/2+fintab.max/4) { // keep table at most 3/4 full: @@ -134,7 +132,7 @@ runtime·addfinalizer(void *p, void (*f)(void*), int32 nret) newtab.max *= 3; } - newtab.key = runtime·mallocgc(newtab.max*sizeof newtab.key[0], RefNoPointers, 0, 1); + newtab.key = runtime·mallocgc(newtab.max*sizeof newtab.key[0], FlagNoPointers, 0, 1); newtab.val = runtime·mallocgc(newtab.max*sizeof newtab.val[0], 0, 0, 1); for(i=0; i> shift; +// /* then test bits & bitAllocated, bits & bitMarked, etc. */ +// +#define bitAllocated ((uintptr)1<<(bitShift*0)) +#define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAllocated is set */ +#define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAllocated is set */ +#define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAllocated is set - has finalizer or being profiled */ +#define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAllocated is NOT set */ + +#define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) + +static uint64 nlookup; +static uint64 nsizelookup; +static uint64 naddrlookup; +static uint64 nhandoff; +static int32 gctrace; + +typedef struct Workbuf Workbuf; +struct Workbuf { - byte *obj; - uintptr size; + Workbuf *next; + uintptr nw; + byte *w[2048-2]; }; extern byte data[]; @@ -33,72 +71,258 @@ extern byte end[]; static G *fing; static Finalizer *finq; static int32 fingwait; -static BlockList *bl, *ebl; +static uint32 nfullwait; static void runfinq(void); +static bool bitlookup(void*, uintptr**, uintptr*, int32*); +static Workbuf* getempty(Workbuf*); +static Workbuf* getfull(Workbuf*); -enum { - PtrSize = sizeof(void*) -}; - +// scanblock scans a block of n bytes starting at pointer b for references +// to other objects, scanning any it finds recursively until there are no +// unscanned objects left. Instead of using an explicit recursion, it keeps +// a work list in the Workbuf* structures and loops in the main function +// body. Keeping an explicit work list is easier on the stack allocator and +// more efficient. static void scanblock(byte *b, int64 n) { - int32 off; - void *obj; - uintptr size; - uint32 *refp, ref; + byte *obj, *arena_start, *p; void **vp; - int64 i; - BlockList *w; + uintptr size, *bitp, bits, shift, i, j, x, xbits, off; + MSpan *s; + PageID k; + void **bw, **w, **ew; + Workbuf *wbuf; - w = bl; - w->obj = b; - w->size = n; - w++; + // Memory arena parameters. + arena_start = runtime·mheap.arena_start; + + wbuf = nil; // current work buffer + ew = nil; // end of work buffer + bw = nil; // beginning of work buffer + w = nil; // current pointer into work buffer - while(w > bl) { - w--; - b = w->obj; - n = w->size; + // Align b to a word boundary. + off = (uintptr)b & (PtrSize-1); + if(off != 0) { + b += PtrSize - off; + n -= PtrSize - off; + } + for(;;) { + // Each iteration scans the block b of length n, queueing pointers in + // the work buffer. if(Debug > 1) runtime·printf("scanblock %p %D\n", b, n); - off = (uint32)(uintptr)b & (PtrSize-1); - if(off) { - b += PtrSize - off; - n -= PtrSize - off; - } - + vp = (void**)b; n /= PtrSize; for(i=0; i= runtime·mheap.arena_used) continue; - if(runtime·mheap.arena_start <= (byte*)obj && (byte*)obj < runtime·mheap.arena_end) { - if(runtime·mlookup(obj, &obj, &size, nil, &refp)) { - ref = *refp; - switch(ref & ~RefFlags) { - case RefNone: - if(Debug > 1) - runtime·printf("found at %p: ", &vp[i]); - *refp = RefSome | (ref & RefFlags); - if(!(ref & RefNoPointers)) { - if(w >= ebl) - runtime·throw("scanblock: garbage collection stack overflow"); - w->obj = obj; - w->size = size; - w++; - } - break; - } + + // obj may be a pointer to a live object. + // Try to find the beginning of the object. + + // Round down to word boundary. + obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + + // Find bits for this word. + off = (uintptr*)obj - (uintptr*)arena_start; + bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + xbits = *bitp; + bits = xbits >> shift; + + // Pointing at the beginning of a block? + if((bits & (bitAllocated|bitBlockBoundary)) != 0) + goto found; + + // Pointing just past the beginning? + // Scan backward a little to find a block boundary. + for(j=shift; j-->0; ) { + if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) { + obj = (byte*)obj - (shift-j)*PtrSize; + shift = j; + bits = xbits>>shift; + goto found; } } + + // Otherwise consult span table to find beginning. + // (Manually inlined copy of MHeap_LookupMaybe.) + nlookup++; + naddrlookup++; + k = (uintptr)obj>>PageShift; + x = k; + if(sizeof(void*) == 8) + x -= (uintptr)arena_start>>PageShift; + s = runtime·mheap.map[x]; + if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse) + continue; + p = (byte*)((uintptr)s->start<sizeclass == 0) { + obj = p; + } else { + if((byte*)obj >= (byte*)s->limit) + continue; + size = runtime·class_to_size[s->sizeclass]; + int32 i = ((byte*)obj - p)/size; + obj = p+i*size; + } + + // Now that we know the object header, reload bits. + off = (uintptr*)obj - (uintptr*)arena_start; + bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + xbits = *bitp; + bits = xbits >> shift; + + found: + // Now we have bits, bitp, and shift correct for + // obj pointing at the base of the object. + // If not allocated or already marked, done. + if((bits & bitAllocated) == 0 || (bits & bitMarked) != 0) + continue; + *bitp |= bitMarked<= ew) { + wbuf = getempty(wbuf); + bw = wbuf->w; + w = bw; + ew = bw + nelem(wbuf->w); + } + *w++ = obj; } + + // Done scanning [b, b+n). Prepare for the next iteration of + // the loop by setting b and n to the parameters for the next block. + + // Fetch b from the work buffers. + if(w <= bw) { + // Emptied our buffer: refill. + wbuf = getfull(wbuf); + if(wbuf == nil) + break; + bw = wbuf->w; + ew = wbuf->w + nelem(wbuf->w); + w = bw+wbuf->nw; + } + b = *--w; + + // Figure out n = size of b. Start by loading bits for b. + off = (uintptr*)b - (uintptr*)arena_start; + bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + xbits = *bitp; + bits = xbits >> shift; + + // Might be small; look for nearby block boundary. + // A block boundary is marked by either bitBlockBoundary + // or bitAllocated being set (see notes near their definition). + enum { + boundary = bitBlockBoundary|bitAllocated + }; + // Look for a block boundary both after and before b + // in the same bitmap word. + // + // A block boundary j words after b is indicated by + // bits>>j & boundary + // assuming shift+j < bitShift. (If shift+j >= bitShift then + // we'll be bleeding other bit types like bitMarked into our test.) + // Instead of inserting the conditional shift+j < bitShift into the loop, + // we can let j range from 1 to bitShift as long as we first + // apply a mask to keep only the bits corresponding + // to shift+j < bitShift aka j < bitShift-shift. + bits &= (boundary<<(bitShift-shift)) - boundary; + + // A block boundary j words before b is indicated by + // xbits>>(shift-j) & boundary + // (assuming shift >= j). There is no cleverness here + // avoid the test, because when j gets too large the shift + // turns negative, which is undefined in C. + + for(j=1; j>j)&boundary) != 0 || shift>=j && ((xbits>>(shift-j))&boundary) != 0) { + n = j*PtrSize; + goto scan; + } + } + + // Fall back to asking span about size class. + // (Manually inlined copy of MHeap_Lookup.) + nlookup++; + nsizelookup++; + x = (uintptr)b>>PageShift; + if(sizeof(void*) == 8) + x -= (uintptr)arena_start>>PageShift; + s = runtime·mheap.map[x]; + if(s->sizeclass == 0) + n = s->npages<sizeclass]; + scan:; } } +static struct { + Workbuf *full; + Workbuf *empty; + byte *chunk; + uintptr nchunk; +} work; + +// Get an empty work buffer off the work.empty list, +// allocating new buffers as needed. +static Workbuf* +getempty(Workbuf *b) +{ + if(b != nil) { + b->nw = nelem(b->w); + b->next = work.full; + work.full = b; + } + b = work.empty; + if(b != nil) { + work.empty = b->next; + return b; + } + + if(work.nchunk < sizeof *b) { + work.nchunk = 1<<20; + work.chunk = runtime·SysAlloc(work.nchunk); + } + b = (Workbuf*)work.chunk; + work.chunk += sizeof *b; + work.nchunk -= sizeof *b; + return b; +} + +// Get a full work buffer off the work.full list, or return nil. +static Workbuf* +getfull(Workbuf *b) +{ + if(b != nil) { + b->nw = 0; + b->next = work.empty; + work.empty = b; + } + b = work.full; + if(b != nil) + work.full = b->next; + return b; +} + +// Scanstack calls scanblock on each of gp's stack segments. static void scanstack(G *gp) { @@ -119,46 +343,26 @@ scanstack(G *gp) } } +// Markfin calls scanblock on the blocks that have finalizers: +// the things pointed at cannot be freed until the finalizers have run. static void markfin(void *v) { uintptr size; - uint32 *refp; size = 0; - refp = nil; - if(!runtime·mlookup(v, &v, &size, nil, &refp) || !(*refp & RefHasFinalizer)) + if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v)) runtime·throw("mark - finalizer inconsistency"); - + // do not mark the finalizer block itself. just mark the things it points at. scanblock(v, size); } +// Mark static void mark(void) { G *gp; - uintptr blsize, nobj; - - // Figure out how big an object stack we need. - // Get a new one if we need more than we have - // or we need significantly less than we have. - nobj = mstats.heap_objects; - if(nobj > ebl - bl || nobj < (ebl-bl)/4) { - if(bl != nil) - runtime·SysFree(bl, (byte*)ebl - (byte*)bl); - - // While we're allocated a new object stack, - // add 20% headroom and also round up to - // the nearest page boundary, since mmap - // will anyway. - nobj = nobj * 12/10; - blsize = nobj * sizeof *bl; - blsize = (blsize + 4095) & ~4095; - nobj = blsize / sizeof *bl; - bl = runtime·SysAlloc(blsize); - ebl = bl + nobj; - } // mark data+bss. // skip runtime·mheap itself, which has no interesting pointers @@ -192,95 +396,83 @@ mark(void) runtime·walkfintab(markfin); } -// free RefNone, free & queue finalizers for RefNone|RefHasFinalizer, reset RefSome -static void -sweepspan(MSpan *s) -{ - int32 n, npages, size; - byte *p; - uint32 ref, *gcrefp, *gcrefep; - MCache *c; - Finalizer *f; - - p = (byte*)(s->start << PageShift); - if(s->sizeclass == 0) { - // Large block. - ref = s->gcref0; - switch(ref & ~(RefFlags^RefHasFinalizer)) { - case RefNone: - // Free large object. - mstats.alloc -= s->npages<npages<npages<gcref0 = RefFree; - runtime·MHeap_Free(&runtime·mheap, s, 1); - break; - case RefNone|RefHasFinalizer: - f = runtime·getfinalizer(p, 1); - if(f == nil) - runtime·throw("finalizer inconsistency"); - f->arg = p; - f->next = finq; - finq = f; - ref &= ~RefHasFinalizer; - // fall through - case RefSome: - case RefSome|RefHasFinalizer: - s->gcref0 = RefNone | (ref&RefFlags); - break; - } - return; - } - - // Chunk full of small blocks. - runtime·MGetSizeClassInfo(s->sizeclass, &size, &npages, &n); - gcrefp = s->gcref; - gcrefep = s->gcref + n; - for(; gcrefp < gcrefep; gcrefp++, p += size) { - ref = *gcrefp; - if(ref < RefNone) // RefFree or RefStack - continue; - switch(ref & ~(RefFlags^RefHasFinalizer)) { - case RefNone: - // Free small object. - if(ref & RefProfiled) - runtime·MProf_Free(p, size); - *gcrefp = RefFree; - c = m->mcache; - if(size > sizeof(uintptr)) - ((uintptr*)p)[1] = 1; // mark as "needs to be zeroed" - mstats.alloc -= size; - mstats.nfree++; - mstats.by_size[s->sizeclass].nfree++; - runtime·MCache_Free(c, p, s->sizeclass, size); - break; - case RefNone|RefHasFinalizer: - f = runtime·getfinalizer(p, 1); - if(f == nil) - runtime·throw("finalizer inconsistency"); - f->arg = p; - f->next = finq; - finq = f; - ref &= ~RefHasFinalizer; - // fall through - case RefSome: - case RefSome|RefHasFinalizer: - *gcrefp = RefNone | (ref&RefFlags); - break; - } - } -} - +// Sweep frees or calls finalizers for blocks not marked in the mark phase. +// It clears the mark bits in preparation for the next GC round. static void sweep(void) { MSpan *s; + int32 cl, n, npages; + uintptr size; + byte *p; + MCache *c; + Finalizer *f; - for(s = runtime·mheap.allspans; s != nil; s = s->allnext) - if(s->state == MSpanInUse) - sweepspan(s); + for(s = runtime·mheap.allspans; s != nil; s = s->allnext) { + if(s->state != MSpanInUse) + continue; + + p = (byte*)(s->start << PageShift); + cl = s->sizeclass; + if(cl == 0) { + size = s->npages< 0; n--, p += size) { + uintptr off, *bitp, shift, bits; + + off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start; + bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + bits = *bitp>>shift; + + if((bits & bitAllocated) == 0) + continue; + + if((bits & bitMarked) != 0) { + *bitp &= ~(bitMarked<arg = p; + f->next = finq; + finq = f; + continue; + } + runtime·MProf_Free(p, size); + } + + // Mark freed; restore block boundary bit. + *bitp = (*bitp & ~(bitMask<sizeclass == 0) { + // Free large span. + runtime·unmarkspan(p, 1<mcache; + if(size > sizeof(uintptr)) + ((uintptr*)p)[1] = 1; // mark as "needs to be zeroed" + mstats.by_size[s->sizeclass].nfree++; + runtime·MCache_Free(c, p, s->sizeclass, size); + } + mstats.alloc -= size; + mstats.nfree++; + } + } } // Semaphore, not Lock, so that the goroutine @@ -326,7 +518,8 @@ cachestats(void) void runtime·gc(int32 force) { - int64 t0, t1; + int64 t0, t1, t2, t3; + uint64 heap0, heap1, obj0, obj1; byte *p; Finalizer *fp; @@ -349,23 +542,41 @@ runtime·gc(int32 force) gcpercent = -1; else gcpercent = runtime·atoi(p); + + p = runtime·getenv("GOGCTRACE"); + if(p != nil) + gctrace = runtime·atoi(p); } if(gcpercent < 0) return; runtime·semacquire(&gcsema); + if(!force && mstats.heap_alloc < mstats.next_gc) { + runtime·semrelease(&gcsema); + return; + } + t0 = runtime·nanotime(); + nlookup = 0; + nsizelookup = 0; + naddrlookup = 0; + m->gcing = 1; runtime·stoptheworld(); if(runtime·mheap.Lock.key != 0) runtime·throw("runtime·mheap locked during gc"); - if(force || mstats.heap_alloc >= mstats.next_gc) { - cachestats(); - mark(); - sweep(); - stealcache(); - mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; - } + + cachestats(); + heap0 = mstats.heap_alloc; + obj0 = mstats.nmalloc - mstats.nfree; + + mark(); + t1 = runtime·nanotime(); + sweep(); + t2 = runtime·nanotime(); + stealcache(); + + mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; m->gcing = 0; m->locks++; // disable gc during the mallocs in newproc @@ -381,18 +592,34 @@ runtime·gc(int32 force) } m->locks--; - t1 = runtime·nanotime(); + cachestats(); + heap1 = mstats.heap_alloc; + obj1 = mstats.nmalloc - mstats.nfree; + + t3 = runtime·nanotime(); + mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0; + mstats.pause_total_ns += t3 - t0; mstats.numgc++; - mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t1 - t0; - mstats.pause_total_ns += t1 - t0; if(mstats.debuggc) - runtime·printf("pause %D\n", t1-t0); + runtime·printf("pause %D\n", t3-t0); + + if(gctrace) { + runtime·printf("gc%d: %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr)\n", + mstats.numgc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000, + heap0>>20, heap1>>20, obj0, obj1, + mstats.nmalloc, mstats.nfree, + nlookup, nsizelookup, naddrlookup); + } + runtime·semrelease(&gcsema); runtime·starttheworld(); // give the queued finalizers, if any, a chance to run if(fp != nil) runtime·gosched(); + + if(gctrace > 1 && !force) + runtime·gc(1); } static void @@ -430,3 +657,157 @@ runfinq(void) runtime·gc(1); // trigger another gc to clean up the finalized objects, if possible } } + +// mark the block at v of size n as allocated. +// If noptr is true, mark it as having no pointers. +void +runtime·markallocated(void *v, uintptr n, bool noptr) +{ + uintptr *b, bits, off, shift; + + if(0) + runtime·printf("markallocated %p+%p\n", v, n); + + if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) + runtime·throw("markallocated: bad pointer"); + + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; // word offset + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + + bits = (*b & ~(bitMask< (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) + runtime·throw("markallocated: bad pointer"); + + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; // word offset + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + + *b = (*b & ~(bitMask< (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) + return; // not allocated, so okay + + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; // word offset + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + + bits = *b>>shift; + if((bits & bitAllocated) != 0) { + runtime·printf("checkfreed %p+%p: off=%p have=%p\n", + v, n, off, bits & bitMask); + runtime·throw("checkfreed: not freed"); + } +} + +// mark the span of memory at v as having n blocks of the given size. +// if leftover is true, there is left over space at the end of the span. +void +runtime·markspan(void *v, uintptr size, uintptr n, bool leftover) +{ + uintptr *b, off, shift; + byte *p; + + if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) + runtime·throw("markspan: bad pointer"); + + p = v; + if(leftover) // mark a boundary just past end of last block too + n++; + for(; n-- > 0; p += size) { + off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start; // word offset + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + *b = (*b & ~(bitMask< (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) + runtime·throw("markspan: bad pointer"); + + p = v; + off = p - (uintptr*)runtime·mheap.arena_start; // word offset + if(off % wordsPerBitmapWord != 0) + runtime·throw("markspan: unaligned pointer"); + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + n /= PtrSize; + if(n%wordsPerBitmapWord != 0) + runtime·throw("unmarkspan: unaligned length"); + n /= wordsPerBitmapWord; + while(n-- > 0) + *b-- = 0; +} + +bool +runtime·blockspecial(void *v) +{ + uintptr *b, off, shift; + + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; + b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + + return (*b & (bitSpecial<arena_used - h->arena_start) / wordsPerBitmapWord; + n = (n+bitmapChunk-1) & ~(bitmapChunk-1); + if(h->bitmap_mapped >= n) + return; + + runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped); + h->bitmap_mapped = n; +} diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c index 0c9ac0a0968..8061b7cf888 100644 --- a/src/pkg/runtime/mheap.c +++ b/src/pkg/runtime/mheap.c @@ -180,7 +180,9 @@ MHeap_Grow(MHeap *h, uintptr npage) // Allocate a multiple of 64kB (16 pages). npage = (npage+15)&~15; ask = npage< h->arena_end - h->arena_used) + return false; + if(ask < HeapAllocChunk && HeapAllocChunk <= h->arena_end - h->arena_used) ask = HeapAllocChunk; v = runtime·MHeap_SysAlloc(h, ask); @@ -194,11 +196,6 @@ MHeap_Grow(MHeap *h, uintptr npage) } mstats.heap_sys += ask; - if((byte*)v < h->arena_start || h->arena_start == nil) - h->arena_start = v; - if((byte*)v+ask > h->arena_end) - h->arena_end = (byte*)v+ask; - // Create a fake "in use" span and free it, so that the // right coalescing happens. s = runtime·FixAlloc_Alloc(&h->spanalloc); @@ -370,10 +367,14 @@ runtime·MSpanList_IsEmpty(MSpan *list) void runtime·MSpanList_Insert(MSpan *list, MSpan *span) { - if(span->next != nil || span->prev != nil) + if(span->next != nil || span->prev != nil) { + runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev); runtime·throw("MSpanList_Insert"); + } span->next = list->next; span->prev = list; span->next->prev = span; span->prev->next = span; } + + diff --git a/src/pkg/runtime/mprof.goc b/src/pkg/runtime/mprof.goc index f4581e98d55..aae3d183fef 100644 --- a/src/pkg/runtime/mprof.goc +++ b/src/pkg/runtime/mprof.goc @@ -65,7 +65,7 @@ stkbucket(uintptr *stk, int32 nstk) runtime·mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0) return b; - b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], RefNoProfiling, 0, 1); + b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1); bucketmem += sizeof *b + nstk*sizeof stk[0]; runtime·memmove(b->stk, stk, nstk*sizeof stk[0]); b->hash = h; @@ -132,7 +132,7 @@ setaddrbucket(uintptr addr, Bucket *b) if(ah->addr == (addr>>20)) goto found; - ah = runtime·mallocgc(sizeof *ah, RefNoProfiling, 0, 1); + ah = runtime·mallocgc(sizeof *ah, FlagNoProfiling, 0, 1); addrmem += sizeof *ah; ah->next = addrhash[h]; ah->addr = addr>>20; @@ -140,7 +140,7 @@ setaddrbucket(uintptr addr, Bucket *b) found: if((e = addrfree) == nil) { - e = runtime·mallocgc(64*sizeof *e, RefNoProfiling, 0, 0); + e = runtime·mallocgc(64*sizeof *e, FlagNoProfiling, 0, 0); addrmem += 64*sizeof *e; for(i=0; i+1<64; i++) e[i].next = &e[i+1]; diff --git a/src/pkg/runtime/msize.c b/src/pkg/runtime/msize.c index ec85eb37317..770ef38ceff 100644 --- a/src/pkg/runtime/msize.c +++ b/src/pkg/runtime/msize.c @@ -57,7 +57,7 @@ runtime·SizeToClass(int32 size) void runtime·InitSizes(void) { - int32 align, sizeclass, size, osize, nextsize, n; + int32 align, sizeclass, size, nextsize, n; uint32 i; uintptr allocsize, npages; @@ -81,8 +81,7 @@ runtime·InitSizes(void) // the leftover is less than 1/8 of the total, // so wasted space is at most 12.5%. allocsize = PageSize; - osize = size + RefcountOverhead; - while(allocsize%osize > (allocsize/8)) + while(allocsize%size > allocsize/8) allocsize += PageSize; npages = allocsize >> PageShift; @@ -93,7 +92,7 @@ runtime·InitSizes(void) // different sizes. if(sizeclass > 1 && npages == runtime·class_to_allocnpages[sizeclass-1] - && allocsize/osize == allocsize/(runtime·class_to_size[sizeclass-1]+RefcountOverhead)) { + && allocsize/size == allocsize/runtime·class_to_size[sizeclass-1]) { runtime·class_to_size[sizeclass-1] = size; continue; } diff --git a/src/pkg/runtime/slice.c b/src/pkg/runtime/slice.c index 05107547922..1fee923e43b 100644 --- a/src/pkg/runtime/slice.c +++ b/src/pkg/runtime/slice.c @@ -41,7 +41,7 @@ makeslice1(SliceType *t, int32 len, int32 cap, Slice *ret) ret->cap = cap; if((t->elem->kind&KindNoPointers)) - ret->array = runtime·mallocgc(size, RefNoPointers, 1, 1); + ret->array = runtime·mallocgc(size, FlagNoPointers, 1, 1); else ret->array = runtime·mal(size); } diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc index 916559eb2d8..b72aa937c33 100644 --- a/src/pkg/runtime/string.goc +++ b/src/pkg/runtime/string.goc @@ -225,7 +225,7 @@ func slicebytetostring(b Slice) (s String) { } func stringtoslicebyte(s String) (b Slice) { - b.array = runtime·mallocgc(s.len, RefNoPointers, 1, 1); + b.array = runtime·mallocgc(s.len, FlagNoPointers, 1, 1); b.len = s.len; b.cap = s.len; runtime·mcpy(b.array, s.str, s.len); @@ -268,7 +268,7 @@ func stringtosliceint(s String) (b Slice) { n++; } - b.array = runtime·mallocgc(n*sizeof(r[0]), RefNoPointers, 1, 1); + b.array = runtime·mallocgc(n*sizeof(r[0]), FlagNoPointers, 1, 1); b.len = n; b.cap = n; p = s.str; diff --git a/src/pkg/runtime/windows/mem.c b/src/pkg/runtime/windows/mem.c index 19d11ce8d3e..c523195a46a 100644 --- a/src/pkg/runtime/windows/mem.c +++ b/src/pkg/runtime/windows/mem.c @@ -48,7 +48,7 @@ runtime·SysFree(void *v, uintptr n) void* runtime·SysReserve(void *v, uintptr n) { - return runtime·stdcall(runtime·VirtualAlloc, 4, v, n, MEM_RESERVE, 0); + return runtime·stdcall(runtime·VirtualAlloc, 4, v, n, MEM_RESERVE, PAGE_EXECUTE_READWRITE); } void