From ff3fa1b32dba4448bed45282b1038a4f8e9c23dc Mon Sep 17 00:00:00 2001 From: Dmitriy Vyukov Date: Tue, 19 Aug 2014 17:38:00 +0400 Subject: [PATCH] runtime: make the GC bitmap a byte array Half the code in the garbage collector accesses the bitmap as an array of bytes instead of as an array of uintptrs. This is tricky to do correctly in a portable fashion, it breaks on big-endian systems. Make the bitmap a byte array. Simplifies markallocated, scanblock and span sweep along the way, as we don't need to recalculate bitmap position for each word. LGTM=khr R=golang-codereviews, khr CC=golang-codereviews, rlh, rsc https://golang.org/cl/125250043 --- src/pkg/runtime/asm_386.s | 8 ++ src/pkg/runtime/asm_amd64.s | 8 ++ src/pkg/runtime/atomic_arm.c | 16 +++ src/pkg/runtime/heapdump.c | 11 +- src/pkg/runtime/malloc.go | 22 ++-- src/pkg/runtime/mgc0.c | 225 +++++++++++++++++++---------------- src/pkg/runtime/mgc0.h | 2 +- src/pkg/runtime/runtime.h | 1 + 8 files changed, 168 insertions(+), 125 deletions(-) diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s index 0607bc8021..16e3f3136a 100644 --- a/src/pkg/runtime/asm_386.s +++ b/src/pkg/runtime/asm_386.s @@ -619,6 +619,14 @@ TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12 XADDL AX, (SP) RET +// void runtime·atomicor8(byte volatile*, byte); +TEXT runtime·atomicor8(SB), NOSPLIT, $0-8 + MOVL ptr+0(FP), AX + MOVB val+4(FP), BX + LOCK + ORB BX, (AX) + RET + // void jmpdefer(fn, sp); // called from deferreturn. // 1. pop the caller diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s index d94df0bf8d..6446b5d832 100644 --- a/src/pkg/runtime/asm_amd64.s +++ b/src/pkg/runtime/asm_amd64.s @@ -702,6 +702,14 @@ TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 XCHGQ AX, 0(BX) RET +// void runtime·atomicor8(byte volatile*, byte); +TEXT runtime·atomicor8(SB), NOSPLIT, $0-16 + MOVQ ptr+0(FP), AX + MOVB val+8(FP), BX + LOCK + ORB BX, (AX) + RET + // void jmpdefer(fn, sp); // called from deferreturn. // 1. pop the caller diff --git a/src/pkg/runtime/atomic_arm.c b/src/pkg/runtime/atomic_arm.c index d914475c7f..537bf18331 100644 --- a/src/pkg/runtime/atomic_arm.c +++ b/src/pkg/runtime/atomic_arm.c @@ -167,3 +167,19 @@ runtime·atomicstore64(uint64 volatile *addr, uint64 v) *addr = v; runtime·unlock(LOCK(addr)); } + +#pragma textflag NOSPLIT +void +runtime·atomicor8(byte volatile *addr, byte v) +{ + uint32 *addr32, old, word, shift; + + // Align down to 4 bytes and use 32-bit CAS. + addr32 = (uint32*)((uintptr)addr & ~3); + word = ((uint32)v) << (((uintptr)addr & 3) * 8); + for(;;) { + old = *addr32; + if(runtime·cas(addr32, old, old|word)) + break; + } +} diff --git a/src/pkg/runtime/heapdump.c b/src/pkg/runtime/heapdump.c index babb32fe5a..63d80b8d0e 100644 --- a/src/pkg/runtime/heapdump.c +++ b/src/pkg/runtime/heapdump.c @@ -820,7 +820,8 @@ dumpbvtypes(BitVector *bv, byte *base) static BitVector makeheapobjbv(byte *p, uintptr size) { - uintptr off, shift, *bitp, bits, nptr, i; + uintptr off, nptr, i; + byte shift, *bitp, bits; bool mw; // Extend the temp buffer if necessary. @@ -838,13 +839,13 @@ makeheapobjbv(byte *p, uintptr size) mw = false; for(i = 0; i < nptr; i++) { off = (uintptr*)(p + i*PtrSize) - (uintptr*)runtime·mheap.arena_start; - bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - bits = (*bitp >> (shift + 2)) & 3; + bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; + bits = (*bitp >> (shift + 2)) & BitsMask; if(!mw && bits == BitsDead) break; // end of heap object mw = !mw && bits == BitsMultiWord; - tmpbuf[i*BitsPerPointer/8] &= ~(3<<((i*BitsPerPointer)%8)); + tmpbuf[i*BitsPerPointer/8] &= ~(BitsMask<<((i*BitsPerPointer)%8)); tmpbuf[i*BitsPerPointer/8] |= bits<<((i*BitsPerPointer)%8); } return (BitVector){i*BitsPerPointer, (uint32*)tmpbuf}; diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go index 152b3b6b68..8ee460755f 100644 --- a/src/pkg/runtime/malloc.go +++ b/src/pkg/runtime/malloc.go @@ -22,8 +22,8 @@ const ( pageSize = 1 << pageShift pageMask = pageSize - 1 - wordsPerBitmapWord = ptrSize * 8 / 4 gcBits = 4 + wordsPerBitmapByte = 8 / gcBits bitsPerPointer = 2 bitsMask = 1<>shift)&(bitMask|bitPtrMask)) != bitBoundary { println("runtime: bits =", (*xbits>>shift)&(bitMask|bitPtrMask)) gothrow("bad bits in markallocated") @@ -260,8 +260,7 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask } if size == 2*ptrSize { - xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) - *xbitsb = *ptrmask | bitBoundary + *xbits = *ptrmask | bitBoundary goto marked } te = uintptr(typ.size) / ptrSize @@ -283,19 +282,12 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { v &^= uint8(bitPtrMask << 4) } - off := (uintptr(x) + i - arena_start) / ptrSize - xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize)) - shift := (off % wordsPerBitmapWord) * gcBits - xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) - *xbitsb = v + *xbits = v + xbits = (*byte)(add(unsafe.Pointer(xbits), ^uintptr(0))) } if size0%(2*ptrSize) == 0 && size0 < size { // Mark the word after last object's word as bitsDead. - off := (uintptr(x) + size0 - arena_start) / ptrSize - xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize)) - shift := (off % wordsPerBitmapWord) * gcBits - xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) - *xbitsb = bitsDead << 2 + *xbits = bitsDead << 2 } } marked: diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 60a6181fc9..14743c2838 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -212,8 +212,8 @@ static struct { static void scanblock(byte *b, uintptr n, byte *ptrmask) { - byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], bits8; - uintptr i, nobj, size, idx, *bitp, bits, xbits, shift, x, off, cached, scanbufpos; + byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached; + uintptr i, nobj, size, idx, x, off, scanbufpos; intptr ncached; Workbuf *wbuf; String *str; @@ -237,6 +237,10 @@ scanblock(byte *b, uintptr n, byte *ptrmask) for(i = 0; i < nelem(scanbuf); i++) scanbuf[i] = nil; + ptrbitp = nil; + cached = 0; + ncached = 0; + // ptrmask can have 3 possible values: // 1. nil - obtain pointer mask from GC bitmap. // 2. ScanConservatively - don't use any mask, scan conservatively. @@ -295,8 +299,15 @@ scanblock(byte *b, uintptr n, byte *ptrmask) } ptrmask = ScanConservatively; } - cached = 0; - ncached = 0; + // Find bits of the beginning of the object. + if(ptrmask == nil) { + off = (uintptr*)b - (uintptr*)arena_start; + ptrbitp = arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; + cached = *ptrbitp >> shift; + cached &= ~bitBoundary; + ncached = (8 - shift)/gcBits; + } for(i = 0; i < n; i += PtrSize) { obj = nil; // Find bits for this word. @@ -308,16 +319,13 @@ scanblock(byte *b, uintptr n, byte *ptrmask) // Consult GC bitmap. if(ncached <= 0) { // Refill cache. - off = (uintptr*)(b+i) - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - cached = *bitp >> shift; - ncached = (PtrSize*8 - shift)/gcBits; + cached = *--ptrbitp; + ncached = 2; } bits = cached; cached >>= gcBits; ncached--; - if(i != 0 && (bits&bitBoundary) != 0) + if((bits&bitBoundary) != 0) break; // reached beginning of the next object bits = (bits>>2)&BitsMask; if(bits == BitsDead) @@ -336,11 +344,9 @@ scanblock(byte *b, uintptr n, byte *ptrmask) // Find the next pair of bits. if(ptrmask == nil) { if(ncached <= 0) { - off = (uintptr*)(b+i+PtrSize) - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - cached = *bitp >> shift; - ncached = (PtrSize*8 - shift)/gcBits; + // Refill cache. + cached = *--ptrbitp; + ncached = 2; } bits = (cached>>2)&BitsMask; } else @@ -383,8 +389,14 @@ scanblock(byte *b, uintptr n, byte *ptrmask) if(bits == BitsSlice) { i += 2*PtrSize; - cached >>= 2*gcBits; - ncached -= 2; + if(ncached == 2) + ncached = 0; + else if(ptrmask == nil) { + // Refill cache and consume one quadruple. + cached = *--ptrbitp; + cached >>= gcBits; + ncached = 1; + } } else { i += PtrSize; cached >>= gcBits; @@ -398,20 +410,12 @@ scanblock(byte *b, uintptr n, byte *ptrmask) continue; // Mark the object. off = (uintptr*)obj - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; + bitp = arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; xbits = *bitp; bits = (xbits >> shift) & bitMask; if((bits&bitBoundary) == 0) { - // Not a beginning of a block, check if we have block boundary in xbits. - while(shift > 0) { - obj -= PtrSize; - shift -= gcBits; - bits = (xbits >> shift) & bitMask; - if((bits&bitBoundary) != 0) - goto havebits; - } - // Otherwise consult span table to find the block beginning. + // Not a beginning of a block, consult span table to find the block beginning. k = (uintptr)obj>>PageShift; x = k; x -= (uintptr)arena_start>>PageShift; @@ -433,7 +437,6 @@ scanblock(byte *b, uintptr n, byte *ptrmask) goto markobj; } - havebits: // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about not marked objects. @@ -449,22 +452,12 @@ scanblock(byte *b, uintptr n, byte *ptrmask) // For 8-byte objects we use non-atomic store, if the other // quadruple is already marked. Otherwise we resort to CAS // loop for marking. - bits8 = xbits>>(shift&~7); - if((bits8&(bitMask|(bitMask<>shift) & bitMask; - if((bits&bitMarked) != 0) - break; - } - if((bits&bitMarked) != 0) - continue; - } + *bitp = xbits | (bitMarked<>(shift+2))&BitsMask) == BitsDead) continue; // noscan object @@ -901,9 +894,9 @@ bool runtime·MSpan_Sweep(MSpan *s, bool preserve) { int32 cl, n, npages, nfree; - uintptr size, off, *bitp, shift, xbits, bits; + uintptr size, off, step; uint32 sweepgen; - byte *p; + byte *p, *bitp, shift, xbits, bits; MCache *c; byte *arena_start; MLink head, *end, *link; @@ -939,8 +932,8 @@ runtime·MSpan_Sweep(MSpan *s, bool preserve) // Mark any free objects in this span so we don't collect them. for(link = s->freelist; link != nil; link = link->next) { off = (uintptr*)link - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; + bitp = arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; *bitp |= bitMarked<start << PageShift) + special->offset/size*size; off = (uintptr*)p - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; + bitp = arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; bits = (*bitp>>shift) & bitMask; if((bits&bitMarked) == 0) { // Find the exact byte for which the special was setup @@ -977,10 +970,27 @@ runtime·MSpan_Sweep(MSpan *s, bool preserve) // This thread owns the span now, so it can manipulate // the block bitmap without atomic operations. p = (byte*)(s->start << PageShift); + // Find bits for the beginning of the span. + off = (uintptr*)p - (uintptr*)arena_start; + bitp = arena_start - off/wordsPerBitmapByte - 1; + shift = 0; + step = size/(PtrSize*wordsPerBitmapByte); + // Rewind to the previous quadruple as we move to the next + // in the beginning of the loop. + bitp += step; + if(step == 0) { + // 8-byte objects. + bitp++; + shift = gcBits; + } for(; n > 0; n--, p += size) { - off = (uintptr*)p - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; + bitp -= step; + if(step == 0) { + if(shift != 0) + bitp--; + shift = gcBits - shift; + } + xbits = *bitp; bits = (xbits>>shift) & bitMask; @@ -1759,8 +1769,8 @@ runtime·wakefing(void) static byte* unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse) { - uintptr *b, off, shift, pos, siz, i; - byte *arena_start, *prog1, v; + uintptr pos, siz, i, off; + byte *arena_start, *prog1, v, *bitp, shift; arena_start = runtime·mheap.arena_start; pos = *ppos; @@ -1777,11 +1787,11 @@ unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse) if(inplace) { // Store directly into GC bitmap. off = (uintptr*)(mask+pos) - (uintptr*)arena_start; - b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - if((shift%8)==0) - ((byte*)b)[shift/8] = 0; - ((byte*)b)[shift/8] |= v<<((shift%8)+2); + bitp = arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; + if(shift==0) + *bitp = 0; + *bitp |= v<<(shift+2); pos += PtrSize; } else if(sparse) { // 4-bits per word @@ -1847,8 +1857,8 @@ unrollglobgcprog(byte *prog, uintptr size) void runtime·unrollgcproginplace_m(void) { - uintptr size, size0, *b, off, shift, pos; - byte *arena_start, *prog; + uintptr size, size0, pos, off; + byte *arena_start, *prog, *bitp, shift; Type *typ; void *v; @@ -1866,15 +1876,15 @@ runtime·unrollgcproginplace_m(void) // Mark first word as bitAllocated. arena_start = runtime·mheap.arena_start; off = (uintptr*)v - (uintptr*)arena_start; - b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - *b |= bitBoundary< (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) runtime·throw("markspan: bad pointer"); - p = v; - if(leftover) // mark a boundary just past end of last block too - n++; + // Find bits of the beginning of the span. + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; // word offset + b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; + if((off%wordsPerBitmapByte) != 0) + runtime·throw("markspan: unaligned length"); - b0 = nil; - x = 0; - for(; n-- > 0; p += size) { - // Okay to use non-atomic ops here, because we control - // the entire span, and each bitmap word has bits for only - // one span, so no other goroutines are changing these - // bitmap words. - off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start; // word offset - b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - if(b0 != b) { - if(b0 != nil) - *b0 = x; - b0 = b; - x = 0; - } - x |= (bitBoundary< (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start) runtime·throw("markspan: bad pointer"); - p = v; - off = p - (uintptr*)runtime·mheap.arena_start; // word offset - if((off % wordsPerBitmapWord) != 0) + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; // word offset + if((off % (PtrSize*wordsPerBitmapByte)) != 0) runtime·throw("markspan: unaligned pointer"); - b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; n /= PtrSize; - if(n%wordsPerBitmapWord != 0) + if(n%(PtrSize*wordsPerBitmapByte) != 0) runtime·throw("unmarkspan: unaligned length"); // Okay to use non-atomic ops here, because we control // the entire span, and each bitmap word has bits for only // one span, so no other goroutines are changing these // bitmap words. - n /= wordsPerBitmapWord; - runtime·memclr((byte*)(b - n + 1), n*PtrSize); + n /= wordsPerBitmapByte; + runtime·memclr(b - n + 1, n); } void @@ -1983,7 +2000,7 @@ runtime·MHeap_MapBits(MHeap *h) }; uintptr n; - n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; + n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte); n = ROUND(n, bitmapChunk); n = ROUND(n, PhysPageSize); if(h->bitmap_mapped >= n) @@ -2011,8 +2028,8 @@ void runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len) { Stkframe frame; - uintptr i, n, off, bits, shift, *b; - byte *base; + uintptr i, n, off; + byte *base, bits, shift, *b; *mask = nil; *len = 0; @@ -2047,8 +2064,8 @@ runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len) *mask = runtime·mallocgc(*len, nil, 0); for(i = 0; i < n; i += PtrSize) { off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start; - b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; + b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; + shift = (off % wordsPerBitmapByte) * gcBits; bits = (*b >> (shift+2))&BitsMask; (*mask)[i/PtrSize] = bits; } diff --git a/src/pkg/runtime/mgc0.h b/src/pkg/runtime/mgc0.h index 893819c128..7449398b9f 100644 --- a/src/pkg/runtime/mgc0.h +++ b/src/pkg/runtime/mgc0.h @@ -8,8 +8,8 @@ enum { ScanStackByFrames = 1, // Four bits per word (see #defines below). - wordsPerBitmapWord = sizeof(void*)*8/4, gcBits = 4, + wordsPerBitmapByte = 8/gcBits, // GC type info programs. // The programs allow to store type info required for GC in a compact form. diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h index 0dc60b286b..867da3f46a 100644 --- a/src/pkg/runtime/runtime.h +++ b/src/pkg/runtime/runtime.h @@ -913,6 +913,7 @@ void runtime·atomicstore64(uint64 volatile*, uint64); uint64 runtime·atomicload64(uint64 volatile*); void* runtime·atomicloadp(void* volatile*); void runtime·atomicstorep(void* volatile*, void*); +void runtime·atomicor8(byte volatile*, byte); void runtime·setg(G*); void runtime·newextram(void);