From 122384e4893edeccb34c3faa4c1378b3bcb55531 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Tue, 3 Mar 2015 16:55:14 -0500 Subject: [PATCH] runtime: Remove boundary bit logic. This is an experiment to see if removing the boundary bit logic will lead to fewer cache misses and improved performance. Instead of using boundary bits we use the span information to get element size and use some bit whacking to get the boundary without having to touch the random heap bits which cause cache misses. Furthermore once the boundary bit is removed we can either use that bit for a simpler checkmark routine or we can reduce the number of bits in the GC bitmap to 2 bits per pointer sized work. For example the 2 bits at the boundary can be used for marking and pointer/scalar differentiation. Since we don't need the mark bit except at the boundary nibble of the object other nibbles can use this bit as a noscan bit to indicate that there are no more pointers in the object. Currently the changed included in this CL slows down the garbage benchmark. With the boundary bits garbage gives 5.78 and without (this CL) it gives 5.88 which is a 2% slowdown. Change-Id: Id68f831ad668176f7dc9f7b57b339e4ebb6dc4c2 Reviewed-on: https://go-review.googlesource.com/6665 Reviewed-by: Austin Clements --- src/runtime/mbitmap.go | 137 +++++++--------------------------------- src/runtime/mcentral.go | 2 +- src/runtime/mgcmark.go | 23 +++---- src/runtime/mgcsweep.go | 2 +- 4 files changed, 34 insertions(+), 130 deletions(-) diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 6b46ad18cbf..7dd3637665f 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -32,7 +32,7 @@ // describe p and the high 4 bits describe p+ptrSize. // // The 4 bits for each word are: -// 0001 - bitBoundary: this is the start of an object +// 0001 - not used // 0010 - bitMarked: this object has been marked by GC // tt00 - word type bits, as in a type bitmap. // @@ -77,7 +77,6 @@ const ( heapBitsWidth = 4 heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes per heap bitmap byte - bitBoundary = 1 bitMarked = 2 typeShift = 2 ) @@ -151,30 +150,21 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { // heapBitsForObject returns the base address for the heap object // containing the address p, along with the heapBits for base. -// If p does not point into a heap object, heapBitsForObject returns base == 0. -func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits) { +// If p does not point into a heap object, +// return base == 0 +// otherwise return the base of the object. +func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) { if p < mheap_.arena_start || p >= mheap_.arena_used { return } - // If heap bits for the pointer-sized word containing p have bitBoundary set, - // then we know this is the base of the object, and we can stop now. - // This handles the case where p is the base and, due to rounding - // when looking up the heap bits, also the case where p points beyond - // the base but still into the first pointer-sized word of the object. - hbits = heapBitsForAddr(p) - if hbits.isBoundary() { - base = p &^ (ptrSize - 1) - return - } - - // Otherwise, p points into the middle of an object. + // p points into the heap, but possibly to the middle of an object. // Consult the span table to find the block beginning. // TODO(rsc): Factor this out. k := p >> _PageShift x := k x -= mheap_.arena_start >> _PageShift - s := h_spans[x] + s = h_spans[x] if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse { if s == nil || s.state == _MSpanStack { // If s is nil, the virtual address has never been part of the heap. @@ -216,19 +206,16 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits) { base += n * s.elemsize } - if base == p { - print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n") - throw("failed to find block beginning") - } - // Now that we know the actual base, compute heapBits to return to caller. hbits = heapBitsForAddr(base) - if !hbits.isBoundary() { - throw("missing boundary at computed object start") - } return } +// prefetch the bits. +func (h heapBits) prefetch() { + prefetchnta(uintptr(unsafe.Pointer((h.bitp)))) +} + // next returns the heapBits describing the next pointer-sized word in memory. // That is, if h describes address p, h.next() describes p+ptrSize. // Note that next does not modify h. The caller must record the result. @@ -258,14 +245,6 @@ func (h heapBits) setMarkedNonAtomic() { *h.bitp |= bitMarked << h.shift } -// isBoundary reports whether the heap bits have the boundary bit set. -func (h heapBits) isBoundary() bool { - return *h.bitp&(bitBoundary<> (h.shift + typeShift)) & typeMask @@ -299,60 +278,8 @@ func (h heapBits) setCheckmarked() { // initSpan initializes the heap bitmap for a span. func (h heapBits) initSpan(size, n, total uintptr) { - if size == ptrSize { - // Only possible on 64-bit system, since minimum size is 8. - // Set all nibbles to bitBoundary using uint64 writes. - nbyte := n * ptrSize / heapBitmapScale - nuint64 := nbyte / 8 - bitp := subtractb(h.bitp, nbyte-1) - for i := uintptr(0); i < nuint64; i++ { - const boundary64 = bitBoundary | - bitBoundary<<4 | - bitBoundary<<8 | - bitBoundary<<12 | - bitBoundary<<16 | - bitBoundary<<20 | - bitBoundary<<24 | - bitBoundary<<28 | - bitBoundary<<32 | - bitBoundary<<36 | - bitBoundary<<40 | - bitBoundary<<44 | - bitBoundary<<48 | - bitBoundary<<52 | - bitBoundary<<56 | - bitBoundary<<60 - - *(*uint64)(unsafe.Pointer(bitp)) = boundary64 - bitp = addb(bitp, 8) - } - return - } - - if size*n < total { - // To detect end of object during GC object scan, - // add boundary just past end of last block. - // The object scan knows to stop when it reaches - // the end of the span, but in this case the object - // ends before the end of the span. - // - // TODO(rsc): If the bitmap bits were going to be typeDead - // otherwise, what's the point of this? - // Can we delete this logic? - n++ - } - step := size / heapBitmapScale - bitp := h.bitp - for i := uintptr(0); i < n; i++ { - *bitp = bitBoundary - bitp = subtractb(bitp, step) - } -} - -// clearSpan clears the heap bitmap bytes for the span. -func (h heapBits) clearSpan(size, n, total uintptr) { if total%heapBitmapScale != 0 { - throw("clearSpan: unaligned length") + throw("initSpan: unaligned length") } nbyte := total / heapBitmapScale memclr(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte) @@ -371,9 +298,7 @@ func (h heapBits) initCheckmarkSpan(size, n, total uintptr) { bitp := h.bitp for i := uintptr(0); i < n; i += 2 { x := int(*bitp) - if x&0x11 != 0x11 { - throw("missing bitBoundary") - } + if (x>>typeShift)&typeMask == typeDead { x += (typeScalar - typeDead) << typeShift } @@ -392,9 +317,6 @@ func (h heapBits) initCheckmarkSpan(size, n, total uintptr) { bitp := h.bitp step := size / heapBitmapScale for i := uintptr(0); i < n; i++ { - if *bitp&bitBoundary == 0 { - throw("missing bitBoundary") - } x := *bitp if (x>>typeShift)&typeMask == typeDead { x += (typeScalar - typeDead) << typeShift @@ -416,10 +338,6 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) { bitp := h.bitp for i := uintptr(0); i < n; i += 2 { x := int(*bitp) - if x&(bitBoundary|bitBoundary<<4) != (bitBoundary | bitBoundary<<4) { - throw("missing bitBoundary") - } - switch typ := (x >> typeShift) & typeMask; typ { case typeScalar: x += (typeDead - typeScalar) << typeShift @@ -448,10 +366,6 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) { step := size / heapBitmapScale for i := uintptr(0); i < n; i++ { x := int(*bitp) - if x&bitBoundary == 0 { - throw("missing bitBoundary") - } - switch typ := (x >> typeShift) & typeMask; { case typ == typeScalarCheckmarked && (x>>(4+typeShift))&typeMask != typeDead: x += (typeScalar - typeScalarCheckmarked) << typeShift @@ -503,7 +417,7 @@ func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) { if x&bitMarked != 0 { x &^= bitMarked } else { - x = bitBoundary // clear marked bit, set type bits to typeDead + x = 0 f(base + i*size) } *bitp = uint8(x) @@ -522,10 +436,6 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // From here till marked label marking the object as allocated // and storing type info in the GC bitmap. h := heapBitsForAddr(x) - if debugMalloc && (*h.bitp>>h.shift)&0x0f != bitBoundary { - println("runtime: bits =", (*h.bitp>>h.shift)&0x0f) - throw("bad bits in markallocated") - } var ti, te uintptr var ptrmask *uint8 @@ -572,7 +482,8 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask } if size == 2*ptrSize { - *h.bitp = *ptrmask | bitBoundary + // h.shift is 0 for all sizes > ptrSize. + *h.bitp = *ptrmask return } te = uintptr(typ.size) / ptrSize @@ -581,15 +492,17 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { te /= 2 } // Copy pointer bitmask into the bitmap. + // TODO(rlh): add comment addressing the following concerns: + // If size > 2*ptrSize, is x guaranteed to be at least 2*ptrSize-aligned? + // And if type occupies and odd number of words, why are we only going through half + // of ptrmask and why don't we have to shift everything by 4 on odd iterations? + for i := uintptr(0); i < dataSize; i += 2 * ptrSize { v := *(*uint8)(add(unsafe.Pointer(ptrmask), ti)) ti++ if ti == te { ti = 0 } - if i == 0 { - v |= bitBoundary - } if i+ptrSize == dataSize { v &^= typeMask << (4 + typeShift) } @@ -783,12 +696,6 @@ func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) { // Mark first word as bitAllocated. // Mark word after last as typeDead. - // TODO(rsc): Explain why we need to set this boundary. - // Aren't the boundaries always set for the whole span? - // Did unrollgcproc1 overwrite the boundary bit? - // Is that okay? - h := heapBitsForAddr(uintptr(v)) - *h.bitp |= bitBoundary << h.shift if size0 < size { h := heapBitsForAddr(uintptr(v) + size0) *h.bitp &^= typeMask << typeShift diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 965cd10586d..a6dbe45ba17 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -173,7 +173,7 @@ func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gcli s.needzero = 1 s.freelist = 0 unlock(&c.lock) - heapBitsForSpan(s.base()).clearSpan(s.layout()) + heapBitsForSpan(s.base()).initSpan(s.layout()) mHeap_Free(&mheap_, s, 0) return true } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 494c3c16214..3276ab83444 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -404,7 +404,7 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWorkProducer) { } } -// Scan the object b of size n, adding pointers to wbuf. +// Scan the object b of size n bytes, adding pointers to wbuf. // Return possibly new wbuf to use. // If ptrmask != nil, it specifies where pointers are in b. // If ptrmask == nil, the GC bitmap should be consulted. @@ -417,13 +417,16 @@ func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWorkProducer) { // Find bits of the beginning of the object. var hbits heapBits + if ptrmask == nil { - b, hbits = heapBitsForObject(b) + var s *mspan + b, hbits, s = heapBitsForObject(b) if b == 0 { return } + n = s.elemsize if n == 0 { - n = mheap_.arena_used - b + throw("scanobject n == 0") } } for i := uintptr(0); i < n; i += ptrSize { @@ -433,15 +436,9 @@ func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWorkProducer) { // dense mask (stack or data) bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * typeBitsWidth)) & typeMask } else { - // Check if we have reached end of span. - // n is an overestimate of the size of the object. - if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] { - break - } - bits = uintptr(hbits.typeBits()) - if i > 0 && (hbits.isBoundary() || bits == typeDead) { - break // reached beginning of the next object + if bits == typeDead { + break // no more pointers in this object } hbits = hbits.next() } @@ -468,7 +465,7 @@ func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWorkProducer) { } // Mark the object. - if obj, hbits := heapBitsForObject(obj); obj != 0 { + if obj, hbits, _ := heapBitsForObject(obj); obj != 0 { greyobject(obj, b, i, hbits, gcw) } } @@ -481,7 +478,7 @@ func shade(b uintptr) { if !inheap(b) { throw("shade: passed an address not in the heap") } - if obj, hbits := heapBitsForObject(b); obj != 0 { + if obj, hbits, _ := heapBitsForObject(b); obj != 0 { // TODO: this would be a great place to put a check to see // if we are harvesting and if we are then we should // figure out why there is a call to shade when the diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index ab18d5ff881..8a1ced9f285 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -218,7 +218,7 @@ func mSpan_Sweep(s *mspan, preserve bool) bool { if preserve { throw("can't preserve large span") } - heapBitsForSpan(p).clearSpan(s.layout()) + heapBitsForSpan(p).initSpan(s.layout()) s.needzero = 1 // important to set sweepgen before returning it to heap