From ad0287f4960610c01c12ea99da50d7138509c0b0 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 9 Aug 2022 15:44:48 +0000 Subject: [PATCH] Revert "runtime: redo heap bitmap" This reverts commit b589208c8cc6e08239868f47e12c1449cd797bac. Reason for revert: Bug somewhere in this code, causing wasm and maybe linux/386 to fail. Change-Id: I5e1e501d839584e0219271bb937e94348f83c11f Reviewed-on: https://go-review.googlesource.com/c/go/+/422395 Reviewed-by: Than McIntosh Run-TryBot: Keith Randall Reviewed-by: Michael Knyszek TryBot-Result: Gopher Robot --- src/cmd/compile/internal/test/inl_test.go | 8 +- src/reflect/all_test.go | 21 +- src/runtime/cgocall.go | 11 +- src/runtime/cgocheck.go | 18 +- src/runtime/heapdump.go | 20 +- src/runtime/malloc.go | 10 +- src/runtime/mbitmap.go | 1512 +++++++++++++++------ src/runtime/mcache.go | 2 +- src/runtime/mcentral.go | 2 +- src/runtime/mgcmark.go | 31 +- src/runtime/mheap.go | 19 +- src/runtime/slice.go | 2 - 12 files changed, 1126 insertions(+), 530 deletions(-) diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index 9926985c58..49ee88eaec 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -72,7 +72,11 @@ func TestIntendedInlining(t *testing.T) { "cgoInRange", "gclinkptr.ptr", "guintptr.ptr", - "writeHeapBitsForAddr", + "heapBits.bits", + "heapBits.isPointer", + "heapBits.morePointers", + "heapBits.next", + "heapBitsForAddr", "markBits.isMarked", "muintptr.ptr", "puintptr.ptr", @@ -220,8 +224,6 @@ func TestIntendedInlining(t *testing.T) { // On loong64, mips64x and riscv64, Ctz64 is not intrinsified and causes nextFreeFast too expensive // to inline (Issue 22239). want["runtime"] = append(want["runtime"], "nextFreeFast") - // Same behavior for heapBits.nextFast. - want["runtime"] = append(want["runtime"], "heapBits.nextFast") } if runtime.GOARCH != "386" { // As explained above, Ctz64 and Ctz32 are not Go code on 386. diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go index aa620bf0ee..0e493fb5e2 100644 --- a/src/reflect/all_test.go +++ b/src/reflect/all_test.go @@ -6989,21 +6989,8 @@ func TestFuncLayout(t *testing.T) { } } -// trimBitmap removes trailing 0 elements from b and returns the result. -func trimBitmap(b []byte) []byte { - for len(b) > 0 && b[len(b)-1] == 0 { - b = b[:len(b)-1] - } - return b -} - func verifyGCBits(t *testing.T, typ Type, bits []byte) { heapBits := GCBits(New(typ).Interface()) - - // Trim scalars at the end, as bits might end in zero, - // e.g. with rep(2, lit(1, 0)). - bits = trimBitmap(bits) - if !bytes.Equal(heapBits, bits) { _, _, line, _ := runtime.Caller(1) t.Errorf("line %d: heapBits incorrect for %v\nhave %v\nwant %v", line, typ, heapBits, bits) @@ -7020,10 +7007,12 @@ func verifyGCBitsSlice(t *testing.T, typ Type, cap int, bits []byte) { heapBits := GCBits(data.Interface()) // Repeat the bitmap for the slice size, trimming scalars in // the last element. - bits = trimBitmap(rep(cap, bits)) + bits = rep(cap, bits) + for len(bits) > 0 && bits[len(bits)-1] == 0 { + bits = bits[:len(bits)-1] + } if !bytes.Equal(heapBits, bits) { - _, _, line, _ := runtime.Caller(1) - t.Errorf("line %d: heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", line, typ, cap, heapBits, bits) + t.Errorf("heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", typ, cap, heapBits, bits) } } diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go index dd9de9d247..892654ed5b 100644 --- a/src/runtime/cgocall.go +++ b/src/runtime/cgocall.go @@ -568,16 +568,17 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) { if base == 0 { return } + hbits := heapBitsForAddr(base) n := span.elemsize - hbits := heapBitsForAddr(base, n) - for { - var addr uintptr - if hbits, addr = hbits.next(); addr == 0 { + for i = uintptr(0); i < n; i += goarch.PtrSize { + if !hbits.morePointers() { + // No more possible pointers. break } - if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(addr))) { + if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) { panic(errorString(msg)) } + hbits = hbits.next() } return diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go index 84e7516758..6b492093ea 100644 --- a/src/runtime/cgocheck.go +++ b/src/runtime/cgocheck.go @@ -153,16 +153,16 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) { // src must be in the regular heap. - hbits := heapBitsForAddr(uintptr(src), size) - for { - var addr uintptr - if hbits, addr = hbits.next(); addr == 0 { - break - } - v := *(*unsafe.Pointer)(unsafe.Pointer(addr)) - if cgoIsGoPointer(v) { - throw(cgoWriteBarrierFail) + hbits := heapBitsForAddr(uintptr(src)) + for i := uintptr(0); i < off+size; i += goarch.PtrSize { + bits := hbits.bits() + if i >= off && bits&bitPointer != 0 { + v := *(*unsafe.Pointer)(add(src, i)) + if cgoIsGoPointer(v) { + throw(cgoWriteBarrierFail) + } } + hbits = hbits.next() } } diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 4123c5194c..543efeded4 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -737,16 +737,16 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector { for i := uintptr(0); i < nptr/8+1; i++ { tmpbuf[i] = 0 } - - hbits := heapBitsForAddr(p, size) - for { - var addr uintptr - hbits, addr = hbits.next() - if addr == 0 { - break + i := uintptr(0) + hbits := heapBitsForAddr(p) + for ; i < nptr; i++ { + if !hbits.morePointers() { + break // end of object } - i := (addr - p) / goarch.PtrSize - tmpbuf[i/8] |= 1 << (i % 8) + if hbits.isPointer() { + tmpbuf[i/8] |= 1 << (i % 8) + } + hbits = hbits.next() } - return bitvector{int32(nptr), &tmpbuf[0]} + return bitvector{int32(i), &tmpbuf[0]} } diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 0219401c83..b044e29d95 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -247,15 +247,13 @@ const ( // memory. heapArenaBytes = 1 << logHeapArenaBytes - heapArenaWords = heapArenaBytes / goarch.PtrSize - // logHeapArenaBytes is log_2 of heapArenaBytes. For clarity, // prefer using heapArenaBytes where possible (we need the // constant to compute some other constants). logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64 - // heapArenaBitmapWords is the size of each heap arena's bitmap in uintptrs. - heapArenaBitmapWords = heapArenaWords / (8 * goarch.PtrSize) + // heapArenaBitmapBytes is the size of each heap arena's bitmap. + heapArenaBitmapBytes = heapArenaBytes / (goarch.PtrSize * 8 / 2) pagesPerArena = heapArenaBytes / pageSize @@ -355,10 +353,10 @@ func mallocinit() { throw("bad TinySizeClass") } - if heapArenaBitmapWords&(heapArenaBitmapWords-1) != 0 { + if heapArenaBitmapBytes&(heapArenaBitmapBytes-1) != 0 { // heapBits expects modular arithmetic on bitmap // addresses to work. - throw("heapArenaBitmapWords not a power of 2") + throw("heapArenaBitmapBytes not a power of 2") } // Check physPageSize. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 1c7ae8a68e..fcf59b8b3c 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -14,30 +14,34 @@ // // Heap bitmap // -// The heap bitmap comprises 1 bit for each pointer-sized word in the heap, -// recording whether a pointer is stored in that word or not. This bitmap -// is stored in the heapArena metadata backing each heap arena. -// That is, if ha is the heapArena for the arena starting at "start", -// then ha.bitmap[0] holds the 64 bits for the 64 words "start" -// through start+63*ptrSize, ha.bitmap[1] holds the entries for -// start+64*ptrSize through start+127*ptrSize, and so on. -// Bits correspond to words in little-endian order. ha.bitmap[0]&1 represents -// the word at "start", ha.bitmap[0]>>1&1 represents the word at start+8, etc. -// (For 32-bit platforms, s/64/32/.) +// The heap bitmap comprises 2 bits for each pointer-sized word in the heap, +// stored in the heapArena metadata backing each heap arena. +// That is, if ha is the heapArena for the arena starting a start, +// then ha.bitmap[0] holds the 2-bit entries for the four words start +// through start+3*ptrSize, ha.bitmap[1] holds the entries for +// start+4*ptrSize through start+7*ptrSize, and so on. // -// We also keep a noMorePtrs bitmap which allows us to stop scanning -// the heap bitmap early in certain situations. If ha.noMorePtrs[i]>>j&1 -// is 1, then the object containing the last word described by ha.bitmap[8*i+j] -// has no more pointers beyond those described by ha.bitmap[8*i+j]. -// If ha.noMorePtrs[i]>>j&1 is set, the entries in ha.bitmap[8*i+j+1] and -// beyond must all be zero until the start of the next object. +// In each 2-bit entry, the lower bit is a pointer/scalar bit, just +// like in the stack/data bitmaps described above. The upper bit +// indicates scan/dead: a "1" value ("scan") indicates that there may +// be pointers in later words of the allocation, and a "0" value +// ("dead") indicates there are no more pointers in the allocation. If +// the upper bit is 0, the lower bit must also be 0, and this +// indicates scanning can ignore the rest of the allocation. // -// The bitmap for noscan spans is not maintained (can be junk). Code must -// ensure that an object is scannable before consulting its bitmap by +// The 2-bit entries are split when written into the byte, so that the top half +// of the byte contains 4 high (scan) bits and the bottom half contains 4 low +// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to +// keep the pointer bits contiguous, instead of having to space them out. +// +// The code makes use of the fact that the zero value for a heap +// bitmap means scalar/dead. This property must be preserved when +// modifying the encoding. +// +// The bitmap for noscan spans is not maintained. Code must ensure +// that an object is scannable before consulting its bitmap by // checking either the noscan bit in the span or by consulting its // type's information. -// -// The bitmap for unallocated objects is also not maintained. package runtime @@ -48,6 +52,18 @@ import ( "unsafe" ) +const ( + bitPointer = 1 << 0 + bitScan = 1 << 4 + + heapBitsShift = 1 // shift offset between successive bitPointer or bitScan entries + wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte + + // all scan/pointer bits in a byte + bitScanAll = bitScan | bitScan<>63)*32)) @@ -376,132 +433,121 @@ func reflect_verifyNotInHeapPtr(p uintptr) bool { return spanOf(p) == nil && p != clobberdeadPtr } -const ptrBits = 8 * goarch.PtrSize - -// heapBits provides access to the bitmap bits for a single heap word. -// The methods on heapBits take value receivers so that the compiler -// can more easily inline calls to those methods and registerize the -// struct fields independently. -type heapBits struct { - // heapBits will report on pointers in the range [addr,addr+size). - // The low bit of mask contains the pointerness of the word at addr - // (assuming valid>0). - addr, size uintptr - - // The next few pointer bits representing words starting at addr. - // Those bits already returned by next() are zeroed. - mask uintptr - // Number of bits in mask that are valid. mask is always less than 1<> off - valid := ptrBits - off - - // Process depending on where the object ends. - nptr := size / goarch.PtrSize - if nptr < valid { - // Bits for this object end before the end of this bitmap word. - // Squash bits for the following objects. - mask &= 1<<(nptr&(ptrBits-1)) - 1 - valid = nptr - } else if nptr == valid { - // Bits for this object end at exactly the end of this bitmap word. - // All good. - } else { - // Bits for this object extend into the next bitmap word. See if there - // may be any pointers recorded there. - if uintptr(ha.noMorePtrs[idx/8])>>(idx%8)&1 != 0 { - // No more pointers in this object after this bitmap word. - // Update size so we know not to look there. - size = valid * goarch.PtrSize - } - } - - return heapBits{addr: addr, size: size, mask: mask, valid: valid} -} - -// Returns the (absolute) address of the next known pointer and -// a heapBits iterator representing any remaining pointers. -// If there are no more pointers, returns address 0. +// next returns the heapBits describing the next pointer-sized word in memory. +// That is, if h describes address p, h.next() describes p+ptrSize. // Note that next does not modify h. The caller must record the result. // // nosplit because it is used during write barriers and must not be preempted. // //go:nosplit -func (h heapBits) next() (heapBits, uintptr) { - for { - if h.mask != 0 { - var i int - if goarch.PtrSize == 8 { - i = sys.Ctz64(uint64(h.mask)) - } else { - i = sys.Ctz32(uint32(h.mask)) - } - h.mask ^= uintptr(1) << (i & (ptrBits - 1)) - return h, h.addr + uintptr(i)*goarch.PtrSize - } - - // Skip words that we've already processed. - h.addr += h.valid * goarch.PtrSize - h.size -= h.valid * goarch.PtrSize - if h.size == 0 { - return h, 0 // no more pointers - } - - // Grab more bits and try again. - h = heapBitsForAddr(h.addr, h.size) +func (h heapBits) next() heapBits { + if h.shift < 3*heapBitsShift { + h.shift += heapBitsShift + } else if h.bitp != h.last { + h.bitp, h.shift = add1(h.bitp), 0 + } else { + // Move to the next arena. + return h.nextArena() } + return h } -// nextFast is like next, but can return 0 even when there are more pointers -// to be found. Callers should call next if nextFast returns 0 as its second -// return value. -// if addr, h = h.nextFast(); addr == 0 { -// if addr, h = h.next(); addr == 0 { -// ... no more pointers ... -// } -// } -// ... process pointer at addr ... -// nextFast is designed to be inlineable. +// nextArena advances h to the beginning of the next heap arena. +// +// This is a slow-path helper to next. gc's inliner knows that +// heapBits.next can be inlined even though it calls this. This is +// marked noinline so it doesn't get inlined into next and cause next +// to be too big to inline. // //go:nosplit -func (h heapBits) nextFast() (heapBits, uintptr) { - // TESTQ/JEQ - if h.mask == 0 { - return h, 0 +//go:noinline +func (h heapBits) nextArena() heapBits { + h.arena++ + ai := arenaIdx(h.arena) + l2 := mheap_.arenas[ai.l1()] + if l2 == nil { + // We just passed the end of the object, which + // was also the end of the heap. Poison h. It + // should never be dereferenced at this point. + return heapBits{} } - // BSFQ - var i int - if goarch.PtrSize == 8 { - i = sys.Ctz64(uint64(h.mask)) + ha := l2[ai.l2()] + if ha == nil { + return heapBits{} + } + h.bitp, h.shift = &ha.bitmap[0], 0 + h.last = &ha.bitmap[len(ha.bitmap)-1] + return h +} + +// forward returns the heapBits describing n pointer-sized words ahead of h in memory. +// That is, if h describes address p, h.forward(n) describes p+n*ptrSize. +// h.forward(1) is equivalent to h.next(), just slower. +// Note that forward does not modify h. The caller must record the result. +// bits returns the heap bits for the current word. +// +//go:nosplit +func (h heapBits) forward(n uintptr) heapBits { + n += uintptr(h.shift) / heapBitsShift + nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4 + h.shift = uint32(n%4) * heapBitsShift + if nbitp <= uintptr(unsafe.Pointer(h.last)) { + h.bitp = (*uint8)(unsafe.Pointer(nbitp)) + return h + } + + // We're in a new heap arena. + past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1) + h.arena += 1 + uint32(past/heapArenaBitmapBytes) + ai := arenaIdx(h.arena) + if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil { + a := l2[ai.l2()] + h.bitp = &a.bitmap[past%heapArenaBitmapBytes] + h.last = &a.bitmap[len(a.bitmap)-1] } else { - i = sys.Ctz32(uint32(h.mask)) + h.bitp, h.last = nil, nil } - // BTCQ - h.mask ^= uintptr(1) << (i & (ptrBits - 1)) - // LEAQ (XX)(XX*8) - return h, h.addr + uintptr(i)*goarch.PtrSize + return h +} + +// forwardOrBoundary is like forward, but stops at boundaries between +// contiguous sections of the bitmap. It returns the number of words +// advanced over, which will be <= n. +func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) { + maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp))) + if n > maxn { + n = maxn + } + return h.forward(n), n +} + +// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer. +// The result includes in its higher bits the bits for subsequent words +// described by the same bitmap byte. +// +// nosplit because it is used during write barriers and must not be preempted. +// +//go:nosplit +func (h heapBits) bits() uint32 { + // The (shift & 31) eliminates a test and conditional branch + // from the generated code. + return uint32(*h.bitp) >> (h.shift & 31) +} + +// morePointers reports whether this word and all remaining words in this object +// are scalars. +// h must not describe the second word of the object. +func (h heapBits) morePointers() bool { + return h.bits()&bitScan != 0 +} + +// isPointer reports whether the heap bits describe a pointer word. +// +// nosplit because it is used during write barriers and must not be preempted. +// +//go:nosplit +func (h heapBits) isPointer() bool { + return h.bits()&bitPointer != 0 } // bulkBarrierPreWrite executes a write barrier @@ -565,29 +611,27 @@ func bulkBarrierPreWrite(dst, src, size uintptr) { } buf := &getg().m.p.ptr().wbBuf - h := heapBitsForAddr(dst, size) + h := heapBitsForAddr(dst) if src == 0 { - for { - var addr uintptr - if h, addr = h.next(); addr == 0 { - break - } - dstx := (*uintptr)(unsafe.Pointer(addr)) - if !buf.putFast(*dstx, 0) { - wbBufFlush(nil, 0) + for i := uintptr(0); i < size; i += goarch.PtrSize { + if h.isPointer() { + dstx := (*uintptr)(unsafe.Pointer(dst + i)) + if !buf.putFast(*dstx, 0) { + wbBufFlush(nil, 0) + } } + h = h.next() } } else { - for { - var addr uintptr - if h, addr = h.next(); addr == 0 { - break - } - dstx := (*uintptr)(unsafe.Pointer(addr)) - srcx := (*uintptr)(unsafe.Pointer(src + (addr - dst))) - if !buf.putFast(*dstx, *srcx) { - wbBufFlush(nil, 0) + for i := uintptr(0); i < size; i += goarch.PtrSize { + if h.isPointer() { + dstx := (*uintptr)(unsafe.Pointer(dst + i)) + srcx := (*uintptr)(unsafe.Pointer(src + i)) + if !buf.putFast(*dstx, *srcx) { + wbBufFlush(nil, 0) + } } + h = h.next() } } } @@ -610,16 +654,15 @@ func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) { return } buf := &getg().m.p.ptr().wbBuf - h := heapBitsForAddr(dst, size) - for { - var addr uintptr - if h, addr = h.next(); addr == 0 { - break - } - srcx := (*uintptr)(unsafe.Pointer(addr - dst + src)) - if !buf.putFast(0, *srcx) { - wbBufFlush(nil, 0) + h := heapBitsForAddr(dst) + for i := uintptr(0); i < size; i += goarch.PtrSize { + if h.isPointer() { + srcx := (*uintptr)(unsafe.Pointer(src + i)) + if !buf.putFast(0, *srcx) { + wbBufFlush(nil, 0) + } } + h = h.next() } } @@ -716,21 +759,43 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) { } } -// initHeapBits initializes the heap bitmap for a span. -// If this is a span of single pointer allocations, it initializes all -// words to pointer. -func (s *mspan) initHeapBits() { +// The methods operating on spans all require that h has been returned +// by heapBitsForSpan and that size, n, total are the span layout description +// returned by the mspan's layout method. +// If total > size*n, it means that there is extra leftover memory in the span, +// usually due to rounding. +// +// TODO(rsc): Perhaps introduce a different heapBitsSpan type. + +// initSpan initializes the heap bitmap for a span. +// If this is a span of pointer-sized objects, it initializes all +// words to pointer/scan. +// Otherwise, it initializes all words to scalar/dead. +func (h heapBits) initSpan(s *mspan) { + // Clear bits corresponding to objects. + nw := (s.npages << _PageShift) / goarch.PtrSize + if nw%wordsPerBitmapByte != 0 { + throw("initSpan: unaligned length") + } + if h.shift != 0 { + throw("initSpan: unaligned base") + } isPtrs := goarch.PtrSize == 8 && s.elemsize == goarch.PtrSize - if !isPtrs { - return // nothing to do + for nw > 0 { + hNext, anw := h.forwardOrBoundary(nw) + nbyte := anw / wordsPerBitmapByte + if isPtrs { + bitp := h.bitp + for i := uintptr(0); i < nbyte; i++ { + *bitp = bitPointerAll | bitScanAll + bitp = add1(bitp) + } + } else { + memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte) + } + h = hNext + nw -= anw } - h := writeHeapBitsForAddr(s.base()) - size := s.npages * pageSize - nptrs := size / goarch.PtrSize - for i := uintptr(0); i < nptrs; i += ptrBits { - h = h.write(^uintptr(0), ptrBits) - } - h.flush(s.base(), size) } // countAlloc returns the number of objects allocated in span s by @@ -753,146 +818,6 @@ func (s *mspan) countAlloc() int { return count } -type writeHeapBits struct { - addr uintptr // address that the low bit of mask represents the pointer state of. - mask uintptr // some pointer bits starting at the address addr. - valid uintptr // number of bits in buf that are valid (including low) - low uintptr // number of low-order bits to not overwrite -} - -func writeHeapBitsForAddr(addr uintptr) (h writeHeapBits) { - // We start writing bits maybe in the middle of a heap bitmap word. - // Remember how many bits into the word we started, so we can be sure - // not to overwrite the previous bits. - h.low = addr / goarch.PtrSize % ptrBits - - // round down to heap word that starts the bitmap word. - h.addr = addr - h.low*goarch.PtrSize - - // We don't have any bits yet. - h.mask = 0 - h.valid = h.low - - return -} - -// write appends the pointerness of the next valid pointer slots -// using the low valid bits of bits. 1=pointer, 0=scalar. -func (h writeHeapBits) write(bits, valid uintptr) writeHeapBits { - if h.valid+valid <= ptrBits { - // Fast path - just accumulate the bits. - h.mask |= bits << h.valid - h.valid += valid - return h - } - // Too many bits to fit in this word. Write the current word - // out and move on to the next word. - - data := h.mask | bits<> (ptrBits - h.valid) // leftover for next word - h.valid += valid - ptrBits // have h.valid+valid bits, writing ptrBits of them - - // Flush mask to the memory bitmap. - // TODO: figure out how to cache arena lookup. - ai := arenaIndex(h.addr) - ha := mheap_.arenas[ai.l1()][ai.l2()] - idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords - m := uintptr(1)< ptrBits { - h = h.write(0, ptrBits) - words -= ptrBits - } - return h.write(0, words) -} - -// Flush the bits that have been written, and add zeros as needed -// to cover the full object [addr, addr+size). -func (h writeHeapBits) flush(addr, size uintptr) { - // zeros counts the number of bits needed to represent the object minus the - // number of bits we've already written. This is the number of 0 bits - // that need to be added. - zeros := (addr+size-h.addr)/goarch.PtrSize - h.valid - - // Add zero bits up to the bitmap word boundary - if zeros > 0 { - z := ptrBits - h.valid - if z > zeros { - z = zeros - } - h.valid += z - zeros -= z - } - - // Find word in bitmap that we're going to write. - ai := arenaIndex(h.addr) - ha := mheap_.arenas[ai.l1()][ai.l2()] - idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords - - // Write remaining bits. - if h.valid != h.low { - m := uintptr(1)< 8 { - h = h.write(uintptr(*p), 8) - p = add1(p) - j -= 8 + // 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits. + // Therefore, these objects share a heap bitmap byte with the objects next to them. + // These are called out as a special case primarily so the code below can assume all + // objects are at least 4 words long and that their bitmaps start either at the beginning + // of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively). + + if size == 2*goarch.PtrSize { + if typ.size == goarch.PtrSize { + // We're allocating a block big enough to hold two pointers. + // On 64-bit, that means the actual object must be two pointers, + // or else we'd have used the one-pointer-sized block. + // On 32-bit, however, this is the 8-byte block, the smallest one. + // So it could be that we're allocating one pointer and this was + // just the smallest block available. Distinguish by checking dataSize. + // (In general the number of instances of typ being allocated is + // dataSize/typ.size.) + if goarch.PtrSize == 4 && dataSize == goarch.PtrSize { + // 1 pointer object. On 32-bit machines clear the bit for the + // unused second word. + *h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<> 1 - // Erase the expanded GC program. - memclrNoHeapPointers(unsafe.Pointer(obj), (n+7)/8) + // For h.shift > 1 heap bits cross a byte boundary and need to be written part + // to h.bitp and part to the next h.bitp. + switch h.shift { + case 0: + *h.bitp &^= mask3 << 0 + *h.bitp |= hb << 0 + case 1: + *h.bitp &^= mask3 << 1 + *h.bitp |= hb << 1 + case 2: + *h.bitp &^= mask2 << 2 + *h.bitp |= (hb & mask2) << 2 + // Two words written to the first byte. + // Advance two words to get to the next byte. + h = h.next().next() + *h.bitp &^= mask1 + *h.bitp |= (hb >> 2) & mask1 + case 3: + *h.bitp &^= mask1 << 3 + *h.bitp |= (hb & mask1) << 3 + // One word written to the first byte. + // Advance one word to get to the next byte. + h = h.next() + *h.bitp &^= mask2 + *h.bitp |= (hb >> 1) & mask2 + } return } + // Copy from 1-bit ptrmask into 2-bit bitmap. + // The basic approach is to use a single uintptr as a bit buffer, + // alternating between reloading the buffer and writing bitmap bytes. + // In general, one load can supply two bitmap byte writes. + // This is a lot of lines of code, but it compiles into relatively few + // machine instructions. + + outOfPlace := false + if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrandn(2) == 0) { + // This object spans heap arenas, so the bitmap may be + // discontiguous. Unroll it into the object instead + // and then copy it out. + // + // In doubleCheck mode, we randomly do this anyway to + // stress test the bitmap copying path. + outOfPlace = true + h.bitp = (*uint8)(unsafe.Pointer(x)) + h.last = nil + } + + var ( + // Ptrmask input. + p *byte // last ptrmask byte read + b uintptr // ptrmask bits already loaded + nb uintptr // number of bits in b at next read + endp *byte // final ptrmask byte to read (then repeat) + endnb uintptr // number of valid bits in *endp + pbits uintptr // alternate source of bits + + // Heap bitmap output. + w uintptr // words processed + nw uintptr // number of words to process + hbitp *byte // next heap bitmap byte to write + hb uintptr // bits being prepared for *hbitp + ) + + hbitp = h.bitp + + // Handle GC program. Delayed until this part of the code + // so that we can use the same double-checking mechanism + // as the 1-bit case. Nothing above could have encountered + // GC programs: the cases were all too small. + if typ.kind&kindGCProg != 0 { + heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4)) + if doubleCheck { + // Double-check the heap bits written by GC program + // by running the GC program to create a 1-bit pointer mask + // and then jumping to the double-check code below. + // This doesn't catch bugs shared between the 1-bit and 4-bit + // GC program execution, but it does catch mistakes specific + // to just one of those and bugs in heapBitsSetTypeGCProg's + // implementation of arrays. + lock(&debugPtrmask.lock) + if debugPtrmask.data == nil { + debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys)) + } + ptrmask = debugPtrmask.data + runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1) + } + goto Phase4 + } + // Note about sizes: // // typ.size is the number of words in the object, @@ -987,52 +1061,424 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // to scan the buffer's heap bitmap at all. // The 1-bit ptrmasks are sized to contain only bits for // the typ.ptrdata prefix, zero padded out to a full byte - // of bitmap. If there is more room in the allocated object, - // that space is pointerless. The noMorePtrs bitmap will prevent - // scanning large pointerless tails of an object. + // of bitmap. This code sets nw (below) so that heap bitmap + // bits are only written for the typ.ptrdata prefix; if there is + // more room in the allocated object, the next heap bitmap + // entry is a 00, indicating that there are no more pointers + // to scan. So only the ptrmask for the ptrdata bytes is needed. // // Replicated copies are not as nice: if there is an array of // objects with scalar tails, all but the last tail does have to // be initialized, because there is no way to say "skip forward". + // However, because of the possibility of a repeated type with + // size not a multiple of 4 pointers (one heap bitmap byte), + // the code already must handle the last ptrmask byte specially + // by treating it as containing only the bits for endnb pointers, + // where endnb <= 4. We represent large scalar tails that must + // be expanded in the replication by setting endnb larger than 4. + // This will have the effect of reading many bits out of b, + // but once the real bits are shifted out, b will supply as many + // zero bits as we try to read, which is exactly what we need. - for i := uintptr(0); true; i += typ.size { - p := typ.gcdata - var j uintptr - for j = 0; j+8*goarch.PtrSize < typ.ptrdata; j += 8 * goarch.PtrSize { - h = h.write(uintptr(*p), 8) - p = add1(p) + p = ptrmask + if typ.size < dataSize { + // Filling in bits for an array of typ. + // Set up for repetition of ptrmask during main loop. + // Note that ptrmask describes only a prefix of + const maxBits = goarch.PtrSize*8 - 7 + if typ.ptrdata/goarch.PtrSize <= maxBits { + // Entire ptrmask fits in uintptr with room for a byte fragment. + // Load into pbits and never read from ptrmask again. + // This is especially important when the ptrmask has + // fewer than 8 bits in it; otherwise the reload in the middle + // of the Phase 2 loop would itself need to loop to gather + // at least 8 bits. + + // Accumulate ptrmask into b. + // ptrmask is sized to describe only typ.ptrdata, but we record + // it as describing typ.size bytes, since all the high bits are zero. + nb = typ.ptrdata / goarch.PtrSize + for i := uintptr(0); i < nb; i += 8 { + b |= uintptr(*p) << i + p = add1(p) + } + nb = typ.size / goarch.PtrSize + + // Replicate ptrmask to fill entire pbits uintptr. + // Doubling and truncating is fewer steps than + // iterating by nb each time. (nb could be 1.) + // Since we loaded typ.ptrdata/goarch.PtrSize bits + // but are pretending to have typ.size/goarch.PtrSize, + // there might be no replication necessary/possible. + pbits = b + endnb = nb + if nb+nb <= maxBits { + for endnb <= goarch.PtrSize*8 { + pbits |= pbits << endnb + endnb += endnb + } + // Truncate to a multiple of original ptrmask. + // Because nb+nb <= maxBits, nb fits in a byte. + // Byte division is cheaper than uintptr division. + endnb = uintptr(maxBits/byte(nb)) * nb + pbits &= 1<>(j%8)&1 != 0 - } + if typ.size == dataSize { + // Single entry: can stop once we reach the non-pointer data. + nw = typ.ptrdata / goarch.PtrSize + } else { + // Repeated instances of typ in an array. + // Have to process first N-1 entries in full, but can stop + // once we reach the non-pointer data in the final entry. + nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / goarch.PtrSize + } + if nw == 0 { + // No pointers! Caller was supposed to check. + println("runtime: invalid type ", typ.string()) + throw("heapBitsSetType: called with non-pointer type") + return + } + + // Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2). + // The leading byte is special because it contains the bits for word 1, + // which does not have the scan bit set. + // The leading half-byte is special because it's a half a byte, + // so we have to be careful with the bits already there. + switch { + default: + throw("heapBitsSetType: unexpected shift") + + case h.shift == 0: + // Ptrmask and heap bitmap are aligned. + // + // This is a fast path for small objects. + // + // The first byte we write out covers the first four + // words of the object. The scan/dead bit on the first + // word must be set to scan since there are pointers + // somewhere in the object. + // In all following words, we set the scan/dead + // appropriately to indicate that the object continues + // to the next 2-bit entry in the bitmap. + // + // We set four bits at a time here, but if the object + // is fewer than four words, phase 3 will clear + // unnecessary bits. + hb = b & bitPointerAll + hb |= bitScanAll + if w += 4; w >= nw { + goto Phase3 + } + *hbitp = uint8(hb) + hbitp = add1(hbitp) + b >>= 4 + nb -= 4 + + case h.shift == 2: + // Ptrmask and heap bitmap are misaligned. + // + // On 32 bit architectures only the 6-word object that corresponds + // to a 24 bytes size class can start with h.shift of 2 here since + // all other non 16 byte aligned size classes have been handled by + // special code paths at the beginning of heapBitsSetType on 32 bit. + // + // Many size classes are only 16 byte aligned. On 64 bit architectures + // this results in a heap bitmap position starting with a h.shift of 2. + // + // The bits for the first two words are in a byte shared + // with another object, so we must be careful with the bits + // already there. + // + // We took care of 1-word, 2-word, and 3-word objects above, + // so this is at least a 6-word object. + hb = (b & (bitPointer | bitPointer< 1 { + hb |= bitScan << (3 * heapBitsShift) + } + b >>= 2 + nb -= 2 + *hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift)) + *hbitp |= uint8(hb) + hbitp = add1(hbitp) + if w += 2; w >= nw { + // We know that there is more data, because we handled 2-word and 3-word objects above. + // This must be at least a 6-word object. If we're out of pointer words, + // mark no scan in next bitmap byte and finish. + hb = 0 + w += 4 + goto Phase3 + } + } + + // Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap. + // The loop computes the bits for that last write but does not execute the write; + // it leaves the bits in hb for processing by phase 3. + // To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to + // use in the first half of the loop right now, and then we only adjust nb explicitly + // if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop. + nb -= 4 + for { + // Emit bitmap byte. + // b has at least nb+4 bits, with one exception: + // if w+4 >= nw, then b has only nw-w bits, + // but we'll stop at the break and then truncate + // appropriately in Phase 3. + hb = b & bitPointerAll + hb |= bitScanAll + if w += 4; w >= nw { + break + } + *hbitp = uint8(hb) + hbitp = add1(hbitp) + b >>= 4 + + // Load more bits. b has nb right now. + if p != endp { + // Fast path: keep reading from ptrmask. + // nb unmodified: we just loaded 8 bits, + // and the next iteration will consume 8 bits, + // leaving us with the same nb the next time we're here. + if nb < 8 { + b |= uintptr(*p) << nb + p = add1(p) + } else { + // Reduce the number of bits in b. + // This is important if we skipped + // over a scalar tail, since nb could + // be larger than the bit width of b. + nb -= 8 } - if want { - var addr uintptr - h, addr = h.next() - if addr != x+i { - throw("heapBitsSetType: pointer entry not correct") - } + } else if p == nil { + // Almost as fast path: track bit count and refill from pbits. + // For short repetitions. + if nb < 8 { + b |= pbits << nb + nb += endnb + } + nb -= 8 // for next iteration + } else { + // Slow path: reached end of ptrmask. + // Process final partial byte and rewind to start. + b |= uintptr(*p) << nb + nb += endnb + if nb < 8 { + b |= uintptr(*ptrmask) << nb + p = add1(ptrmask) + } else { + nb -= 8 + p = ptrmask } } - if _, addr := h.next(); addr != 0 { - throw("heapBitsSetType: extra pointer") + + // Emit bitmap byte. + hb = b & bitPointerAll + hb |= bitScanAll + if w += 4; w >= nw { + break + } + *hbitp = uint8(hb) + hbitp = add1(hbitp) + b >>= 4 + } + +Phase3: + // Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries. + if w > nw { + // Counting the 4 entries in hb not yet written to memory, + // there are more entries than possible pointer slots. + // Discard the excess entries (can't be more than 3). + mask := uintptr(1)<<(4-(w-nw)) - 1 + hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits + } + + // Change nw from counting possibly-pointer words to total words in allocation. + nw = size / goarch.PtrSize + + // Write whole bitmap bytes. + // The first is hb, the rest are zero. + if w <= nw { + *hbitp = uint8(hb) + hbitp = add1(hbitp) + hb = 0 // for possible final half-byte below + for w += 4; w <= nw; w += 4 { + *hbitp = 0 + hbitp = add1(hbitp) + } + } + + // Write final partial bitmap byte if any. + // We know w > nw, or else we'd still be in the loop above. + // It can be bigger only due to the 4 entries in hb that it counts. + // If w == nw+4 then there's nothing left to do: we wrote all nw entries + // and can discard the 4 sitting in hb. + // But if w == nw+2, we need to write first two in hb. + // The byte is shared with the next object, so be careful with + // existing bits. + if w == nw+2 { + *hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<= 4 { + // This loop processes four words at a time, + // so round cnw down accordingly. + hNext, words := h.forwardOrBoundary(cnw / 4 * 4) + + // n is the number of bitmap bytes to copy. + n := words / 4 + memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n) + cnw -= words + h = hNext + src = addb(src, n) + } + if doubleCheck && h.shift != 0 { + print("cnw=", cnw, " h.shift=", h.shift, "\n") + throw("bad shift after block copy") + } + // Handle the last byte if it's shared. + if cnw == 2 { + *h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)< x+size { + throw("copy exceeded object size") + } + if !(cnw == 0 || cnw == 2) { + print("x=", x, " size=", size, " cnw=", cnw, "\n") + throw("bad number of remaining words") + } + // Set up hbitp so doubleCheck code below can check it. + hbitp = h.bitp + } + // Zero the object where we wrote the bitmap. + memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x) + } + + // Double check the whole bitmap. + if doubleCheck { + // x+size may not point to the heap, so back up one + // word and then advance it the way we do above. + end := heapBitsForAddr(x + size - goarch.PtrSize) + if outOfPlace { + // In out-of-place copying, we just advance + // using next. + end = end.next() + } else { + // Don't use next because that may advance to + // the next arena and the in-place logic + // doesn't do that. + end.shift += heapBitsShift + if end.shift == 4*heapBitsShift { + end.bitp, end.shift = add1(end.bitp), 0 + } + } + if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) { + println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size) + print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") + print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n") + h0 := heapBitsForAddr(x) + print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n") + print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n") + throw("bad heapBitsSetType") + } + + // Double-check that bits to be written were written correctly. + // Does not check that other bits were not written, unfortunately. + h := heapBitsForAddr(x) + nptr := typ.ptrdata / goarch.PtrSize + ndata := typ.size / goarch.PtrSize + count := dataSize / typ.size + totalptr := ((count-1)*typ.size + typ.ptrdata) / goarch.PtrSize + for i := uintptr(0); i < size/goarch.PtrSize; i++ { + j := i % ndata + var have, want uint8 + have = (*h.bitp >> h.shift) & (bitPointer | bitScan) + if i >= totalptr { + if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 { + // heapBitsSetTypeGCProg always fills + // in full nibbles of bitScan. + want = bitScan + } + } else { + if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 { + want |= bitPointer + } + want |= bitScan + } + if have != want { + println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size) + print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") + print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n") + print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n") + h0 := heapBitsForAddr(x) + print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n") + print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n") + print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n") + println("at word", i, "offset", i*goarch.PtrSize, "have", hex(have), "want", hex(want)) + if typ.kind&kindGCProg != 0 { + println("GC program:") + dumpGCProg(addb(typ.gcdata, 4)) + } + throw("bad heapBitsSetType") + } + h = h.next() + } + if ptrmask == debugPtrmask.data { + unlock(&debugPtrmask.lock) } } } @@ -1042,6 +1488,92 @@ var debugPtrmask struct { data *byte } +// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program. +// progSize is the size of the memory described by the program. +// elemSize is the size of the element that the GC program describes (a prefix of). +// dataSize is the total size of the intended data, a multiple of elemSize. +// allocSize is the total size of the allocated memory. +// +// GC programs are only used for large allocations. +// heapBitsSetType requires that allocSize is a multiple of 4 words, +// so that the relevant bitmap bytes are not shared with surrounding +// objects. +func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) { + if goarch.PtrSize == 8 && allocSize%(4*goarch.PtrSize) != 0 { + // Alignment will be wrong. + throw("heapBitsSetTypeGCProg: small allocation") + } + var totalBits uintptr + if elemSize == dataSize { + totalBits = runGCProg(prog, nil, h.bitp, 2) + if totalBits*goarch.PtrSize != progSize { + println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize) + throw("heapBitsSetTypeGCProg: unexpected bit count") + } + } else { + count := dataSize / elemSize + + // Piece together program trailer to run after prog that does: + // literal(0) + // repeat(1, elemSize-progSize-1) // zeros to fill element size + // repeat(elemSize, count-1) // repeat that element for count + // This zero-pads the data remaining in the first element and then + // repeats that first element to fill the array. + var trailer [40]byte // 3 varints (max 10 each) + some bytes + i := 0 + if n := elemSize/goarch.PtrSize - progSize/goarch.PtrSize; n > 0 { + // literal(0) + trailer[i] = 0x01 + i++ + trailer[i] = 0 + i++ + if n > 1 { + // repeat(1, n-1) + trailer[i] = 0x81 + i++ + n-- + for ; n >= 0x80; n >>= 7 { + trailer[i] = byte(n | 0x80) + i++ + } + trailer[i] = byte(n) + i++ + } + } + // repeat(elemSize/ptrSize, count-1) + trailer[i] = 0x80 + i++ + n := elemSize / goarch.PtrSize + for ; n >= 0x80; n >>= 7 { + trailer[i] = byte(n | 0x80) + i++ + } + trailer[i] = byte(n) + i++ + n = count - 1 + for ; n >= 0x80; n >>= 7 { + trailer[i] = byte(n | 0x80) + i++ + } + trailer[i] = byte(n) + i++ + trailer[i] = 0 + i++ + + runGCProg(prog, &trailer[0], h.bitp, 2) + + // Even though we filled in the full array just now, + // record that we only filled in up to the ptrdata of the + // last element. This will cause the code below to + // memclr the dead section of the final array element, + // so that scanobject can stop early in the final element. + totalBits = (elemSize*(count-1) + progSize) / goarch.PtrSize + } + endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4)) + endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/goarch.PtrSize/wordsPerBitmapByte)) + memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg)) +} + // progToPointerMask returns the 1-bit pointer mask output by the GC program prog. // size the size of the region described by prog, in bytes. // The resulting bitvector will have no more than size/goarch.PtrSize bits. @@ -1049,7 +1581,7 @@ func progToPointerMask(prog *byte, size uintptr) bitvector { n := (size/goarch.PtrSize + 7) / 8 x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1] x[len(x)-1] = 0xa1 // overflow check sentinel - n = runGCProg(prog, &x[0]) + n = runGCProg(prog, nil, &x[0], 1) if x[len(x)-1] != 0xa1 { throw("progToPointerMask: overflow") } @@ -1070,8 +1602,15 @@ func progToPointerMask(prog *byte, size uintptr) bitvector { // 10000000 n c: repeat the previous n bits c times; n, c are varints // 1nnnnnnn c: repeat the previous n bits c times; c is a varint -// runGCProg returns the number of 1-bit entries written to memory. -func runGCProg(prog, dst *byte) uintptr { +// runGCProg executes the GC program prog, and then trailer if non-nil, +// writing to dst with entries of the given size. +// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst. +// If size == 2, dst is the 2-bit heap bitmap, and writes move backward +// starting at dst (because the heap bitmap does). In this case, the caller guarantees +// that only whole bytes in dst need to be written. +// +// runGCProg returns the number of 1- or 2-bit entries written to memory. +func runGCProg(prog, trailer, dst *byte, size int) uintptr { dstStart := dst // Bits waiting to be written to memory. @@ -1084,9 +1623,20 @@ Run: // Flush accumulated full bytes. // The rest of the loop assumes that nbits <= 7. for ; nbits >= 8; nbits -= 8 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 + if size == 1 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + } else { + v := bits&bitPointerAll | bitScanAll + *dst = uint8(v) + dst = add1(dst) + bits >>= 4 + v = bits&bitPointerAll | bitScanAll + *dst = uint8(v) + dst = add1(dst) + bits >>= 4 + } } // Process one instruction. @@ -1096,16 +1646,32 @@ Run: if inst&0x80 == 0 { // Literal bits; n == 0 means end of program. if n == 0 { - // Program is over. + // Program is over; continue in trailer if present. + if trailer != nil { + p = trailer + trailer = nil + continue + } break Run } nbyte := n / 8 for i := uintptr(0); i < nbyte; i++ { bits |= uintptr(*p) << nbits p = add1(p) - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 + if size == 1 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + } else { + v := bits&0xf | bitScanAll + *dst = uint8(v) + dst = add1(dst) + bits >>= 4 + v = bits&0xf | bitScanAll + *dst = uint8(v) + dst = add1(dst) + bits >>= 4 + } } if n %= 8; n > 0 { bits |= uintptr(*p) << nbits @@ -1154,12 +1720,22 @@ Run: npattern := nbits // If we need more bits, fetch them from memory. - src = subtract1(src) - for npattern < n { - pattern <<= 8 - pattern |= uintptr(*src) + if size == 1 { src = subtract1(src) - npattern += 8 + for npattern < n { + pattern <<= 8 + pattern |= uintptr(*src) + src = subtract1(src) + npattern += 8 + } + } else { + src = subtract1(src) + for npattern < n { + pattern <<= 4 + pattern |= uintptr(*src) & 0xf + src = subtract1(src) + npattern += 4 + } } // We started with the whole bit output buffer, @@ -1209,11 +1785,20 @@ Run: for ; c >= npattern; c -= npattern { bits |= pattern << nbits nbits += npattern - for nbits >= 8 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - nbits -= 8 + if size == 1 { + for nbits >= 8 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + nbits -= 8 + } + } else { + for nbits >= 4 { + *dst = uint8(bits&0xf | bitScanAll) + dst = add1(dst) + bits >>= 4 + nbits -= 4 + } } } @@ -1230,37 +1815,74 @@ Run: // Since nbits <= 7, we know the first few bytes of repeated data // are already written to memory. off := n - nbits // n > nbits because n > maxBits and nbits <= 7 - // Leading src fragment. - src = subtractb(src, (off+7)/8) - if frag := off & 7; frag != 0 { - bits |= uintptr(*src) >> (8 - frag) << nbits - src = add1(src) - nbits += frag - c -= frag - } - // Main loop: load one byte, write another. - // The bits are rotating through the bit buffer. - for i := c / 8; i > 0; i-- { - bits |= uintptr(*src) << nbits - src = add1(src) - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - } - // Final src fragment. - if c %= 8; c > 0 { - bits |= (uintptr(*src) & (1<> (8 - frag) << nbits + src = add1(src) + nbits += frag + c -= frag + } + // Main loop: load one byte, write another. + // The bits are rotating through the bit buffer. + for i := c / 8; i > 0; i-- { + bits |= uintptr(*src) << nbits + src = add1(src) + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + } + // Final src fragment. + if c %= 8; c > 0 { + bits |= (uintptr(*src) & (1<> (4 - frag) << nbits + src = add1(src) + nbits += frag + c -= frag + } + // Main loop: load one byte, write another. + // The bits are rotating through the bit buffer. + for i := c / 4; i > 0; i-- { + bits |= (uintptr(*src) & 0xf) << nbits + src = add1(src) + *dst = uint8(bits&0xf | bitScanAll) + dst = add1(dst) + bits >>= 4 + } + // Final src fragment. + if c %= 4; c > 0 { + bits |= (uintptr(*src) & (1< 0; nbits -= 8 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 + var totalBits uintptr + if size == 1 { + totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits + nbits += -nbits & 7 + for ; nbits > 0; nbits -= 8 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + } + } else { + totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits + nbits += -nbits & 3 + for ; nbits > 0; nbits -= 4 { + v := bits&0xf | bitScanAll + *dst = uint8(v) + dst = add1(dst) + bits >>= 4 + } } return totalBits } @@ -1276,7 +1898,7 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan { // Compute the number of pages needed for bitmapBytes. pages := divRoundUp(bitmapBytes, pageSize) s := mheap_.allocManual(pages, spanAllocPtrScalarBits) - runGCProg(addb(prog, 4), (*byte)(unsafe.Pointer(s.startAddr))) + runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1) return s } func dematerializeGCProg(s *mspan) { @@ -1344,7 +1966,13 @@ func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool { // //go:linkname reflect_gcbits reflect.gcbits func reflect_gcbits(x any) []byte { - return getgcmask(x) + ret := getgcmask(x) + typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem + nptr := typ.ptrdata / goarch.PtrSize + for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 { + ret = ret[:len(ret)-1] + } + return ret } // Returns GC type info for the pointer stored in ep for testing. @@ -1383,22 +2011,18 @@ func getgcmask(ep any) (mask []byte) { // heap if base, s, _ := findObject(uintptr(p), 0, 0); base != 0 { - if s.spanclass.noscan() { - return nil - } + hbits := heapBitsForAddr(base) n := s.elemsize - hbits := heapBitsForAddr(base, n) mask = make([]byte, n/goarch.PtrSize) - for { - var addr uintptr - if hbits, addr = hbits.next(); addr == 0 { + for i := uintptr(0); i < n; i += goarch.PtrSize { + if hbits.isPointer() { + mask[i/goarch.PtrSize] = 1 + } + if !hbits.morePointers() { + mask = mask[:i/goarch.PtrSize] break } - mask[(addr-base)/goarch.PtrSize] = 1 - } - // Callers expect this mask to end at the last pointer. - for len(mask) > 0 && mask[len(mask)-1] == 0 { - mask = mask[:len(mask)-1] + hbits = hbits.next() } return } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 40674d8939..1f484fb9b6 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -251,7 +251,7 @@ func (c *mcache) allocLarge(size uintptr, noscan bool) *mspan { // visible to the background sweeper. mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s) s.limit = s.base() + size - s.initHeapBits() + heapBitsForAddr(s.base()).initSpan(s) return s } diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index c7ce573da6..e4bdf35071 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -250,6 +250,6 @@ func (c *mcentral) grow() *mspan { // n := (npages << _PageShift) / size n := s.divideByElemSize(npages << _PageShift) s.limit = s.base() + size*n - s.initHeapBits() + heapBitsForAddr(s.base()).initSpan(s) return s } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index a4fa448b54..68600be4e7 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1265,6 +1265,7 @@ func scanobject(b uintptr, gcw *gcWork) { // b is either the beginning of an object, in which case this // is the size of the object to scan, or it points to an // oblet, in which case we compute the size to scan below. + hbits := heapBitsForAddr(b) s := spanOfUnchecked(b) n := s.elemsize if n == 0 { @@ -1307,24 +1308,20 @@ func scanobject(b uintptr, gcw *gcWork) { } } - hbits := heapBitsForAddr(b, n) - var scanSize uintptr - for { - var addr uintptr - if hbits, addr = hbits.nextFast(); addr == 0 { - if hbits, addr = hbits.next(); addr == 0 { - break - } + var i uintptr + for i = 0; i < n; i, hbits = i+goarch.PtrSize, hbits.next() { + // Load bits once. See CL 22712 and issue 16973 for discussion. + bits := hbits.bits() + if bits&bitScan == 0 { + break // no more pointers in this object + } + if bits&bitPointer == 0 { + continue // not a pointer } - - // Keep track of farthest pointer we found, so we can - // update heapScanWork. TODO: is there a better metric, - // now that we can skip scalar portions pretty efficiently? - scanSize = addr - b + goarch.PtrSize // Work here is duplicated in scanblock and above. // If you make changes here, make changes there too. - obj := *(*uintptr)(unsafe.Pointer(addr)) + obj := *(*uintptr)(unsafe.Pointer(b + i)) // At this point we have extracted the next potential pointer. // Quickly filter out nil and pointers back to the current object. @@ -1338,13 +1335,13 @@ func scanobject(b uintptr, gcw *gcWork) { // heap. In this case, we know the object was // just allocated and hence will be marked by // allocation itself. - if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 { - greyobject(obj, b, addr-b, span, gcw, objIndex) + if obj, span, objIndex := findObject(obj, b, i); obj != 0 { + greyobject(obj, b, i, span, gcw, objIndex) } } } gcw.bytesMarked += uint64(n) - gcw.heapScanWork += int64(scanSize) + gcw.heapScanWork += int64(i) } // scanConservative scans block [b, b+n) conservatively, treating any diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index bd40508e46..b19a2ff408 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -221,22 +221,9 @@ var mheap_ mheap //go:notinheap type heapArena struct { // bitmap stores the pointer/scalar bitmap for the words in - // this arena. See mbitmap.go for a description. - // This array uses 1 bit per word of heap, or 1.6% of the heap size (for 64-bit). - bitmap [heapArenaBitmapWords]uintptr - - // If the ith bit of noMorePtrs is true, then there are no more - // pointers for the object containing the word described by the - // high bit of bitmap[i]. - // In that case, bitmap[i+1], ... must be zero until the start - // of the next object. - // We never operate on these entries using bit-parallel techniques, - // so it is ok if they are small. Also, they can't be bigger than - // uint16 because at that size a single noMorePtrs entry - // represents 8K of memory, the minimum size of a span. Any larger - // and we'd have to worry about concurrent updates. - // This array uses 1 bit per word of bitmap, or .024% of the heap size (for 64-bit). - noMorePtrs [heapArenaBitmapWords / 8]uint8 + // this arena. See mbitmap.go for a description. Use the + // heapBits type to access this. + bitmap [heapArenaBitmapBytes]byte // spans maps from virtual address page ID within this arena to *mspan. // For allocated spans, their pages map to the span itself. diff --git a/src/runtime/slice.go b/src/runtime/slice.go index 75f202fca0..11b1be48db 100644 --- a/src/runtime/slice.go +++ b/src/runtime/slice.go @@ -260,14 +260,12 @@ func growslice(et *_type, old slice, cap int) slice { capmem = roundupsize(uintptr(newcap) << shift) overflow = uintptr(newcap) > (maxAlloc >> shift) newcap = int(capmem >> shift) - capmem = uintptr(newcap) << shift default: lenmem = uintptr(old.len) * et.size newlenmem = uintptr(cap) * et.size capmem, overflow = math.MulUintptr(et.size, uintptr(newcap)) capmem = roundupsize(capmem) newcap = int(capmem / et.size) - capmem = uintptr(newcap) * et.size } // The check of overflow in addition to capmem > maxAlloc is needed