mirror of
https://github.com/golang/go
synced 2024-11-12 04:30:22 -07:00
[dev.garbage] runtime: add bit and cache ctz64 (count trailing zero)
Add to each span a 64 bit cache (allocCache) of the allocBits at freeindex. allocCache is shifted such that the lowest bit corresponds to the bit freeindex. allocBits uses a 0 to indicate an object is free, on the other hand allocCache uses a 1 to indicate an object is free. This facilitates ctz64 (count trailing zero) which counts the number of 0s trailing the least significant 1. This is also the index of the least significant 1. Each span maintains a freeindex indicating the boundary between allocated objects and unallocated objects. allocCache is shifted as freeindex is incremented such that the low bit in allocCache corresponds to the bit a freeindex in the allocBits array. Currently ctz64 is written in Go using a for loop so it is not very efficient. Use of the hardware instruction will follow. With this in mind comparisons of the garbage benchmark are as follows. 1.6 release 2.8 seconds dev:garbage branch 3.1 seconds. Profiling shows the go implementation of ctz64 takes up 1% of the total time. Change-Id: If084ed9c3b1eda9f3c6ab2e794625cb870b8167f Reviewed-on: https://go-review.googlesource.com/20200 Reviewed-by: Austin Clements <austin@google.com>
This commit is contained in:
parent
44fe90d0b3
commit
4093481523
@ -505,29 +505,30 @@ const (
|
||||
func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) {
|
||||
s := c.alloc[sizeclass]
|
||||
shouldhelpgc = false
|
||||
freeIndex := s.nextFreeIndex(s.freeindex)
|
||||
|
||||
freeIndex := s.nextFreeIndex()
|
||||
if freeIndex == s.nelems {
|
||||
// The span is full.
|
||||
if uintptr(s.allocCount) != s.nelems {
|
||||
throw("s.allocCount != s.nelems && freeIndex == s.nelems")
|
||||
if uintptr(s.allocCount) > s.nelems {
|
||||
println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
|
||||
throw("s.allocCount > s.nelems && freeIndex == s.nelems")
|
||||
}
|
||||
systemstack(func() {
|
||||
c.refill(int32(sizeclass))
|
||||
})
|
||||
shouldhelpgc = true
|
||||
s = c.alloc[sizeclass]
|
||||
freeIndex = s.nextFreeIndex(s.freeindex)
|
||||
|
||||
freeIndex = s.nextFreeIndex()
|
||||
}
|
||||
|
||||
if freeIndex >= s.nelems {
|
||||
throw("freeIndex is not valid")
|
||||
}
|
||||
|
||||
v = gclinkptr(freeIndex*s.elemsize + s.base())
|
||||
// Advance the freeIndex.
|
||||
s.freeindex = freeIndex + 1
|
||||
s.allocCount++
|
||||
if uintptr(s.allocCount) > s.nelems {
|
||||
println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
|
||||
throw("s.allocCount > s.nelems")
|
||||
}
|
||||
return
|
||||
|
@ -187,57 +187,84 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
|
||||
}
|
||||
|
||||
// A temporary stand in for the count trailing zero ctz instruction.
|
||||
func ctz(markBits byte) uint8 {
|
||||
tz := uint8(0) // trailing zero count.
|
||||
// IA bsf works on 64 bit non-zero word.
|
||||
func ctz64(markBits uint64) uint64 {
|
||||
if markBits == 0 {
|
||||
return 8 // 8
|
||||
return 64 // bits in 64 bit word, ensures loop terminates
|
||||
}
|
||||
for mask := byte(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 {
|
||||
// tz holds trailing zero count.
|
||||
tz := uint64(0)
|
||||
for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 {
|
||||
}
|
||||
return tz
|
||||
}
|
||||
|
||||
// nextFreeIndex returns the index of the next free object in s at or
|
||||
// after the index'th object.
|
||||
// refillAllocCache takes 8 bytes s.allocBits starting at whichByte
|
||||
// and negates them so that ctz (count trailing zeros) instructions
|
||||
// can be used. It then places these 8 bytes into the cached 64 bit
|
||||
// s.allocCache.
|
||||
func (s *mspan) refillAllocCache(whichByte uintptr) {
|
||||
bytes := s.allocBits[whichByte : whichByte+8]
|
||||
aCache := uint64(0)
|
||||
aCache |= uint64(bytes[0])
|
||||
aCache |= uint64(bytes[1]) << (1 * 8)
|
||||
aCache |= uint64(bytes[2]) << (2 * 8)
|
||||
aCache |= uint64(bytes[3]) << (3 * 8)
|
||||
aCache |= uint64(bytes[4]) << (4 * 8)
|
||||
aCache |= uint64(bytes[5]) << (5 * 8)
|
||||
aCache |= uint64(bytes[6]) << (6 * 8)
|
||||
aCache |= uint64(bytes[7]) << (7 * 8)
|
||||
s.allocCache = ^aCache
|
||||
}
|
||||
|
||||
// nextFreeIndex returns the index of the next free object in s at
|
||||
// or after s.freeindex.
|
||||
// There are hardware instructions that can be used to make this
|
||||
// faster if profiling warrants it.
|
||||
func (s *mspan) nextFreeIndex(index uintptr) uintptr {
|
||||
if index == s.nelems {
|
||||
return index
|
||||
func (s *mspan) nextFreeIndex() uintptr {
|
||||
if s.freeindex == s.nelems {
|
||||
return s.freeindex
|
||||
}
|
||||
if index > s.nelems {
|
||||
throw("index > s.nelems")
|
||||
if s.freeindex > s.nelems {
|
||||
throw("s.freeindex > s.nelems")
|
||||
}
|
||||
whichByte := index / 8
|
||||
theByte := s.allocBits[whichByte]
|
||||
|
||||
theBitMask := uint8(1<<(index%8) - 1)
|
||||
// theBitMask holds a 1 for every bit < index which have already been allocated.
|
||||
// Flip the masked marked bits so 1 means a free bit.
|
||||
theByte = ^(theByte | theBitMask)
|
||||
tz := ctz(theByte)
|
||||
if tz != 8 {
|
||||
result := uintptr(tz) + whichByte*8
|
||||
if result >= s.nelems {
|
||||
return s.nelems
|
||||
aCache := s.allocCache
|
||||
bitIndex := ctz64(aCache)
|
||||
for bitIndex == 64 {
|
||||
// Move index to start of next cached bits.
|
||||
s.freeindex = (s.freeindex + 64) &^ (64 - 1)
|
||||
if s.freeindex >= s.nelems {
|
||||
s.freeindex = s.nelems
|
||||
return s.freeindex
|
||||
}
|
||||
return result
|
||||
whichByte := s.freeindex / 8
|
||||
// Refill s.allocCache with the next 64 alloc bits.
|
||||
// Unlike in allocBits a 1 in s.allocCache means
|
||||
// the object is not marked.
|
||||
s.refillAllocCache(whichByte)
|
||||
aCache = s.allocCache
|
||||
bitIndex = ctz64(aCache)
|
||||
// Nothing was available try again now allocCache has been refilled.
|
||||
}
|
||||
whichByte++
|
||||
index = (whichByte) * 8
|
||||
for ; index < s.nelems; index += 8 {
|
||||
theByte = ^s.allocBits[whichByte]
|
||||
tz = ctz(theByte)
|
||||
if tz != 8 {
|
||||
result := uintptr(tz) + whichByte*8
|
||||
if result >= s.nelems {
|
||||
return s.nelems
|
||||
}
|
||||
return result
|
||||
}
|
||||
whichByte++
|
||||
result := s.freeindex + uintptr(bitIndex)
|
||||
if result >= s.nelems {
|
||||
s.freeindex = s.nelems
|
||||
return s.freeindex
|
||||
}
|
||||
return s.nelems
|
||||
s.allocCache >>= bitIndex + 1
|
||||
s.freeindex = result + 1
|
||||
|
||||
if s.freeindex%64 == 0 && s.freeindex != s.nelems {
|
||||
// We just incremented s.freeindex so it isn't 0.
|
||||
// As each 1 in s.allocCache was encountered and used for allocation
|
||||
// it was shifted away. At this point s.allocCache contains all 0s.
|
||||
// Refill s.allocCache so that it corresponds
|
||||
// to the bits at s.allocBits starting at s.freeindex.
|
||||
whichByte := s.freeindex / 8
|
||||
s.refillAllocCache(whichByte)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (s *mspan) isFree(index uintptr) bool {
|
||||
@ -667,6 +694,7 @@ func (h heapBits) initSpan(s *mspan) {
|
||||
s.allocBits = &s.markbits1
|
||||
s.gcmarkBits = &s.markbits2
|
||||
s.freeindex = 0
|
||||
s.allocCache = ^uint64(0) // all 1s indicating all free.
|
||||
s.nelems = n
|
||||
s.clearAllocBits()
|
||||
s.clearGCMarkBits()
|
||||
@ -746,7 +774,6 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) {
|
||||
n := s.nelems
|
||||
cl := s.sizeclass
|
||||
doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0
|
||||
|
||||
h := heapBitsForSpan(base)
|
||||
switch {
|
||||
default:
|
||||
@ -763,69 +790,58 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) {
|
||||
|
||||
func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) {
|
||||
mbits := s.markBitsForBase()
|
||||
for i := uintptr(0); i < n; i += 4 {
|
||||
// Consider mark bits in all four 2-bit entries of each bitmap byte.
|
||||
if cl == 0 {
|
||||
throw("8BitPtrs are not in cl 0")
|
||||
}
|
||||
// Consider mark bits in all four 2-bit entries of each bitmap byte.
|
||||
for i := uintptr(0); i < n; i++ {
|
||||
// Note that unlike the other size cases, we leave the pointer bits set here.
|
||||
// These are initialized during initSpan when the span is created and left
|
||||
// in place the whole time the span is used for pointer-sized objects.
|
||||
// That lets heapBitsSetType avoid an atomic update to set the pointer bit
|
||||
// during allocation.
|
||||
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
|
||||
if doCall {
|
||||
if !mbits.isMarked() {
|
||||
nfree++
|
||||
if mbits.index < s.freeindex {
|
||||
f(base + i*sys.PtrSize)
|
||||
} else if s.allocBits[mbits.index/8]&mbits.mask == 1 {
|
||||
// it was marked in the previous cycle but not this cycle
|
||||
// if it wasn't marked in the prvious cycle the call would be redundant.
|
||||
f(base + i*sys.PtrSize)
|
||||
}
|
||||
if cl != 0 {
|
||||
nfree++
|
||||
}
|
||||
}
|
||||
mbits.advance()
|
||||
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
|
||||
if doCall {
|
||||
f(base + (i+1)*sys.PtrSize)
|
||||
}
|
||||
if cl != 0 {
|
||||
nfree++
|
||||
}
|
||||
}
|
||||
mbits.advance()
|
||||
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
|
||||
if doCall {
|
||||
f(base + (i+2)*sys.PtrSize)
|
||||
}
|
||||
if cl != 0 {
|
||||
nfree++
|
||||
}
|
||||
}
|
||||
mbits.advance()
|
||||
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
|
||||
if doCall {
|
||||
f(base + (i+3)*sys.PtrSize)
|
||||
}
|
||||
if cl != 0 {
|
||||
nfree++
|
||||
}
|
||||
}
|
||||
mbits.advance()
|
||||
}
|
||||
return
|
||||
return nfree
|
||||
}
|
||||
|
||||
func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool {
|
||||
// nextFreed returns the next object that is being freed during this GC cycle.
|
||||
// If the mark bit is set then the object is free. If it is < s.freeindex
|
||||
// then either the object was freed during by this GC cycle.
|
||||
// If it is >= freeindex then if the allocBit is set then it was
|
||||
// freed during this GC cycle. If the allocBit is 0 it was freed
|
||||
// during a previous cycle so is not considered a freed.
|
||||
func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool {
|
||||
mByte := *m.bytep
|
||||
for {
|
||||
for mByte == 0xff {
|
||||
if m.index >= maxIndex {
|
||||
if m.index >= nelems {
|
||||
return false
|
||||
}
|
||||
m.index = (m.index + 8) &^ (8 - 1)
|
||||
m.mask = 1
|
||||
m.bytep = add1(m.bytep)
|
||||
mByte = *m.bytep
|
||||
// Nothing free found totalFree remains the same.
|
||||
}
|
||||
if m.index >= maxIndex {
|
||||
if m.index >= nelems {
|
||||
return false
|
||||
}
|
||||
for m.index < maxIndex {
|
||||
for m.index < nelems {
|
||||
if m.mask&mByte == 0 {
|
||||
// At this point we have a free object so update totalFree
|
||||
*totalFree++
|
||||
if m.index < s.freeindex {
|
||||
return true
|
||||
}
|
||||
@ -848,18 +864,16 @@ func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) {
|
||||
func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int {
|
||||
totalFree := 0
|
||||
twobits := s.markBitsForBase()
|
||||
for twobits.nextFreed(n, s) {
|
||||
for twobits.nextFreed(n, s, &totalFree) {
|
||||
if doCall {
|
||||
f(base + twobits.index*size)
|
||||
}
|
||||
if cl != 0 {
|
||||
nfree++
|
||||
}
|
||||
twobits.advance()
|
||||
}
|
||||
return
|
||||
return totalFree
|
||||
}
|
||||
|
||||
// heapBitsSetType records that the new allocation [x, x+size)
|
||||
|
@ -67,7 +67,7 @@ retry:
|
||||
c.empty.insertBack(s)
|
||||
unlock(&c.lock)
|
||||
s.sweep(true)
|
||||
freeIndex := s.nextFreeIndex(0)
|
||||
freeIndex := s.nextFreeIndex()
|
||||
if freeIndex != s.nelems {
|
||||
s.freeindex = freeIndex
|
||||
goto havespan
|
||||
@ -101,7 +101,7 @@ retry:
|
||||
havespan:
|
||||
cap := int32((s.npages << _PageShift) / s.elemsize)
|
||||
n := cap - int32(s.allocCount)
|
||||
if n == 0 {
|
||||
if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
|
||||
throw("span has no free objects")
|
||||
}
|
||||
usedBytes := uintptr(s.allocCount) * s.elemsize
|
||||
@ -118,6 +118,15 @@ havespan:
|
||||
gcController.revise()
|
||||
}
|
||||
s.incache = true
|
||||
freeByteBase := s.freeindex &^ (64 - 1)
|
||||
whichByte := freeByteBase / 8
|
||||
// Init alloc bits cache.
|
||||
s.refillAllocCache(whichByte)
|
||||
|
||||
// Adjust the allocCache so that s.freeindex corresponds to the low bit in
|
||||
// s.allocCache.
|
||||
s.allocCache >>= s.freeindex % 64
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
@ -143,19 +152,19 @@ func (c *mcentral) uncacheSpan(s *mspan) {
|
||||
unlock(&c.lock)
|
||||
}
|
||||
|
||||
// Free n objects from a span s back into the central free list c.
|
||||
// Called during sweep.
|
||||
// Returns true if the span was returned to heap. Sets sweepgen to
|
||||
// the latest generation.
|
||||
// If preserve=true, don't return the span to heap nor relink in MCentral lists;
|
||||
// caller takes care of it.
|
||||
func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool {
|
||||
// freeSpan updates c and s after sweeping s.
|
||||
// It sets s's sweepgen to the latest generation,
|
||||
// and, based on the number of free objects in s,
|
||||
// moves s to the appropriate list of c or returns it
|
||||
// to the heap.
|
||||
// freeSpan returns true if s was returned to the heap.
|
||||
// If preserve=true, it does not move s (the caller
|
||||
// must take care of it).
|
||||
func (c *mcentral) freeSpan(s *mspan, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool {
|
||||
if s.incache {
|
||||
throw("freeSpan given cached span")
|
||||
}
|
||||
|
||||
s.allocCount -= uint16(n)
|
||||
|
||||
if preserve {
|
||||
// preserve is set only when called from MCentral_CacheSpan above,
|
||||
// the span must be in the empty list.
|
||||
|
@ -257,7 +257,7 @@ func (s *mspan) sweep(preserve bool) bool {
|
||||
// the block bitmap without atomic operations.
|
||||
|
||||
nfree = heapBitsSweepSpan(s, func(p uintptr) {
|
||||
// At this point we know that we are looking at garbage object
|
||||
// At this point we know that we are looking at a garbage object
|
||||
// that needs to be collected.
|
||||
if debug.allocfreetrace != 0 {
|
||||
tracefree(unsafe.Pointer(p), size)
|
||||
@ -286,8 +286,8 @@ func (s *mspan) sweep(preserve bool) bool {
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
wasempty := s.nextFreeIndex(s.freeindex) == s.nelems
|
||||
s.allocCount = uint16(s.nelems) - uint16(nfree)
|
||||
wasempty := s.nextFreeIndex() == s.nelems
|
||||
|
||||
s.freeindex = 0 // reset allocation index to start of span.
|
||||
|
||||
@ -295,6 +295,8 @@ func (s *mspan) sweep(preserve bool) bool {
|
||||
// Clear gcmarkBits in preparation for next GC
|
||||
s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits
|
||||
s.clearGCMarkBits() // prepare for next GC
|
||||
// Initialize alloc bits cache.
|
||||
s.refillAllocCache(0)
|
||||
|
||||
// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
|
||||
// because of the potential for a concurrent free/SetFinalizer.
|
||||
@ -313,9 +315,10 @@ func (s *mspan) sweep(preserve bool) bool {
|
||||
// to go so release the span.
|
||||
atomic.Store(&s.sweepgen, sweepgen)
|
||||
}
|
||||
if nfree > 0 {
|
||||
|
||||
if nfree > 0 && cl != 0 {
|
||||
c.local_nsmallfree[cl] += uintptr(nfree)
|
||||
res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve, wasempty)
|
||||
res = mheap_.central[cl].mcentral.freeSpan(s, head, end, preserve, wasempty)
|
||||
// MCentral_FreeSpan updates sweepgen
|
||||
} else if freeToHeap {
|
||||
// Free large span to heap
|
||||
|
@ -136,7 +136,15 @@ type mspan struct {
|
||||
// undefined and should never be referenced.
|
||||
//
|
||||
// Object n starts at address n*elemsize + (start << pageShift).
|
||||
freeindex uintptr
|
||||
freeindex uintptr
|
||||
|
||||
// Cache of the allocBits at freeindex. allocCache is shifted
|
||||
// such that the lowest bit corresponds to the bit freeindex.
|
||||
// allocCache holds the complement of allocBits, thus allowing
|
||||
// ctz64 (count trailing zero) to use it directly.
|
||||
// allocCache may contain bits beyond s.nelems; the caller must ignore
|
||||
// these.
|
||||
allocCache uint64
|
||||
allocBits *[maxObjsPerSpan / 8]uint8
|
||||
gcmarkBits *[maxObjsPerSpan / 8]uint8
|
||||
nelems uintptr // number of object in the span.
|
||||
@ -947,7 +955,7 @@ func (list *mSpanList) init() {
|
||||
|
||||
func (list *mSpanList) remove(span *mspan) {
|
||||
if span.prev == nil || span.list != list {
|
||||
println("failed MSpanList_Remove", span, span.prev, span.list, list)
|
||||
println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
|
||||
throw("MSpanList_Remove")
|
||||
}
|
||||
if span.next != nil {
|
||||
@ -969,7 +977,7 @@ func (list *mSpanList) isEmpty() bool {
|
||||
|
||||
func (list *mSpanList) insert(span *mspan) {
|
||||
if span.next != nil || span.prev != nil || span.list != nil {
|
||||
println("failed MSpanList_Insert", span, span.next, span.prev, span.list)
|
||||
println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
|
||||
throw("MSpanList_Insert")
|
||||
}
|
||||
span.next = list.first
|
||||
|
Loading…
Reference in New Issue
Block a user