go/src/runtime/mgcsweepbuf.go

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package runtime

import (
	"runtime/internal/atomic"
	"runtime/internal/sys"
	"unsafe"
)

// A gcSweepBuf is a set of *mspans.
//
// gcSweepBuf is safe for concurrent push operations *or* concurrent
// pop operations, but not both simultaneously.
type gcSweepBuf struct {
	// A gcSweepBuf is a two-level data structure consisting of a
	// growable spine that points to fixed-sized blocks. The spine
	// can be accessed without locks, but adding a block or
	// growing it requires taking the spine lock.
	//
	// Because each mspan covers at least 8K of heap and takes at
	// most 8 bytes in the gcSweepBuf, the growth of the spine is
	// quite limited.
	//
	// The spine and all blocks are allocated off-heap, which
	// allows this to be used in the memory manager and avoids the
	// need for write barriers on all of these. We never release
	// this memory because there could be concurrent lock-free
	// access and we're likely to reuse it anyway. (In principle,
	// we could do this during STW.)

	spineLock mutex
	spine     unsafe.Pointer // *[N]*gcSweepBlock, accessed atomically
	spineLen  uintptr        // Spine array length, accessed atomically
	spineCap  uintptr        // Spine array cap, accessed under lock

	// index is the first unused slot in the logical concatenation
	// of all blocks. It is accessed atomically.
	index uint32
}

const (
	gcSweepBlockEntries    = 512 // 4KB on 64-bit
	gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit
)

type gcSweepBlock struct {
	spans [gcSweepBlockEntries]*mspan
}

// push adds span s to buffer b. push is safe to call concurrently
// with other push operations, but NOT to call concurrently with pop.
func (b *gcSweepBuf) push(s *mspan) {
	// Obtain our slot.
	cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)
	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries

	// Do we need to add a block?
	spineLen := atomic.Loaduintptr(&b.spineLen)
	var block *gcSweepBlock
retry:
	if top < spineLen {
		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
		blockp := add(spine, sys.PtrSize*top)
		block = (*gcSweepBlock)(atomic.Loadp(blockp))
	} else {
		// Add a new block to the spine, potentially growing
		// the spine.
		lock(&b.spineLock)
		// spineLen cannot change until we release the lock,
		// but may have changed while we were waiting.
		spineLen = atomic.Loaduintptr(&b.spineLen)
		if top < spineLen {
			unlock(&b.spineLock)
			goto retry
		}

		if spineLen == b.spineCap {
			// Grow the spine.
			newCap := b.spineCap * 2
			if newCap == 0 {
				newCap = gcSweepBufInitSpineCap
			}
			newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)
			if b.spineCap != 0 {
				// Blocks are allocated off-heap, so
				// no write barriers.
				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
			}
			// Spine is allocated off-heap, so no write barrier.
			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
			b.spineCap = newCap
			// We can't immediately free the old spine
			// since a concurrent push with a lower index
			// could still be reading from it. We let it
			// leak because even a 1TB heap would waste
			// less than 2MB of memory on old spines. If
			// this is a problem, we could free old spines
			// during STW.
		}

		// Allocate a new block and add it to the spine.
		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))
		blockp := add(b.spine, sys.PtrSize*top)
		// Blocks are allocated off-heap, so no write barrier.
		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
		atomic.Storeuintptr(&b.spineLen, spineLen+1)
		unlock(&b.spineLock)
	}

	// We have a block. Insert the span.
	block.spans[bottom] = s
}

// pop removes and returns a span from buffer b, or nil if b is empty.
// pop is safe to call concurrently with other pop operations, but NOT
// to call concurrently with push.
func (b *gcSweepBuf) pop() *mspan {
	cursor := atomic.Xadd(&b.index, -1)
	if int32(cursor) < 0 {
		atomic.Xadd(&b.index, +1)
		return nil
	}

	// There are no concurrent spine or block modifications during
	// pop, so we can omit the atomics.
	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
	blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
	block := *blockp
	s := block.spans[bottom]
	// Clear the pointer for block(i).
	block.spans[bottom] = nil
	return s
}

// numBlocks returns the number of blocks in buffer b. numBlocks is
// safe to call concurrently with any other operation. Spans that have
// been pushed prior to the call to numBlocks are guaranteed to appear
// in some block in the range [0, numBlocks()), assuming there are no
// intervening pops. Spans that are pushed after the call may also
// appear in these blocks.
func (b *gcSweepBuf) numBlocks() int {
	return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)
}

// block returns the spans in the i'th block of buffer b. block is
// safe to call concurrently with push.
func (b *gcSweepBuf) block(i int) []*mspan {
	// Perform bounds check before loading spine address since
	// push ensures the allocated length is at least spineLen.
	if i < 0 || uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {
		throw("block index out of range")
	}

	// Get block i.
	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
	blockp := add(spine, sys.PtrSize*uintptr(i))
	block := (*gcSweepBlock)(atomic.Loadp(blockp))

	// Slice the block if necessary.
	cursor := uintptr(atomic.Load(&b.index))
	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
	var spans []*mspan
	if uintptr(i) < top {
		spans = block.spans[:]
	} else {
		spans = block.spans[:bottom]
	}

	// push may have reserved a slot but not filled it yet, so
	// trim away unused entries.
	for len(spans) > 0 && spans[len(spans)-1] == nil {
		spans = spans[:len(spans)-1]
	}
	return spans
}
runtime: make sweep time proportional to in-use spans Currently sweeping walks the list of all spans, which means the work in sweeping is proportional to the maximum number of spans ever used. If the heap was once large but is now small, this causes an amortization failure: on a small heap, GCs happen frequently, but a full sweep still has to happen in each GC cycle, which means we spent a lot of time in sweeping. Fix this by creating a separate list consisting of just the in-use spans to be swept, so sweeping is proportional to the number of in-use spans (which is proportional to the live heap). Specifically, we create two lists: a list of unswept in-use spans and a list of swept in-use spans. At the start of the sweep cycle, the swept list becomes the unswept list and the new swept list is empty. Allocating a new in-use span adds it to the swept list. Sweeping moves spans from the unswept list to the swept list. This fixes the amortization problem because a shrinking heap moves spans off the unswept list without adding them to the swept list, reducing the time required by the next sweep cycle. Updates #9265. This fix eliminates almost all of the time spent in sweepone; however, markrootSpans has essentially the same bug, so now the test program from this issue spends all of its time in markrootSpans. No significant effect on other benchmarks. Change-Id: Ib382e82790aad907da1c127e62b3ab45d7a4ac1e Reviewed-on: https://go-review.googlesource.com/30535 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org> 2016-10-05 15:50:39 -06:00			`// Copyright 2016 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package runtime`

			`import (`
			`"runtime/internal/atomic"`
			`"runtime/internal/sys"`
			`"unsafe"`
			`)`

			`// A gcSweepBuf is a set of *mspans.`
			`//`
			`// gcSweepBuf is safe for concurrent push operations or concurrent`
			`// pop operations, but not both simultaneously.`
			`type gcSweepBuf struct {`
			`// A gcSweepBuf is a two-level data structure consisting of a`
			`// growable spine that points to fixed-sized blocks. The spine`
			`// can be accessed without locks, but adding a block or`
			`// growing it requires taking the spine lock.`
			`//`
			`// Because each mspan covers at least 8K of heap and takes at`
			`// most 8 bytes in the gcSweepBuf, the growth of the spine is`
			`// quite limited.`
			`//`
			`// The spine and all blocks are allocated off-heap, which`
			`// allows this to be used in the memory manager and avoids the`
			`// need for write barriers on all of these. We never release`
			`// this memory because there could be concurrent lock-free`
			`// access and we're likely to reuse it anyway. (In principle,`
			`// we could do this during STW.)`

			`spineLock mutex`
			`spine unsafe.Pointer // [N]gcSweepBlock, accessed atomically`
			`spineLen uintptr // Spine array length, accessed atomically`
			`spineCap uintptr // Spine array cap, accessed under lock`

			`// index is the first unused slot in the logical concatenation`
			`// of all blocks. It is accessed atomically.`
			`index uint32`
			`}`

			`const (`
			`gcSweepBlockEntries = 512 // 4KB on 64-bit`
			`gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit`
			`)`

			`type gcSweepBlock struct {`
			`spans [gcSweepBlockEntries]*mspan`
			`}`

			`// push adds span s to buffer b. push is safe to call concurrently`
			`// with other push operations, but NOT to call concurrently with pop.`
			`func (b gcSweepBuf) push(s mspan) {`
			`// Obtain our slot.`
			`cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)`
			`top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries`

			`// Do we need to add a block?`
			`spineLen := atomic.Loaduintptr(&b.spineLen)`
			`var block *gcSweepBlock`
			`retry:`
			`if top < spineLen {`
			`spine := atomic.Loadp(unsafe.Pointer(&b.spine))`
			`blockp := add(spine, sys.PtrSize*top)`
			`block = (*gcSweepBlock)(atomic.Loadp(blockp))`
			`} else {`
			`// Add a new block to the spine, potentially growing`
			`// the spine.`
			`lock(&b.spineLock)`
			`// spineLen cannot change until we release the lock,`
			`// but may have changed while we were waiting.`
			`spineLen = atomic.Loaduintptr(&b.spineLen)`
			`if top < spineLen {`
			`unlock(&b.spineLock)`
			`goto retry`
			`}`

			`if spineLen == b.spineCap {`
			`// Grow the spine.`
			`newCap := b.spineCap * 2`
			`if newCap == 0 {`
			`newCap = gcSweepBufInitSpineCap`
			`}`
			`newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)`
			`if b.spineCap != 0 {`
			`// Blocks are allocated off-heap, so`
			`// no write barriers.`
			`memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)`
			`}`
			`// Spine is allocated off-heap, so no write barrier.`
			`atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)`
			`b.spineCap = newCap`
			`// We can't immediately free the old spine`
			`// since a concurrent push with a lower index`
			`// could still be reading from it. We let it`
			`// leak because even a 1TB heap would waste`
			`// less than 2MB of memory on old spines. If`
			`// this is a problem, we could free old spines`
			`// during STW.`
			`}`

			`// Allocate a new block and add it to the spine.`
			`block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))`
			`blockp := add(b.spine, sys.PtrSize*top)`
			`// Blocks are allocated off-heap, so no write barrier.`
			`atomic.StorepNoWB(blockp, unsafe.Pointer(block))`
			`atomic.Storeuintptr(&b.spineLen, spineLen+1)`
			`unlock(&b.spineLock)`
			`}`

			`// We have a block. Insert the span.`
			`block.spans[bottom] = s`
			`}`

			`// pop removes and returns a span from buffer b, or nil if b is empty.`
			`// pop is safe to call concurrently with other pop operations, but NOT`
			`// to call concurrently with push.`
			`func (b gcSweepBuf) pop() mspan {`
			`cursor := atomic.Xadd(&b.index, -1)`
			`if int32(cursor) < 0 {`
			`atomic.Xadd(&b.index, +1)`
			`return nil`
			`}`

			`// There are no concurrent spine or block modifications during`
			`// pop, so we can omit the atomics.`
			`top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries`
			`blockp := (*gcSweepBlock)(add(b.spine, sys.PtrSizeuintptr(top)))`
			`block := *blockp`
runtime: make markrootSpans time proportional to in-use spans Currently markrootSpans iterates over all spans ever allocated to find the in-use spans. Since we now have a list of in-use spans, change it to iterate over that instead. This, combined with the previous change, fixes #9265. Before these two changes, blowing up the heap to 8GB and then shrinking it to a 0MB live set caused the small-heap portion of the test to run 60x slower than without the initial blowup. With these two changes, the time is indistinguishable. No significant effect on other benchmarks. Change-Id: I4a27e533efecfb5d18cba3a87c0181a81d0ddc1e Reviewed-on: https://go-review.googlesource.com/30536 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org> 2016-10-05 16:32:21 -06:00			`s := block.spans[bottom]`
			`// Clear the pointer for block(i).`
			`block.spans[bottom] = nil`
			`return s`
			`}`

			`// numBlocks returns the number of blocks in buffer b. numBlocks is`
			`// safe to call concurrently with any other operation. Spans that have`
			`// been pushed prior to the call to numBlocks are guaranteed to appear`
			`// in some block in the range [0, numBlocks()), assuming there are no`
			`// intervening pops. Spans that are pushed after the call may also`
			`// appear in these blocks.`
			`func (b *gcSweepBuf) numBlocks() int {`
			`return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)`
			`}`

			`// block returns the spans in the i'th block of buffer b. block is`
			`// safe to call concurrently with push.`
			`func (b gcSweepBuf) block(i int) []mspan {`
			`// Perform bounds check before loading spine address since`
			`// push ensures the allocated length is at least spineLen.`
			`if i < 0 \|\| uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {`
			`throw("block index out of range")`
			`}`

			`// Get block i.`
			`spine := atomic.Loadp(unsafe.Pointer(&b.spine))`
			`blockp := add(spine, sys.PtrSize*uintptr(i))`
			`block := (*gcSweepBlock)(atomic.Loadp(blockp))`

			`// Slice the block if necessary.`
			`cursor := uintptr(atomic.Load(&b.index))`
			`top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries`
			`var spans []*mspan`
			`if uintptr(i) < top {`
			`spans = block.spans[:]`
			`} else {`
			`spans = block.spans[:bottom]`
			`}`

			`// push may have reserved a slot but not filled it yet, so`
			`// trim away unused entries.`
			`for len(spans) > 0 && spans[len(spans)-1] == nil {`
			`spans = spans[:len(spans)-1]`
			`}`
			`return spans`
runtime: make sweep time proportional to in-use spans Currently sweeping walks the list of all spans, which means the work in sweeping is proportional to the maximum number of spans ever used. If the heap was once large but is now small, this causes an amortization failure: on a small heap, GCs happen frequently, but a full sweep still has to happen in each GC cycle, which means we spent a lot of time in sweeping. Fix this by creating a separate list consisting of just the in-use spans to be swept, so sweeping is proportional to the number of in-use spans (which is proportional to the live heap). Specifically, we create two lists: a list of unswept in-use spans and a list of swept in-use spans. At the start of the sweep cycle, the swept list becomes the unswept list and the new swept list is empty. Allocating a new in-use span adds it to the swept list. Sweeping moves spans from the unswept list to the swept list. This fixes the amortization problem because a shrinking heap moves spans off the unswept list without adding them to the swept list, reducing the time required by the next sweep cycle. Updates #9265. This fix eliminates almost all of the time spent in sweepone; however, markrootSpans has essentially the same bug, so now the test program from this issue spends all of its time in markrootSpans. No significant effect on other benchmarks. Change-Id: Ib382e82790aad907da1c127e62b3ab45d7a4ac1e Reviewed-on: https://go-review.googlesource.com/30535 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org> 2016-10-05 15:50:39 -06:00			`}`