mirror of
https://github.com/golang/go
synced 2024-11-19 12:34:47 -07:00
runtime: buffered write barrier implementation
This implements runtime support for buffered write barriers on amd64. The buffered write barrier has a fast path that simply enqueues pointers in a per-P buffer. Unlike the current write barrier, this fast path is *not* a normal Go call and does not require the compiler to spill general-purpose registers or put arguments on the stack. When the buffer fills up, the write barrier takes the slow path, which spills all general purpose registers and flushes the buffer. We don't allow safe-points or stack splits while this frame is active, so it doesn't matter that we have no type information for the spilled registers in this frame. One minor complication is cgocheck=2 mode, which uses the write barrier to detect Go pointers being written to non-Go memory. We obviously can't buffer this, so instead we set the buffer to its minimum size, forcing the write barrier into the slow path on every call. For this specific case, we pass additional information as arguments to the flush function. This also requires enabling the cgo write barrier slightly later during runtime initialization, after Ps (and the per-P write barrier buffers) have been initialized. The code in this CL is not yet active. The next CL will modify the compiler to generate calls to the new write barrier. This reduces the average cost of the write barrier by roughly a factor of 4, which will pay for the cost of having it enabled more of the time after we make the GC pacer less aggressive. (Benchmarks will be in the next CL.) Updates #14951. Updates #22460. Change-Id: I396b5b0e2c5e5c4acfd761a3235fd15abadc6cb1 Reviewed-on: https://go-review.googlesource.com/73711 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
This commit is contained in:
parent
1e8ab99b37
commit
e9079a69f3
@ -31,3 +31,4 @@ runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] countByte: function countByte missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] gcWriteBarrier: function gcWriteBarrier missing Go declaration
|
||||
|
@ -2371,3 +2371,92 @@ TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
|
||||
MOVQ DI, runtime·lastmoduledatap(SB)
|
||||
POPQ R15
|
||||
RET
|
||||
|
||||
// gcWriteBarrier performs a heap pointer write and informs the GC.
|
||||
//
|
||||
// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
|
||||
// - DI is the destination of the write
|
||||
// - AX is the value being written at DI
|
||||
// It clobbers FLAGS. It does not clobber any general-purpose registers,
|
||||
// but may clobber others (e.g., SSE registers).
|
||||
//
|
||||
// TODO: AX may be a bad choice because regalloc likes to use it.
|
||||
TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
|
||||
// Save the registers clobbered by the fast path.
|
||||
//
|
||||
// TODO: Teach the register allocator that this clobbers some registers
|
||||
// so we don't always have to save them? Use regs it's least likely to
|
||||
// care about.
|
||||
MOVQ R14, 104(SP)
|
||||
MOVQ R13, 112(SP)
|
||||
// TODO: Consider passing g.m.p in as an argument so they can be shared
|
||||
// across a sequence of write barriers.
|
||||
get_tls(R13)
|
||||
MOVQ g(R13), R13
|
||||
MOVQ g_m(R13), R13
|
||||
MOVQ m_p(R13), R13
|
||||
MOVQ (p_wbBuf+wbBuf_next)(R13), R14
|
||||
// Increment wbBuf.next position.
|
||||
LEAQ 16(R14), R14
|
||||
MOVQ R14, (p_wbBuf+wbBuf_next)(R13)
|
||||
CMPQ R14, (p_wbBuf+wbBuf_end)(R13)
|
||||
// Record the write.
|
||||
MOVQ AX, -16(R14) // Record value
|
||||
MOVQ (DI), R13 // TODO: This turns bad writes into bad reads.
|
||||
MOVQ R13, -8(R14) // Record *slot
|
||||
// Is the buffer full? (flags set in CMPQ above)
|
||||
JEQ flush
|
||||
ret:
|
||||
MOVQ 104(SP), R14
|
||||
MOVQ 112(SP), R13
|
||||
// Do the write.
|
||||
MOVQ AX, (DI)
|
||||
RET
|
||||
|
||||
flush:
|
||||
// Save all general purpose registers since these could be
|
||||
// clobbered by wbBufFlush and were not saved by the caller.
|
||||
// It is possible for wbBufFlush to clobber other registers
|
||||
// (e.g., SSE registers), but the compiler takes care of saving
|
||||
// those in the caller if necessary. This strikes a balance
|
||||
// with registers that are likely to be used.
|
||||
//
|
||||
// We don't have type information for these, but all code under
|
||||
// here is NOSPLIT, so nothing will observe these.
|
||||
//
|
||||
// TODO: We could strike a different balance; e.g., saving X0
|
||||
// and not saving GP registers that are less likely to be used.
|
||||
MOVQ DI, 0(SP) // Also first argument to wbBufFlush
|
||||
MOVQ AX, 8(SP) // Also second argument to wbBufFlush
|
||||
MOVQ BX, 16(SP)
|
||||
MOVQ CX, 24(SP)
|
||||
MOVQ DX, 32(SP)
|
||||
// DI already saved
|
||||
MOVQ SI, 40(SP)
|
||||
MOVQ BP, 48(SP)
|
||||
MOVQ R8, 56(SP)
|
||||
MOVQ R9, 64(SP)
|
||||
MOVQ R10, 72(SP)
|
||||
MOVQ R11, 80(SP)
|
||||
MOVQ R12, 88(SP)
|
||||
// R13 already saved
|
||||
// R14 already saved
|
||||
MOVQ R15, 96(SP)
|
||||
|
||||
// This takes arguments DI and AX
|
||||
CALL runtime·wbBufFlush(SB)
|
||||
|
||||
MOVQ 0(SP), DI
|
||||
MOVQ 8(SP), AX
|
||||
MOVQ 16(SP), BX
|
||||
MOVQ 24(SP), CX
|
||||
MOVQ 32(SP), DX
|
||||
MOVQ 40(SP), SI
|
||||
MOVQ 48(SP), BP
|
||||
MOVQ 56(SP), R8
|
||||
MOVQ 64(SP), R9
|
||||
MOVQ 72(SP), R10
|
||||
MOVQ 80(SP), R11
|
||||
MOVQ 88(SP), R12
|
||||
MOVQ 96(SP), R15
|
||||
JMP ret
|
||||
|
@ -16,6 +16,10 @@ const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
|
||||
|
||||
// cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
|
||||
// It throws if the program is storing a Go pointer into non-Go memory.
|
||||
//
|
||||
// This is called from the write barrier, so its entire call tree must
|
||||
// be nosplit.
|
||||
//
|
||||
//go:nosplit
|
||||
//go:nowritebarrier
|
||||
func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
|
||||
|
@ -1394,6 +1394,7 @@ top:
|
||||
// workers have exited their loop so we can
|
||||
// start new mark 2 workers.
|
||||
forEachP(func(_p_ *p) {
|
||||
wbBufFlush1(_p_)
|
||||
_p_.gcw.dispose()
|
||||
})
|
||||
})
|
||||
|
@ -1234,6 +1234,9 @@ func shade(b uintptr) {
|
||||
// obj is the start of an object with mark mbits.
|
||||
// If it isn't already marked, mark it and enqueue into gcw.
|
||||
// base and off are for debugging only and could be removed.
|
||||
//
|
||||
// See also wbBufFlush1, which partially duplicates this logic.
|
||||
//
|
||||
//go:nowritebarrierrec
|
||||
func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) {
|
||||
// obj should be start of allocation, and so must be at least pointer-aligned.
|
||||
|
@ -150,6 +150,39 @@ func (w *gcWork) putFast(obj uintptr) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// putBatch performs a put on every pointer in obj. See put for
|
||||
// constraints on these pointers.
|
||||
//
|
||||
//go:nowritebarrierrec
|
||||
func (w *gcWork) putBatch(obj []uintptr) {
|
||||
if len(obj) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
flushed := false
|
||||
wbuf := w.wbuf1
|
||||
if wbuf == nil {
|
||||
w.init()
|
||||
wbuf = w.wbuf1
|
||||
}
|
||||
|
||||
for len(obj) > 0 {
|
||||
for wbuf.nobj == len(wbuf.obj) {
|
||||
putfull(wbuf)
|
||||
w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
|
||||
wbuf = w.wbuf1
|
||||
flushed = true
|
||||
}
|
||||
n := copy(wbuf.obj[wbuf.nobj:], obj)
|
||||
wbuf.nobj += n
|
||||
obj = obj[n:]
|
||||
}
|
||||
|
||||
if flushed && gcphase == _GCmark {
|
||||
gcController.enlistWorker()
|
||||
}
|
||||
}
|
||||
|
||||
// tryGet dequeues a pointer for the garbage collector to trace.
|
||||
//
|
||||
// If there are no pointers remaining in this gcWork or in the global
|
||||
|
216
src/runtime/mwbbuf.go
Normal file
216
src/runtime/mwbbuf.go
Normal file
@ -0,0 +1,216 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This implements the write barrier buffer. The write barrier itself
|
||||
// is gcWriteBarrier and is implemented in assembly.
|
||||
//
|
||||
// The write barrier has a fast path and a slow path. The fast path
|
||||
// simply enqueues to a per-P write barrier buffer. It's written in
|
||||
// assembly and doesn't clobber any general purpose registers, so it
|
||||
// doesn't have the usual overheads of a Go call.
|
||||
//
|
||||
// When the buffer fills up, the write barrier invokes the slow path
|
||||
// (wbBufFlush) to flush the buffer to the GC work queues. In this
|
||||
// path, since the compiler didn't spill registers, we spill *all*
|
||||
// registers and disallow any GC safe points that could observe the
|
||||
// stack frame (since we don't know the types of the spilled
|
||||
// registers).
|
||||
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// testSmallBuf forces a small write barrier buffer to stress write
|
||||
// barrier flushing.
|
||||
const testSmallBuf = false
|
||||
|
||||
// wbBuf is a per-P buffer of pointers queued by the write barrier.
|
||||
// This buffer is flushed to the GC workbufs when it fills up and on
|
||||
// various GC transitions.
|
||||
//
|
||||
// This is closely related to a "sequential store buffer" (SSB),
|
||||
// except that SSBs are usually used for maintaining remembered sets,
|
||||
// while this is used for marking.
|
||||
type wbBuf struct {
|
||||
// next points to the next slot in buf. It must not be a
|
||||
// pointer type because it can point past the end of buf and
|
||||
// must be updated without write barriers.
|
||||
//
|
||||
// This is a pointer rather than an index to optimize the
|
||||
// write barrier assembly.
|
||||
next uintptr
|
||||
|
||||
// end points to just past the end of buf. It must not be a
|
||||
// pointer type because it points past the end of buf and must
|
||||
// be updated without write barriers.
|
||||
end uintptr
|
||||
|
||||
// buf stores a series of pointers to execute write barriers
|
||||
// on. This must be a multiple of wbBufEntryPointers because
|
||||
// the write barrier only checks for overflow once per entry.
|
||||
buf [wbBufEntryPointers * wbBufEntries]uintptr
|
||||
}
|
||||
|
||||
const (
|
||||
// wbBufEntries is the number of write barriers between
|
||||
// flushes of the write barrier buffer.
|
||||
//
|
||||
// This trades latency for throughput amortization. Higher
|
||||
// values amortize flushing overhead more, but increase the
|
||||
// latency of flushing. Higher values also increase the cache
|
||||
// footprint of the buffer.
|
||||
//
|
||||
// TODO: What is the latency cost of this? Tune this value.
|
||||
wbBufEntries = 256
|
||||
|
||||
// wbBufEntryPointers is the number of pointers added to the
|
||||
// buffer by each write barrier.
|
||||
wbBufEntryPointers = 2
|
||||
)
|
||||
|
||||
// reset empties b by resetting its next and end pointers.
|
||||
func (b *wbBuf) reset() {
|
||||
start := uintptr(unsafe.Pointer(&b.buf[0]))
|
||||
b.next = start
|
||||
if gcBlackenPromptly || writeBarrier.cgo {
|
||||
// Effectively disable the buffer by forcing a flush
|
||||
// on every barrier.
|
||||
b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
|
||||
} else if testSmallBuf {
|
||||
// For testing, allow two barriers in the buffer. If
|
||||
// we only did one, then barriers of non-heap pointers
|
||||
// would be no-ops. This lets us combine a buffered
|
||||
// barrier with a flush at a later time.
|
||||
b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
|
||||
} else {
|
||||
b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
|
||||
}
|
||||
|
||||
if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
|
||||
throw("bad write barrier buffer bounds")
|
||||
}
|
||||
}
|
||||
|
||||
// wbBufFlush flushes the current P's write barrier buffer to the GC
|
||||
// workbufs. It is passed the slot and value of the write barrier that
|
||||
// caused the flush so that it can implement cgocheck.
|
||||
//
|
||||
// This must not have write barriers because it is part of the write
|
||||
// barrier implementation.
|
||||
//
|
||||
// This and everything it calls must be nosplit because 1) the stack
|
||||
// contains untyped slots from gcWriteBarrier and 2) there must not be
|
||||
// a GC safe point between the write barrier test in the caller and
|
||||
// flushing the buffer.
|
||||
//
|
||||
// TODO: A "go:nosplitrec" annotation would be perfect for this.
|
||||
//
|
||||
//go:nowritebarrierrec
|
||||
//go:nosplit
|
||||
func wbBufFlush(dst *uintptr, src uintptr) {
|
||||
if getg().m.dying > 0 {
|
||||
// We're going down. Not much point in write barriers
|
||||
// and this way we can allow write barriers in the
|
||||
// panic path.
|
||||
return
|
||||
}
|
||||
|
||||
if writeBarrier.cgo {
|
||||
// This must be called from the stack that did the
|
||||
// write. It's nosplit all the way down.
|
||||
cgoCheckWriteBarrier(dst, src)
|
||||
if !writeBarrier.needed {
|
||||
// We were only called for cgocheck.
|
||||
b := &getg().m.p.ptr().wbBuf
|
||||
b.next = uintptr(unsafe.Pointer(&b.buf[0]))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Switch to the system stack so we don't have to worry about
|
||||
// the untyped stack slots or safe points.
|
||||
systemstack(func() {
|
||||
wbBufFlush1(getg().m.p.ptr())
|
||||
})
|
||||
}
|
||||
|
||||
// wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
|
||||
//
|
||||
// This must not have write barriers because it is part of the write
|
||||
// barrier implementation, so this may lead to infinite loops or
|
||||
// buffer corruption.
|
||||
//
|
||||
// This must be non-preemptible because it uses the P's workbuf.
|
||||
//
|
||||
//go:nowritebarrierrec
|
||||
//go:systemstack
|
||||
func wbBufFlush1(_p_ *p) {
|
||||
// Get the buffered pointers.
|
||||
start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
|
||||
n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
|
||||
ptrs := _p_.wbBuf.buf[:n]
|
||||
|
||||
// Reset the buffer.
|
||||
_p_.wbBuf.reset()
|
||||
|
||||
if useCheckmark {
|
||||
// Slow path for checkmark mode.
|
||||
for _, ptr := range ptrs {
|
||||
shade(ptr)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Mark all of the pointers in the buffer and record only the
|
||||
// pointers we greyed. We use the buffer itself to temporarily
|
||||
// record greyed pointers.
|
||||
//
|
||||
// TODO: Should scanobject/scanblock just stuff pointers into
|
||||
// the wbBuf? Then this would become the sole greying path.
|
||||
gcw := &_p_.gcw
|
||||
pos := 0
|
||||
arenaStart := mheap_.arena_start
|
||||
for _, ptr := range ptrs {
|
||||
if ptr < arenaStart {
|
||||
// nil pointers are very common, especially
|
||||
// for the "old" values. Filter out these and
|
||||
// other "obvious" non-heap pointers ASAP.
|
||||
//
|
||||
// TODO: Should we filter out nils in the fast
|
||||
// path to reduce the rate of flushes?
|
||||
continue
|
||||
}
|
||||
// TODO: This doesn't use hbits, so calling
|
||||
// heapBitsForObject seems a little silly. We could
|
||||
// easily separate this out since heapBitsForObject
|
||||
// just calls heapBitsForAddr(obj) to get hbits.
|
||||
obj, _, span, objIndex := heapBitsForObject(ptr, 0, 0)
|
||||
if obj == 0 {
|
||||
continue
|
||||
}
|
||||
// TODO: Consider making two passes where the first
|
||||
// just prefetches the mark bits.
|
||||
mbits := span.markBitsForIndex(objIndex)
|
||||
if mbits.isMarked() {
|
||||
continue
|
||||
}
|
||||
mbits.setMarked()
|
||||
if span.spanclass.noscan() {
|
||||
gcw.bytesMarked += uint64(span.elemsize)
|
||||
continue
|
||||
}
|
||||
ptrs[pos] = obj
|
||||
pos++
|
||||
}
|
||||
|
||||
// Enqueue the greyed objects.
|
||||
gcw.putBatch(ptrs[:pos])
|
||||
if gcphase == _GCmarktermination || gcBlackenPromptly {
|
||||
// Ps aren't allowed to cache work during mark
|
||||
// termination.
|
||||
gcw.dispose()
|
||||
}
|
||||
}
|
@ -506,6 +506,17 @@ func schedinit() {
|
||||
throw("unknown runnable goroutine during bootstrap")
|
||||
}
|
||||
|
||||
// For cgocheck > 1, we turn on the write barrier at all times
|
||||
// and check all pointer writes. We can't do this until after
|
||||
// procresize because the write barrier needs a P.
|
||||
if debug.cgocheck > 1 {
|
||||
writeBarrier.cgo = true
|
||||
writeBarrier.enabled = true
|
||||
for _, p := range allp {
|
||||
p.wbBuf.reset()
|
||||
}
|
||||
}
|
||||
|
||||
if buildVersion == "" {
|
||||
// Condition should never trigger. This code just serves
|
||||
// to ensure runtime·buildVersion is kept in the resulting binary.
|
||||
@ -3862,6 +3873,7 @@ func procresize(nprocs int32) *p {
|
||||
for i := range pp.deferpool {
|
||||
pp.deferpool[i] = pp.deferpoolbuf[i][:0]
|
||||
}
|
||||
pp.wbBuf.reset()
|
||||
atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
|
||||
}
|
||||
if pp.mcache == nil {
|
||||
@ -3917,6 +3929,11 @@ func procresize(nprocs int32) *p {
|
||||
// world is stopped.
|
||||
p.gcBgMarkWorker.set(nil)
|
||||
}
|
||||
// Flush p's write barrier buffer.
|
||||
if gcphase != _GCoff {
|
||||
wbBufFlush1(p)
|
||||
p.gcw.dispose()
|
||||
}
|
||||
for i := range p.sudogbuf {
|
||||
p.sudogbuf[i] = nil
|
||||
}
|
||||
|
@ -386,13 +386,6 @@ func parsedebugvars() {
|
||||
|
||||
setTraceback(gogetenv("GOTRACEBACK"))
|
||||
traceback_env = traceback_cache
|
||||
|
||||
// For cgocheck > 1, we turn on the write barrier at all times
|
||||
// and check all pointer writes.
|
||||
if debug.cgocheck > 1 {
|
||||
writeBarrier.cgo = true
|
||||
writeBarrier.enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
//go:linkname setTraceback runtime/debug.SetTraceback
|
||||
|
@ -538,6 +538,11 @@ type p struct {
|
||||
// disposed on certain GC state transitions.
|
||||
gcw gcWork
|
||||
|
||||
// wbBuf is this P's GC write barrier buffer.
|
||||
//
|
||||
// TODO: Consider caching this in the running G.
|
||||
wbBuf wbBuf
|
||||
|
||||
runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
|
||||
|
||||
pad [sys.CacheLineSize]byte
|
||||
|
Loading…
Reference in New Issue
Block a user