mirror of
https://github.com/golang/go
synced 2024-11-12 09:50:21 -07:00
runtime: cache two workbufs to reduce contention
Currently the gcWork abstraction caches a single work buffer. As a result, if a worker is putting and getting pointers right at the boundary of a work buffer, it can flap between work buffers and (potentially significantly) increase contention on the global work buffer lists. This change modifies gcWork to instead cache two work buffers and switch off between them. This introduces one buffers' worth of hysteresis and eliminates the above performance worst case by amortizing the cost of getting or putting a work buffer over at least one buffers' worth of work. In practice, it's difficult to trigger this worst case with reasonably large work buffers. On the garbage benchmark, this reduces the max writes/sec to the global work list from 32K to 25K and the median from 6K to 5K. However, if a workload were to trigger this worst case behavior, it could significantly drive up this contention. This has negligible effects on the go1 benchmarks and slightly speeds up the garbage benchmark. name old time/op new time/op delta XBenchGarbage-12 5.90ms ± 3% 5.83ms ± 4% -1.18% (p=0.011 n=18+18) name old time/op new time/op delta BinaryTree17-12 3.22s ± 4% 3.17s ± 3% -1.57% (p=0.009 n=19+20) Fannkuch11-12 2.44s ± 1% 2.53s ± 4% +3.78% (p=0.000 n=18+19) FmtFprintfEmpty-12 50.2ns ± 2% 50.5ns ± 5% ~ (p=0.631 n=19+20) FmtFprintfString-12 167ns ± 1% 166ns ± 1% ~ (p=0.141 n=20+20) FmtFprintfInt-12 162ns ± 1% 159ns ± 1% -1.80% (p=0.000 n=20+20) FmtFprintfIntInt-12 277ns ± 2% 263ns ± 1% -4.78% (p=0.000 n=20+18) FmtFprintfPrefixedInt-12 240ns ± 1% 232ns ± 2% -3.25% (p=0.000 n=20+20) FmtFprintfFloat-12 311ns ± 1% 315ns ± 2% +1.17% (p=0.000 n=20+20) FmtManyArgs-12 1.05µs ± 2% 1.03µs ± 2% -1.72% (p=0.000 n=20+20) GobDecode-12 8.65ms ± 1% 8.71ms ± 2% +0.68% (p=0.001 n=19+20) GobEncode-12 6.51ms ± 1% 6.54ms ± 1% +0.42% (p=0.047 n=20+19) Gzip-12 318ms ± 2% 315ms ± 2% -1.20% (p=0.000 n=19+19) Gunzip-12 42.2ms ± 2% 42.1ms ± 1% ~ (p=0.667 n=20+19) HTTPClientServer-12 62.5µs ± 1% 62.4µs ± 1% ~ (p=0.110 n=20+18) JSONEncode-12 16.8ms ± 1% 16.8ms ± 2% ~ (p=0.569 n=19+20) JSONDecode-12 60.8ms ± 2% 59.8ms ± 1% -1.69% (p=0.000 n=19+19) Mandelbrot200-12 3.87ms ± 1% 3.85ms ± 0% -0.61% (p=0.001 n=20+17) GoParse-12 3.76ms ± 2% 3.76ms ± 1% ~ (p=0.698 n=20+20) RegexpMatchEasy0_32-12 100ns ± 2% 101ns ± 2% ~ (p=0.065 n=19+20) RegexpMatchEasy0_1K-12 342ns ± 2% 333ns ± 1% -2.82% (p=0.000 n=20+19) RegexpMatchEasy1_32-12 83.3ns ± 2% 83.2ns ± 2% ~ (p=0.692 n=20+19) RegexpMatchEasy1_1K-12 498ns ± 2% 490ns ± 1% -1.52% (p=0.000 n=18+20) RegexpMatchMedium_32-12 131ns ± 2% 131ns ± 2% ~ (p=0.464 n=20+18) RegexpMatchMedium_1K-12 39.3µs ± 2% 39.6µs ± 1% +0.77% (p=0.000 n=18+19) RegexpMatchHard_32-12 2.04µs ± 2% 2.06µs ± 1% +0.69% (p=0.009 n=19+20) RegexpMatchHard_1K-12 61.4µs ± 2% 62.1µs ± 1% +1.21% (p=0.000 n=19+20) Revcomp-12 534ms ± 1% 529ms ± 1% -0.97% (p=0.000 n=19+16) Template-12 70.4ms ± 2% 70.0ms ± 1% ~ (p=0.070 n=19+19) TimeParse-12 359ns ± 3% 344ns ± 1% -4.15% (p=0.000 n=19+19) TimeFormat-12 357ns ± 1% 361ns ± 2% +1.05% (p=0.002 n=20+20) [Geo mean] 62.4µs 62.0µs -0.56% name old speed new speed delta GobDecode-12 88.7MB/s ± 1% 88.1MB/s ± 2% -0.68% (p=0.001 n=19+20) GobEncode-12 118MB/s ± 1% 117MB/s ± 1% -0.42% (p=0.046 n=20+19) Gzip-12 60.9MB/s ± 2% 61.7MB/s ± 2% +1.21% (p=0.000 n=19+19) Gunzip-12 460MB/s ± 2% 461MB/s ± 1% ~ (p=0.661 n=20+19) JSONEncode-12 116MB/s ± 1% 115MB/s ± 2% ~ (p=0.555 n=19+20) JSONDecode-12 31.9MB/s ± 2% 32.5MB/s ± 1% +1.72% (p=0.000 n=19+19) GoParse-12 15.4MB/s ± 2% 15.4MB/s ± 1% ~ (p=0.653 n=20+20) RegexpMatchEasy0_32-12 317MB/s ± 2% 315MB/s ± 2% ~ (p=0.141 n=19+20) RegexpMatchEasy0_1K-12 2.99GB/s ± 2% 3.07GB/s ± 1% +2.86% (p=0.000 n=20+19) RegexpMatchEasy1_32-12 384MB/s ± 2% 385MB/s ± 2% ~ (p=0.672 n=20+19) RegexpMatchEasy1_1K-12 2.06GB/s ± 2% 2.09GB/s ± 1% +1.54% (p=0.000 n=18+20) RegexpMatchMedium_32-12 7.62MB/s ± 2% 7.63MB/s ± 2% ~ (p=0.800 n=20+18) RegexpMatchMedium_1K-12 26.0MB/s ± 1% 25.8MB/s ± 1% -0.77% (p=0.000 n=18+19) RegexpMatchHard_32-12 15.7MB/s ± 2% 15.6MB/s ± 1% -0.69% (p=0.010 n=19+20) RegexpMatchHard_1K-12 16.7MB/s ± 2% 16.5MB/s ± 1% -1.19% (p=0.000 n=19+20) Revcomp-12 476MB/s ± 1% 481MB/s ± 1% +0.97% (p=0.000 n=19+16) Template-12 27.6MB/s ± 2% 27.7MB/s ± 1% ~ (p=0.071 n=19+19) [Geo mean] 99.1MB/s 99.3MB/s +0.27% Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6 Reviewed-on: https://go-review.googlesource.com/16042 Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
bf606094ee
commit
b6c0934a9b
@ -1515,7 +1515,7 @@ func gcMark(start_time int64) {
|
||||
work.finalizersDone = true
|
||||
|
||||
for i := 0; i < int(gomaxprocs); i++ {
|
||||
if allp[i].gcw.wbuf != 0 {
|
||||
if !allp[i].gcw.empty() {
|
||||
throw("P has cached GC work at end of mark termination")
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_Debugwbufs = false // if true check wbufs consistency
|
||||
_WorkbufSize = 4096 // in bytes; larger values result in less contention
|
||||
_WorkbufSize = 2048 // in bytes; larger values result in less contention
|
||||
)
|
||||
|
||||
// Garbage collector work pool abstraction.
|
||||
@ -60,8 +60,25 @@ func (wp wbufptr) ptr() *workbuf {
|
||||
// gcWork may locally hold GC work buffers. This can be done by
|
||||
// disabling preemption (systemstack or acquirem).
|
||||
type gcWork struct {
|
||||
// Invariant: wbuf is never full or empty
|
||||
wbuf wbufptr
|
||||
// wbuf1 and wbuf2 are the primary and secondary work buffers.
|
||||
//
|
||||
// This can be thought of as a stack of both work buffers'
|
||||
// pointers concatenated. When we pop the last pointer, we
|
||||
// shift the stack up by one work buffer by bringing in a new
|
||||
// full buffer and discarding an empty one. When we fill both
|
||||
// buffers, we shift the stack down by one work buffer by
|
||||
// bringing in a new empty buffer and discarding a full one.
|
||||
// This way we have one buffer's worth of hysteresis, which
|
||||
// amortizes the cost of getting or putting a work buffer over
|
||||
// at least one buffer of work and reduces contention on the
|
||||
// global work lists.
|
||||
//
|
||||
// wbuf1 is always the buffer we're currently pushing to and
|
||||
// popping from and wbuf2 is the buffer that will be discarded
|
||||
// next.
|
||||
//
|
||||
// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
|
||||
wbuf1, wbuf2 wbufptr
|
||||
|
||||
// Bytes marked (blackened) on this gcWork. This is aggregated
|
||||
// into work.bytesMarked by dispose.
|
||||
@ -72,25 +89,38 @@ type gcWork struct {
|
||||
scanWork int64
|
||||
}
|
||||
|
||||
func (w *gcWork) init() {
|
||||
w.wbuf1 = wbufptrOf(getempty(101))
|
||||
wbuf2 := trygetfull(102)
|
||||
if wbuf2 == nil {
|
||||
wbuf2 = getempty(103)
|
||||
}
|
||||
w.wbuf2 = wbufptrOf(wbuf2)
|
||||
}
|
||||
|
||||
// put enqueues a pointer for the garbage collector to trace.
|
||||
// obj must point to the beginning of a heap object.
|
||||
//go:nowritebarrier
|
||||
func (ww *gcWork) put(obj uintptr) {
|
||||
w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
|
||||
|
||||
wbuf := w.wbuf.ptr()
|
||||
wbuf := w.wbuf1.ptr()
|
||||
if wbuf == nil {
|
||||
wbuf = getempty(42)
|
||||
w.wbuf = wbufptrOf(wbuf)
|
||||
w.init()
|
||||
wbuf = w.wbuf1.ptr()
|
||||
// wbuf is empty at this point.
|
||||
} else if wbuf.nobj == len(wbuf.obj) {
|
||||
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
||||
wbuf = w.wbuf1.ptr()
|
||||
if wbuf.nobj == len(wbuf.obj) {
|
||||
putfull(wbuf, 132)
|
||||
wbuf = getempty(133)
|
||||
w.wbuf1 = wbufptrOf(wbuf)
|
||||
}
|
||||
}
|
||||
|
||||
wbuf.obj[wbuf.nobj] = obj
|
||||
wbuf.nobj++
|
||||
|
||||
if wbuf.nobj == len(wbuf.obj) {
|
||||
putfull(wbuf, 50)
|
||||
w.wbuf = 0
|
||||
}
|
||||
}
|
||||
|
||||
// tryGet dequeues a pointer for the garbage collector to trace.
|
||||
@ -102,24 +132,28 @@ func (ww *gcWork) put(obj uintptr) {
|
||||
func (ww *gcWork) tryGet() uintptr {
|
||||
w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
|
||||
|
||||
wbuf := w.wbuf.ptr()
|
||||
wbuf := w.wbuf1.ptr()
|
||||
if wbuf == nil {
|
||||
wbuf = trygetfull(74)
|
||||
if wbuf == nil {
|
||||
return 0
|
||||
w.init()
|
||||
wbuf = w.wbuf1.ptr()
|
||||
// wbuf is empty at this point.
|
||||
}
|
||||
if wbuf.nobj == 0 {
|
||||
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
||||
wbuf = w.wbuf1.ptr()
|
||||
if wbuf.nobj == 0 {
|
||||
owbuf := wbuf
|
||||
wbuf = trygetfull(167)
|
||||
if wbuf == nil {
|
||||
return 0
|
||||
}
|
||||
putempty(owbuf, 166)
|
||||
w.wbuf1 = wbufptrOf(wbuf)
|
||||
}
|
||||
w.wbuf = wbufptrOf(wbuf)
|
||||
}
|
||||
|
||||
wbuf.nobj--
|
||||
obj := wbuf.obj[wbuf.nobj]
|
||||
|
||||
if wbuf.nobj == 0 {
|
||||
putempty(wbuf, 86)
|
||||
w.wbuf = 0
|
||||
}
|
||||
|
||||
return obj
|
||||
return wbuf.obj[wbuf.nobj]
|
||||
}
|
||||
|
||||
// get dequeues a pointer for the garbage collector to trace, blocking
|
||||
@ -129,27 +163,30 @@ func (ww *gcWork) tryGet() uintptr {
|
||||
func (ww *gcWork) get() uintptr {
|
||||
w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
|
||||
|
||||
wbuf := w.wbuf.ptr()
|
||||
wbuf := w.wbuf1.ptr()
|
||||
if wbuf == nil {
|
||||
wbuf = getfull(103)
|
||||
if wbuf == nil {
|
||||
return 0
|
||||
w.init()
|
||||
wbuf = w.wbuf1.ptr()
|
||||
// wbuf is empty at this point.
|
||||
}
|
||||
if wbuf.nobj == 0 {
|
||||
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
||||
wbuf = w.wbuf1.ptr()
|
||||
if wbuf.nobj == 0 {
|
||||
owbuf := wbuf
|
||||
wbuf = getfull(185)
|
||||
if wbuf == nil {
|
||||
return 0
|
||||
}
|
||||
putempty(owbuf, 184)
|
||||
w.wbuf1 = wbufptrOf(wbuf)
|
||||
}
|
||||
wbuf.checknonempty()
|
||||
w.wbuf = wbufptrOf(wbuf)
|
||||
}
|
||||
|
||||
// TODO: This might be a good place to add prefetch code
|
||||
|
||||
wbuf.nobj--
|
||||
obj := wbuf.obj[wbuf.nobj]
|
||||
|
||||
if wbuf.nobj == 0 {
|
||||
putempty(wbuf, 115)
|
||||
w.wbuf = 0
|
||||
}
|
||||
|
||||
return obj
|
||||
return wbuf.obj[wbuf.nobj]
|
||||
}
|
||||
|
||||
// dispose returns any cached pointers to the global queue.
|
||||
@ -160,12 +197,21 @@ func (ww *gcWork) get() uintptr {
|
||||
//
|
||||
//go:nowritebarrier
|
||||
func (w *gcWork) dispose() {
|
||||
if wbuf := w.wbuf; wbuf != 0 {
|
||||
if wbuf.ptr().nobj == 0 {
|
||||
throw("dispose: workbuf is empty")
|
||||
if wbuf := w.wbuf1.ptr(); wbuf != nil {
|
||||
if wbuf.nobj == 0 {
|
||||
putempty(wbuf, 212)
|
||||
} else {
|
||||
putfull(wbuf, 214)
|
||||
}
|
||||
putfull(wbuf.ptr(), 166)
|
||||
w.wbuf = 0
|
||||
w.wbuf1 = 0
|
||||
|
||||
wbuf = w.wbuf2.ptr()
|
||||
if wbuf.nobj == 0 {
|
||||
putempty(wbuf, 218)
|
||||
} else {
|
||||
putfull(wbuf, 220)
|
||||
}
|
||||
w.wbuf2 = 0
|
||||
}
|
||||
if w.bytesMarked != 0 {
|
||||
// dispose happens relatively infrequently. If this
|
||||
@ -185,16 +231,21 @@ func (w *gcWork) dispose() {
|
||||
// global queue.
|
||||
//go:nowritebarrier
|
||||
func (w *gcWork) balance() {
|
||||
if wbuf := w.wbuf; wbuf != 0 && wbuf.ptr().nobj > 4 {
|
||||
w.wbuf = wbufptrOf(handoff(wbuf.ptr()))
|
||||
if w.wbuf1 == 0 {
|
||||
return
|
||||
}
|
||||
if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
|
||||
putfull(wbuf, 246)
|
||||
w.wbuf2 = wbufptrOf(getempty(247))
|
||||
} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
|
||||
w.wbuf1 = wbufptrOf(handoff(wbuf))
|
||||
}
|
||||
}
|
||||
|
||||
// empty returns true if w has no mark work available.
|
||||
//go:nowritebarrier
|
||||
func (w *gcWork) empty() bool {
|
||||
wbuf := w.wbuf
|
||||
return wbuf == 0 || wbuf.ptr().nobj == 0
|
||||
return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
|
||||
}
|
||||
|
||||
// Internally, the GC work pool is kept in arrays in work buffers.
|
||||
|
Loading…
Reference in New Issue
Block a user