go/src/runtime/mgcwork.go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package runtime

import (
	"runtime/internal/atomic"
	"runtime/internal/sys"
	"unsafe"
)

const (
	_WorkbufSize = 2048 // in bytes; larger values result in less contention

	// workbufAlloc is the number of bytes to allocate at a time
	// for new workbufs. This must be a multiple of pageSize and
	// should be a multiple of _WorkbufSize.
	//
	// Larger values reduce workbuf allocation overhead. Smaller
	// values reduce heap fragmentation.
	workbufAlloc = 32 << 10
)

func init() {
	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
		throw("bad workbufAlloc")
	}
}

// Garbage collector work pool abstraction.
//
// This implements a producer/consumer model for pointers to grey
// objects. A grey object is one that is marked and on a work
// queue. A black object is marked and not on a work queue.
//
// Write barriers, root discovery, stack scanning, and object scanning
// produce pointers to grey objects. Scanning consumes pointers to
// grey objects, thus blackening them, and then scans them,
// potentially producing new pointers to grey objects.

// A gcWork provides the interface to produce and consume work for the
// garbage collector.
//
// A gcWork can be used on the stack as follows:
//
//     (preemption must be disabled)
//     gcw := &getg().m.p.ptr().gcw
//     .. call gcw.put() to produce and gcw.get() to consume ..
//     if gcBlackenPromptly {
//         gcw.dispose()
//     }
//
// It's important that any use of gcWork during the mark phase prevent
// the garbage collector from transitioning to mark termination since
// gcWork may locally hold GC work buffers. This can be done by
// disabling preemption (systemstack or acquirem).
type gcWork struct {
	// wbuf1 and wbuf2 are the primary and secondary work buffers.
	//
	// This can be thought of as a stack of both work buffers'
	// pointers concatenated. When we pop the last pointer, we
	// shift the stack up by one work buffer by bringing in a new
	// full buffer and discarding an empty one. When we fill both
	// buffers, we shift the stack down by one work buffer by
	// bringing in a new empty buffer and discarding a full one.
	// This way we have one buffer's worth of hysteresis, which
	// amortizes the cost of getting or putting a work buffer over
	// at least one buffer of work and reduces contention on the
	// global work lists.
	//
	// wbuf1 is always the buffer we're currently pushing to and
	// popping from and wbuf2 is the buffer that will be discarded
	// next.
	//
	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
	wbuf1, wbuf2 *workbuf

	// Bytes marked (blackened) on this gcWork. This is aggregated
	// into work.bytesMarked by dispose.
	bytesMarked uint64

	// Scan work performed on this gcWork. This is aggregated into
	// gcController by dispose and may also be flushed by callers.
	scanWork int64
}

// Most of the methods of gcWork are go:nowritebarrierrec because the
// write barrier itself can invoke gcWork methods but the methods are
// not generally re-entrant. Hence, if a gcWork method invoked the
// write barrier while the gcWork was in an inconsistent state, and
// the write barrier in turn invoked a gcWork method, it could
// permanently corrupt the gcWork.

func (w *gcWork) init() {
	w.wbuf1 = getempty()
	wbuf2 := trygetfull()
	if wbuf2 == nil {
		wbuf2 = getempty()
	}
	w.wbuf2 = wbuf2
}

// put enqueues a pointer for the garbage collector to trace.
// obj must point to the beginning of a heap object or an oblet.
//go:nowritebarrierrec
func (w *gcWork) put(obj uintptr) {
	flushed := false
	wbuf := w.wbuf1
	if wbuf == nil {
		w.init()
		wbuf = w.wbuf1
		// wbuf is empty at this point.
	} else if wbuf.nobj == len(wbuf.obj) {
		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
		wbuf = w.wbuf1
		if wbuf.nobj == len(wbuf.obj) {
			putfull(wbuf)
			wbuf = getempty()
			w.wbuf1 = wbuf
			flushed = true
		}
	}

	wbuf.obj[wbuf.nobj] = obj
	wbuf.nobj++

	// If we put a buffer on full, let the GC controller know so
	// it can encourage more workers to run. We delay this until
	// the end of put so that w is in a consistent state, since
	// enlistWorker may itself manipulate w.
	if flushed && gcphase == _GCmark {
		gcController.enlistWorker()
	}
}

// putFast does a put and returns true if it can be done quickly
// otherwise it returns false and the caller needs to call put.
//go:nowritebarrierrec
func (w *gcWork) putFast(obj uintptr) bool {
	wbuf := w.wbuf1
	if wbuf == nil {
		return false
	} else if wbuf.nobj == len(wbuf.obj) {
		return false
	}

	wbuf.obj[wbuf.nobj] = obj
	wbuf.nobj++
	return true
}

// putBatch performs a put on every pointer in obj. See put for
// constraints on these pointers.
//
//go:nowritebarrierrec
func (w *gcWork) putBatch(obj []uintptr) {
	if len(obj) == 0 {
		return
	}

	flushed := false
	wbuf := w.wbuf1
	if wbuf == nil {
		w.init()
		wbuf = w.wbuf1
	}

	for len(obj) > 0 {
		for wbuf.nobj == len(wbuf.obj) {
			putfull(wbuf)
			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
			wbuf = w.wbuf1
			flushed = true
		}
		n := copy(wbuf.obj[wbuf.nobj:], obj)
		wbuf.nobj += n
		obj = obj[n:]
	}

	if flushed && gcphase == _GCmark {
		gcController.enlistWorker()
	}
}

// tryGet dequeues a pointer for the garbage collector to trace.
//
// If there are no pointers remaining in this gcWork or in the global
// queue, tryGet returns 0.  Note that there may still be pointers in
// other gcWork instances or other caches.
//go:nowritebarrierrec
func (w *gcWork) tryGet() uintptr {
	wbuf := w.wbuf1
	if wbuf == nil {
		w.init()
		wbuf = w.wbuf1
		// wbuf is empty at this point.
	}
	if wbuf.nobj == 0 {
		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
		wbuf = w.wbuf1
		if wbuf.nobj == 0 {
			owbuf := wbuf
			wbuf = trygetfull()
			if wbuf == nil {
				return 0
			}
			putempty(owbuf)
			w.wbuf1 = wbuf
		}
	}

	wbuf.nobj--
	return wbuf.obj[wbuf.nobj]
}

// tryGetFast dequeues a pointer for the garbage collector to trace
// if one is readily available. Otherwise it returns 0 and
// the caller is expected to call tryGet().
//go:nowritebarrierrec
func (w *gcWork) tryGetFast() uintptr {
	wbuf := w.wbuf1
	if wbuf == nil {
		return 0
	}
	if wbuf.nobj == 0 {
		return 0
	}

	wbuf.nobj--
	return wbuf.obj[wbuf.nobj]
}

// get dequeues a pointer for the garbage collector to trace, blocking
// if necessary to ensure all pointers from all queues and caches have
// been retrieved.  get returns 0 if there are no pointers remaining.
//go:nowritebarrierrec
func (w *gcWork) get() uintptr {
	wbuf := w.wbuf1
	if wbuf == nil {
		w.init()
		wbuf = w.wbuf1
		// wbuf is empty at this point.
	}
	if wbuf.nobj == 0 {
		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
		wbuf = w.wbuf1
		if wbuf.nobj == 0 {
			owbuf := wbuf
			wbuf = getfull()
			if wbuf == nil {
				return 0
			}
			putempty(owbuf)
			w.wbuf1 = wbuf
		}
	}

	// TODO: This might be a good place to add prefetch code

	wbuf.nobj--
	return wbuf.obj[wbuf.nobj]
}

// dispose returns any cached pointers to the global queue.
// The buffers are being put on the full queue so that the
// write barriers will not simply reacquire them before the
// GC can inspect them. This helps reduce the mutator's
// ability to hide pointers during the concurrent mark phase.
//
//go:nowritebarrierrec
func (w *gcWork) dispose() {
	if wbuf := w.wbuf1; wbuf != nil {
		if wbuf.nobj == 0 {
			putempty(wbuf)
		} else {
			putfull(wbuf)
		}
		w.wbuf1 = nil

		wbuf = w.wbuf2
		if wbuf.nobj == 0 {
			putempty(wbuf)
		} else {
			putfull(wbuf)
		}
		w.wbuf2 = nil
	}
	if w.bytesMarked != 0 {
		// dispose happens relatively infrequently. If this
		// atomic becomes a problem, we should first try to
		// dispose less and if necessary aggregate in a per-P
		// counter.
		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
		w.bytesMarked = 0
	}
	if w.scanWork != 0 {
		atomic.Xaddint64(&gcController.scanWork, w.scanWork)
		w.scanWork = 0
	}
}

// balance moves some work that's cached in this gcWork back on the
// global queue.
//go:nowritebarrierrec
func (w *gcWork) balance() {
	if w.wbuf1 == nil {
		return
	}
	if wbuf := w.wbuf2; wbuf.nobj != 0 {
		putfull(wbuf)
		w.wbuf2 = getempty()
	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
		w.wbuf1 = handoff(wbuf)
	} else {
		return
	}
	// We flushed a buffer to the full list, so wake a worker.
	if gcphase == _GCmark {
		gcController.enlistWorker()
	}
}

// empty returns true if w has no mark work available.
//go:nowritebarrierrec
func (w *gcWork) empty() bool {
	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
}

// Internally, the GC work pool is kept in arrays in work buffers.
// The gcWork interface caches a work buffer until full (or empty) to
// avoid contending on the global work buffer lists.

type workbufhdr struct {
	node lfnode // must be first
	nobj int
}

//go:notinheap
type workbuf struct {
	workbufhdr
	// account for the above fields
	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
}

// workbuf factory routines. These funcs are used to manage the
// workbufs.
// If the GC asks for some work these are the only routines that
// make wbufs available to the GC.

func (b *workbuf) checknonempty() {
	if b.nobj == 0 {
		throw("workbuf is empty")
	}
}

func (b *workbuf) checkempty() {
	if b.nobj != 0 {
		throw("workbuf is not empty")
	}
}

// getempty pops an empty work buffer off the work.empty list,
// allocating new buffers if none are available.
//go:nowritebarrier
func getempty() *workbuf {
	var b *workbuf
	if work.empty != 0 {
		b = (*workbuf)(work.empty.pop())
		if b != nil {
			b.checkempty()
		}
	}
	if b == nil {
		// Allocate more workbufs.
		var s *mspan
		if work.wbufSpans.free.first != nil {
			lock(&work.wbufSpans.lock)
			s = work.wbufSpans.free.first
			if s != nil {
				work.wbufSpans.free.remove(s)
				work.wbufSpans.busy.insert(s)
			}
			unlock(&work.wbufSpans.lock)
		}
		if s == nil {
			systemstack(func() {
				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
			})
			if s == nil {
				throw("out of memory")
			}
			// Record the new span in the busy list.
			lock(&work.wbufSpans.lock)
			work.wbufSpans.busy.insert(s)
			unlock(&work.wbufSpans.lock)
		}
		// Slice up the span into new workbufs. Return one and
		// put the rest on the empty list.
		for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
			newb := (*workbuf)(unsafe.Pointer(s.base() + i))
			newb.nobj = 0
			lfnodeValidate(&newb.node)
			if i == 0 {
				b = newb
			} else {
				putempty(newb)
			}
		}
	}
	return b
}

// putempty puts a workbuf onto the work.empty list.
// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
//go:nowritebarrier
func putempty(b *workbuf) {
	b.checkempty()
	work.empty.push(&b.node)
}

// putfull puts the workbuf on the work.full list for the GC.
// putfull accepts partially full buffers so the GC can avoid competing
// with the mutators for ownership of partially full buffers.
//go:nowritebarrier
func putfull(b *workbuf) {
	b.checknonempty()
	work.full.push(&b.node)
}

// trygetfull tries to get a full or partially empty workbuffer.
// If one is not immediately available return nil
//go:nowritebarrier
func trygetfull() *workbuf {
	b := (*workbuf)(work.full.pop())
	if b != nil {
		b.checknonempty()
		return b
	}
	return b
}

// Get a full work buffer off the work.full list.
// If nothing is available wait until all the other gc helpers have
// finished and then return nil.
// getfull acts as a barrier for work.nproc helpers. As long as one
// gchelper is actively marking objects it
// may create a workbuffer that the other helpers can work on.
// The for loop either exits when a work buffer is found
// or when _all_ of the work.nproc GC helpers are in the loop
// looking for work and thus not capable of creating new work.
// This is in fact the termination condition for the STW mark
// phase.
//go:nowritebarrier
func getfull() *workbuf {
	b := (*workbuf)(work.full.pop())
	if b != nil {
		b.checknonempty()
		return b
	}

	incnwait := atomic.Xadd(&work.nwait, +1)
	if incnwait > work.nproc {
		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
		throw("work.nwait > work.nproc")
	}
	for i := 0; ; i++ {
		if work.full != 0 {
			decnwait := atomic.Xadd(&work.nwait, -1)
			if decnwait == work.nproc {
				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
				throw("work.nwait > work.nproc")
			}
			b = (*workbuf)(work.full.pop())
			if b != nil {
				b.checknonempty()
				return b
			}
			incnwait := atomic.Xadd(&work.nwait, +1)
			if incnwait > work.nproc {
				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
				throw("work.nwait > work.nproc")
			}
		}
		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
			return nil
		}
		if i < 10 {
			procyield(20)
		} else if i < 20 {
			osyield()
		} else {
			usleep(100)
		}
	}
}

//go:nowritebarrier
func handoff(b *workbuf) *workbuf {
	// Make new buffer with half of b's pointers.
	b1 := getempty()
	n := b.nobj / 2
	b.nobj -= n
	b1.nobj = n
	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))

	// Put b on full list - let first half of b get stolen.
	putfull(b)
	return b1
}

// prepareFreeWorkbufs moves busy workbuf spans to free list so they
// can be freed to the heap. This must only be called when all
// workbufs are on the empty list.
func prepareFreeWorkbufs() {
	lock(&work.wbufSpans.lock)
	if work.full != 0 {
		throw("cannot free workbufs when work.full != 0")
	}
	// Since all workbufs are on the empty list, we don't care
	// which ones are in which spans. We can wipe the entire empty
	// list and move all workbuf spans to the free list.
	work.empty = 0
	work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
	unlock(&work.wbufSpans.lock)
}

// freeSomeWbufs frees some workbufs back to the heap and returns
// true if it should be called again to free more.
func freeSomeWbufs(preemptible bool) bool {
	const batchSize = 64 // ~1–2 µs per span.
	lock(&work.wbufSpans.lock)
	if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
		unlock(&work.wbufSpans.lock)
		return false
	}
	systemstack(func() {
		gp := getg().m.curg
		for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
			span := work.wbufSpans.free.first
			if span == nil {
				break
			}
			work.wbufSpans.free.remove(span)
			mheap_.freeManual(span, &memstats.gc_sys)
		}
	})
	more := !work.wbufSpans.free.isEmpty()
	unlock(&work.wbufSpans.lock)
	return more
}
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								// Copyright 2009 The Go Authors. All rights reserved.
 								// Use of this source code is governed by a BSD-style
 								// license that can be found in the LICENSE file.
 								package runtime
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+								import (
 									"runtime/internal/atomic"
-												runtime: break out system-specific constants into package sys

runtime/internal/sys will hold system-, architecture- and config-
specific constants.

Updates #11647

Change-Id: I6db29c312556087a42e8d2bdd9af40d157c56b54
Reviewed-on: https://go-review.googlesource.com/16817
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-11 10:39:30 -07:00
+									"runtime/internal/sys"
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+									"unsafe"
 								)
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
 								const (
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+									_WorkbufSize = 2048 // in bytes; larger values result in less contention
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
 									// workbufAlloc is the number of bytes to allocate at a time
 									// for new workbufs. This must be a multiple of pageSize and
 									// should be a multiple of _WorkbufSize.
 									//
 									// Larger values reduce workbuf allocation overhead. Smaller
 									// values reduce heap fragmentation.
 									workbufAlloc = 32 << 10
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								)
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
+								func init() {
 									if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 										throw("bad workbufAlloc")
 									}
 								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// Garbage collector work pool abstraction.
 								//
 								// This implements a producer/consumer model for pointers to grey
-												all: single space after period.

The tree's pretty inconsistent about single space vs double space
after a period in documentation. Make it consistently a single space,
per earlier decisions. This means contributors won't be confused by
misleading precedence.

This CL doesn't use go/doc to parse. It only addresses // comments.
It was generated with:

$ perl -i -npe 's,^(\s*// .+[a-z]\.)  +([A-Z]),$1 $2,' $(git grep -l -E '^\s*//(.+\.)  +([A-Z])')
$ go test go/doc -update

Change-Id: Iccdb99c37c797ef1f804a94b22ba5ee4b500c4f7
Reviewed-on: https://go-review.googlesource.com/20022
Reviewed-by: Rob Pike <r@golang.org>
Reviewed-by: Dave Day <djd@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2016-03-01 16:21:55 -07:00
+								// objects. A grey object is one that is marked and on a work
 								// queue. A black object is marked and not on a work queue.
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								//
 								// Write barriers, root discovery, stack scanning, and object scanning
-												all: single space after period.

The tree's pretty inconsistent about single space vs double space
after a period in documentation. Make it consistently a single space,
per earlier decisions. This means contributors won't be confused by
misleading precedence.

This CL doesn't use go/doc to parse. It only addresses // comments.
It was generated with:

$ perl -i -npe 's,^(\s*// .+[a-z]\.)  +([A-Z]),$1 $2,' $(git grep -l -E '^\s*//(.+\.)  +([A-Z])')
$ go test go/doc -update

Change-Id: Iccdb99c37c797ef1f804a94b22ba5ee4b500c4f7
Reviewed-on: https://go-review.googlesource.com/20022
Reviewed-by: Rob Pike <r@golang.org>
Reviewed-by: Dave Day <djd@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2016-03-01 16:21:55 -07:00
+								// produce pointers to grey objects. Scanning consumes pointers to
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// grey objects, thus blackening them, and then scans them,
 								// potentially producing new pointers to grey objects.
-												runtime: combine gcWorkProducer into gcWork

The distinction between gcWorkProducer and gcWork (producer and
consumer) is not serving us as originally intended, so merge these
into just gcWork.

The original intent was to replace the currentwbuf cache with a
gcWorkProducer. However, with gchelpwork (aka mutator assists),
mutators can both produce and consume work, so it will make more sense
to cache a whole gcWork.

Change-Id: I6e633e96db7cb23a64fbadbfc4607e3ad32bcfb3
Reviewed-on: https://go-review.googlesource.com/7733
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 11:09:30 -06:00
+								// A gcWork provides the interface to produce and consume work for the
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// garbage collector.
 								//
-												runtime: replace per-M workbuf cache with per-P gcWork cache

Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.

This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).

This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:

benchmark                          old ns/op       new ns/op       delta
BenchmarkBinaryTree17              11963668673     11206112763     -6.33%
BenchmarkFannkuch11                2643217136      2649182499      +0.23%
BenchmarkFmtFprintfEmpty           70.4            70.2            -0.28%
BenchmarkFmtFprintfString          364             307             -15.66%
BenchmarkFmtFprintfInt             317             282             -11.04%
BenchmarkFmtFprintfIntInt          512             483             -5.66%
BenchmarkFmtFprintfPrefixedInt     404             380             -5.94%
BenchmarkFmtFprintfFloat           521             479             -8.06%
BenchmarkFmtManyArgs               2164            1894            -12.48%
BenchmarkGobDecode                 30366146        22429593        -26.14%
BenchmarkGobEncode                 29867472        26663152        -10.73%
BenchmarkGzip                      391236616       396779490       +1.42%
BenchmarkGunzip                    96639491        96297024        -0.35%
BenchmarkHTTPClientServer          100110          70763           -29.31%
BenchmarkJSONEncode                51866051        52511382        +1.24%
BenchmarkJSONDecode                103813138       86094963        -17.07%
BenchmarkMandelbrot200             4121834         4120886         -0.02%
BenchmarkGoParse                   16472789        5879949         -64.31%
BenchmarkRegexpMatchEasy0_32       140             140             +0.00%
BenchmarkRegexpMatchEasy0_1K       394             394             +0.00%
BenchmarkRegexpMatchEasy1_32       120             120             +0.00%
BenchmarkRegexpMatchEasy1_1K       621             614             -1.13%
BenchmarkRegexpMatchMedium_32      209             202             -3.35%
BenchmarkRegexpMatchMedium_1K      54889           55175           +0.52%
BenchmarkRegexpMatchHard_32        2682            2675            -0.26%
BenchmarkRegexpMatchHard_1K        79383           79524           +0.18%
BenchmarkRevcomp                   584116718       584595320       +0.08%
BenchmarkTemplate                  125400565       109620196       -12.58%
BenchmarkTimeParse                 386             387             +0.26%
BenchmarkTimeFormat                580             447             -22.93%

(Best out of 10 runs. The delta of averages is similar.)

This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.

Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-04-19 13:22:20 -06:00
+								// A gcWork can be used on the stack as follows:
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								//
-												runtime: replace per-M workbuf cache with per-P gcWork cache

Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.

This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).

This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:

benchmark                          old ns/op       new ns/op       delta
BenchmarkBinaryTree17              11963668673     11206112763     -6.33%
BenchmarkFannkuch11                2643217136      2649182499      +0.23%
BenchmarkFmtFprintfEmpty           70.4            70.2            -0.28%
BenchmarkFmtFprintfString          364             307             -15.66%
BenchmarkFmtFprintfInt             317             282             -11.04%
BenchmarkFmtFprintfIntInt          512             483             -5.66%
BenchmarkFmtFprintfPrefixedInt     404             380             -5.94%
BenchmarkFmtFprintfFloat           521             479             -8.06%
BenchmarkFmtManyArgs               2164            1894            -12.48%
BenchmarkGobDecode                 30366146        22429593        -26.14%
BenchmarkGobEncode                 29867472        26663152        -10.73%
BenchmarkGzip                      391236616       396779490       +1.42%
BenchmarkGunzip                    96639491        96297024        -0.35%
BenchmarkHTTPClientServer          100110          70763           -29.31%
BenchmarkJSONEncode                51866051        52511382        +1.24%
BenchmarkJSONDecode                103813138       86094963        -17.07%
BenchmarkMandelbrot200             4121834         4120886         -0.02%
BenchmarkGoParse                   16472789        5879949         -64.31%
BenchmarkRegexpMatchEasy0_32       140             140             +0.00%
BenchmarkRegexpMatchEasy0_1K       394             394             +0.00%
BenchmarkRegexpMatchEasy1_32       120             120             +0.00%
BenchmarkRegexpMatchEasy1_1K       621             614             -1.13%
BenchmarkRegexpMatchMedium_32      209             202             -3.35%
BenchmarkRegexpMatchMedium_1K      54889           55175           +0.52%
BenchmarkRegexpMatchHard_32        2682            2675            -0.26%
BenchmarkRegexpMatchHard_1K        79383           79524           +0.18%
BenchmarkRevcomp                   584116718       584595320       +0.08%
BenchmarkTemplate                  125400565       109620196       -12.58%
BenchmarkTimeParse                 386             387             +0.26%
BenchmarkTimeFormat                580             447             -22.93%

(Best out of 10 runs. The delta of averages is similar.)

This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.

Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-04-19 13:22:20 -06:00
+								//     (preemption must be disabled)
 								//     gcw := &getg().m.p.ptr().gcw
 								//     .. call gcw.put() to produce and gcw.get() to consume ..
-												runtime: fix out-of-date comment on gcWork usage

Change-Id: I3c21ffa80a5c14911e07238b1f64bec686ed7b72
Reviewed-on: https://go-review.googlesource.com/14980
Reviewed-by: Minux Ma <minux@golang.org>

											
										
										
											2015-09-24 12:24:32 -06:00
+								//     if gcBlackenPromptly {
-												runtime: replace per-M workbuf cache with per-P gcWork cache

Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.

This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).

This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:

benchmark                          old ns/op       new ns/op       delta
BenchmarkBinaryTree17              11963668673     11206112763     -6.33%
BenchmarkFannkuch11                2643217136      2649182499      +0.23%
BenchmarkFmtFprintfEmpty           70.4            70.2            -0.28%
BenchmarkFmtFprintfString          364             307             -15.66%
BenchmarkFmtFprintfInt             317             282             -11.04%
BenchmarkFmtFprintfIntInt          512             483             -5.66%
BenchmarkFmtFprintfPrefixedInt     404             380             -5.94%
BenchmarkFmtFprintfFloat           521             479             -8.06%
BenchmarkFmtManyArgs               2164            1894            -12.48%
BenchmarkGobDecode                 30366146        22429593        -26.14%
BenchmarkGobEncode                 29867472        26663152        -10.73%
BenchmarkGzip                      391236616       396779490       +1.42%
BenchmarkGunzip                    96639491        96297024        -0.35%
BenchmarkHTTPClientServer          100110          70763           -29.31%
BenchmarkJSONEncode                51866051        52511382        +1.24%
BenchmarkJSONDecode                103813138       86094963        -17.07%
BenchmarkMandelbrot200             4121834         4120886         -0.02%
BenchmarkGoParse                   16472789        5879949         -64.31%
BenchmarkRegexpMatchEasy0_32       140             140             +0.00%
BenchmarkRegexpMatchEasy0_1K       394             394             +0.00%
BenchmarkRegexpMatchEasy1_32       120             120             +0.00%
BenchmarkRegexpMatchEasy1_1K       621             614             -1.13%
BenchmarkRegexpMatchMedium_32      209             202             -3.35%
BenchmarkRegexpMatchMedium_1K      54889           55175           +0.52%
BenchmarkRegexpMatchHard_32        2682            2675            -0.26%
BenchmarkRegexpMatchHard_1K        79383           79524           +0.18%
BenchmarkRevcomp                   584116718       584595320       +0.08%
BenchmarkTemplate                  125400565       109620196       -12.58%
BenchmarkTimeParse                 386             387             +0.26%
BenchmarkTimeFormat                580             447             -22.93%

(Best out of 10 runs. The delta of averages is similar.)

This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.

Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-04-19 13:22:20 -06:00
+								//         gcw.dispose()
 								//     }
 								//
-												runtime: improve comment about non-preemption during GC work

Currently, gcDrainN is documented saying that it must be run on the
system stack. In fact, the problem and solution here are somewhat
subtler. First, it doesn't have to happen on the system stack, it just
has to be non-stoppable (that is, non-preemptible). Second, this isn't
specific to gcDrainN (though gcDrainN is perhaps the most surprising
instance); it's general to anything that uses the gcWork structure.

Move the comment to gcWork and generalize it.

Change-Id: I5277b5abb070e47f8d783bc15a310b379c6adc22
Reviewed-on: https://go-review.googlesource.com/8247
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-20 11:34:03 -06:00
+								// It's important that any use of gcWork during the mark phase prevent
 								// the garbage collector from transitioning to mark termination since
 								// gcWork may locally hold GC work buffers. This can be done by
 								// disabling preemption (systemstack or acquirem).
-												runtime: combine gcWorkProducer into gcWork

The distinction between gcWorkProducer and gcWork (producer and
consumer) is not serving us as originally intended, so merge these
into just gcWork.

The original intent was to replace the currentwbuf cache with a
gcWorkProducer. However, with gchelpwork (aka mutator assists),
mutators can both produce and consume work, so it will make more sense
to cache a whole gcWork.

Change-Id: I6e633e96db7cb23a64fbadbfc4607e3ad32bcfb3
Reviewed-on: https://go-review.googlesource.com/7733
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 11:09:30 -06:00
+								type gcWork struct {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+									// wbuf1 and wbuf2 are the primary and secondary work buffers.
 									//
 									// This can be thought of as a stack of both work buffers'
 									// pointers concatenated. When we pop the last pointer, we
 									// shift the stack up by one work buffer by bringing in a new
 									// full buffer and discarding an empty one. When we fill both
 									// buffers, we shift the stack down by one work buffer by
 									// bringing in a new empty buffer and discarding a full one.
 									// This way we have one buffer's worth of hysteresis, which
 									// amortizes the cost of getting or putting a work buffer over
 									// at least one buffer of work and reduces contention on the
 									// global work lists.
 									//
 									// wbuf1 is always the buffer we're currently pushing to and
 									// popping from and wbuf2 is the buffer that will be discarded
 									// next.
 									//
 									// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf1, wbuf2 *workbuf
-												runtime: track heap bytes marked by GC

This tracks the number of heap bytes marked by a GC cycle. We'll use
this information to precisely trigger the next GC cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: I1bc377cb2e802ef61c2968602b63146d52e7f5db
Reviewed-on: https://go-review.googlesource.com/8388
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-03-12 14:53:57 -06:00
 									// Bytes marked (blackened) on this gcWork. This is aggregated
 									// into work.bytesMarked by dispose.
 									bytesMarked uint64
-												runtime: track scan work performed during concurrent mark

This tracks the amount of scan work in terms of scanned pointers
during the concurrent mark phase. We'll use this information to
estimate scan work for the next cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: Iac0364c466ee35fab781dbbbe7970a5f3c4e1fc1
Reviewed-on: https://go-review.googlesource.com/8832
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 10:08:47 -06:00
 									// Scan work performed on this gcWork. This is aggregated into
-												runtime: update gcController.scanWork regularly

Currently, gcController.scanWork is updated as lazily as possible
since it is only read at the end of the GC cycle. We're about to read
it during the GC cycle to improve the assist ratio revisions, so
modify gcDrain* to regularly flush to gcController.scanWork in much
the same way as we regularly flush to gcController.bgScanCredit.

One consequence of this is that it's difficult to keep gcw.scanWork
monotonic, so we give up on that and simply return the amount of scan
work done by gcDrainN rather than calculating it in the caller.

Change-Id: I7b50acdc39602f843eed0b5c6d2dacd7e762b81d
Reviewed-on: https://go-review.googlesource.com/15407
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-10-04 21:00:01 -06:00
+									// gcController by dispose and may also be flushed by callers.
-												runtime: track scan work performed during concurrent mark

This tracks the amount of scan work in terms of scanned pointers
during the concurrent mark phase. We'll use this information to
estimate scan work for the next cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: Iac0364c466ee35fab781dbbbe7970a5f3c4e1fc1
Reviewed-on: https://go-review.googlesource.com/8832
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 10:08:47 -06:00
+									scanWork int64
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								}
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								// Most of the methods of gcWork are go:nowritebarrierrec because the
 								// write barrier itself can invoke gcWork methods but the methods are
 								// not generally re-entrant. Hence, if a gcWork method invoked the
 								// write barrier while the gcWork was in an inconsistent state, and
 								// the write barrier in turn invoked a gcWork method, it could
 								// permanently corrupt the gcWork.
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+								func (w *gcWork) init() {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									w.wbuf1 = getempty()
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+									wbuf2 := trygetfull()
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+									if wbuf2 == nil {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+										wbuf2 = getempty()
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+									}
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									w.wbuf2 = wbuf2
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// put enqueues a pointer for the garbage collector to trace.
-												runtime: bound scanobject to ~100 µs

Currently the time spent in scanobject is proportional to the size of
the object being scanned. Since scanobject is non-preemptible, large
objects can cause significant goroutine (and even whole application)
delays through several means:

1. If a GC assist picks up a large object, the allocating goroutine is
   blocked for the whole scan, even if that scan well exceeds that
   goroutine's debt.

2. Since the scheduler does not run on the P performing a large object
   scan, goroutines in that P's run queue do not run unless they are
   stolen by another P (which can take some time). If there are a few
   large objects, all of the Ps may get tied up so the scheduler
   doesn't run anywhere.

3. Even if a large object is scanned by a background worker and other
   Ps are still running the scheduler, the large object scan doesn't
   flush background credit until the whole scan is done. This can
   easily cause all allocations to block in assists, waiting for
   credit, causing an effective STW.

Fix this by splitting large objects into 128 KB "oblets" and scanning
at most one oblet at a time. Since we can scan 1–2 MB/ms, this equates
to bounding scanobject at roughly 100 µs. This improves assist
behavior both because assists can no longer get "unlucky" and be stuck
scanning a large object, and because it causes the background worker
to flush credit and unblock assists more frequently when scanning
large objects. This also improves GC parallelism if the heap consists
primarily of a small number of very large objects by letting multiple
workers scan a large objects in parallel.

Fixes #10345. Fixes #16293.

This substantially improves goroutine latency in the benchmark from
issue #16293, which exercises several forms of very large objects:

name                 old max-latency    new max-latency    delta
SliceNoPointer-12           154µs ± 1%        155µs ±  2%     ~     (p=0.087 n=13+12)
SlicePointer-12             314ms ± 1%       5.94ms ±138%  -98.11%  (p=0.000 n=19+20)
SliceLivePointer-12        1148ms ± 0%       4.72ms ±167%  -99.59%  (p=0.000 n=19+20)
MapNoPointer-12           72509µs ± 1%        408µs ±325%  -99.44%  (p=0.000 n=19+18)
ChanPointer-12              313ms ± 0%       4.74ms ±140%  -98.49%  (p=0.000 n=18+20)
ChanLivePointer-12         1147ms ± 0%       3.30ms ±149%  -99.71%  (p=0.000 n=19+20)

name                 old P99.9-latency  new P99.9-latency  delta
SliceNoPointer-12           113µs ±25%         107µs ±12%     ~     (p=0.153 n=20+18)
SlicePointer-12          309450µs ± 0%         133µs ±23%  -99.96%  (p=0.000 n=20+20)
SliceLivePointer-12         961ms ± 0%        1.35ms ±27%  -99.86%  (p=0.000 n=20+20)
MapNoPointer-12            448µs ±288%         119µs ±18%  -73.34%  (p=0.000 n=18+20)
ChanPointer-12           309450µs ± 0%         134µs ±23%  -99.96%  (p=0.000 n=20+19)
ChanLivePointer-12          961ms ± 0%        1.35ms ±27%  -99.86%  (p=0.000 n=20+20)

This has negligible effect on all metrics from the garbage, JSON, and
HTTP x/benchmarks.

It shows slight improvement on some of the go1 benchmarks,
particularly Revcomp, which uses some multi-megabyte buffers:

name                      old time/op    new time/op    delta
BinaryTree17-12              2.46s ± 1%     2.47s ± 1%  +0.32%  (p=0.012 n=20+20)
Fannkuch11-12                2.82s ± 0%     2.81s ± 0%  -0.61%  (p=0.000 n=17+20)
FmtFprintfEmpty-12          50.8ns ± 5%    50.5ns ± 2%    ~     (p=0.197 n=17+19)
FmtFprintfString-12          131ns ± 1%     132ns ± 0%  +0.57%  (p=0.000 n=20+16)
FmtFprintfInt-12             117ns ± 0%     116ns ± 0%  -0.47%  (p=0.000 n=15+20)
FmtFprintfIntInt-12          180ns ± 0%     179ns ± 1%  -0.78%  (p=0.000 n=16+20)
FmtFprintfPrefixedInt-12     186ns ± 1%     185ns ± 1%  -0.55%  (p=0.000 n=19+20)
FmtFprintfFloat-12           263ns ± 1%     271ns ± 0%  +2.84%  (p=0.000 n=18+20)
FmtManyArgs-12               741ns ± 1%     742ns ± 1%    ~     (p=0.190 n=19+19)
GobDecode-12                7.44ms ± 0%    7.35ms ± 1%  -1.21%  (p=0.000 n=20+20)
GobEncode-12                6.22ms ± 1%    6.21ms ± 1%    ~     (p=0.336 n=20+19)
Gzip-12                      220ms ± 1%     219ms ± 1%    ~     (p=0.130 n=19+19)
Gunzip-12                   37.9ms ± 0%    37.9ms ± 1%    ~     (p=1.000 n=20+19)
HTTPClientServer-12         82.5µs ± 3%    82.6µs ± 3%    ~     (p=0.776 n=20+19)
JSONEncode-12               16.4ms ± 1%    16.5ms ± 2%  +0.49%  (p=0.003 n=18+19)
JSONDecode-12               53.7ms ± 1%    54.1ms ± 1%  +0.71%  (p=0.000 n=19+18)
Mandelbrot200-12            4.19ms ± 1%    4.20ms ± 1%    ~     (p=0.452 n=19+19)
GoParse-12                  3.38ms ± 1%    3.37ms ± 1%    ~     (p=0.123 n=19+19)
RegexpMatchEasy0_32-12      72.1ns ± 1%    71.8ns ± 1%    ~     (p=0.397 n=19+17)
RegexpMatchEasy0_1K-12       242ns ± 0%     242ns ± 0%    ~     (p=0.168 n=17+20)
RegexpMatchEasy1_32-12      72.1ns ± 1%    72.1ns ± 1%    ~     (p=0.538 n=18+19)
RegexpMatchEasy1_1K-12       385ns ± 1%     384ns ± 1%    ~     (p=0.388 n=20+20)
RegexpMatchMedium_32-12      112ns ± 1%     112ns ± 3%    ~     (p=0.539 n=20+20)
RegexpMatchMedium_1K-12     34.4µs ± 2%    34.4µs ± 2%    ~     (p=0.628 n=18+18)
RegexpMatchHard_32-12       1.80µs ± 1%    1.80µs ± 1%    ~     (p=0.522 n=18+19)
RegexpMatchHard_1K-12       54.0µs ± 1%    54.1µs ± 1%    ~     (p=0.647 n=20+19)
Revcomp-12                   387ms ± 1%     369ms ± 5%  -4.89%  (p=0.000 n=17+19)
Template-12                 62.3ms ± 1%    62.0ms ± 0%  -0.48%  (p=0.002 n=20+17)
TimeParse-12                 314ns ± 1%     314ns ± 0%    ~     (p=1.011 n=20+13)
TimeFormat-12                358ns ± 0%     354ns ± 0%  -1.12%  (p=0.000 n=17+20)
[Geo mean]                  53.5µs         53.3µs       -0.23%

Change-Id: I2a0a179d1d6bf7875dd054b7693dd12d2a340132
Reviewed-on: https://go-review.googlesource.com/23540
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-05-27 19:04:40 -06:00
+								// obj must point to the beginning of a heap object or an oblet.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: remove noescape hacks from gcWork

When gcWork was first introduced, the compiler's escape analysis
wasn't good enough to detect that that method receiver didn't escape,
so we had to hack around this.

Now that the compiler can figure out this for itself, remove these
hacks.

Change-Id: I9f73fab721e272410b8b6905b564e7abc03c0dfe
Reviewed-on: https://go-review.googlesource.com/19634
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2016-02-09 16:37:41 -07:00
+								func (w *gcWork) put(obj uintptr) {
-												runtime: wake idle Ps when enqueuing GC work

If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.

Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.

To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.

There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.

Fixes #14179.

Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

											
										
										
											2016-10-30 18:43:53 -06:00
+									flushed := false
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf := w.wbuf1
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									if wbuf == nil {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										w.init()
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										// wbuf is empty at this point.
 									} else if wbuf.nobj == len(wbuf.obj) {
 										w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										if wbuf.nobj == len(wbuf.obj) {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putfull(wbuf)
 											wbuf = getempty()
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+											w.wbuf1 = wbuf
-												runtime: wake idle Ps when enqueuing GC work

If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.

Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.

To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.

There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.

Fixes #14179.

Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

											
										
										
											2016-10-30 18:43:53 -06:00
+											flushed = true
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									}
 									wbuf.obj[wbuf.nobj] = obj
 									wbuf.nobj++
-												runtime: wake idle Ps when enqueuing GC work

If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.

Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.

To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.

There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.

Fixes #14179.

Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

											
										
										
											2016-10-30 18:43:53 -06:00
 									// If we put a buffer on full, let the GC controller know so
 									// it can encourage more workers to run. We delay this until
 									// the end of put so that w is in a consistent state, since
 									// enlistWorker may itself manipulate w.
 									if flushed && gcphase == _GCmark {
 										gcController.enlistWorker()
 									}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								}
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+								// putFast does a put and returns true if it can be done quickly
 								// otherwise it returns false and the caller needs to call put.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+								func (w *gcWork) putFast(obj uintptr) bool {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf := w.wbuf1
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+									if wbuf == nil {
 										return false
 									} else if wbuf.nobj == len(wbuf.obj) {
 										return false
 									}
 									wbuf.obj[wbuf.nobj] = obj
 									wbuf.nobj++
 									return true
 								}
-												runtime: buffered write barrier implementation

This implements runtime support for buffered write barriers on amd64.
The buffered write barrier has a fast path that simply enqueues
pointers in a per-P buffer. Unlike the current write barrier, this
fast path is *not* a normal Go call and does not require the compiler
to spill general-purpose registers or put arguments on the stack. When
the buffer fills up, the write barrier takes the slow path, which
spills all general purpose registers and flushes the buffer. We don't
allow safe-points or stack splits while this frame is active, so it
doesn't matter that we have no type information for the spilled
registers in this frame.

One minor complication is cgocheck=2 mode, which uses the write
barrier to detect Go pointers being written to non-Go memory. We
obviously can't buffer this, so instead we set the buffer to its
minimum size, forcing the write barrier into the slow path on every
call. For this specific case, we pass additional information as
arguments to the flush function. This also requires enabling the cgo
write barrier slightly later during runtime initialization, after Ps
(and the per-P write barrier buffers) have been initialized.

The code in this CL is not yet active. The next CL will modify the
compiler to generate calls to the new write barrier.

This reduces the average cost of the write barrier by roughly a factor
of 4, which will pay for the cost of having it enabled more of the
time after we make the GC pacer less aggressive. (Benchmarks will be
in the next CL.)

Updates #14951.
Updates #22460.

Change-Id: I396b5b0e2c5e5c4acfd761a3235fd15abadc6cb1
Reviewed-on: https://go-review.googlesource.com/73711
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-26 10:21:16 -06:00
+								// putBatch performs a put on every pointer in obj. See put for
 								// constraints on these pointers.
 								//
 								//go:nowritebarrierrec
 								func (w *gcWork) putBatch(obj []uintptr) {
 									if len(obj) == 0 {
 										return
 									}
 									flushed := false
 									wbuf := w.wbuf1
 									if wbuf == nil {
 										w.init()
 										wbuf = w.wbuf1
 									}
 									for len(obj) > 0 {
 										for wbuf.nobj == len(wbuf.obj) {
 											putfull(wbuf)
 											w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
 											wbuf = w.wbuf1
 											flushed = true
 										}
 										n := copy(wbuf.obj[wbuf.nobj:], obj)
 										wbuf.nobj += n
 										obj = obj[n:]
 									}
 									if flushed && gcphase == _GCmark {
 										gcController.enlistWorker()
 									}
 								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// tryGet dequeues a pointer for the garbage collector to trace.
 								//
 								// If there are no pointers remaining in this gcWork or in the global
 								// queue, tryGet returns 0.  Note that there may still be pointers in
 								// other gcWork instances or other caches.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: remove noescape hacks from gcWork

When gcWork was first introduced, the compiler's escape analysis
wasn't good enough to detect that that method receiver didn't escape,
so we had to hack around this.

Now that the compiler can figure out this for itself, remove these
hacks.

Change-Id: I9f73fab721e272410b8b6905b564e7abc03c0dfe
Reviewed-on: https://go-review.googlesource.com/19634
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2016-02-09 16:37:41 -07:00
+								func (w *gcWork) tryGet() uintptr {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf := w.wbuf1
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									if wbuf == nil {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										w.init()
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										// wbuf is empty at this point.
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									}
 									if wbuf.nobj == 0 {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										if wbuf.nobj == 0 {
 											owbuf := wbuf
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											wbuf = trygetfull()
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+											if wbuf == nil {
 												return 0
 											}
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putempty(owbuf)
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+											w.wbuf1 = wbuf
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									}
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+									wbuf.nobj--
 									return wbuf.obj[wbuf.nobj]
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								}
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+								// tryGetFast dequeues a pointer for the garbage collector to trace
 								// if one is readily available. Otherwise it returns 0 and
 								// the caller is expected to call tryGet().
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+								func (w *gcWork) tryGetFast() uintptr {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf := w.wbuf1
-												[dev.garbage] runtime: add gc work buffer tryGet and put fast paths

The complexity of the GC work buffers put and tryGet
prevented them from being inlined. This CL simplifies
the fast path thus enabling inlining. If the fast
path does not succeed the previous put and tryGet
functions are called.

Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5
Reviewed-on: https://go-review.googlesource.com/20704
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2016-03-14 10:17:48 -06:00
+									if wbuf == nil {
 										return 0
 									}
 									if wbuf.nobj == 0 {
 										return 0
 									}
 									wbuf.nobj--
 									return wbuf.obj[wbuf.nobj]
 								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// get dequeues a pointer for the garbage collector to trace, blocking
 								// if necessary to ensure all pointers from all queues and caches have
 								// been retrieved.  get returns 0 if there are no pointers remaining.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: remove noescape hacks from gcWork

When gcWork was first introduced, the compiler's escape analysis
wasn't good enough to detect that that method receiver didn't escape,
so we had to hack around this.

Now that the compiler can figure out this for itself, remove these
hacks.

Change-Id: I9f73fab721e272410b8b6905b564e7abc03c0dfe
Reviewed-on: https://go-review.googlesource.com/19634
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2016-02-09 16:37:41 -07:00
+								func (w *gcWork) get() uintptr {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									wbuf := w.wbuf1
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									if wbuf == nil {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										w.init()
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										// wbuf is empty at this point.
 									}
 									if wbuf.nobj == 0 {
 										w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf1
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										if wbuf.nobj == 0 {
 											owbuf := wbuf
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											wbuf = getfull()
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+											if wbuf == nil {
 												return 0
 											}
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putempty(owbuf)
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+											w.wbuf1 = wbuf
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+										}
 									}
 									// TODO: This might be a good place to add prefetch code
 									wbuf.nobj--
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+									return wbuf.obj[wbuf.nobj]
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								}
 								// dispose returns any cached pointers to the global queue.
-												runtime: force mutator to give work buffer to GC

The scheduler, work buffer's dispose, and write barriers
can conspire to hide the a pointer from the GC's concurent
mark phase. If this pointer is the only path to a large
amount of marking the STW mark termination phase may take
a lot of time.

Consider the following:
1) dispose places a work buffer on the partial queue
2) the GC is busy so it does not immediately remove and
   process the work buffer
3) the scheduler runs a mutator whose write barrier dequeues the
   work buffer from the partial queue so the GC won't see it
This repeats until the GC reaches the mark termination
phase where the GC finally discovers the pointer along
with a lot of work to do.

This CL fixes the problem by having the mutator
dispose of the buffer to the full queue instead of
the partial queue. Since the write buffer never asks for full
buffers the conspiracy described above is not possible.

Updates #11694.

Change-Id: I2ce832f9657a7570f800e8ce4459cd9e304ef43b
Reviewed-on: https://go-review.googlesource.com/12840
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-07-29 10:03:54 -06:00
+								// The buffers are being put on the full queue so that the
 								// write barriers will not simply reacquire them before the
 								// GC can inspect them. This helps reduce the mutator's
 								// ability to hide pointers during the concurrent mark phase.
 								//
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								func (w *gcWork) dispose() {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									if wbuf := w.wbuf1; wbuf != nil {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										if wbuf.nobj == 0 {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putempty(wbuf)
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										} else {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putfull(wbuf)
-												runtime: force mutator to give work buffer to GC

The scheduler, work buffer's dispose, and write barriers
can conspire to hide the a pointer from the GC's concurent
mark phase. If this pointer is the only path to a large
amount of marking the STW mark termination phase may take
a lot of time.

Consider the following:
1) dispose places a work buffer on the partial queue
2) the GC is busy so it does not immediately remove and
   process the work buffer
3) the scheduler runs a mutator whose write barrier dequeues the
   work buffer from the partial queue so the GC won't see it
This repeats until the GC reaches the mark termination
phase where the GC finally discovers the pointer along
with a lot of work to do.

This CL fixes the problem by having the mutator
dispose of the buffer to the full queue instead of
the partial queue. Since the write buffer never asks for full
buffers the conspiracy described above is not possible.

Updates #11694.

Change-Id: I2ce832f9657a7570f800e8ce4459cd9e304ef43b
Reviewed-on: https://go-review.googlesource.com/12840
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-07-29 10:03:54 -06:00
+										}
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										w.wbuf1 = nil
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										wbuf = w.wbuf2
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										if wbuf.nobj == 0 {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putempty(wbuf)
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										} else {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+											putfull(wbuf)
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										}
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										w.wbuf2 = nil
-												runtime: combine gcWorkProducer into gcWork

The distinction between gcWorkProducer and gcWork (producer and
consumer) is not serving us as originally intended, so merge these
into just gcWork.

The original intent was to replace the currentwbuf cache with a
gcWorkProducer. However, with gchelpwork (aka mutator assists),
mutators can both produce and consume work, so it will make more sense
to cache a whole gcWork.

Change-Id: I6e633e96db7cb23a64fbadbfc4607e3ad32bcfb3
Reviewed-on: https://go-review.googlesource.com/7733
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 11:09:30 -06:00
+									}
-												runtime: track heap bytes marked by GC

This tracks the number of heap bytes marked by a GC cycle. We'll use
this information to precisely trigger the next GC cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: I1bc377cb2e802ef61c2968602b63146d52e7f5db
Reviewed-on: https://go-review.googlesource.com/8388
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-03-12 14:53:57 -06:00
+									if w.bytesMarked != 0 {
 										// dispose happens relatively infrequently. If this
 										// atomic becomes a problem, we should first try to
 										// dispose less and if necessary aggregate in a per-P
 										// counter.
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+										atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
-												runtime: track heap bytes marked by GC

This tracks the number of heap bytes marked by a GC cycle. We'll use
this information to precisely trigger the next GC cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: I1bc377cb2e802ef61c2968602b63146d52e7f5db
Reviewed-on: https://go-review.googlesource.com/8388
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-03-12 14:53:57 -06:00
+										w.bytesMarked = 0
 									}
-												runtime: track scan work performed during concurrent mark

This tracks the amount of scan work in terms of scanned pointers
during the concurrent mark phase. We'll use this information to
estimate scan work for the next cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: Iac0364c466ee35fab781dbbbe7970a5f3c4e1fc1
Reviewed-on: https://go-review.googlesource.com/8832
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 10:08:47 -06:00
+									if w.scanWork != 0 {
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+										atomic.Xaddint64(&gcController.scanWork, w.scanWork)
-												runtime: track scan work performed during concurrent mark

This tracks the amount of scan work in terms of scanned pointers
during the concurrent mark phase. We'll use this information to
estimate scan work for the next cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: Iac0364c466ee35fab781dbbbe7970a5f3c4e1fc1
Reviewed-on: https://go-review.googlesource.com/8832
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 10:08:47 -06:00
+										w.scanWork = 0
 									}
-												runtime: combine gcWorkProducer into gcWork

The distinction between gcWorkProducer and gcWork (producer and
consumer) is not serving us as originally intended, so merge these
into just gcWork.

The original intent was to replace the currentwbuf cache with a
gcWorkProducer. However, with gchelpwork (aka mutator assists),
mutators can both produce and consume work, so it will make more sense
to cache a whole gcWork.

Change-Id: I6e633e96db7cb23a64fbadbfc4607e3ad32bcfb3
Reviewed-on: https://go-review.googlesource.com/7733
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-03-12 11:09:30 -06:00
+								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// balance moves some work that's cached in this gcWork back on the
 								// global queue.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								func (w *gcWork) balance() {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									if w.wbuf1 == nil {
-												runtime: cache two workbufs to reduce contention

Currently the gcWork abstraction caches a single work buffer. As a
result, if a worker is putting and getting pointers right at the
boundary of a work buffer, it can flap between work buffers and
(potentially significantly) increase contention on the global work
buffer lists.

This change modifies gcWork to instead cache two work buffers and
switch off between them. This introduces one buffers' worth of
hysteresis and eliminates the above performance worst case by
amortizing the cost of getting or putting a work buffer over at least
one buffers' worth of work.

In practice, it's difficult to trigger this worst case with reasonably
large work buffers. On the garbage benchmark, this reduces the max
writes/sec to the global work list from 32K to 25K and the median from
6K to 5K. However, if a workload were to trigger this worst case
behavior, it could significantly drive up this contention.

This has negligible effects on the go1 benchmarks and slightly speeds
up the garbage benchmark.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.90ms ± 3%  5.83ms ± 4%  -1.18%  (p=0.011 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.22s ± 4%     3.17s ± 3%  -1.57%  (p=0.009 n=19+20)
Fannkuch11-12                2.44s ± 1%     2.53s ± 4%  +3.78%  (p=0.000 n=18+19)
FmtFprintfEmpty-12          50.2ns ± 2%    50.5ns ± 5%    ~     (p=0.631 n=19+20)
FmtFprintfString-12          167ns ± 1%     166ns ± 1%    ~     (p=0.141 n=20+20)
FmtFprintfInt-12             162ns ± 1%     159ns ± 1%  -1.80%  (p=0.000 n=20+20)
FmtFprintfIntInt-12          277ns ± 2%     263ns ± 1%  -4.78%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     240ns ± 1%     232ns ± 2%  -3.25%  (p=0.000 n=20+20)
FmtFprintfFloat-12           311ns ± 1%     315ns ± 2%  +1.17%  (p=0.000 n=20+20)
FmtManyArgs-12              1.05µs ± 2%    1.03µs ± 2%  -1.72%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.71ms ± 2%  +0.68%  (p=0.001 n=19+20)
GobEncode-12                6.51ms ± 1%    6.54ms ± 1%  +0.42%  (p=0.047 n=20+19)
Gzip-12                      318ms ± 2%     315ms ± 2%  -1.20%  (p=0.000 n=19+19)
Gunzip-12                   42.2ms ± 2%    42.1ms ± 1%    ~     (p=0.667 n=20+19)
HTTPClientServer-12         62.5µs ± 1%    62.4µs ± 1%    ~     (p=0.110 n=20+18)
JSONEncode-12               16.8ms ± 1%    16.8ms ± 2%    ~     (p=0.569 n=19+20)
JSONDecode-12               60.8ms ± 2%    59.8ms ± 1%  -1.69%  (p=0.000 n=19+19)
Mandelbrot200-12            3.87ms ± 1%    3.85ms ± 0%  -0.61%  (p=0.001 n=20+17)
GoParse-12                  3.76ms ± 2%    3.76ms ± 1%    ~     (p=0.698 n=20+20)
RegexpMatchEasy0_32-12       100ns ± 2%     101ns ± 2%    ~     (p=0.065 n=19+20)
RegexpMatchEasy0_1K-12       342ns ± 2%     333ns ± 1%  -2.82%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      83.3ns ± 2%    83.2ns ± 2%    ~     (p=0.692 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     490ns ± 1%  -1.52%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12      131ns ± 2%     131ns ± 2%    ~     (p=0.464 n=20+18)
RegexpMatchMedium_1K-12     39.3µs ± 2%    39.6µs ± 1%  +0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12       2.04µs ± 2%    2.06µs ± 1%  +0.69%  (p=0.009 n=19+20)
RegexpMatchHard_1K-12       61.4µs ± 2%    62.1µs ± 1%  +1.21%  (p=0.000 n=19+20)
Revcomp-12                   534ms ± 1%     529ms ± 1%  -0.97%  (p=0.000 n=19+16)
Template-12                 70.4ms ± 2%    70.0ms ± 1%    ~     (p=0.070 n=19+19)
TimeParse-12                 359ns ± 3%     344ns ± 1%  -4.15%  (p=0.000 n=19+19)
TimeFormat-12                357ns ± 1%     361ns ± 2%  +1.05%  (p=0.002 n=20+20)
[Geo mean]                  62.4µs         62.0µs       -0.56%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.1MB/s ± 2%  -0.68%  (p=0.001 n=19+20)
GobEncode-12               118MB/s ± 1%   117MB/s ± 1%  -0.42%  (p=0.046 n=20+19)
Gzip-12                   60.9MB/s ± 2%  61.7MB/s ± 2%  +1.21%  (p=0.000 n=19+19)
Gunzip-12                  460MB/s ± 2%   461MB/s ± 1%    ~     (p=0.661 n=20+19)
JSONEncode-12              116MB/s ± 1%   115MB/s ± 2%    ~     (p=0.555 n=19+20)
JSONDecode-12             31.9MB/s ± 2%  32.5MB/s ± 1%  +1.72%  (p=0.000 n=19+19)
GoParse-12                15.4MB/s ± 2%  15.4MB/s ± 1%    ~     (p=0.653 n=20+20)
RegexpMatchEasy0_32-12     317MB/s ± 2%   315MB/s ± 2%    ~     (p=0.141 n=19+20)
RegexpMatchEasy0_1K-12    2.99GB/s ± 2%  3.07GB/s ± 1%  +2.86%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12     384MB/s ± 2%   385MB/s ± 2%    ~     (p=0.672 n=20+19)
RegexpMatchEasy1_1K-12    2.06GB/s ± 2%  2.09GB/s ± 1%  +1.54%  (p=0.000 n=18+20)
RegexpMatchMedium_32-12   7.62MB/s ± 2%  7.63MB/s ± 2%    ~     (p=0.800 n=20+18)
RegexpMatchMedium_1K-12   26.0MB/s ± 1%  25.8MB/s ± 1%  -0.77%  (p=0.000 n=18+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.6MB/s ± 1%  -0.69%  (p=0.010 n=19+20)
RegexpMatchHard_1K-12     16.7MB/s ± 2%  16.5MB/s ± 1%  -1.19%  (p=0.000 n=19+20)
Revcomp-12                 476MB/s ± 1%   481MB/s ± 1%  +0.97%  (p=0.000 n=19+16)
Template-12               27.6MB/s ± 2%  27.7MB/s ± 1%    ~     (p=0.071 n=19+19)
[Geo mean]                99.1MB/s       99.3MB/s       +0.27%

Change-Id: I68bcbf74ccb716cd5e844a554f67b679135105e6
Reviewed-on: https://go-review.googlesource.com/16042
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-16 07:53:11 -06:00
+										return
 									}
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									if wbuf := w.wbuf2; wbuf.nobj != 0 {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+										putfull(wbuf)
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+										w.wbuf2 = getempty()
 									} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
 										w.wbuf1 = handoff(wbuf)
-												runtime: wake idle Ps when enqueuing GC work

If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.

Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.

To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.

There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.

Fixes #14179.

Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

											
										
										
											2016-10-30 18:43:53 -06:00
+									} else {
 										return
 									}
 									// We flushed a buffer to the full list, so wake a worker.
 									if gcphase == _GCmark {
 										gcController.enlistWorker()
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+									}
 								}
-												runtime: run background mark helpers only if work is available

Prior to this CL whenever the GC marking was enabled and
a P was looking for work we supplied a G to help
the GC do its marking tasks. Once this G finished all
the marking available it would release the P to find another
available G. In the case where there was no work the P would drop
into findrunnable which would execute the mark helper G which would
immediately return and the P would drop into findrunnable again repeating
the process. Since the P was always given a G to run it never blocks.
This CL first checks if the GC mark helper G has available work and if
not the P immediately falls through to its blocking logic.

Fixes #10901

Change-Id: I94ac9646866ba64b7892af358888bc9950de23b5
Reviewed-on: https://go-review.googlesource.com/10189
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-05-18 14:02:37 -06:00
+								// empty returns true if w has no mark work available.
-												runtime: mark gcWork methods nowritebarrierrec

Currently most of these are marked go:nowritebarrier as a hint, but
it's actually important that these not invoke write barriers
recursively. The danger is that some gcWork method would invoke the
write barrier while the gcWork is in an inconsistent state and that
the write barrier would in turn invoke some other gcWork method, which
would crash or permanently corrupt the gcWork. Simply marking the
write barrier itself as go:nowritebarrierrec isn't sufficient to
prevent this if the write barrier doesn't use the outer method.

Thankfully, this doesn't cause any build failures, so we were getting
this right. :)

For #22460.

Change-Id: I35a7292a584200eb35a49507cd3fe359ba2206f6
Reviewed-on: https://go-review.googlesource.com/72554
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-10-19 17:57:46 -06:00
+								//go:nowritebarrierrec
-												runtime: run background mark helpers only if work is available

Prior to this CL whenever the GC marking was enabled and
a P was looking for work we supplied a G to help
the GC do its marking tasks. Once this G finished all
the marking available it would release the P to find another
available G. In the case where there was no work the P would drop
into findrunnable which would execute the mark helper G which would
immediately return and the P would drop into findrunnable again repeating
the process. Since the P was always given a G to run it never blocks.
This CL first checks if the GC mark helper G has available work and if
not the P immediately falls through to its blocking logic.

Fixes #10901

Change-Id: I94ac9646866ba64b7892af358888bc9950de23b5
Reviewed-on: https://go-review.googlesource.com/10189
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-05-18 14:02:37 -06:00
+								func (w *gcWork) empty() bool {
-												runtime: remove wbufptr

Since workbuf is now marked go:notinheap, the write barrier-preventing
wrapper type wbufptr is no longer necessary. Remove it.

Change-Id: I3e5b5803a1547d65de1c1a9c22458a38e08549b7
Reviewed-on: https://go-review.googlesource.com/35971
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-01-29 20:47:27 -07:00
+									return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
-												runtime: run background mark helpers only if work is available

Prior to this CL whenever the GC marking was enabled and
a P was looking for work we supplied a G to help
the GC do its marking tasks. Once this G finished all
the marking available it would release the P to find another
available G. In the case where there was no work the P would drop
into findrunnable which would execute the mark helper G which would
immediately return and the P would drop into findrunnable again repeating
the process. Since the P was always given a G to run it never blocks.
This CL first checks if the GC mark helper G has available work and if
not the P immediately falls through to its blocking logic.

Fixes #10901

Change-Id: I94ac9646866ba64b7892af358888bc9950de23b5
Reviewed-on: https://go-review.googlesource.com/10189
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-05-18 14:02:37 -06:00
+								}
-												runtime: introduce higher-level GC work abstraction

This introduces a producer/consumer abstraction for GC work pointers
that internally handles the details of filling, draining, and
shuffling work buffers.

In addition to simplifying the GC code, this should make it easy for
us to change how we use work buffers, including cleaning up how we use
the work.partial queue, reintroducing a FIFO lookahead cache, adding
prefetching, and using dual buffers to avoid flapping.

This commit doesn't change any existing code.  The following commit
will switch the garbage collector from explicit workbuf manipulation
to gcWork.

Change-Id: Ifbfe5fff45bf0362d6d7c3cecb061f0c9874077d
Reviewed-on: https://go-review.googlesource.com/5231
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 10:53:48 -07:00
+								// Internally, the GC work pool is kept in arrays in work buffers.
 								// The gcWork interface caches a work buffer until full (or empty) to
 								// avoid contending on the global work buffer lists.
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								type workbufhdr struct {
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+									node lfnode // must be first
 									nobj int
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								}
-												runtime: mark several types go:notinheap

This covers basically all sysAlloc'd, persistentalloc'd, and
fixalloc'd types.

Change-Id: I0487c887c2a0ade5e33d4c4c12d837e97468e66b
Reviewed-on: https://go-review.googlesource.com/30941
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-10-11 20:58:21 -06:00
+								//go:notinheap
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								type workbuf struct {
 									workbufhdr
 									// account for the above fields
-												runtime: break out system-specific constants into package sys

runtime/internal/sys will hold system-, architecture- and config-
specific constants.

Updates #11647

Change-Id: I6db29c312556087a42e8d2bdd9af40d157c56b54
Reviewed-on: https://go-review.googlesource.com/16817
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-11 10:39:30 -07:00
+									obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								}
 								// workbuf factory routines. These funcs are used to manage the
-												runtime: replace per-M workbuf cache with per-P gcWork cache

Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.

This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).

This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:

benchmark                          old ns/op       new ns/op       delta
BenchmarkBinaryTree17              11963668673     11206112763     -6.33%
BenchmarkFannkuch11                2643217136      2649182499      +0.23%
BenchmarkFmtFprintfEmpty           70.4            70.2            -0.28%
BenchmarkFmtFprintfString          364             307             -15.66%
BenchmarkFmtFprintfInt             317             282             -11.04%
BenchmarkFmtFprintfIntInt          512             483             -5.66%
BenchmarkFmtFprintfPrefixedInt     404             380             -5.94%
BenchmarkFmtFprintfFloat           521             479             -8.06%
BenchmarkFmtManyArgs               2164            1894            -12.48%
BenchmarkGobDecode                 30366146        22429593        -26.14%
BenchmarkGobEncode                 29867472        26663152        -10.73%
BenchmarkGzip                      391236616       396779490       +1.42%
BenchmarkGunzip                    96639491        96297024        -0.35%
BenchmarkHTTPClientServer          100110          70763           -29.31%
BenchmarkJSONEncode                51866051        52511382        +1.24%
BenchmarkJSONDecode                103813138       86094963        -17.07%
BenchmarkMandelbrot200             4121834         4120886         -0.02%
BenchmarkGoParse                   16472789        5879949         -64.31%
BenchmarkRegexpMatchEasy0_32       140             140             +0.00%
BenchmarkRegexpMatchEasy0_1K       394             394             +0.00%
BenchmarkRegexpMatchEasy1_32       120             120             +0.00%
BenchmarkRegexpMatchEasy1_1K       621             614             -1.13%
BenchmarkRegexpMatchMedium_32      209             202             -3.35%
BenchmarkRegexpMatchMedium_1K      54889           55175           +0.52%
BenchmarkRegexpMatchHard_32        2682            2675            -0.26%
BenchmarkRegexpMatchHard_1K        79383           79524           +0.18%
BenchmarkRevcomp                   584116718       584595320       +0.08%
BenchmarkTemplate                  125400565       109620196       -12.58%
BenchmarkTimeParse                 386             387             +0.26%
BenchmarkTimeFormat                580             447             -22.93%

(Best out of 10 runs. The delta of averages is similar.)

This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.

Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-04-19 13:22:20 -06:00
+								// workbufs.
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								// If the GC asks for some work these are the only routines that
-												runtime: remove work.partial queue

This work queue is no longer used (there are many reads of
work.partial, but the only write is in putpartial, which is never
called).

Fixes #11922.

Change-Id: I08b76c0c02a0867a9cdcb94783e1f7629d44249a
Reviewed-on: https://go-review.googlesource.com/15892
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-10-15 13:44:16 -06:00
+								// make wbufs available to the GC.
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
 								func (b *workbuf) checknonempty() {
 									if b.nobj == 0 {
 										throw("workbuf is empty")
 									}
 								}
 								func (b *workbuf) checkempty() {
 									if b.nobj != 0 {
 										throw("workbuf is not empty")
 									}
 								}
 								// getempty pops an empty work buffer off the work.empty list,
 								// allocating new buffers if none are available.
 								//go:nowritebarrier
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+								func getempty() *workbuf {
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									var b *workbuf
 									if work.empty != 0 {
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+										b = (*workbuf)(work.empty.pop())
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+										if b != nil {
 											b.checkempty()
 										}
 									}
 									if b == nil {
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
+										// Allocate more workbufs.
 										var s *mspan
-												runtime: free workbufs during sweeping

This extends the sweeper to free workbufs back to the heap between GC
cycles, allowing this memory to be reused for GC'd allocations or
eventually returned to the OS.

This helps for applications that have high peak heap usage relative to
their regular heap usage (for example, a high-memory initialization
phase). Workbuf memory is roughly proportional to heap size and since
we currently never free workbufs, it's proportional to *peak* heap
size. By freeing workbufs, we can release and reuse this memory for
other purposes when the heap shrinks.

This is somewhat complicated because this costs ~1–2 µs per workbuf
span, so for large heaps it's too expensive to just do synchronously
after mark termination between starting the world and dropping the
worldsema. Hence, we do it asynchronously in the sweeper. This adds a
list of "free" workbuf spans that can be returned to the heap. GC
moves all workbuf spans to this list after mark termination and the
background sweeper drains this list back to the heap. If the sweeper
doesn't finish, that's fine, since getempty can directly reuse any
remaining spans to allocate more workbufs.

Performance impact is negligible. On the x/benchmarks, this reduces
GC-bytes-from-system by 6–11%.

Fixes #19325.

Change-Id: Icb92da2196f0c39ee984faf92d52f29fd9ded7a8
Reviewed-on: https://go-review.googlesource.com/38582
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 15:25:59 -06:00
+										if work.wbufSpans.free.first != nil {
 											lock(&work.wbufSpans.lock)
 											s = work.wbufSpans.free.first
 											if s != nil {
 												work.wbufSpans.free.remove(s)
 												work.wbufSpans.busy.insert(s)
 											}
 											unlock(&work.wbufSpans.lock)
 										}
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
+										if s == nil {
-												runtime: free workbufs during sweeping

This extends the sweeper to free workbufs back to the heap between GC
cycles, allowing this memory to be reused for GC'd allocations or
eventually returned to the OS.

This helps for applications that have high peak heap usage relative to
their regular heap usage (for example, a high-memory initialization
phase). Workbuf memory is roughly proportional to heap size and since
we currently never free workbufs, it's proportional to *peak* heap
size. By freeing workbufs, we can release and reuse this memory for
other purposes when the heap shrinks.

This is somewhat complicated because this costs ~1–2 µs per workbuf
span, so for large heaps it's too expensive to just do synchronously
after mark termination between starting the world and dropping the
worldsema. Hence, we do it asynchronously in the sweeper. This adds a
list of "free" workbuf spans that can be returned to the heap. GC
moves all workbuf spans to this list after mark termination and the
background sweeper drains this list back to the heap. If the sweeper
doesn't finish, that's fine, since getempty can directly reuse any
remaining spans to allocate more workbufs.

Performance impact is negligible. On the x/benchmarks, this reduces
GC-bytes-from-system by 6–11%.

Fixes #19325.

Change-Id: Icb92da2196f0c39ee984faf92d52f29fd9ded7a8
Reviewed-on: https://go-review.googlesource.com/38582
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 15:25:59 -06:00
+											systemstack(func() {
 												s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
 											})
 											if s == nil {
 												throw("out of memory")
 											}
 											// Record the new span in the busy list.
 											lock(&work.wbufSpans.lock)
 											work.wbufSpans.busy.insert(s)
 											unlock(&work.wbufSpans.lock)
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
+										}
 										// Slice up the span into new workbufs. Return one and
 										// put the rest on the empty list.
 										for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
 											newb := (*workbuf)(unsafe.Pointer(s.base() + i))
 											newb.nobj = 0
-												runtime: validate lfnode addresses

Change-Id: Ic8c506289caaf6218494e5150d10002e0232feaa
Reviewed-on: https://go-review.googlesource.com/85876
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-12-03 17:28:28 -07:00
+											lfnodeValidate(&newb.node)
-												runtime: allocate GC workbufs from manually-managed spans

Currently the runtime allocates workbufs from persistent memory, which
means they can never be freed.

Switch to allocating them from manually-managed heap spans. This
doesn't free them yet, but it puts us in a position to do so.

For #19325.

Change-Id: I94b2512a2f2bbbb456cd9347761b9412e80d2da9
Reviewed-on: https://go-review.googlesource.com/38581
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 12:05:48 -06:00
+											if i == 0 {
 												b = newb
 											} else {
 												putempty(newb)
 											}
 										}
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									}
 									return b
 								}
 								// putempty puts a workbuf onto the work.empty list.
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+								// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								//go:nowritebarrier
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+								func putempty(b *workbuf) {
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									b.checkempty()
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+									work.empty.push(&b.node)
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								}
 								// putfull puts the workbuf on the work.full list for the GC.
 								// putfull accepts partially full buffers so the GC can avoid competing
 								// with the mutators for ownership of partially full buffers.
 								//go:nowritebarrier
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+								func putfull(b *workbuf) {
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									b.checknonempty()
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+									work.full.push(&b.node)
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								}
 								// trygetfull tries to get a full or partially empty workbuffer.
 								// If one is not immediately available return nil
 								//go:nowritebarrier
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+								func trygetfull() *workbuf {
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+									b := (*workbuf)(work.full.pop())
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									if b != nil {
 										b.checknonempty()
 										return b
 									}
 									return b
 								}
-												runtime: remove work.partial queue

This work queue is no longer used (there are many reads of
work.partial, but the only write is in putpartial, which is never
called).

Fixes #11922.

Change-Id: I08b76c0c02a0867a9cdcb94783e1f7629d44249a
Reviewed-on: https://go-review.googlesource.com/15892
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-10-15 13:44:16 -06:00
+								// Get a full work buffer off the work.full list.
 								// If nothing is available wait until all the other gc helpers have
 								// finished and then return nil.
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+								// getfull acts as a barrier for work.nproc helpers. As long as one
 								// gchelper is actively marking objects it
 								// may create a workbuffer that the other helpers can work on.
 								// The for loop either exits when a work buffer is found
 								// or when _all_ of the work.nproc GC helpers are in the loop
 								// looking for work and thus not capable of creating new work.
 								// This is in fact the termination condition for the STW mark
 								// phase.
 								//go:nowritebarrier
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+								func getfull() *workbuf {
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+									b := (*workbuf)(work.full.pop())
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									if b != nil {
 										b.checknonempty()
 										return b
 									}
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+									incnwait := atomic.Xadd(&work.nwait, +1)
-												runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-06-01 16:16:03 -06:00
+									if incnwait > work.nproc {
 										println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 										throw("work.nwait > work.nproc")
 									}
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									for i := 0; ; i++ {
-												runtime: remove work.partial queue

This work queue is no longer used (there are many reads of
work.partial, but the only write is in putpartial, which is never
called).

Fixes #11922.

Change-Id: I08b76c0c02a0867a9cdcb94783e1f7629d44249a
Reviewed-on: https://go-review.googlesource.com/15892
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-10-15 13:44:16 -06:00
+										if work.full != 0 {
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+											decnwait := atomic.Xadd(&work.nwait, -1)
-												runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-06-01 16:16:03 -06:00
+											if decnwait == work.nproc {
 												println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 												throw("work.nwait > work.nproc")
 											}
-												runtime: introduce a type for lfstacks

The lfstack API is still a C-style API: lfstacks all have unhelpful
type uint64 and the APIs are package-level functions. Make the code
more readable and Go-style by creating an lfstack type with methods
for push, pop, and empty.

Change-Id: I64685fa3be0e82ae2d1a782a452a50974440a827
Reviewed-on: https://go-review.googlesource.com/38290
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-07 14:38:29 -07:00
+											b = (*workbuf)(work.full.pop())
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+											if b != nil {
 												b.checknonempty()
 												return b
 											}
-												runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-11-02 12:09:24 -07:00
+											incnwait := atomic.Xadd(&work.nwait, +1)
-												runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590
Reviewed-by: Austin Clements <austin@google.com>

											
										
										
											2015-06-01 16:16:03 -06:00
+											if incnwait > work.nproc {
 												println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 												throw("work.nwait > work.nproc")
 											}
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+										}
-												runtime: perform concurrent scan in GC workers

Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.

This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.

This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.

This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).

This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.

This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.

name              old time/op  new time/op  delta
XBenchGarbage-12  5.10ms ± 1%  5.24ms ± 1%  +2.87%  (p=0.000 n=18+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.25s ± 3%     3.20s ± 5%  -1.57%  (p=0.013 n=20+20)
Fannkuch11-12                2.45s ± 1%     2.46s ± 1%  +0.38%  (p=0.019 n=20+18)
FmtFprintfEmpty-12          49.7ns ± 3%    49.9ns ± 4%    ~     (p=0.851 n=19+20)
FmtFprintfString-12          170ns ± 2%     170ns ± 1%    ~     (p=0.775 n=20+19)
FmtFprintfInt-12             161ns ± 1%     160ns ± 1%  -0.78%  (p=0.000 n=19+18)
FmtFprintfIntInt-12          267ns ± 1%     270ns ± 1%  +1.04%  (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12     238ns ± 2%     238ns ± 1%    ~     (p=0.133 n=18+19)
FmtFprintfFloat-12           311ns ± 1%     310ns ± 2%  -0.35%  (p=0.023 n=20+19)
FmtManyArgs-12              1.08µs ± 1%    1.06µs ± 1%  -2.31%  (p=0.000 n=20+20)
GobDecode-12                8.65ms ± 1%    8.63ms ± 1%    ~     (p=0.377 n=18+20)
GobEncode-12                6.49ms ± 1%    6.52ms ± 1%  +0.37%  (p=0.015 n=20+20)
Gzip-12                      319ms ± 3%     318ms ± 1%    ~     (p=0.975 n=19+17)
Gunzip-12                   41.9ms ± 1%    42.1ms ± 2%  +0.65%  (p=0.004 n=19+20)
HTTPClientServer-12         61.7µs ± 1%    62.6µs ± 1%  +1.40%  (p=0.000 n=18+20)
JSONEncode-12               16.8ms ± 1%    16.9ms ± 1%    ~     (p=0.239 n=20+18)
JSONDecode-12               58.4ms ± 1%    60.7ms ± 1%  +3.85%  (p=0.000 n=19+20)
Mandelbrot200-12            3.86ms ± 0%    3.86ms ± 1%    ~     (p=0.092 n=18+19)
GoParse-12                  3.75ms ± 2%    3.75ms ± 2%    ~     (p=0.708 n=19+20)
RegexpMatchEasy0_32-12       100ns ± 1%     100ns ± 2%  +0.60%  (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12       341ns ± 1%     342ns ± 2%    ~     (p=0.203 n=20+19)
RegexpMatchEasy1_32-12      82.5ns ± 2%    83.2ns ± 2%  +0.83%  (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12       495ns ± 1%     495ns ± 2%    ~     (p=0.970 n=19+18)
RegexpMatchMedium_32-12      130ns ± 2%     130ns ± 2%  +0.59%  (p=0.039 n=19+20)
RegexpMatchMedium_1K-12     39.2µs ± 1%    39.3µs ± 1%    ~     (p=0.214 n=18+18)
RegexpMatchHard_32-12       2.03µs ± 2%    2.02µs ± 1%    ~     (p=0.166 n=18+19)
RegexpMatchHard_1K-12       61.0µs ± 1%    60.9µs ± 1%    ~     (p=0.169 n=20+18)
Revcomp-12                   533ms ± 1%     535ms ± 1%    ~     (p=0.071 n=19+17)
Template-12                 68.1ms ± 2%    73.0ms ± 1%  +7.26%  (p=0.000 n=19+20)
TimeParse-12                 355ns ± 2%     356ns ± 2%    ~     (p=0.530 n=19+20)
TimeFormat-12                357ns ± 2%     347ns ± 1%  -2.59%  (p=0.000 n=20+19)
[Geo mean]                  62.1µs         62.3µs       +0.31%

name                      old speed      new speed      delta
GobDecode-12              88.7MB/s ± 1%  88.9MB/s ± 1%    ~     (p=0.377 n=18+20)
GobEncode-12               118MB/s ± 1%   118MB/s ± 1%  -0.37%  (p=0.015 n=20+20)
Gzip-12                   60.9MB/s ± 3%  60.9MB/s ± 1%    ~     (p=0.944 n=19+17)
Gunzip-12                  464MB/s ± 1%   461MB/s ± 2%  -0.64%  (p=0.004 n=19+20)
JSONEncode-12              115MB/s ± 1%   115MB/s ± 1%    ~     (p=0.236 n=20+18)
JSONDecode-12             33.2MB/s ± 1%  32.0MB/s ± 1%  -3.71%  (p=0.000 n=19+20)
GoParse-12                15.5MB/s ± 2%  15.5MB/s ± 2%    ~     (p=0.702 n=19+20)
RegexpMatchEasy0_32-12     320MB/s ± 1%   318MB/s ± 2%    ~     (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12    3.00GB/s ± 1%  2.99GB/s ± 1%    ~     (p=0.194 n=20+19)
RegexpMatchEasy1_32-12     388MB/s ± 2%   385MB/s ± 2%  -0.83%  (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12    2.07GB/s ± 1%  2.07GB/s ± 1%    ~     (p=0.964 n=19+18)
RegexpMatchMedium_32-12   7.68MB/s ± 1%  7.64MB/s ± 2%  -0.57%  (p=0.020 n=19+20)
RegexpMatchMedium_1K-12   26.1MB/s ± 1%  26.1MB/s ± 1%    ~     (p=0.211 n=18+18)
RegexpMatchHard_32-12     15.8MB/s ± 1%  15.8MB/s ± 1%    ~     (p=0.180 n=18+19)
RegexpMatchHard_1K-12     16.8MB/s ± 1%  16.8MB/s ± 2%    ~     (p=0.236 n=20+19)
Revcomp-12                 477MB/s ± 1%   475MB/s ± 1%    ~     (p=0.071 n=19+17)
Template-12               28.5MB/s ± 2%  26.6MB/s ± 1%  -6.77%  (p=0.000 n=19+20)
[Geo mean]                 100MB/s       99.0MB/s       -0.82%

Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

											
										
										
											2015-10-19 11:46:32 -06:00
+										if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+											return nil
 										}
 										if i < 10 {
 											procyield(20)
 										} else if i < 20 {
 											osyield()
 										} else {
 											usleep(100)
 										}
 									}
 								}
 								//go:nowritebarrier
 								func handoff(b *workbuf) *workbuf {
 									// Make new buffer with half of b's pointers.
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+									b1 := getempty()
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									n := b.nobj / 2
 									b.nobj -= n
 									b1.nobj = n
-												runtime: use more natural types in struct workbuf

Until recently, struct workbuf had only lfnode and uintptr fields
before the obj array to make it convenient to compute the size of the
obj array.  It slowly grew more fields until this became inconvenient
enough that it was restructured to make the size computation easy.
Now the size computation doesn't care what the field types are, so
switch to more natural types.

Change-Id: I966140ba7ebb4aeb41d5c66d9d2a3bdc17dd4bcf
Reviewed-on: https://go-review.googlesource.com/5262
Reviewed-by: Russ Cox <rsc@golang.org>

											
										
										
											2015-02-18 19:56:12 -07:00
+									memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
 									// Put b on full list - let first half of b get stolen.
-												runtime: remove workbuf logging

Early in Go 1.5 we had bugs with ownership of workbufs, so we added a
system for tracing their ownership to help debug these issues.
However, this system has both CPU and space overhead even when
disabled, it clutters up the workbuf API, the higher level gcWork
abstraction makes it very difficult to mess up the ownership of
workbufs in practice, and the tracing hasn't been enabled or needed
since 5b66e5d nine months ago. Hence, remove it.

Benchmarks show the usual noise from changes at this level, but little
overall movement.

name              old time/op  new time/op  delta
XBenchGarbage-12  2.48ms ± 1%  2.47ms ± 0%  -0.68%  (p=0.000 n=21+21)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.98s ± 7%     2.98s ± 6%    ~     (p=0.799 n=20+20)
Fannkuch11-12                2.61s ± 3%     2.55s ± 5%  -2.55%  (p=0.003 n=20+20)
FmtFprintfEmpty-12          52.8ns ± 6%    53.6ns ± 6%    ~     (p=0.228 n=20+20)
FmtFprintfString-12          177ns ± 4%     177ns ± 4%    ~     (p=0.280 n=20+20)
FmtFprintfInt-12             162ns ± 5%     162ns ± 3%    ~     (p=0.347 n=20+20)
FmtFprintfIntInt-12          277ns ± 7%     273ns ± 4%  -1.62%  (p=0.005 n=20+20)
FmtFprintfPrefixedInt-12     237ns ± 4%     242ns ± 4%  +2.13%  (p=0.005 n=20+20)
FmtFprintfFloat-12           315ns ± 4%     312ns ± 4%  -0.97%  (p=0.001 n=20+20)
FmtManyArgs-12              1.11µs ± 3%    1.15µs ± 4%  +3.41%  (p=0.004 n=20+20)
GobDecode-12                8.50ms ± 7%    8.53ms ± 7%    ~     (p=0.429 n=20+20)
GobEncode-12                6.86ms ± 9%    6.93ms ± 7%  +0.93%  (p=0.030 n=20+20)
Gzip-12                      326ms ± 4%     329ms ± 4%  +0.98%  (p=0.020 n=20+20)
Gunzip-12                   43.3ms ± 3%    43.8ms ± 9%  +1.25%  (p=0.003 n=20+20)
HTTPClientServer-12         72.0µs ± 3%    71.5µs ± 3%    ~     (p=0.053 n=20+20)
JSONEncode-12               17.0ms ± 6%    17.3ms ± 7%  +1.32%  (p=0.006 n=20+20)
JSONDecode-12               64.2ms ± 4%    63.5ms ± 3%  -1.05%  (p=0.005 n=20+20)
Mandelbrot200-12            4.00ms ± 3%    3.99ms ± 3%    ~     (p=0.121 n=20+20)
GoParse-12                  3.74ms ± 5%    3.75ms ± 9%    ~     (p=0.383 n=20+20)
RegexpMatchEasy0_32-12       104ns ± 4%     104ns ± 6%    ~     (p=0.392 n=20+20)
RegexpMatchEasy0_1K-12       358ns ± 3%     361ns ± 4%  +0.95%  (p=0.003 n=20+20)
RegexpMatchEasy1_32-12      86.3ns ± 5%    86.1ns ± 6%    ~     (p=0.614 n=20+20)
RegexpMatchEasy1_1K-12       523ns ± 4%     518ns ± 3%  -1.14%  (p=0.008 n=20+20)
RegexpMatchMedium_32-12      137ns ± 3%     134ns ± 4%  -1.90%  (p=0.005 n=20+20)
RegexpMatchMedium_1K-12     41.0µs ± 3%    40.6µs ± 4%  -1.11%  (p=0.004 n=20+20)
RegexpMatchHard_32-12       2.13µs ± 4%    2.11µs ± 5%  -1.31%  (p=0.014 n=20+20)
RegexpMatchHard_1K-12       64.1µs ± 3%    63.2µs ± 5%  -1.38%  (p=0.005 n=20+20)
Revcomp-12                   555ms ±10%     548ms ± 7%  -1.17%  (p=0.011 n=20+20)
Template-12                 84.2ms ± 5%    88.2ms ± 4%  +4.73%  (p=0.000 n=20+20)
TimeParse-12                 365ns ± 4%     371ns ± 5%  +1.77%  (p=0.002 n=20+20)
TimeFormat-12                361ns ± 4%     365ns ± 3%  +1.08%  (p=0.002 n=20+20)
[Geo mean]                  64.7µs         64.8µs       +0.19%

Change-Id: Ib043a7a0d18b588b298873d60913d44cd19f3b44
Reviewed-on: https://go-review.googlesource.com/19887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2016-02-25 19:01:16 -07:00
+									putfull(b)
-												runtime: move wbuf-related functions to new gcwork.go

No code modifications.

This is in preparation for improving the wbuf abstraction.

Change-Id: I719543a345c34d079b7e39b251eccd5dd8a07826
Reviewed-on: https://go-review.googlesource.com/4710
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2015-02-12 07:58:25 -07:00
+									return b1
 								}
-												runtime: free workbufs during sweeping

This extends the sweeper to free workbufs back to the heap between GC
cycles, allowing this memory to be reused for GC'd allocations or
eventually returned to the OS.

This helps for applications that have high peak heap usage relative to
their regular heap usage (for example, a high-memory initialization
phase). Workbuf memory is roughly proportional to heap size and since
we currently never free workbufs, it's proportional to *peak* heap
size. By freeing workbufs, we can release and reuse this memory for
other purposes when the heap shrinks.

This is somewhat complicated because this costs ~1–2 µs per workbuf
span, so for large heaps it's too expensive to just do synchronously
after mark termination between starting the world and dropping the
worldsema. Hence, we do it asynchronously in the sweeper. This adds a
list of "free" workbuf spans that can be returned to the heap. GC
moves all workbuf spans to this list after mark termination and the
background sweeper drains this list back to the heap. If the sweeper
doesn't finish, that's fine, since getempty can directly reuse any
remaining spans to allocate more workbufs.

Performance impact is negligible. On the x/benchmarks, this reduces
GC-bytes-from-system by 6–11%.

Fixes #19325.

Change-Id: Icb92da2196f0c39ee984faf92d52f29fd9ded7a8
Reviewed-on: https://go-review.googlesource.com/38582
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

											
										
										
											2017-03-20 15:25:59 -06:00
 								// prepareFreeWorkbufs moves busy workbuf spans to free list so they
 								// can be freed to the heap. This must only be called when all
 								// workbufs are on the empty list.
 								func prepareFreeWorkbufs() {
 									lock(&work.wbufSpans.lock)
 									if work.full != 0 {
 										throw("cannot free workbufs when work.full != 0")
 									}
 									// Since all workbufs are on the empty list, we don't care
 									// which ones are in which spans. We can wipe the entire empty
 									// list and move all workbuf spans to the free list.
 									work.empty = 0
 									work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
 									unlock(&work.wbufSpans.lock)
 								}
 								// freeSomeWbufs frees some workbufs back to the heap and returns
 								// true if it should be called again to free more.
 								func freeSomeWbufs(preemptible bool) bool {
 									const batchSize = 64 // ~1–2 µs per span.
 									lock(&work.wbufSpans.lock)
 									if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
 										unlock(&work.wbufSpans.lock)
 										return false
 									}
 									systemstack(func() {
 										gp := getg().m.curg
 										for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
 											span := work.wbufSpans.free.first
 											if span == nil {
 												break
 											}
 											work.wbufSpans.free.remove(span)
 											mheap_.freeManual(span, &memstats.gc_sys)
 										}
 									})
 									more := !work.wbufSpans.free.isEmpty()
 									unlock(&work.wbufSpans.lock)
 									return more
 								}