2015-02-12 07:58:25 -07:00
|
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
|
import (
|
|
|
|
|
"runtime/internal/atomic"
|
2015-11-11 10:39:30 -07:00
|
|
|
|
"runtime/internal/sys"
|
2015-11-02 12:09:24 -07:00
|
|
|
|
"unsafe"
|
|
|
|
|
)
|
2015-02-12 07:58:25 -07:00
|
|
|
|
|
|
|
|
|
const (
|
2016-02-25 19:01:16 -07:00
|
|
|
|
_WorkbufSize = 2048 // in bytes; larger values result in less contention
|
2017-03-20 12:05:48 -06:00
|
|
|
|
|
|
|
|
|
// workbufAlloc is the number of bytes to allocate at a time
|
|
|
|
|
// for new workbufs. This must be a multiple of pageSize and
|
|
|
|
|
// should be a multiple of _WorkbufSize.
|
|
|
|
|
//
|
|
|
|
|
// Larger values reduce workbuf allocation overhead. Smaller
|
|
|
|
|
// values reduce heap fragmentation.
|
|
|
|
|
workbufAlloc = 32 << 10
|
2015-02-12 07:58:25 -07:00
|
|
|
|
)
|
|
|
|
|
|
2017-03-20 12:05:48 -06:00
|
|
|
|
func init() {
|
|
|
|
|
if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
|
|
|
|
|
throw("bad workbufAlloc")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// Garbage collector work pool abstraction.
|
|
|
|
|
//
|
|
|
|
|
// This implements a producer/consumer model for pointers to grey
|
2016-03-01 16:21:55 -07:00
|
|
|
|
// objects. A grey object is one that is marked and on a work
|
|
|
|
|
// queue. A black object is marked and not on a work queue.
|
2015-02-12 10:53:48 -07:00
|
|
|
|
//
|
|
|
|
|
// Write barriers, root discovery, stack scanning, and object scanning
|
2016-03-01 16:21:55 -07:00
|
|
|
|
// produce pointers to grey objects. Scanning consumes pointers to
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// grey objects, thus blackening them, and then scans them,
|
|
|
|
|
// potentially producing new pointers to grey objects.
|
|
|
|
|
|
2015-03-12 11:09:30 -06:00
|
|
|
|
// A gcWork provides the interface to produce and consume work for the
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// garbage collector.
|
|
|
|
|
//
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
|
// A gcWork can be used on the stack as follows:
|
2015-02-12 10:53:48 -07:00
|
|
|
|
//
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
|
// (preemption must be disabled)
|
|
|
|
|
// gcw := &getg().m.p.ptr().gcw
|
|
|
|
|
// .. call gcw.put() to produce and gcw.get() to consume ..
|
2015-09-24 12:24:32 -06:00
|
|
|
|
// if gcBlackenPromptly {
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
|
// gcw.dispose()
|
|
|
|
|
// }
|
|
|
|
|
//
|
2015-03-20 11:34:03 -06:00
|
|
|
|
// It's important that any use of gcWork during the mark phase prevent
|
|
|
|
|
// the garbage collector from transitioning to mark termination since
|
|
|
|
|
// gcWork may locally hold GC work buffers. This can be done by
|
|
|
|
|
// disabling preemption (systemstack or acquirem).
|
2015-03-12 11:09:30 -06:00
|
|
|
|
type gcWork struct {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
// wbuf1 and wbuf2 are the primary and secondary work buffers.
|
|
|
|
|
//
|
|
|
|
|
// This can be thought of as a stack of both work buffers'
|
|
|
|
|
// pointers concatenated. When we pop the last pointer, we
|
|
|
|
|
// shift the stack up by one work buffer by bringing in a new
|
|
|
|
|
// full buffer and discarding an empty one. When we fill both
|
|
|
|
|
// buffers, we shift the stack down by one work buffer by
|
|
|
|
|
// bringing in a new empty buffer and discarding a full one.
|
|
|
|
|
// This way we have one buffer's worth of hysteresis, which
|
|
|
|
|
// amortizes the cost of getting or putting a work buffer over
|
|
|
|
|
// at least one buffer of work and reduces contention on the
|
|
|
|
|
// global work lists.
|
|
|
|
|
//
|
|
|
|
|
// wbuf1 is always the buffer we're currently pushing to and
|
|
|
|
|
// popping from and wbuf2 is the buffer that will be discarded
|
|
|
|
|
// next.
|
|
|
|
|
//
|
|
|
|
|
// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf1, wbuf2 *workbuf
|
2015-03-12 14:53:57 -06:00
|
|
|
|
|
|
|
|
|
// Bytes marked (blackened) on this gcWork. This is aggregated
|
|
|
|
|
// into work.bytesMarked by dispose.
|
|
|
|
|
bytesMarked uint64
|
2015-03-12 10:08:47 -06:00
|
|
|
|
|
|
|
|
|
// Scan work performed on this gcWork. This is aggregated into
|
2015-10-04 21:00:01 -06:00
|
|
|
|
// gcController by dispose and may also be flushed by callers.
|
2015-03-12 10:08:47 -06:00
|
|
|
|
scanWork int64
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
2017-10-19 17:57:46 -06:00
|
|
|
|
// Most of the methods of gcWork are go:nowritebarrierrec because the
|
|
|
|
|
// write barrier itself can invoke gcWork methods but the methods are
|
|
|
|
|
// not generally re-entrant. Hence, if a gcWork method invoked the
|
|
|
|
|
// write barrier while the gcWork was in an inconsistent state, and
|
|
|
|
|
// the write barrier in turn invoked a gcWork method, it could
|
|
|
|
|
// permanently corrupt the gcWork.
|
|
|
|
|
|
2015-10-16 07:53:11 -06:00
|
|
|
|
func (w *gcWork) init() {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf1 = getempty()
|
2016-02-25 19:01:16 -07:00
|
|
|
|
wbuf2 := trygetfull()
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf2 == nil {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
wbuf2 = getempty()
|
2015-10-16 07:53:11 -06:00
|
|
|
|
}
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf2 = wbuf2
|
2015-10-16 07:53:11 -06:00
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// put enqueues a pointer for the garbage collector to trace.
|
runtime: bound scanobject to ~100 µs
Currently the time spent in scanobject is proportional to the size of
the object being scanned. Since scanobject is non-preemptible, large
objects can cause significant goroutine (and even whole application)
delays through several means:
1. If a GC assist picks up a large object, the allocating goroutine is
blocked for the whole scan, even if that scan well exceeds that
goroutine's debt.
2. Since the scheduler does not run on the P performing a large object
scan, goroutines in that P's run queue do not run unless they are
stolen by another P (which can take some time). If there are a few
large objects, all of the Ps may get tied up so the scheduler
doesn't run anywhere.
3. Even if a large object is scanned by a background worker and other
Ps are still running the scheduler, the large object scan doesn't
flush background credit until the whole scan is done. This can
easily cause all allocations to block in assists, waiting for
credit, causing an effective STW.
Fix this by splitting large objects into 128 KB "oblets" and scanning
at most one oblet at a time. Since we can scan 1–2 MB/ms, this equates
to bounding scanobject at roughly 100 µs. This improves assist
behavior both because assists can no longer get "unlucky" and be stuck
scanning a large object, and because it causes the background worker
to flush credit and unblock assists more frequently when scanning
large objects. This also improves GC parallelism if the heap consists
primarily of a small number of very large objects by letting multiple
workers scan a large objects in parallel.
Fixes #10345. Fixes #16293.
This substantially improves goroutine latency in the benchmark from
issue #16293, which exercises several forms of very large objects:
name old max-latency new max-latency delta
SliceNoPointer-12 154µs ± 1% 155µs ± 2% ~ (p=0.087 n=13+12)
SlicePointer-12 314ms ± 1% 5.94ms ±138% -98.11% (p=0.000 n=19+20)
SliceLivePointer-12 1148ms ± 0% 4.72ms ±167% -99.59% (p=0.000 n=19+20)
MapNoPointer-12 72509µs ± 1% 408µs ±325% -99.44% (p=0.000 n=19+18)
ChanPointer-12 313ms ± 0% 4.74ms ±140% -98.49% (p=0.000 n=18+20)
ChanLivePointer-12 1147ms ± 0% 3.30ms ±149% -99.71% (p=0.000 n=19+20)
name old P99.9-latency new P99.9-latency delta
SliceNoPointer-12 113µs ±25% 107µs ±12% ~ (p=0.153 n=20+18)
SlicePointer-12 309450µs ± 0% 133µs ±23% -99.96% (p=0.000 n=20+20)
SliceLivePointer-12 961ms ± 0% 1.35ms ±27% -99.86% (p=0.000 n=20+20)
MapNoPointer-12 448µs ±288% 119µs ±18% -73.34% (p=0.000 n=18+20)
ChanPointer-12 309450µs ± 0% 134µs ±23% -99.96% (p=0.000 n=20+19)
ChanLivePointer-12 961ms ± 0% 1.35ms ±27% -99.86% (p=0.000 n=20+20)
This has negligible effect on all metrics from the garbage, JSON, and
HTTP x/benchmarks.
It shows slight improvement on some of the go1 benchmarks,
particularly Revcomp, which uses some multi-megabyte buffers:
name old time/op new time/op delta
BinaryTree17-12 2.46s ± 1% 2.47s ± 1% +0.32% (p=0.012 n=20+20)
Fannkuch11-12 2.82s ± 0% 2.81s ± 0% -0.61% (p=0.000 n=17+20)
FmtFprintfEmpty-12 50.8ns ± 5% 50.5ns ± 2% ~ (p=0.197 n=17+19)
FmtFprintfString-12 131ns ± 1% 132ns ± 0% +0.57% (p=0.000 n=20+16)
FmtFprintfInt-12 117ns ± 0% 116ns ± 0% -0.47% (p=0.000 n=15+20)
FmtFprintfIntInt-12 180ns ± 0% 179ns ± 1% -0.78% (p=0.000 n=16+20)
FmtFprintfPrefixedInt-12 186ns ± 1% 185ns ± 1% -0.55% (p=0.000 n=19+20)
FmtFprintfFloat-12 263ns ± 1% 271ns ± 0% +2.84% (p=0.000 n=18+20)
FmtManyArgs-12 741ns ± 1% 742ns ± 1% ~ (p=0.190 n=19+19)
GobDecode-12 7.44ms ± 0% 7.35ms ± 1% -1.21% (p=0.000 n=20+20)
GobEncode-12 6.22ms ± 1% 6.21ms ± 1% ~ (p=0.336 n=20+19)
Gzip-12 220ms ± 1% 219ms ± 1% ~ (p=0.130 n=19+19)
Gunzip-12 37.9ms ± 0% 37.9ms ± 1% ~ (p=1.000 n=20+19)
HTTPClientServer-12 82.5µs ± 3% 82.6µs ± 3% ~ (p=0.776 n=20+19)
JSONEncode-12 16.4ms ± 1% 16.5ms ± 2% +0.49% (p=0.003 n=18+19)
JSONDecode-12 53.7ms ± 1% 54.1ms ± 1% +0.71% (p=0.000 n=19+18)
Mandelbrot200-12 4.19ms ± 1% 4.20ms ± 1% ~ (p=0.452 n=19+19)
GoParse-12 3.38ms ± 1% 3.37ms ± 1% ~ (p=0.123 n=19+19)
RegexpMatchEasy0_32-12 72.1ns ± 1% 71.8ns ± 1% ~ (p=0.397 n=19+17)
RegexpMatchEasy0_1K-12 242ns ± 0% 242ns ± 0% ~ (p=0.168 n=17+20)
RegexpMatchEasy1_32-12 72.1ns ± 1% 72.1ns ± 1% ~ (p=0.538 n=18+19)
RegexpMatchEasy1_1K-12 385ns ± 1% 384ns ± 1% ~ (p=0.388 n=20+20)
RegexpMatchMedium_32-12 112ns ± 1% 112ns ± 3% ~ (p=0.539 n=20+20)
RegexpMatchMedium_1K-12 34.4µs ± 2% 34.4µs ± 2% ~ (p=0.628 n=18+18)
RegexpMatchHard_32-12 1.80µs ± 1% 1.80µs ± 1% ~ (p=0.522 n=18+19)
RegexpMatchHard_1K-12 54.0µs ± 1% 54.1µs ± 1% ~ (p=0.647 n=20+19)
Revcomp-12 387ms ± 1% 369ms ± 5% -4.89% (p=0.000 n=17+19)
Template-12 62.3ms ± 1% 62.0ms ± 0% -0.48% (p=0.002 n=20+17)
TimeParse-12 314ns ± 1% 314ns ± 0% ~ (p=1.011 n=20+13)
TimeFormat-12 358ns ± 0% 354ns ± 0% -1.12% (p=0.000 n=17+20)
[Geo mean] 53.5µs 53.3µs -0.23%
Change-Id: I2a0a179d1d6bf7875dd054b7693dd12d2a340132
Reviewed-on: https://go-review.googlesource.com/23540
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-05-27 19:04:40 -06:00
|
|
|
|
// obj must point to the beginning of a heap object or an oblet.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2016-02-09 16:37:41 -07:00
|
|
|
|
func (w *gcWork) put(obj uintptr) {
|
runtime: wake idle Ps when enqueuing GC work
If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.
Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.
To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.
There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.
Fixes #14179.
Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
2016-10-30 18:43:53 -06:00
|
|
|
|
flushed := false
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf := w.wbuf1
|
2015-02-12 10:53:48 -07:00
|
|
|
|
if wbuf == nil {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
w.init()
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
// wbuf is empty at this point.
|
|
|
|
|
} else if wbuf.nobj == len(wbuf.obj) {
|
|
|
|
|
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf.nobj == len(wbuf.obj) {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putfull(wbuf)
|
|
|
|
|
wbuf = getempty()
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf1 = wbuf
|
runtime: wake idle Ps when enqueuing GC work
If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.
Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.
To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.
There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.
Fixes #14179.
Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
2016-10-30 18:43:53 -06:00
|
|
|
|
flushed = true
|
2015-10-16 07:53:11 -06:00
|
|
|
|
}
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wbuf.obj[wbuf.nobj] = obj
|
|
|
|
|
wbuf.nobj++
|
runtime: wake idle Ps when enqueuing GC work
If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.
Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.
To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.
There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.
Fixes #14179.
Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
2016-10-30 18:43:53 -06:00
|
|
|
|
|
|
|
|
|
// If we put a buffer on full, let the GC controller know so
|
|
|
|
|
// it can encourage more workers to run. We delay this until
|
|
|
|
|
// the end of put so that w is in a consistent state, since
|
|
|
|
|
// enlistWorker may itself manipulate w.
|
|
|
|
|
if flushed && gcphase == _GCmark {
|
|
|
|
|
gcController.enlistWorker()
|
|
|
|
|
}
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-14 10:17:48 -06:00
|
|
|
|
// putFast does a put and returns true if it can be done quickly
|
|
|
|
|
// otherwise it returns false and the caller needs to call put.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2016-03-14 10:17:48 -06:00
|
|
|
|
func (w *gcWork) putFast(obj uintptr) bool {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf := w.wbuf1
|
2016-03-14 10:17:48 -06:00
|
|
|
|
if wbuf == nil {
|
|
|
|
|
return false
|
|
|
|
|
} else if wbuf.nobj == len(wbuf.obj) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wbuf.obj[wbuf.nobj] = obj
|
|
|
|
|
wbuf.nobj++
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
runtime: buffered write barrier implementation
This implements runtime support for buffered write barriers on amd64.
The buffered write barrier has a fast path that simply enqueues
pointers in a per-P buffer. Unlike the current write barrier, this
fast path is *not* a normal Go call and does not require the compiler
to spill general-purpose registers or put arguments on the stack. When
the buffer fills up, the write barrier takes the slow path, which
spills all general purpose registers and flushes the buffer. We don't
allow safe-points or stack splits while this frame is active, so it
doesn't matter that we have no type information for the spilled
registers in this frame.
One minor complication is cgocheck=2 mode, which uses the write
barrier to detect Go pointers being written to non-Go memory. We
obviously can't buffer this, so instead we set the buffer to its
minimum size, forcing the write barrier into the slow path on every
call. For this specific case, we pass additional information as
arguments to the flush function. This also requires enabling the cgo
write barrier slightly later during runtime initialization, after Ps
(and the per-P write barrier buffers) have been initialized.
The code in this CL is not yet active. The next CL will modify the
compiler to generate calls to the new write barrier.
This reduces the average cost of the write barrier by roughly a factor
of 4, which will pay for the cost of having it enabled more of the
time after we make the GC pacer less aggressive. (Benchmarks will be
in the next CL.)
Updates #14951.
Updates #22460.
Change-Id: I396b5b0e2c5e5c4acfd761a3235fd15abadc6cb1
Reviewed-on: https://go-review.googlesource.com/73711
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-10-26 10:21:16 -06:00
|
|
|
|
// putBatch performs a put on every pointer in obj. See put for
|
|
|
|
|
// constraints on these pointers.
|
|
|
|
|
//
|
|
|
|
|
//go:nowritebarrierrec
|
|
|
|
|
func (w *gcWork) putBatch(obj []uintptr) {
|
|
|
|
|
if len(obj) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
flushed := false
|
|
|
|
|
wbuf := w.wbuf1
|
|
|
|
|
if wbuf == nil {
|
|
|
|
|
w.init()
|
|
|
|
|
wbuf = w.wbuf1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for len(obj) > 0 {
|
|
|
|
|
for wbuf.nobj == len(wbuf.obj) {
|
|
|
|
|
putfull(wbuf)
|
|
|
|
|
w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
|
|
|
|
|
wbuf = w.wbuf1
|
|
|
|
|
flushed = true
|
|
|
|
|
}
|
|
|
|
|
n := copy(wbuf.obj[wbuf.nobj:], obj)
|
|
|
|
|
wbuf.nobj += n
|
|
|
|
|
obj = obj[n:]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if flushed && gcphase == _GCmark {
|
|
|
|
|
gcController.enlistWorker()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// tryGet dequeues a pointer for the garbage collector to trace.
|
|
|
|
|
//
|
|
|
|
|
// If there are no pointers remaining in this gcWork or in the global
|
|
|
|
|
// queue, tryGet returns 0. Note that there may still be pointers in
|
|
|
|
|
// other gcWork instances or other caches.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2016-02-09 16:37:41 -07:00
|
|
|
|
func (w *gcWork) tryGet() uintptr {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf := w.wbuf1
|
2015-02-12 10:53:48 -07:00
|
|
|
|
if wbuf == nil {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
w.init()
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
// wbuf is empty at this point.
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
if wbuf.nobj == 0 {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf.nobj == 0 {
|
|
|
|
|
owbuf := wbuf
|
2016-02-25 19:01:16 -07:00
|
|
|
|
wbuf = trygetfull()
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf == nil {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putempty(owbuf)
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf1 = wbuf
|
2015-10-16 07:53:11 -06:00
|
|
|
|
}
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
2015-10-16 07:53:11 -06:00
|
|
|
|
wbuf.nobj--
|
|
|
|
|
return wbuf.obj[wbuf.nobj]
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-14 10:17:48 -06:00
|
|
|
|
// tryGetFast dequeues a pointer for the garbage collector to trace
|
|
|
|
|
// if one is readily available. Otherwise it returns 0 and
|
|
|
|
|
// the caller is expected to call tryGet().
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2016-03-14 10:17:48 -06:00
|
|
|
|
func (w *gcWork) tryGetFast() uintptr {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf := w.wbuf1
|
2016-03-14 10:17:48 -06:00
|
|
|
|
if wbuf == nil {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
if wbuf.nobj == 0 {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wbuf.nobj--
|
|
|
|
|
return wbuf.obj[wbuf.nobj]
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// get dequeues a pointer for the garbage collector to trace, blocking
|
|
|
|
|
// if necessary to ensure all pointers from all queues and caches have
|
|
|
|
|
// been retrieved. get returns 0 if there are no pointers remaining.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2016-02-09 16:37:41 -07:00
|
|
|
|
func (w *gcWork) get() uintptr {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf := w.wbuf1
|
2015-02-12 10:53:48 -07:00
|
|
|
|
if wbuf == nil {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
w.init()
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
// wbuf is empty at this point.
|
|
|
|
|
}
|
|
|
|
|
if wbuf.nobj == 0 {
|
|
|
|
|
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf1
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf.nobj == 0 {
|
|
|
|
|
owbuf := wbuf
|
2016-02-25 19:01:16 -07:00
|
|
|
|
wbuf = getfull()
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf == nil {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putempty(owbuf)
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf1 = wbuf
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: This might be a good place to add prefetch code
|
|
|
|
|
|
|
|
|
|
wbuf.nobj--
|
2015-10-16 07:53:11 -06:00
|
|
|
|
return wbuf.obj[wbuf.nobj]
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// dispose returns any cached pointers to the global queue.
|
2015-07-29 10:03:54 -06:00
|
|
|
|
// The buffers are being put on the full queue so that the
|
|
|
|
|
// write barriers will not simply reacquire them before the
|
|
|
|
|
// GC can inspect them. This helps reduce the mutator's
|
|
|
|
|
// ability to hide pointers during the concurrent mark phase.
|
|
|
|
|
//
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2015-02-12 10:53:48 -07:00
|
|
|
|
func (w *gcWork) dispose() {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
if wbuf := w.wbuf1; wbuf != nil {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf.nobj == 0 {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putempty(wbuf)
|
2015-10-16 07:53:11 -06:00
|
|
|
|
} else {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putfull(wbuf)
|
2015-07-29 10:03:54 -06:00
|
|
|
|
}
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf1 = nil
|
2015-10-16 07:53:11 -06:00
|
|
|
|
|
2017-01-29 20:47:27 -07:00
|
|
|
|
wbuf = w.wbuf2
|
2015-10-16 07:53:11 -06:00
|
|
|
|
if wbuf.nobj == 0 {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putempty(wbuf)
|
2015-10-16 07:53:11 -06:00
|
|
|
|
} else {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putfull(wbuf)
|
2015-10-16 07:53:11 -06:00
|
|
|
|
}
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf2 = nil
|
2015-03-12 11:09:30 -06:00
|
|
|
|
}
|
2015-03-12 14:53:57 -06:00
|
|
|
|
if w.bytesMarked != 0 {
|
|
|
|
|
// dispose happens relatively infrequently. If this
|
|
|
|
|
// atomic becomes a problem, we should first try to
|
|
|
|
|
// dispose less and if necessary aggregate in a per-P
|
|
|
|
|
// counter.
|
2015-11-02 12:09:24 -07:00
|
|
|
|
atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
|
2015-03-12 14:53:57 -06:00
|
|
|
|
w.bytesMarked = 0
|
|
|
|
|
}
|
2015-03-12 10:08:47 -06:00
|
|
|
|
if w.scanWork != 0 {
|
2015-11-02 12:09:24 -07:00
|
|
|
|
atomic.Xaddint64(&gcController.scanWork, w.scanWork)
|
2015-03-12 10:08:47 -06:00
|
|
|
|
w.scanWork = 0
|
|
|
|
|
}
|
2015-03-12 11:09:30 -06:00
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// balance moves some work that's cached in this gcWork back on the
|
|
|
|
|
// global queue.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2015-02-12 10:53:48 -07:00
|
|
|
|
func (w *gcWork) balance() {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
if w.wbuf1 == nil {
|
2015-10-16 07:53:11 -06:00
|
|
|
|
return
|
|
|
|
|
}
|
2017-01-29 20:47:27 -07:00
|
|
|
|
if wbuf := w.wbuf2; wbuf.nobj != 0 {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putfull(wbuf)
|
2017-01-29 20:47:27 -07:00
|
|
|
|
w.wbuf2 = getempty()
|
|
|
|
|
} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
|
|
|
|
|
w.wbuf1 = handoff(wbuf)
|
runtime: wake idle Ps when enqueuing GC work
If the scheduler has no user work and there's no GC work visible, it
puts the P to sleep (or blocks on the network). However, if we later
enqueue more GC work, there's currently nothing that specifically
wakes up the scheduler to let it start an idle GC worker. As a result,
we can underutilize the CPU during GC if Ps have been put to sleep.
Fix this by making GC wake idle Ps when work buffers are put on the
full list. We already have a hook to do this, since we use this to
preempt a random P if we need more dedicated workers. We expand this
hook to instead wake an idle P if there is one. The logic we use for
this is identical to the logic used to wake an idle P when we ready a
goroutine.
To make this really sound, we also fix the scheduler to re-check the
idle GC worker condition after releasing its P. This closes a race
where 1) the scheduler checks for idle work and finds none, 2) new
work is enqueued but there are no idle Ps so none are woken, and 3)
the scheduler releases its P.
There is one subtlety here. Currently we call enlistWorker directly
from putfull, but the gcWork is in an inconsistent state in the places
that call putfull. This isn't a problem right now because nothing that
enlistWorker does touches the gcWork, but with the added call to
wakep, it's possible to get a recursive call into the gcWork
(specifically, while write barriers are disallowed, this can do an
allocation, which can dispose a gcWork, which can put a workbuf). To
handle this, we lift the enlistWorker calls up a layer and delay them
until the gcWork is in a consistent state.
Fixes #14179.
Change-Id: Ia2467a52e54c9688c3c1752e1fc00f5b37bbfeeb
Reviewed-on: https://go-review.googlesource.com/32434
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
2016-10-30 18:43:53 -06:00
|
|
|
|
} else {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// We flushed a buffer to the full list, so wake a worker.
|
|
|
|
|
if gcphase == _GCmark {
|
|
|
|
|
gcController.enlistWorker()
|
2015-02-12 10:53:48 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-18 14:02:37 -06:00
|
|
|
|
// empty returns true if w has no mark work available.
|
2017-10-19 17:57:46 -06:00
|
|
|
|
//go:nowritebarrierrec
|
2015-05-18 14:02:37 -06:00
|
|
|
|
func (w *gcWork) empty() bool {
|
2017-01-29 20:47:27 -07:00
|
|
|
|
return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
|
2015-05-18 14:02:37 -06:00
|
|
|
|
}
|
|
|
|
|
|
2015-02-12 10:53:48 -07:00
|
|
|
|
// Internally, the GC work pool is kept in arrays in work buffers.
|
|
|
|
|
// The gcWork interface caches a work buffer until full (or empty) to
|
|
|
|
|
// avoid contending on the global work buffer lists.
|
|
|
|
|
|
2015-02-12 07:58:25 -07:00
|
|
|
|
type workbufhdr struct {
|
2016-02-25 19:01:16 -07:00
|
|
|
|
node lfnode // must be first
|
|
|
|
|
nobj int
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
|
|
|
|
|
2016-10-11 20:58:21 -06:00
|
|
|
|
//go:notinheap
|
2015-02-12 07:58:25 -07:00
|
|
|
|
type workbuf struct {
|
|
|
|
|
workbufhdr
|
|
|
|
|
// account for the above fields
|
2015-11-11 10:39:30 -07:00
|
|
|
|
obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// workbuf factory routines. These funcs are used to manage the
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
|
// workbufs.
|
2015-02-12 07:58:25 -07:00
|
|
|
|
// If the GC asks for some work these are the only routines that
|
2015-10-15 13:44:16 -06:00
|
|
|
|
// make wbufs available to the GC.
|
2015-02-12 07:58:25 -07:00
|
|
|
|
|
|
|
|
|
func (b *workbuf) checknonempty() {
|
|
|
|
|
if b.nobj == 0 {
|
|
|
|
|
throw("workbuf is empty")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (b *workbuf) checkempty() {
|
|
|
|
|
if b.nobj != 0 {
|
|
|
|
|
throw("workbuf is not empty")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// getempty pops an empty work buffer off the work.empty list,
|
|
|
|
|
// allocating new buffers if none are available.
|
|
|
|
|
//go:nowritebarrier
|
2016-02-25 19:01:16 -07:00
|
|
|
|
func getempty() *workbuf {
|
2015-02-12 07:58:25 -07:00
|
|
|
|
var b *workbuf
|
|
|
|
|
if work.empty != 0 {
|
2017-03-07 14:38:29 -07:00
|
|
|
|
b = (*workbuf)(work.empty.pop())
|
2015-02-12 07:58:25 -07:00
|
|
|
|
if b != nil {
|
|
|
|
|
b.checkempty()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if b == nil {
|
2017-03-20 12:05:48 -06:00
|
|
|
|
// Allocate more workbufs.
|
|
|
|
|
var s *mspan
|
2017-03-20 15:25:59 -06:00
|
|
|
|
if work.wbufSpans.free.first != nil {
|
|
|
|
|
lock(&work.wbufSpans.lock)
|
|
|
|
|
s = work.wbufSpans.free.first
|
|
|
|
|
if s != nil {
|
|
|
|
|
work.wbufSpans.free.remove(s)
|
|
|
|
|
work.wbufSpans.busy.insert(s)
|
|
|
|
|
}
|
|
|
|
|
unlock(&work.wbufSpans.lock)
|
|
|
|
|
}
|
2017-03-20 12:05:48 -06:00
|
|
|
|
if s == nil {
|
2017-03-20 15:25:59 -06:00
|
|
|
|
systemstack(func() {
|
|
|
|
|
s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
|
|
|
|
|
})
|
|
|
|
|
if s == nil {
|
|
|
|
|
throw("out of memory")
|
|
|
|
|
}
|
|
|
|
|
// Record the new span in the busy list.
|
|
|
|
|
lock(&work.wbufSpans.lock)
|
|
|
|
|
work.wbufSpans.busy.insert(s)
|
|
|
|
|
unlock(&work.wbufSpans.lock)
|
2017-03-20 12:05:48 -06:00
|
|
|
|
}
|
|
|
|
|
// Slice up the span into new workbufs. Return one and
|
|
|
|
|
// put the rest on the empty list.
|
|
|
|
|
for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
|
|
|
|
|
newb := (*workbuf)(unsafe.Pointer(s.base() + i))
|
|
|
|
|
newb.nobj = 0
|
2017-12-03 17:28:28 -07:00
|
|
|
|
lfnodeValidate(&newb.node)
|
2017-03-20 12:05:48 -06:00
|
|
|
|
if i == 0 {
|
|
|
|
|
b = newb
|
|
|
|
|
} else {
|
|
|
|
|
putempty(newb)
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// putempty puts a workbuf onto the work.empty list.
|
2017-03-07 14:38:29 -07:00
|
|
|
|
// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
|
2015-02-12 07:58:25 -07:00
|
|
|
|
//go:nowritebarrier
|
2016-02-25 19:01:16 -07:00
|
|
|
|
func putempty(b *workbuf) {
|
2015-02-12 07:58:25 -07:00
|
|
|
|
b.checkempty()
|
2017-03-07 14:38:29 -07:00
|
|
|
|
work.empty.push(&b.node)
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// putfull puts the workbuf on the work.full list for the GC.
|
|
|
|
|
// putfull accepts partially full buffers so the GC can avoid competing
|
|
|
|
|
// with the mutators for ownership of partially full buffers.
|
|
|
|
|
//go:nowritebarrier
|
2016-02-25 19:01:16 -07:00
|
|
|
|
func putfull(b *workbuf) {
|
2015-02-12 07:58:25 -07:00
|
|
|
|
b.checknonempty()
|
2017-03-07 14:38:29 -07:00
|
|
|
|
work.full.push(&b.node)
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// trygetfull tries to get a full or partially empty workbuffer.
|
|
|
|
|
// If one is not immediately available return nil
|
|
|
|
|
//go:nowritebarrier
|
2016-02-25 19:01:16 -07:00
|
|
|
|
func trygetfull() *workbuf {
|
2017-03-07 14:38:29 -07:00
|
|
|
|
b := (*workbuf)(work.full.pop())
|
2015-02-12 07:58:25 -07:00
|
|
|
|
if b != nil {
|
|
|
|
|
b.checknonempty()
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-15 13:44:16 -06:00
|
|
|
|
// Get a full work buffer off the work.full list.
|
|
|
|
|
// If nothing is available wait until all the other gc helpers have
|
|
|
|
|
// finished and then return nil.
|
2015-02-12 07:58:25 -07:00
|
|
|
|
// getfull acts as a barrier for work.nproc helpers. As long as one
|
|
|
|
|
// gchelper is actively marking objects it
|
|
|
|
|
// may create a workbuffer that the other helpers can work on.
|
|
|
|
|
// The for loop either exits when a work buffer is found
|
|
|
|
|
// or when _all_ of the work.nproc GC helpers are in the loop
|
|
|
|
|
// looking for work and thus not capable of creating new work.
|
|
|
|
|
// This is in fact the termination condition for the STW mark
|
|
|
|
|
// phase.
|
|
|
|
|
//go:nowritebarrier
|
2016-02-25 19:01:16 -07:00
|
|
|
|
func getfull() *workbuf {
|
2017-03-07 14:38:29 -07:00
|
|
|
|
b := (*workbuf)(work.full.pop())
|
2015-02-12 07:58:25 -07:00
|
|
|
|
if b != nil {
|
|
|
|
|
b.checknonempty()
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
|
incnwait := atomic.Xadd(&work.nwait, +1)
|
2015-06-01 16:16:03 -06:00
|
|
|
|
if incnwait > work.nproc {
|
|
|
|
|
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
|
|
|
|
|
throw("work.nwait > work.nproc")
|
|
|
|
|
}
|
2015-02-12 07:58:25 -07:00
|
|
|
|
for i := 0; ; i++ {
|
2015-10-15 13:44:16 -06:00
|
|
|
|
if work.full != 0 {
|
2015-11-02 12:09:24 -07:00
|
|
|
|
decnwait := atomic.Xadd(&work.nwait, -1)
|
2015-06-01 16:16:03 -06:00
|
|
|
|
if decnwait == work.nproc {
|
|
|
|
|
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
|
|
|
|
|
throw("work.nwait > work.nproc")
|
|
|
|
|
}
|
2017-03-07 14:38:29 -07:00
|
|
|
|
b = (*workbuf)(work.full.pop())
|
2015-02-12 07:58:25 -07:00
|
|
|
|
if b != nil {
|
|
|
|
|
b.checknonempty()
|
|
|
|
|
return b
|
|
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
|
incnwait := atomic.Xadd(&work.nwait, +1)
|
2015-06-01 16:16:03 -06:00
|
|
|
|
if incnwait > work.nproc {
|
|
|
|
|
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
|
|
|
|
|
throw("work.nwait > work.nproc")
|
|
|
|
|
}
|
2015-02-12 07:58:25 -07:00
|
|
|
|
}
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
|
if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
|
2015-02-12 07:58:25 -07:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
if i < 10 {
|
|
|
|
|
procyield(20)
|
|
|
|
|
} else if i < 20 {
|
|
|
|
|
osyield()
|
|
|
|
|
} else {
|
|
|
|
|
usleep(100)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func handoff(b *workbuf) *workbuf {
|
|
|
|
|
// Make new buffer with half of b's pointers.
|
2016-02-25 19:01:16 -07:00
|
|
|
|
b1 := getempty()
|
2015-02-12 07:58:25 -07:00
|
|
|
|
n := b.nobj / 2
|
|
|
|
|
b.nobj -= n
|
|
|
|
|
b1.nobj = n
|
2015-02-18 19:56:12 -07:00
|
|
|
|
memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
|
2015-02-12 07:58:25 -07:00
|
|
|
|
|
|
|
|
|
// Put b on full list - let first half of b get stolen.
|
2016-02-25 19:01:16 -07:00
|
|
|
|
putfull(b)
|
2015-02-12 07:58:25 -07:00
|
|
|
|
return b1
|
|
|
|
|
}
|
2017-03-20 15:25:59 -06:00
|
|
|
|
|
|
|
|
|
// prepareFreeWorkbufs moves busy workbuf spans to free list so they
|
|
|
|
|
// can be freed to the heap. This must only be called when all
|
|
|
|
|
// workbufs are on the empty list.
|
|
|
|
|
func prepareFreeWorkbufs() {
|
|
|
|
|
lock(&work.wbufSpans.lock)
|
|
|
|
|
if work.full != 0 {
|
|
|
|
|
throw("cannot free workbufs when work.full != 0")
|
|
|
|
|
}
|
|
|
|
|
// Since all workbufs are on the empty list, we don't care
|
|
|
|
|
// which ones are in which spans. We can wipe the entire empty
|
|
|
|
|
// list and move all workbuf spans to the free list.
|
|
|
|
|
work.empty = 0
|
|
|
|
|
work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
|
|
|
|
|
unlock(&work.wbufSpans.lock)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// freeSomeWbufs frees some workbufs back to the heap and returns
|
|
|
|
|
// true if it should be called again to free more.
|
|
|
|
|
func freeSomeWbufs(preemptible bool) bool {
|
|
|
|
|
const batchSize = 64 // ~1–2 µs per span.
|
|
|
|
|
lock(&work.wbufSpans.lock)
|
|
|
|
|
if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
|
|
|
|
|
unlock(&work.wbufSpans.lock)
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
gp := getg().m.curg
|
|
|
|
|
for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
|
|
|
|
|
span := work.wbufSpans.free.first
|
|
|
|
|
if span == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
work.wbufSpans.free.remove(span)
|
|
|
|
|
mheap_.freeManual(span, &memstats.gc_sys)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
more := !work.wbufSpans.free.isEmpty()
|
|
|
|
|
unlock(&work.wbufSpans.lock)
|
|
|
|
|
return more
|
|
|
|
|
}
|