2015-02-19 11:38:46 -07:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// Garbage collector: marking and scanning
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
import (
|
|
|
|
"runtime/internal/atomic"
|
2015-11-11 10:39:30 -07:00
|
|
|
"runtime/internal/sys"
|
2015-11-02 12:09:24 -07:00
|
|
|
"unsafe"
|
|
|
|
)
|
2015-02-19 11:38:46 -07:00
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
const (
|
|
|
|
fixedRootFinalizers = iota
|
|
|
|
fixedRootFlushCaches
|
|
|
|
fixedRootCount
|
|
|
|
|
|
|
|
// rootBlockBytes is the number of bytes to scan per data or
|
|
|
|
// BSS root.
|
|
|
|
rootBlockBytes = 256 << 10
|
|
|
|
|
|
|
|
// rootBlockSpans is the number of spans to scan per span
|
|
|
|
// root.
|
|
|
|
rootBlockSpans = 8 * 1024 // 64MB worth of spans
|
|
|
|
)
|
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
|
|
|
|
// some miscellany) and initializes scanning-related state.
|
2015-10-16 14:52:26 -06:00
|
|
|
//
|
|
|
|
// The caller must have call gcCopySpans().
|
|
|
|
//
|
|
|
|
//go:nowritebarrier
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
func gcMarkRootPrepare() {
|
2015-10-16 14:52:26 -06:00
|
|
|
// Compute how many data and BSS root blocks there are.
|
|
|
|
nBlocks := func(bytes uintptr) int {
|
|
|
|
return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
|
|
|
|
}
|
|
|
|
|
|
|
|
work.nDataRoots = 0
|
|
|
|
for datap := &firstmoduledata; datap != nil; datap = datap.next {
|
|
|
|
nDataRoots := nBlocks(datap.edata - datap.data)
|
|
|
|
if nDataRoots > work.nDataRoots {
|
|
|
|
work.nDataRoots = nDataRoots
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
work.nBSSRoots = 0
|
|
|
|
for datap := &firstmoduledata; datap != nil; datap = datap.next {
|
|
|
|
nBSSRoots := nBlocks(datap.ebss - datap.bss)
|
|
|
|
if nBSSRoots > work.nBSSRoots {
|
|
|
|
work.nBSSRoots = nBSSRoots
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute number of span roots.
|
|
|
|
work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
|
|
|
|
|
|
|
|
// Snapshot of allglen. During concurrent scan, we just need
|
|
|
|
// to be consistent about how many markroot jobs we create and
|
|
|
|
// how many Gs we check. Gs may be created after this point,
|
|
|
|
// but it's okay that we ignore them because they begin life
|
|
|
|
// without any roots, so there's nothing to scan, and any
|
|
|
|
// roots they create during the concurrent phase will be
|
|
|
|
// scanned during mark termination. During mark termination,
|
|
|
|
// allglen isn't changing, so we'll scan all Gs.
|
2015-11-02 12:09:24 -07:00
|
|
|
work.nStackRoots = int(atomic.Loaduintptr(&allglen))
|
2015-10-16 14:52:26 -06:00
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
work.markrootNext = 0
|
|
|
|
work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
|
2015-10-16 14:52:26 -06:00
|
|
|
}
|
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// gcMarkRootCheck checks that all roots have been scanned. It is
|
|
|
|
// purely for debugging.
|
|
|
|
func gcMarkRootCheck() {
|
|
|
|
if work.markrootNext < work.markrootJobs {
|
|
|
|
print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
|
|
|
|
throw("left over markroot jobs")
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
lock(&allglock)
|
|
|
|
// Check that gc work is done.
|
2015-10-16 14:52:26 -06:00
|
|
|
for i := 0; i < work.nStackRoots; i++ {
|
2015-02-19 11:38:46 -07:00
|
|
|
gp := allgs[i]
|
2015-06-16 17:20:18 -06:00
|
|
|
if !gp.gcscandone {
|
2015-02-19 11:38:46 -07:00
|
|
|
throw("scan missed a g")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
unlock(&allglock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// ptrmask for an allocation containing a single pointer.
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
var oneptrmask = [...]uint8{1}
|
2015-02-19 11:38:46 -07:00
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// markroot scans the i'th root.
|
|
|
|
//
|
|
|
|
// Preemption must be disabled (because this uses a gcWork).
|
|
|
|
//
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2015-11-23 16:44:03 -07:00
|
|
|
func markroot(gcw *gcWork, i uint32) {
|
2015-10-16 14:52:26 -06:00
|
|
|
baseData := uint32(fixedRootCount)
|
|
|
|
baseBSS := baseData + uint32(work.nDataRoots)
|
|
|
|
baseSpans := baseBSS + uint32(work.nBSSRoots)
|
|
|
|
baseStacks := baseSpans + uint32(work.nSpanRoots)
|
|
|
|
|
2015-03-11 13:58:47 -06:00
|
|
|
// Note: if you add a case here, please also update heapdump.go:dumproots.
|
2015-10-16 14:52:26 -06:00
|
|
|
switch {
|
|
|
|
case baseData <= i && i < baseBSS:
|
2015-04-06 18:55:02 -06:00
|
|
|
for datap := &firstmoduledata; datap != nil; datap = datap.next {
|
2015-11-23 16:44:03 -07:00
|
|
|
markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
|
2015-03-29 15:59:00 -06:00
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
case baseBSS <= i && i < baseSpans:
|
2015-04-06 18:55:02 -06:00
|
|
|
for datap := &firstmoduledata; datap != nil; datap = datap.next {
|
2015-11-23 16:44:03 -07:00
|
|
|
markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-baseBSS))
|
2015-03-29 15:59:00 -06:00
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
case i == fixedRootFinalizers:
|
2015-02-19 11:38:46 -07:00
|
|
|
for fb := allfin; fb != nil; fb = fb.alllink {
|
2015-11-23 16:44:03 -07:00
|
|
|
scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
case i == fixedRootFlushCaches:
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
if gcphase == _GCmarktermination { // Do not flush mcaches during concurrent phase.
|
2015-02-19 11:38:46 -07:00
|
|
|
flushallmcaches()
|
|
|
|
}
|
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
case baseSpans <= i && i < baseStacks:
|
|
|
|
// mark MSpan.specials
|
2015-11-23 16:44:03 -07:00
|
|
|
markrootSpans(gcw, int(i-baseSpans))
|
2015-08-04 08:45:29 -06:00
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
default:
|
2015-02-19 11:38:46 -07:00
|
|
|
// the rest is scanning goroutine stacks
|
2015-10-16 14:52:26 -06:00
|
|
|
if uintptr(i-baseStacks) >= allglen {
|
2015-02-19 11:38:46 -07:00
|
|
|
throw("markroot: bad index")
|
|
|
|
}
|
2015-10-16 14:52:26 -06:00
|
|
|
gp := allgs[i-baseStacks]
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
// remember when we've first observed the G blocked
|
|
|
|
// needed only to output in traceback
|
|
|
|
status := readgstatus(gp) // We are not in a scan state
|
|
|
|
if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
|
|
|
|
gp.waitsince = work.tstart
|
|
|
|
}
|
|
|
|
|
2016-02-15 16:30:48 -07:00
|
|
|
if gcphase == _GCmarktermination && status == _Gdead {
|
|
|
|
// Free gp's stack if necessary. Only do this
|
|
|
|
// during mark termination because otherwise
|
|
|
|
// _Gdead may be transient.
|
2015-02-19 11:38:46 -07:00
|
|
|
shrinkstack(gp)
|
|
|
|
}
|
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC {
|
|
|
|
// GC background workers may be
|
|
|
|
// non-preemptible, so we may deadlock if we
|
|
|
|
// try to scan them during a concurrent phase.
|
|
|
|
// They also have tiny stacks, so just ignore
|
|
|
|
// them until mark termination.
|
|
|
|
gp.gcscandone = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
// scang must be done on the system stack in case
|
|
|
|
// we're trying to scan our own stack.
|
|
|
|
systemstack(func() {
|
|
|
|
// If this is a self-scan, put the user G in
|
|
|
|
// _Gwaiting to prevent self-deadlock. It may
|
|
|
|
// already be in _Gwaiting if this is mark
|
|
|
|
// termination.
|
|
|
|
userG := getg().m.curg
|
|
|
|
selfScan := gp == userG && readgstatus(userG) == _Grunning
|
|
|
|
if selfScan {
|
|
|
|
casgstatus(userG, _Grunning, _Gwaiting)
|
|
|
|
userG.waitreason = "garbage collection scan"
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: scang blocks until gp's stack has
|
|
|
|
// been scanned, which may take a while for
|
|
|
|
// running goroutines. Consider doing this in
|
|
|
|
// two phases where the first is non-blocking:
|
|
|
|
// we scan the stacks we can and ask running
|
|
|
|
// goroutines to scan themselves; and the
|
|
|
|
// second blocks.
|
|
|
|
scang(gp)
|
|
|
|
|
|
|
|
if selfScan {
|
|
|
|
casgstatus(userG, _Gwaiting, _Grunning)
|
|
|
|
}
|
|
|
|
})
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-16 14:52:26 -06:00
|
|
|
// markrootBlock scans the shard'th shard of the block of memory [b0,
|
|
|
|
// b0+n0), with the given pointer mask.
|
|
|
|
//
|
|
|
|
//go:nowritebarrier
|
|
|
|
func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
|
2015-11-11 10:39:30 -07:00
|
|
|
if rootBlockBytes%(8*sys.PtrSize) != 0 {
|
2015-10-16 14:52:26 -06:00
|
|
|
// This is necessary to pick byte offsets in ptrmask0.
|
|
|
|
throw("rootBlockBytes must be a multiple of 8*ptrSize")
|
|
|
|
}
|
|
|
|
|
|
|
|
b := b0 + uintptr(shard)*rootBlockBytes
|
|
|
|
if b >= b0+n0 {
|
|
|
|
return
|
|
|
|
}
|
2015-11-11 10:39:30 -07:00
|
|
|
ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*sys.PtrSize))))
|
2015-10-16 14:52:26 -06:00
|
|
|
n := uintptr(rootBlockBytes)
|
|
|
|
if b+n > b0+n0 {
|
|
|
|
n = b0 + n0 - b
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan this shard.
|
|
|
|
scanblock(b, n, ptrmask, gcw)
|
|
|
|
}
|
|
|
|
|
|
|
|
// markrootSpans marks roots for one shard of work.spans.
|
2015-08-04 08:45:29 -06:00
|
|
|
//
|
|
|
|
//go:nowritebarrier
|
|
|
|
func markrootSpans(gcw *gcWork, shard int) {
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
// Objects with finalizers have two GC-related invariants:
|
|
|
|
//
|
|
|
|
// 1) Everything reachable from the object must be marked.
|
|
|
|
// This ensures that when we pass the object to its finalizer,
|
|
|
|
// everything the finalizer can reach will be retained.
|
|
|
|
//
|
|
|
|
// 2) Finalizer specials (which are not in the garbage
|
|
|
|
// collected heap) are roots. In practice, this means the fn
|
|
|
|
// field must be scanned.
|
|
|
|
//
|
|
|
|
// TODO(austin): There are several ideas for making this more
|
|
|
|
// efficient in issue #11485.
|
|
|
|
|
|
|
|
// We process objects with finalizers only during the first
|
|
|
|
// markroot pass. In concurrent GC, this happens during
|
2016-02-15 16:24:06 -07:00
|
|
|
// concurrent mark and we depend on addfinalizer to ensure the
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
// above invariants for objects that get finalizers after
|
2016-02-15 16:24:06 -07:00
|
|
|
// concurrent mark. In STW GC, this will happen during mark
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
// termination.
|
2016-02-15 16:24:06 -07:00
|
|
|
if work.markrootDone {
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-08-04 08:45:29 -06:00
|
|
|
sg := mheap_.sweepgen
|
2015-10-16 14:52:26 -06:00
|
|
|
startSpan := shard * rootBlockSpans
|
|
|
|
endSpan := (shard + 1) * rootBlockSpans
|
|
|
|
if endSpan > len(work.spans) {
|
|
|
|
endSpan = len(work.spans)
|
|
|
|
}
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
// Note that work.spans may not include spans that were
|
|
|
|
// allocated between entering the scan phase and now. This is
|
|
|
|
// okay because any objects with finalizers in those spans
|
|
|
|
// must have been allocated and given finalizers after we
|
|
|
|
// entered the scan phase, so addfinalizer will have ensured
|
|
|
|
// the above invariants for them.
|
2015-09-14 12:28:09 -06:00
|
|
|
for _, s := range work.spans[startSpan:endSpan] {
|
2015-08-04 08:45:29 -06:00
|
|
|
if s.state != mSpanInUse {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !useCheckmark && s.sweepgen != sg {
|
|
|
|
// sweepgen was updated (+2) during non-checkmark GC pass
|
|
|
|
print("sweep ", s.sweepgen, " ", sg, "\n")
|
|
|
|
throw("gc: unswept span")
|
|
|
|
}
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
|
|
|
|
// Speculatively check if there are any specials
|
|
|
|
// without acquiring the span lock. This may race with
|
|
|
|
// adding the first special to a span, but in that
|
|
|
|
// case addfinalizer will observe that the GC is
|
|
|
|
// active (which is globally synchronized) and ensure
|
|
|
|
// the above invariants. We may also ensure the
|
|
|
|
// invariants, but it's okay to scan an object twice.
|
|
|
|
if s.specials == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lock the specials to prevent a special from being
|
|
|
|
// removed from the list while we're traversing it.
|
|
|
|
lock(&s.speciallock)
|
|
|
|
|
2015-08-04 08:45:29 -06:00
|
|
|
for sp := s.specials; sp != nil; sp = sp.next {
|
|
|
|
if sp.kind != _KindSpecialFinalizer {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// don't mark finalized object, but scan it so we
|
|
|
|
// retain everything it points to.
|
|
|
|
spf := (*specialfinalizer)(unsafe.Pointer(sp))
|
|
|
|
// A finalizer can be set for an inner byte of an object, find object beginning.
|
|
|
|
p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
|
|
|
|
// Mark everything that can be reached from
|
|
|
|
// the object (but *not* the object itself or
|
|
|
|
// we'll never collect it).
|
|
|
|
scanobject(p, gcw)
|
|
|
|
|
|
|
|
// The special itself is a root.
|
2015-11-11 10:39:30 -07:00
|
|
|
scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
|
2015-08-04 08:45:29 -06:00
|
|
|
}
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 12:39:27 -06:00
|
|
|
|
|
|
|
unlock(&s.speciallock)
|
2015-08-04 08:45:29 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-04 21:56:11 -06:00
|
|
|
// gcAssistAlloc performs GC work to make gp's assist debt positive.
|
|
|
|
// gp must be the calling user gorountine.
|
2015-03-16 12:22:00 -06:00
|
|
|
//
|
2015-10-04 21:56:11 -06:00
|
|
|
// This must be called with preemption enabled.
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2015-10-04 21:56:11 -06:00
|
|
|
func gcAssistAlloc(gp *g) {
|
2015-07-22 13:14:54 -06:00
|
|
|
// Don't assist in non-preemptible contexts. These are
|
|
|
|
// generally fragile and won't allow the assist to block.
|
|
|
|
if getg() == gp.m.g0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if mp := getg().m; mp.locks > 0 || mp.preemptoff != "" {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
// Compute the amount of scan work we need to do to make the
|
2015-10-04 21:56:11 -06:00
|
|
|
// balance positive. We over-assist to build up credit for
|
|
|
|
// future allocations and amortize the cost of assisting.
|
|
|
|
debtBytes := -gp.gcAssistBytes + gcOverAssistBytes
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
|
2015-03-16 12:22:00 -06:00
|
|
|
|
runtime: retry GC assist until debt is paid off
Currently, there are three ways to satisfy a GC assist: 1) the mutator
steals credit from background GC, 2) the mutator actually does GC
work, and 3) there is no more work available. 3 was never really
intended as a way to satisfy an assist, and it causes problems: there
are periods when it's expected that the GC won't have any work, such
as when transitioning from mark 1 to mark 2 and from mark 2 to mark
termination. During these periods, there's no back-pressure on rapidly
allocating mutators, which lets them race ahead of the heap goal.
For example, test/init1.go and the runtime/trace test both have small
reachable heaps and contain loops that rapidly allocate large garbage
byte slices. This bug lets these tests exceed the heap goal by several
orders of magnitude.
Fix this by forcing the assist (and hence the allocation) to block
until it can satisfy its debt via either 1 or 2, or the GC cycle
terminates.
This fixes one the causes of #11677. It's still possible to overshoot
the GC heap goal, but with this change the overshoot is almost exactly
by the amount of allocation that happens during the concurrent scan
phase, between when the heap passes the GC trigger and when the GC
enables assists.
Change-Id: I5ef4edcb0d2e13a1e432e66e8245f2bd9f8995be
Reviewed-on: https://go-review.googlesource.com/12671
Reviewed-by: Russ Cox <rsc@golang.org>
2015-07-22 14:55:04 -06:00
|
|
|
retry:
|
2015-03-16 12:22:00 -06:00
|
|
|
// Steal as much credit as we can from the background GC's
|
|
|
|
// scan credit. This is racy and may drop the background
|
|
|
|
// credit below 0 if two mutators steal at the same time. This
|
|
|
|
// will just cause steals to fail until credit is accumulated
|
|
|
|
// again, so in the long run it doesn't really matter, but we
|
|
|
|
// do have to handle the negative credit case.
|
2015-11-02 12:09:24 -07:00
|
|
|
bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
|
2015-03-16 12:22:00 -06:00
|
|
|
stolen := int64(0)
|
|
|
|
if bgScanCredit > 0 {
|
|
|
|
if bgScanCredit < scanWork {
|
|
|
|
stolen = bgScanCredit
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
|
2015-03-16 12:22:00 -06:00
|
|
|
} else {
|
|
|
|
stolen = scanWork
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
gp.gcAssistBytes += debtBytes
|
2015-03-16 12:22:00 -06:00
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
|
2015-03-16 12:22:00 -06:00
|
|
|
|
|
|
|
scanWork -= stolen
|
|
|
|
|
|
|
|
if scanWork == 0 {
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
// We were able to steal all of the credit we
|
|
|
|
// needed.
|
2015-03-16 12:22:00 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Perform assist work
|
2015-07-23 15:55:01 -06:00
|
|
|
completed := false
|
2015-03-16 12:22:00 -06:00
|
|
|
systemstack(func() {
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&gcBlackenEnabled) == 0 {
|
2015-06-01 16:16:03 -06:00
|
|
|
// The gcBlackenEnabled check in malloc races with the
|
|
|
|
// store that clears it but an atomic check in every malloc
|
|
|
|
// would be a performance hit.
|
|
|
|
// Instead we recheck it here on the non-preemptable system
|
|
|
|
// stack to determine if we should preform an assist.
|
runtime: retry GC assist until debt is paid off
Currently, there are three ways to satisfy a GC assist: 1) the mutator
steals credit from background GC, 2) the mutator actually does GC
work, and 3) there is no more work available. 3 was never really
intended as a way to satisfy an assist, and it causes problems: there
are periods when it's expected that the GC won't have any work, such
as when transitioning from mark 1 to mark 2 and from mark 2 to mark
termination. During these periods, there's no back-pressure on rapidly
allocating mutators, which lets them race ahead of the heap goal.
For example, test/init1.go and the runtime/trace test both have small
reachable heaps and contain loops that rapidly allocate large garbage
byte slices. This bug lets these tests exceed the heap goal by several
orders of magnitude.
Fix this by forcing the assist (and hence the allocation) to block
until it can satisfy its debt via either 1 or 2, or the GC cycle
terminates.
This fixes one the causes of #11677. It's still possible to overshoot
the GC heap goal, but with this change the overshoot is almost exactly
by the amount of allocation that happens during the concurrent scan
phase, between when the heap passes the GC trigger and when the GC
enables assists.
Change-Id: I5ef4edcb0d2e13a1e432e66e8245f2bd9f8995be
Reviewed-on: https://go-review.googlesource.com/12671
Reviewed-by: Russ Cox <rsc@golang.org>
2015-07-22 14:55:04 -06:00
|
|
|
|
|
|
|
// GC is done, so ignore any remaining debt.
|
2015-10-14 19:31:33 -06:00
|
|
|
gp.gcAssistBytes = 0
|
2015-06-01 16:16:03 -06:00
|
|
|
return
|
|
|
|
}
|
2015-03-17 10:17:47 -06:00
|
|
|
// Track time spent in this assist. Since we're on the
|
|
|
|
// system stack, this is non-preemptible, so we can
|
|
|
|
// just measure start and end time.
|
|
|
|
startTime := nanotime()
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
decnwait := atomic.Xadd(&work.nwait, -1)
|
2015-06-01 16:16:03 -06:00
|
|
|
if decnwait == work.nproc {
|
|
|
|
println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
|
|
|
|
throw("nwait > work.nprocs")
|
|
|
|
}
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
// drain own cached work first in the hopes that it
|
2015-03-16 12:22:00 -06:00
|
|
|
// will be more cache friendly.
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
gcw := &getg().m.p.ptr().gcw
|
2015-10-04 21:00:01 -06:00
|
|
|
workDone := gcDrainN(gcw, scanWork)
|
2015-06-01 16:16:03 -06:00
|
|
|
// If we are near the end of the mark phase
|
|
|
|
// dispose of the gcw.
|
|
|
|
if gcBlackenPromptly {
|
|
|
|
gcw.dispose()
|
|
|
|
}
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
|
|
|
|
// Record that we did this much scan work.
|
2015-10-14 19:31:33 -06:00
|
|
|
//
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 21:16:57 -06:00
|
|
|
// Back out the number of bytes of assist credit that
|
|
|
|
// this scan work counts for. The "1+" is a poor man's
|
|
|
|
// round-up, to ensure this adds credit even if
|
|
|
|
// assistBytesPerWork is very low.
|
|
|
|
gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
|
|
|
|
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
// If this is the last worker and we ran out of work,
|
|
|
|
// signal a completion point.
|
2015-11-02 12:09:24 -07:00
|
|
|
incnwait := atomic.Xadd(&work.nwait, +1)
|
2015-06-01 16:16:03 -06:00
|
|
|
if incnwait > work.nproc {
|
|
|
|
println("runtime: work.nwait=", incnwait,
|
|
|
|
"work.nproc=", work.nproc,
|
|
|
|
"gcBlackenPromptly=", gcBlackenPromptly)
|
|
|
|
throw("work.nwait > work.nproc")
|
|
|
|
}
|
|
|
|
|
2015-10-19 11:35:25 -06:00
|
|
|
if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
// This has reached a background completion
|
2015-04-22 15:44:36 -06:00
|
|
|
// point.
|
2015-07-23 15:55:01 -06:00
|
|
|
completed = true
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
}
|
2015-03-17 10:17:47 -06:00
|
|
|
duration := nanotime() - startTime
|
|
|
|
_p_ := gp.m.p.ptr()
|
|
|
|
_p_.gcAssistTime += duration
|
|
|
|
if _p_.gcAssistTime > gcAssistTimeSlack {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
|
2015-03-17 10:17:47 -06:00
|
|
|
_p_.gcAssistTime = 0
|
|
|
|
}
|
2015-03-16 12:22:00 -06:00
|
|
|
})
|
2015-07-23 15:55:01 -06:00
|
|
|
|
|
|
|
if completed {
|
2015-10-26 09:27:37 -06:00
|
|
|
gcMarkDone()
|
runtime: retry GC assist until debt is paid off
Currently, there are three ways to satisfy a GC assist: 1) the mutator
steals credit from background GC, 2) the mutator actually does GC
work, and 3) there is no more work available. 3 was never really
intended as a way to satisfy an assist, and it causes problems: there
are periods when it's expected that the GC won't have any work, such
as when transitioning from mark 1 to mark 2 and from mark 2 to mark
termination. During these periods, there's no back-pressure on rapidly
allocating mutators, which lets them race ahead of the heap goal.
For example, test/init1.go and the runtime/trace test both have small
reachable heaps and contain loops that rapidly allocate large garbage
byte slices. This bug lets these tests exceed the heap goal by several
orders of magnitude.
Fix this by forcing the assist (and hence the allocation) to block
until it can satisfy its debt via either 1 or 2, or the GC cycle
terminates.
This fixes one the causes of #11677. It's still possible to overshoot
the GC heap goal, but with this change the overshoot is almost exactly
by the amount of allocation that happens during the concurrent scan
phase, between when the heap passes the GC trigger and when the GC
enables assists.
Change-Id: I5ef4edcb0d2e13a1e432e66e8245f2bd9f8995be
Reviewed-on: https://go-review.googlesource.com/12671
Reviewed-by: Russ Cox <rsc@golang.org>
2015-07-22 14:55:04 -06:00
|
|
|
}
|
|
|
|
|
2015-10-14 19:31:33 -06:00
|
|
|
if gp.gcAssistBytes < 0 {
|
runtime: retry GC assist until debt is paid off
Currently, there are three ways to satisfy a GC assist: 1) the mutator
steals credit from background GC, 2) the mutator actually does GC
work, and 3) there is no more work available. 3 was never really
intended as a way to satisfy an assist, and it causes problems: there
are periods when it's expected that the GC won't have any work, such
as when transitioning from mark 1 to mark 2 and from mark 2 to mark
termination. During these periods, there's no back-pressure on rapidly
allocating mutators, which lets them race ahead of the heap goal.
For example, test/init1.go and the runtime/trace test both have small
reachable heaps and contain loops that rapidly allocate large garbage
byte slices. This bug lets these tests exceed the heap goal by several
orders of magnitude.
Fix this by forcing the assist (and hence the allocation) to block
until it can satisfy its debt via either 1 or 2, or the GC cycle
terminates.
This fixes one the causes of #11677. It's still possible to overshoot
the GC heap goal, but with this change the overshoot is almost exactly
by the amount of allocation that happens during the concurrent scan
phase, between when the heap passes the GC trigger and when the GC
enables assists.
Change-Id: I5ef4edcb0d2e13a1e432e66e8245f2bd9f8995be
Reviewed-on: https://go-review.googlesource.com/12671
Reviewed-by: Russ Cox <rsc@golang.org>
2015-07-22 14:55:04 -06:00
|
|
|
// We were unable steal enough credit or perform
|
|
|
|
// enough work to pay off the assist debt. We need to
|
|
|
|
// do one of these before letting the mutator allocate
|
2015-10-14 19:31:33 -06:00
|
|
|
// more to prevent over-allocation.
|
|
|
|
//
|
2015-10-15 15:58:17 -06:00
|
|
|
// If this is because we were preempted, reschedule
|
|
|
|
// and try some more.
|
|
|
|
if gp.preempt {
|
|
|
|
Gosched()
|
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
|
2015-10-14 19:31:33 -06:00
|
|
|
// Add this G to an assist queue and park. When the GC
|
|
|
|
// has more background credit, it will satisfy queued
|
|
|
|
// assists before flushing to the global credit pool.
|
|
|
|
//
|
|
|
|
// Note that this does *not* get woken up when more
|
|
|
|
// work is added to the work list. The theory is that
|
|
|
|
// there wasn't enough work to do anyway, so we might
|
|
|
|
// as well let background marking take care of the
|
|
|
|
// work that is available.
|
|
|
|
lock(&work.assistQueue.lock)
|
|
|
|
|
|
|
|
// If the GC cycle is over, just return. This is the
|
2015-10-26 09:27:37 -06:00
|
|
|
// likely path if we completed above. We do this
|
2015-10-14 19:31:33 -06:00
|
|
|
// under the lock to prevent a GC cycle from ending
|
|
|
|
// between this check and queuing the assist.
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Load(&gcBlackenEnabled) == 0 {
|
2015-10-14 19:31:33 -06:00
|
|
|
unlock(&work.assistQueue.lock)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
|
|
|
|
if oldHead == 0 {
|
|
|
|
work.assistQueue.head.set(gp)
|
|
|
|
} else {
|
|
|
|
oldTail.ptr().schedlink.set(gp)
|
|
|
|
}
|
|
|
|
work.assistQueue.tail.set(gp)
|
|
|
|
gp.schedlink.set(nil)
|
|
|
|
// Recheck for background credit now that this G is in
|
|
|
|
// the queue, but can still back out. This avoids a
|
|
|
|
// race in case background marking has flushed more
|
|
|
|
// credit since we checked above.
|
2015-11-02 12:09:24 -07:00
|
|
|
if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
|
2015-10-14 19:31:33 -06:00
|
|
|
work.assistQueue.head = oldHead
|
|
|
|
work.assistQueue.tail = oldTail
|
|
|
|
if oldTail != 0 {
|
|
|
|
oldTail.ptr().schedlink.set(nil)
|
|
|
|
}
|
|
|
|
unlock(&work.assistQueue.lock)
|
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
// Park for real.
|
2015-10-29 20:40:36 -06:00
|
|
|
goparkunlock(&work.assistQueue.lock, "GC assist wait", traceEvGoBlock, 2)
|
2015-10-14 19:31:33 -06:00
|
|
|
|
|
|
|
// At this point either background GC has satisfied
|
|
|
|
// this G's assist debt, or the GC cycle is over.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// gcWakeAllAssists wakes all currently blocked assists. This is used
|
2016-01-15 11:28:41 -07:00
|
|
|
// at the end of a GC cycle. gcBlackenEnabled must be false to prevent
|
|
|
|
// new assists from going to sleep after this point.
|
2015-10-14 19:31:33 -06:00
|
|
|
func gcWakeAllAssists() {
|
|
|
|
lock(&work.assistQueue.lock)
|
|
|
|
injectglist(work.assistQueue.head.ptr())
|
|
|
|
work.assistQueue.head.set(nil)
|
|
|
|
work.assistQueue.tail.set(nil)
|
|
|
|
unlock(&work.assistQueue.lock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// gcFlushBgCredit flushes scanWork units of background scan work
|
|
|
|
// credit. This first satisfies blocked assists on the
|
|
|
|
// work.assistQueue and then flushes any remaining credit to
|
|
|
|
// gcController.bgScanCredit.
|
2015-11-19 09:25:55 -07:00
|
|
|
//
|
|
|
|
// Write barriers are disallowed because this is used by gcDrain after
|
|
|
|
// it has ensured that all work is drained and this must preserve that
|
|
|
|
// condition.
|
|
|
|
//
|
|
|
|
//go:nowritebarrierrec
|
2015-10-14 19:31:33 -06:00
|
|
|
func gcFlushBgCredit(scanWork int64) {
|
|
|
|
if work.assistQueue.head == 0 {
|
|
|
|
// Fast path; there are no blocked assists. There's a
|
|
|
|
// small window here where an assist may add itself to
|
|
|
|
// the blocked queue and park. If that happens, we'll
|
|
|
|
// just get it on the next flush.
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
|
2015-10-14 19:31:33 -06:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
|
|
|
|
|
|
|
|
lock(&work.assistQueue.lock)
|
|
|
|
gp := work.assistQueue.head.ptr()
|
|
|
|
for gp != nil && scanBytes > 0 {
|
|
|
|
// Note that gp.gcAssistBytes is negative because gp
|
|
|
|
// is in debt. Think carefully about the signs below.
|
|
|
|
if scanBytes+gp.gcAssistBytes >= 0 {
|
|
|
|
// Satisfy this entire assist debt.
|
|
|
|
scanBytes += gp.gcAssistBytes
|
|
|
|
gp.gcAssistBytes = 0
|
|
|
|
xgp := gp
|
|
|
|
gp = gp.schedlink.ptr()
|
|
|
|
ready(xgp, 0)
|
|
|
|
} else {
|
|
|
|
// Partially satisfy this assist.
|
|
|
|
gp.gcAssistBytes += scanBytes
|
|
|
|
scanBytes = 0
|
|
|
|
// As a heuristic, we move this assist to the
|
|
|
|
// back of the queue so that large assists
|
|
|
|
// can't clog up the assist queue and
|
|
|
|
// substantially delay small assists.
|
|
|
|
xgp := gp
|
|
|
|
gp = gp.schedlink.ptr()
|
|
|
|
if gp == nil {
|
|
|
|
// gp is the only assist in the queue.
|
|
|
|
gp = xgp
|
|
|
|
} else {
|
|
|
|
xgp.schedlink = 0
|
|
|
|
work.assistQueue.tail.ptr().schedlink.set(xgp)
|
|
|
|
work.assistQueue.tail.set(xgp)
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
work.assistQueue.head.set(gp)
|
|
|
|
if gp == nil {
|
|
|
|
work.assistQueue.tail.set(nil)
|
|
|
|
}
|
2015-10-04 21:56:11 -06:00
|
|
|
|
2015-10-14 19:31:33 -06:00
|
|
|
if scanBytes > 0 {
|
|
|
|
// Convert from scan bytes back to work.
|
|
|
|
scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
|
2015-07-23 15:55:01 -06:00
|
|
|
}
|
2015-10-14 19:31:33 -06:00
|
|
|
unlock(&work.assistQueue.lock)
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
func scanstack(gp *g) {
|
|
|
|
if gp.gcscanvalid {
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
if gcphase == _GCmarktermination {
|
|
|
|
gcRemoveStackBarriers(gp)
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if readgstatus(gp)&_Gscan == 0 {
|
|
|
|
print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
|
|
|
|
throw("scanstack - bad status")
|
|
|
|
}
|
|
|
|
|
|
|
|
switch readgstatus(gp) &^ _Gscan {
|
|
|
|
default:
|
|
|
|
print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
|
|
|
|
throw("mark - bad status")
|
|
|
|
case _Gdead:
|
|
|
|
return
|
|
|
|
case _Grunning:
|
|
|
|
print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
|
|
|
|
throw("scanstack: goroutine not stopped")
|
|
|
|
case _Grunnable, _Gsyscall, _Gwaiting:
|
|
|
|
// ok
|
|
|
|
}
|
|
|
|
|
|
|
|
if gp == getg() {
|
|
|
|
throw("can't scan our own stack")
|
|
|
|
}
|
|
|
|
mp := gp.m
|
|
|
|
if mp != nil && mp.helpgc != 0 {
|
|
|
|
throw("can't scan gchelper stack")
|
|
|
|
}
|
|
|
|
|
2016-02-15 16:30:48 -07:00
|
|
|
// Shrink the stack if not much of it is being used. During
|
|
|
|
// concurrent GC, we can do this during concurrent mark.
|
|
|
|
if !work.markrootDone {
|
|
|
|
shrinkstack(gp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Prepare for stack barrier insertion/removal.
|
2015-06-05 09:07:21 -06:00
|
|
|
var sp, barrierOffset, nextBarrier uintptr
|
|
|
|
if gp.syscallsp != 0 {
|
|
|
|
sp = gp.syscallsp
|
|
|
|
} else {
|
|
|
|
sp = gp.sched.sp
|
|
|
|
}
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
switch gcphase {
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
case _GCmark:
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
// Install stack barriers during stack scan.
|
2015-08-26 11:54:26 -06:00
|
|
|
barrierOffset = uintptr(firstStackBarrierOffset)
|
2015-06-05 09:07:21 -06:00
|
|
|
nextBarrier = sp + barrierOffset
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
|
2015-06-05 09:51:49 -06:00
|
|
|
if debug.gcstackbarrieroff > 0 {
|
|
|
|
nextBarrier = ^uintptr(0)
|
|
|
|
}
|
|
|
|
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
if gp.stkbarPos != 0 || len(gp.stkbar) != 0 {
|
|
|
|
// If this happens, it's probably because we
|
|
|
|
// scanned a stack twice in the same phase.
|
|
|
|
print("stkbarPos=", gp.stkbarPos, " len(stkbar)=", len(gp.stkbar), " goid=", gp.goid, " gcphase=", gcphase, "\n")
|
|
|
|
throw("g already has stack barriers")
|
|
|
|
}
|
|
|
|
|
2015-11-18 12:10:40 -07:00
|
|
|
gcLockStackBarriers(gp)
|
|
|
|
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
case _GCmarktermination:
|
|
|
|
if int(gp.stkbarPos) == len(gp.stkbar) {
|
|
|
|
// gp hit all of the stack barriers (or there
|
|
|
|
// were none). Re-scan the whole stack.
|
|
|
|
nextBarrier = ^uintptr(0)
|
|
|
|
} else {
|
|
|
|
// Only re-scan up to the lowest un-hit
|
|
|
|
// barrier. Any frames above this have not
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// executed since the concurrent scan of gp and
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
// any writes through up-pointers to above
|
|
|
|
// this barrier had write barriers.
|
|
|
|
nextBarrier = gp.stkbar[gp.stkbarPos].savedLRPtr
|
|
|
|
if debugStackBarrier {
|
2015-06-05 09:07:21 -06:00
|
|
|
print("rescan below ", hex(nextBarrier), " in [", hex(sp), ",", hex(gp.stack.hi), ") goid=", gp.goid, "\n")
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
gcRemoveStackBarriers(gp)
|
|
|
|
|
|
|
|
default:
|
|
|
|
throw("scanstack in wrong phase")
|
|
|
|
}
|
|
|
|
|
2016-02-15 16:30:48 -07:00
|
|
|
// Scan the stack.
|
runtime: add pcvalue cache to improve stack scan speed
The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.
Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.
This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.
This nearly doubles the overall performance of the deep stack test
program from issue #10898:
name old time/op new time/op delta
Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5)
It's a very slight win on the garbage benchmark:
name old time/op new time/op delta
XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19)
It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:
name old time/op new time/op delta
BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20)
Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18)
FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20)
FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20)
FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19)
FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19)
FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20)
FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20)
GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18)
GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20)
Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20)
Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20)
HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20)
JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20)
JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17)
Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19)
GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20)
RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19)
RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18)
RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19)
RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19)
Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20)
Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18)
TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20)
TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18)
[Geo mean] 63.5µs 63.2µs -0.41%
name old speed new speed delta
GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17)
GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20)
Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20)
Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20)
JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20)
JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17)
GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20)
RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19)
RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18)
RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19)
Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20)
Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18)
[Geo mean] 97.8MB/s 98.0MB/s +0.23%
Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614
Reviewed-by: Keith Randall <khr@golang.org>
2015-08-12 21:43:43 -06:00
|
|
|
var cache pcvalueCache
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
gcw := &getg().m.p.ptr().gcw
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
n := 0
|
2015-02-19 11:38:46 -07:00
|
|
|
scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
|
runtime: add pcvalue cache to improve stack scan speed
The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.
Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.
This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.
This nearly doubles the overall performance of the deep stack test
program from issue #10898:
name old time/op new time/op delta
Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5)
It's a very slight win on the garbage benchmark:
name old time/op new time/op delta
XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19)
It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:
name old time/op new time/op delta
BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20)
Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18)
FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20)
FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20)
FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19)
FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19)
FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20)
FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20)
GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18)
GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20)
Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20)
Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20)
HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20)
JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20)
JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17)
Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19)
GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20)
RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19)
RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18)
RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19)
RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19)
Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20)
Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18)
TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20)
TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18)
[Geo mean] 63.5µs 63.2µs -0.41%
name old speed new speed delta
GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17)
GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20)
Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20)
Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20)
JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20)
JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17)
GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20)
RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19)
RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18)
RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19)
Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20)
Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18)
[Geo mean] 97.8MB/s 98.0MB/s +0.23%
Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614
Reviewed-by: Keith Randall <khr@golang.org>
2015-08-12 21:43:43 -06:00
|
|
|
scanframeworker(frame, &cache, gcw)
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
|
|
|
|
if frame.fp > nextBarrier {
|
|
|
|
// We skip installing a barrier on bottom-most
|
|
|
|
// frame because on LR machines this LR is not
|
|
|
|
// on the stack.
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
if gcphase == _GCmark && n != 0 {
|
runtime: don't install a stack barrier in cgocallback_gofunc's frame
Currently the runtime can install stack barriers in any frame.
However, the frame of cgocallback_gofunc is special: it's the one
function that switches from a regular G stack to the system stack on
return. Hence, the return PC slot in its frame on the G stack is
actually used to save getg().sched.pc (so tracebacks appear to unwind
to the last Go function running on that G), and not as an actual
return PC for cgocallback_gofunc.
Because of this, if we install a stack barrier in cgocallback_gofunc's
return PC slot, when cgocallback_gofunc does return, it will move the
stack barrier stub PC in to getg().sched.pc and switch back to the
system stack. The rest of the runtime doesn't know how to deal with a
stack barrier stub in sched.pc: nothing knows how to match it up with
the G's stack barrier array and, when the runtime removes stack
barriers, it doesn't know to undo the one in sched.pc. Hence, if the C
code later returns back in to Go code, it will attempt to return
through the stack barrier saved in sched.pc, which may no longer have
correct unwinding information.
Fix this by blacklisting cgocallback_gofunc's frame so the runtime
won't install a stack barrier in it's return PC slot.
Fixes #12238.
Change-Id: I46aa2155df2fd050dd50de3434b62987dc4947b8
Reviewed-on: https://go-review.googlesource.com/13944
Reviewed-by: Russ Cox <rsc@golang.org>
2015-08-26 10:16:51 -06:00
|
|
|
if gcInstallStackBarrier(gp, frame) {
|
|
|
|
barrierOffset *= 2
|
|
|
|
nextBarrier = sp + barrierOffset
|
|
|
|
}
|
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of
stack re-scanning that must be done during mark termination.
Currently the GC scans stacks of active goroutines twice during every
GC cycle: once at the beginning during root discovery and once at the
end during mark termination. The second scan happens while the world
is stopped and guarantees that we've seen all of the roots (since
there are no write barriers on writes to local stack
variables). However, this means pause time is proportional to stack
size. In particularly recursive programs, this can drive pause time up
past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap).
Re-scanning the entire stack is rarely necessary, especially for large
stacks, because usually most of the frames on the stack were not
active between the first and second scans and hence any changes to
these frames (via non-escaping pointers passed down the stack) were
tracked by write barriers.
To efficiently track how far a stack has been unwound since the first
scan (and, hence, how much needs to be re-scanned), this commit
introduces stack barriers. During the first scan, at exponentially
spaced points in each stack, the scan overwrites return PCs with the
PC of the stack barrier function. When "returned" to, the stack
barrier function records how far the stack has unwound and jumps to
the original return PC for that point in the stack. Then the second
scan only needs to proceed as far as the lowest barrier that hasn't
been hit.
For deeply recursive programs, this substantially reduces mark
termination time (and hence pause time). For the goscheme example
linked in issue #10898, prior to this change, mark termination times
were typically between 100 and 500ms; with this change, mark
termination times are typically between 10 and 20ms. As a result of
the reduced stack scanning work, this reduces overall execution time
of the goscheme example by 20%.
Fixes #10898.
The effect of this on programs that are not deeply recursive is
minimal:
name old time/op new time/op delta
BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19)
Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19)
FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19)
FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19)
FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19)
FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19)
FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18)
FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19)
FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19)
GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18)
GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18)
Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20)
Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18)
HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18)
JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17)
JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17)
Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19)
GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19)
RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19)
RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19)
RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18)
RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18)
RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18)
RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20)
RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19)
RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17)
Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16)
Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19)
TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15)
TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18)
Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2
Reviewed-on: https://go-review.googlesource.com/10314
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-05-20 14:30:49 -06:00
|
|
|
} else if gcphase == _GCmarktermination {
|
|
|
|
// We just scanned a frame containing
|
|
|
|
// a return to a stack barrier. Since
|
|
|
|
// this frame never returned, we can
|
|
|
|
// stop scanning.
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n++
|
|
|
|
|
2015-02-19 11:38:46 -07:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
|
|
|
|
tracebackdefers(gp, scanframe, nil)
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
if gcphase == _GCmarktermination {
|
|
|
|
gcw.dispose()
|
|
|
|
}
|
2015-11-18 12:10:40 -07:00
|
|
|
if gcphase == _GCmark {
|
|
|
|
gcUnlockStackBarriers(gp)
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
gp.gcscanvalid = true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan a stack frame: local variables and function arguments/results.
|
|
|
|
//go:nowritebarrier
|
runtime: add pcvalue cache to improve stack scan speed
The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.
Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.
This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.
This nearly doubles the overall performance of the deep stack test
program from issue #10898:
name old time/op new time/op delta
Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5)
It's a very slight win on the garbage benchmark:
name old time/op new time/op delta
XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19)
It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:
name old time/op new time/op delta
BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20)
Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18)
FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20)
FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20)
FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19)
FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19)
FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20)
FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20)
GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18)
GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20)
Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20)
Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20)
HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20)
JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20)
JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17)
Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19)
GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20)
RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19)
RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18)
RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19)
RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19)
Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20)
Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18)
TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20)
TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18)
[Geo mean] 63.5µs 63.2µs -0.41%
name old speed new speed delta
GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17)
GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20)
Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20)
Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20)
JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20)
JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17)
GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20)
RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19)
RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18)
RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19)
Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20)
Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18)
[Geo mean] 97.8MB/s 98.0MB/s +0.23%
Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614
Reviewed-by: Keith Randall <khr@golang.org>
2015-08-12 21:43:43 -06:00
|
|
|
func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
f := frame.fn
|
|
|
|
targetpc := frame.continpc
|
|
|
|
if targetpc == 0 {
|
|
|
|
// Frame is dead.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if _DebugGC > 1 {
|
|
|
|
print("scanframe ", funcname(f), "\n")
|
|
|
|
}
|
|
|
|
if targetpc != f.entry {
|
|
|
|
targetpc--
|
|
|
|
}
|
runtime: add pcvalue cache to improve stack scan speed
The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.
Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.
This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.
This nearly doubles the overall performance of the deep stack test
program from issue #10898:
name old time/op new time/op delta
Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5)
It's a very slight win on the garbage benchmark:
name old time/op new time/op delta
XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19)
It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:
name old time/op new time/op delta
BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20)
Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18)
FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20)
FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20)
FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19)
FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19)
FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20)
FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20)
GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18)
GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20)
Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20)
Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20)
HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20)
JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20)
JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17)
Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19)
GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20)
RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19)
RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18)
RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19)
RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19)
Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20)
Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18)
TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20)
TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18)
[Geo mean] 63.5µs 63.2µs -0.41%
name old speed new speed delta
GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17)
GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20)
Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20)
Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20)
JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20)
JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17)
GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20)
RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19)
RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20)
RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19)
RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18)
RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19)
RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19)
Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20)
Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18)
[Geo mean] 97.8MB/s 98.0MB/s +0.23%
Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614
Reviewed-by: Keith Randall <khr@golang.org>
2015-08-12 21:43:43 -06:00
|
|
|
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
|
2015-02-19 11:38:46 -07:00
|
|
|
if pcdata == -1 {
|
|
|
|
// We do not have a valid pcdata value but there might be a
|
2016-03-01 16:21:55 -07:00
|
|
|
// stackmap for this function. It is likely that we are looking
|
2015-02-19 11:38:46 -07:00
|
|
|
// at the function prologue, assume so and hope for the best.
|
|
|
|
pcdata = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan local variables if stack frame has been allocated.
|
|
|
|
size := frame.varp - frame.sp
|
|
|
|
var minsize uintptr
|
2016-04-07 00:42:35 -06:00
|
|
|
switch sys.ArchFamily {
|
|
|
|
case sys.ARM64:
|
2015-11-11 10:39:30 -07:00
|
|
|
minsize = sys.SpAlign
|
2015-03-08 07:20:20 -06:00
|
|
|
default:
|
2015-11-11 10:39:30 -07:00
|
|
|
minsize = sys.MinFrameSize
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
if size > minsize {
|
|
|
|
stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
|
|
|
|
if stkmap == nil || stkmap.n <= 0 {
|
|
|
|
print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
|
|
|
|
throw("missing stackmap")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Locals bitmap information, scan just the pointers in locals.
|
|
|
|
if pcdata < 0 || pcdata >= stkmap.n {
|
|
|
|
// don't know where we are
|
|
|
|
print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
|
|
|
|
throw("scanframe: bad symbol table")
|
|
|
|
}
|
|
|
|
bv := stackmapdata(stkmap, pcdata)
|
2015-11-11 10:39:30 -07:00
|
|
|
size = uintptr(bv.n) * sys.PtrSize
|
2015-02-19 11:38:46 -07:00
|
|
|
scanblock(frame.varp-size, size, bv.bytedata, gcw)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan arguments.
|
|
|
|
if frame.arglen > 0 {
|
|
|
|
var bv bitvector
|
|
|
|
if frame.argmap != nil {
|
|
|
|
bv = *frame.argmap
|
|
|
|
} else {
|
|
|
|
stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
|
|
|
|
if stkmap == nil || stkmap.n <= 0 {
|
|
|
|
print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
|
|
|
|
throw("missing stackmap")
|
|
|
|
}
|
|
|
|
if pcdata < 0 || pcdata >= stkmap.n {
|
|
|
|
// don't know where we are
|
|
|
|
print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
|
|
|
|
throw("scanframe: bad symbol table")
|
|
|
|
}
|
|
|
|
bv = stackmapdata(stkmap, pcdata)
|
|
|
|
}
|
2015-11-11 10:39:30 -07:00
|
|
|
scanblock(frame.argp, uintptr(bv.n)*sys.PtrSize, bv.bytedata, gcw)
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-04 20:42:43 -06:00
|
|
|
type gcDrainFlags int
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
|
2015-10-04 20:42:43 -06:00
|
|
|
const (
|
|
|
|
gcDrainUntilPreempt gcDrainFlags = 1 << iota
|
runtime: eliminate getfull barrier from concurrent mark
Currently dedicated mark workers participate in the getfull barrier
during concurrent mark. However, the getfull barrier wasn't designed
for concurrent work and this causes no end of headaches.
In the concurrent setting, participants come and go. This makes mark
completion susceptible to live-lock: since dedicated workers are only
periodically polling for completion, it's possible for the program to
be in some transient worker each time one of the dedicated workers
wakes up to check if it can exit the getfull barrier. It also
complicates reasoning about the system because dedicated workers
participate directly in the getfull barrier, but transient workers
must instead use trygetfull because they have exit conditions that
aren't captured by getfull (e.g., fractional workers exit when
preempted). The complexity of implementing these exit conditions
contributed to #11677. Furthermore, the getfull barrier is inefficient
because we could be running user code instead of spinning on a P. In
effect, we're dedicating 25% of the CPU to marking even if that means
we have to spin to make that 25%. It also causes issues on Windows
because we can't actually sleep for 100µs (#8687).
Fix this by making dedicated workers no longer participate in the
getfull barrier. Instead, dedicated workers simply return to the
scheduler when they fail to get more work, regardless of what others
workers are doing, and the scheduler only starts new dedicated workers
if there's work available. Everything that needs to be handled by this
barrier is already handled by detection of mark completion.
This makes the system much more symmetric because all workers and
assists now use trygetfull during concurrent mark. It also loosens the
25% CPU target so that we can give some of that 25% back to user code
if there isn't enough work to keep the mark worker busy. And it
eliminates the problematic 100µs sleep on Windows during concurrent
mark (though not during mark termination).
The downside of this is that if we hit a bottleneck in the heap graph
that then expands back out, the system may shut down dedicated workers
and take a while to start them back up. We'll address this in the next
commit.
Updates #12041 and #8687.
No effect on the go1 benchmarks. This slows down the garbage benchmark
by 9%, but we'll more than make it up in the next commit.
name old time/op new time/op delta
XBenchGarbage-12 5.80ms ± 2% 6.32ms ± 4% +9.03% (p=0.000 n=20+20)
Change-Id: I65100a9ba005a8b5cf97940798918672ea9dd09b
Reviewed-on: https://go-review.googlesource.com/16297
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-26 14:29:25 -06:00
|
|
|
gcDrainNoBlock
|
2015-10-04 20:47:27 -06:00
|
|
|
gcDrainFlushBgCredit
|
2015-10-04 20:42:43 -06:00
|
|
|
|
runtime: eliminate getfull barrier from concurrent mark
Currently dedicated mark workers participate in the getfull barrier
during concurrent mark. However, the getfull barrier wasn't designed
for concurrent work and this causes no end of headaches.
In the concurrent setting, participants come and go. This makes mark
completion susceptible to live-lock: since dedicated workers are only
periodically polling for completion, it's possible for the program to
be in some transient worker each time one of the dedicated workers
wakes up to check if it can exit the getfull barrier. It also
complicates reasoning about the system because dedicated workers
participate directly in the getfull barrier, but transient workers
must instead use trygetfull because they have exit conditions that
aren't captured by getfull (e.g., fractional workers exit when
preempted). The complexity of implementing these exit conditions
contributed to #11677. Furthermore, the getfull barrier is inefficient
because we could be running user code instead of spinning on a P. In
effect, we're dedicating 25% of the CPU to marking even if that means
we have to spin to make that 25%. It also causes issues on Windows
because we can't actually sleep for 100µs (#8687).
Fix this by making dedicated workers no longer participate in the
getfull barrier. Instead, dedicated workers simply return to the
scheduler when they fail to get more work, regardless of what others
workers are doing, and the scheduler only starts new dedicated workers
if there's work available. Everything that needs to be handled by this
barrier is already handled by detection of mark completion.
This makes the system much more symmetric because all workers and
assists now use trygetfull during concurrent mark. It also loosens the
25% CPU target so that we can give some of that 25% back to user code
if there isn't enough work to keep the mark worker busy. And it
eliminates the problematic 100µs sleep on Windows during concurrent
mark (though not during mark termination).
The downside of this is that if we hit a bottleneck in the heap graph
that then expands back out, the system may shut down dedicated workers
and take a while to start them back up. We'll address this in the next
commit.
Updates #12041 and #8687.
No effect on the go1 benchmarks. This slows down the garbage benchmark
by 9%, but we'll more than make it up in the next commit.
name old time/op new time/op delta
XBenchGarbage-12 5.80ms ± 2% 6.32ms ± 4% +9.03% (p=0.000 n=20+20)
Change-Id: I65100a9ba005a8b5cf97940798918672ea9dd09b
Reviewed-on: https://go-review.googlesource.com/16297
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-26 14:29:25 -06:00
|
|
|
// gcDrainBlock means neither gcDrainUntilPreempt or
|
|
|
|
// gcDrainNoBlock. It is the default, but callers should use
|
|
|
|
// the constant for documentation purposes.
|
2015-10-04 20:42:43 -06:00
|
|
|
gcDrainBlock gcDrainFlags = 0
|
|
|
|
)
|
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// gcDrain scans roots and objects in work buffers, blackening grey
|
|
|
|
// objects until all roots and work buffers have been drained.
|
2015-10-04 20:42:43 -06:00
|
|
|
//
|
runtime: eliminate getfull barrier from concurrent mark
Currently dedicated mark workers participate in the getfull barrier
during concurrent mark. However, the getfull barrier wasn't designed
for concurrent work and this causes no end of headaches.
In the concurrent setting, participants come and go. This makes mark
completion susceptible to live-lock: since dedicated workers are only
periodically polling for completion, it's possible for the program to
be in some transient worker each time one of the dedicated workers
wakes up to check if it can exit the getfull barrier. It also
complicates reasoning about the system because dedicated workers
participate directly in the getfull barrier, but transient workers
must instead use trygetfull because they have exit conditions that
aren't captured by getfull (e.g., fractional workers exit when
preempted). The complexity of implementing these exit conditions
contributed to #11677. Furthermore, the getfull barrier is inefficient
because we could be running user code instead of spinning on a P. In
effect, we're dedicating 25% of the CPU to marking even if that means
we have to spin to make that 25%. It also causes issues on Windows
because we can't actually sleep for 100µs (#8687).
Fix this by making dedicated workers no longer participate in the
getfull barrier. Instead, dedicated workers simply return to the
scheduler when they fail to get more work, regardless of what others
workers are doing, and the scheduler only starts new dedicated workers
if there's work available. Everything that needs to be handled by this
barrier is already handled by detection of mark completion.
This makes the system much more symmetric because all workers and
assists now use trygetfull during concurrent mark. It also loosens the
25% CPU target so that we can give some of that 25% back to user code
if there isn't enough work to keep the mark worker busy. And it
eliminates the problematic 100µs sleep on Windows during concurrent
mark (though not during mark termination).
The downside of this is that if we hit a bottleneck in the heap graph
that then expands back out, the system may shut down dedicated workers
and take a while to start them back up. We'll address this in the next
commit.
Updates #12041 and #8687.
No effect on the go1 benchmarks. This slows down the garbage benchmark
by 9%, but we'll more than make it up in the next commit.
name old time/op new time/op delta
XBenchGarbage-12 5.80ms ± 2% 6.32ms ± 4% +9.03% (p=0.000 n=20+20)
Change-Id: I65100a9ba005a8b5cf97940798918672ea9dd09b
Reviewed-on: https://go-review.googlesource.com/16297
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-26 14:29:25 -06:00
|
|
|
// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
|
|
|
|
// is set. This implies gcDrainNoBlock.
|
|
|
|
//
|
|
|
|
// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
|
|
|
|
// unable to get more work. Otherwise, it will block until all
|
|
|
|
// blocking calls are blocked in gcDrain.
|
2015-10-04 20:42:43 -06:00
|
|
|
//
|
2015-10-04 20:47:27 -06:00
|
|
|
// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
|
2015-10-04 21:00:01 -06:00
|
|
|
// credit to gcController.bgScanCredit every gcCreditSlack units of
|
2015-10-04 20:47:27 -06:00
|
|
|
// scan work.
|
runtime: eliminate getfull barrier from concurrent mark
Currently dedicated mark workers participate in the getfull barrier
during concurrent mark. However, the getfull barrier wasn't designed
for concurrent work and this causes no end of headaches.
In the concurrent setting, participants come and go. This makes mark
completion susceptible to live-lock: since dedicated workers are only
periodically polling for completion, it's possible for the program to
be in some transient worker each time one of the dedicated workers
wakes up to check if it can exit the getfull barrier. It also
complicates reasoning about the system because dedicated workers
participate directly in the getfull barrier, but transient workers
must instead use trygetfull because they have exit conditions that
aren't captured by getfull (e.g., fractional workers exit when
preempted). The complexity of implementing these exit conditions
contributed to #11677. Furthermore, the getfull barrier is inefficient
because we could be running user code instead of spinning on a P. In
effect, we're dedicating 25% of the CPU to marking even if that means
we have to spin to make that 25%. It also causes issues on Windows
because we can't actually sleep for 100µs (#8687).
Fix this by making dedicated workers no longer participate in the
getfull barrier. Instead, dedicated workers simply return to the
scheduler when they fail to get more work, regardless of what others
workers are doing, and the scheduler only starts new dedicated workers
if there's work available. Everything that needs to be handled by this
barrier is already handled by detection of mark completion.
This makes the system much more symmetric because all workers and
assists now use trygetfull during concurrent mark. It also loosens the
25% CPU target so that we can give some of that 25% back to user code
if there isn't enough work to keep the mark worker busy. And it
eliminates the problematic 100µs sleep on Windows during concurrent
mark (though not during mark termination).
The downside of this is that if we hit a bottleneck in the heap graph
that then expands back out, the system may shut down dedicated workers
and take a while to start them back up. We'll address this in the next
commit.
Updates #12041 and #8687.
No effect on the go1 benchmarks. This slows down the garbage benchmark
by 9%, but we'll more than make it up in the next commit.
name old time/op new time/op delta
XBenchGarbage-12 5.80ms ± 2% 6.32ms ± 4% +9.03% (p=0.000 n=20+20)
Change-Id: I65100a9ba005a8b5cf97940798918672ea9dd09b
Reviewed-on: https://go-review.googlesource.com/16297
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-26 14:29:25 -06:00
|
|
|
//
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2015-10-04 20:47:27 -06:00
|
|
|
func gcDrain(gcw *gcWork, flags gcDrainFlags) {
|
2015-11-13 18:45:22 -07:00
|
|
|
if !writeBarrier.needed {
|
2015-07-24 10:33:23 -06:00
|
|
|
throw("gcDrain phase incorrect")
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
gp := getg()
|
2016-01-06 15:07:58 -07:00
|
|
|
preemptible := flags&gcDrainUntilPreempt != 0
|
runtime: eliminate getfull barrier from concurrent mark
Currently dedicated mark workers participate in the getfull barrier
during concurrent mark. However, the getfull barrier wasn't designed
for concurrent work and this causes no end of headaches.
In the concurrent setting, participants come and go. This makes mark
completion susceptible to live-lock: since dedicated workers are only
periodically polling for completion, it's possible for the program to
be in some transient worker each time one of the dedicated workers
wakes up to check if it can exit the getfull barrier. It also
complicates reasoning about the system because dedicated workers
participate directly in the getfull barrier, but transient workers
must instead use trygetfull because they have exit conditions that
aren't captured by getfull (e.g., fractional workers exit when
preempted). The complexity of implementing these exit conditions
contributed to #11677. Furthermore, the getfull barrier is inefficient
because we could be running user code instead of spinning on a P. In
effect, we're dedicating 25% of the CPU to marking even if that means
we have to spin to make that 25%. It also causes issues on Windows
because we can't actually sleep for 100µs (#8687).
Fix this by making dedicated workers no longer participate in the
getfull barrier. Instead, dedicated workers simply return to the
scheduler when they fail to get more work, regardless of what others
workers are doing, and the scheduler only starts new dedicated workers
if there's work available. Everything that needs to be handled by this
barrier is already handled by detection of mark completion.
This makes the system much more symmetric because all workers and
assists now use trygetfull during concurrent mark. It also loosens the
25% CPU target so that we can give some of that 25% back to user code
if there isn't enough work to keep the mark worker busy. And it
eliminates the problematic 100µs sleep on Windows during concurrent
mark (though not during mark termination).
The downside of this is that if we hit a bottleneck in the heap graph
that then expands back out, the system may shut down dedicated workers
and take a while to start them back up. We'll address this in the next
commit.
Updates #12041 and #8687.
No effect on the go1 benchmarks. This slows down the garbage benchmark
by 9%, but we'll more than make it up in the next commit.
name old time/op new time/op delta
XBenchGarbage-12 5.80ms ± 2% 6.32ms ± 4% +9.03% (p=0.000 n=20+20)
Change-Id: I65100a9ba005a8b5cf97940798918672ea9dd09b
Reviewed-on: https://go-review.googlesource.com/16297
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-26 14:29:25 -06:00
|
|
|
blocking := flags&(gcDrainUntilPreempt|gcDrainNoBlock) == 0
|
2015-10-04 20:47:27 -06:00
|
|
|
flushBgCredit := flags&gcDrainFlushBgCredit != 0
|
2015-10-04 20:42:43 -06:00
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// Drain root marking jobs.
|
|
|
|
if work.markrootNext < work.markrootJobs {
|
|
|
|
for blocking || !gp.preempt {
|
2015-11-02 12:09:24 -07:00
|
|
|
job := atomic.Xadd(&work.markrootNext, +1) - 1
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
if job >= work.markrootJobs {
|
|
|
|
break
|
|
|
|
}
|
2015-11-23 16:44:03 -07:00
|
|
|
markroot(gcw, job)
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-04 21:00:01 -06:00
|
|
|
initScanWork := gcw.scanWork
|
2015-03-13 11:29:23 -06:00
|
|
|
|
runtime: perform concurrent scan in GC workers
Currently the concurrent root scan is performed in its entirety by the
GC coordinator before entering concurrent mark (which enables GC
workers). This scan is done sequentially, which can prolong the scan
phase, delay the mark phase, and means that the scan phase does not
obey the 25% CPU goal. Furthermore, there's no need to complete the
root scan before starting marking (in fact, we already allow GC
assists to happen during the scan phase), so this acts as an
unnecessary barrier between root scanning and marking.
This change shifts the root scan work out of the GC coordinator and in
to the GC workers. The coordinator simply sets up the scan state and
enqueues the right number of root scan jobs. The GC workers then drain
the root scan jobs prior to draining heap scan jobs.
This parallelizes the root scan process, makes it obey the 25% CPU
goal, and effectively eliminates root scanning as an isolated phase,
allowing the system to smoothly transition from root scanning to heap
marking. This also eliminates a major non-STW responsibility of the GC
coordinator, which will make it easier to switch to a decentralized
state machine. Finally, it puts us in a good position to perform root
scanning in assists as well, which will help satisfy assists at the
beginning of the GC cycle.
This is mostly straightforward. One tricky aspect is that we have to
deal with preemption deadlock: where two non-preemptible gorountines
are trying to preempt each other to perform a stack scan. Given the
context where this happens, the only instance of this is two
background workers trying to scan each other. We avoid this by simply
not scanning the stacks of background workers during the concurrent
phase; this is safe because we'll scan them during mark termination
(and their stacks are *very* small and should not contain any new
pointers).
This change also switches the root marking during mark termination to
use the same gcDrain-based code path as concurrent mark. This
shouldn't affect performance because STW root marking was already
parallel and tasks switched to heap marking immediately when no more
root marking tasks were available. However, it simplifies the code and
unifies these code paths.
This has negligible effect on the go1 benchmarks. It slightly slows
down the garbage benchmark, possibly by making GC run slightly more
frequently.
name old time/op new time/op delta
XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18)
name old time/op new time/op delta
BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20)
Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18)
FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20)
FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19)
FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19)
FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19)
FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19)
FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20)
GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20)
Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17)
Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20)
HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20)
JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18)
JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20)
Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19)
GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20)
RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19)
RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19)
RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18)
RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20)
RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18)
RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19)
RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18)
Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17)
Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20)
TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20)
TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19)
[Geo mean] 62.1µs 62.3µs +0.31%
name old speed new speed delta
GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20)
GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20)
Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17)
Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20)
JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18)
JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20)
GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20)
RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20)
RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19)
RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19)
RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18)
RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20)
RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18)
RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19)
RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19)
Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17)
Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20)
[Geo mean] 100MB/s 99.0MB/s -0.82%
Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0
Reviewed-on: https://go-review.googlesource.com/16059
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-10-19 11:46:32 -06:00
|
|
|
// Drain heap marking jobs.
|
2016-01-06 15:07:58 -07:00
|
|
|
for !(preemptible && gp.preempt) {
|
|
|
|
// Try to keep work available on the global queue. We used to
|
|
|
|
// check if there were waiting workers, but it's better to
|
|
|
|
// just keep work available than to make workers wait. In the
|
|
|
|
// worst case, we'll do O(log(_WorkbufSize)) unnecessary
|
|
|
|
// balances.
|
|
|
|
if work.full == 0 {
|
2015-02-19 11:38:46 -07:00
|
|
|
gcw.balance()
|
|
|
|
}
|
|
|
|
|
2015-10-04 20:42:43 -06:00
|
|
|
var b uintptr
|
|
|
|
if blocking {
|
|
|
|
b = gcw.get()
|
|
|
|
} else {
|
|
|
|
b = gcw.tryGet()
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
if b == 0 {
|
2015-10-04 20:42:43 -06:00
|
|
|
// work barrier reached or tryGet failed.
|
2015-02-19 11:38:46 -07:00
|
|
|
break
|
|
|
|
}
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
scanobject(b, gcw)
|
2015-03-13 11:29:23 -06:00
|
|
|
|
|
|
|
// Flush background scan work credit to the global
|
|
|
|
// account if we've accumulated enough locally so
|
|
|
|
// mutator assists can draw on it.
|
2015-10-04 21:00:01 -06:00
|
|
|
if gcw.scanWork >= gcCreditSlack {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
|
2015-10-04 21:00:01 -06:00
|
|
|
if flushBgCredit {
|
2015-10-14 19:31:33 -06:00
|
|
|
gcFlushBgCredit(gcw.scanWork - initScanWork)
|
2015-10-04 21:00:01 -06:00
|
|
|
initScanWork = 0
|
|
|
|
}
|
|
|
|
gcw.scanWork = 0
|
2015-03-13 11:29:23 -06:00
|
|
|
}
|
|
|
|
}
|
2015-10-04 21:00:01 -06:00
|
|
|
|
2015-11-19 09:25:55 -07:00
|
|
|
// In blocking mode, write barriers are not allowed after this
|
|
|
|
// point because we must preserve the condition that the work
|
|
|
|
// buffers are empty.
|
|
|
|
|
2015-10-04 21:00:01 -06:00
|
|
|
// Flush remaining scan work credit.
|
|
|
|
if gcw.scanWork > 0 {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
|
2015-10-04 21:00:01 -06:00
|
|
|
if flushBgCredit {
|
2015-10-14 19:31:33 -06:00
|
|
|
gcFlushBgCredit(gcw.scanWork - initScanWork)
|
2015-10-04 21:00:01 -06:00
|
|
|
}
|
|
|
|
gcw.scanWork = 0
|
runtime: multi-threaded, utilization-scheduled background mark
Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.
This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.
This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.
This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.
Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-03-23 19:07:33 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-13 12:01:16 -06:00
|
|
|
// gcDrainN blackens grey objects until it has performed roughly
|
2015-10-15 15:58:17 -06:00
|
|
|
// scanWork units of scan work or the G is preempted. This is
|
|
|
|
// best-effort, so it may perform less work if it fails to get a work
|
|
|
|
// buffer. Otherwise, it will perform at least n units of work, but
|
|
|
|
// may perform more because scanning is always done in whole object
|
|
|
|
// increments. It returns the amount of scan work performed.
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2015-10-04 21:00:01 -06:00
|
|
|
func gcDrainN(gcw *gcWork, scanWork int64) int64 {
|
2015-11-13 18:45:22 -07:00
|
|
|
if !writeBarrier.needed {
|
2015-07-24 10:33:23 -06:00
|
|
|
throw("gcDrainN phase incorrect")
|
|
|
|
}
|
2015-10-04 21:00:01 -06:00
|
|
|
|
|
|
|
// There may already be scan work on the gcw, which we don't
|
|
|
|
// want to claim was done by this call.
|
|
|
|
workFlushed := -gcw.scanWork
|
|
|
|
|
2015-10-15 15:58:17 -06:00
|
|
|
gp := getg().m.curg
|
|
|
|
for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
|
2016-01-06 15:07:58 -07:00
|
|
|
// See gcDrain comment.
|
|
|
|
if work.full == 0 {
|
|
|
|
gcw.balance()
|
|
|
|
}
|
|
|
|
|
2015-02-19 11:38:46 -07:00
|
|
|
// This might be a good place to add prefetch code...
|
|
|
|
// if(wbuf.nobj > 4) {
|
|
|
|
// PREFETCH(wbuf->obj[wbuf.nobj - 3];
|
|
|
|
// }
|
2016-01-06 15:07:58 -07:00
|
|
|
//
|
2015-02-19 11:38:46 -07:00
|
|
|
b := gcw.tryGet()
|
|
|
|
if b == 0 {
|
2015-10-04 21:00:01 -06:00
|
|
|
break
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
scanobject(b, gcw)
|
2015-10-04 21:00:01 -06:00
|
|
|
|
|
|
|
// Flush background scan work credit.
|
|
|
|
if gcw.scanWork >= gcCreditSlack {
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
|
2015-10-04 21:00:01 -06:00
|
|
|
workFlushed += gcw.scanWork
|
|
|
|
gcw.scanWork = 0
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
2015-10-04 21:00:01 -06:00
|
|
|
|
|
|
|
// Unlike gcDrain, there's no need to flush remaining work
|
|
|
|
// here because this never flushes to bgScanCredit and
|
|
|
|
// gcw.dispose will flush any remaining work to scanWork.
|
|
|
|
|
|
|
|
return workFlushed + gcw.scanWork
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
// scanblock scans b as scanobject would, but using an explicit
|
|
|
|
// pointer bitmap instead of the heap bitmap.
|
2015-05-04 14:10:49 -06:00
|
|
|
//
|
|
|
|
// This is used to scan non-heap roots, so it does not update
|
|
|
|
// gcw.bytesMarked or gcw.scanWork.
|
|
|
|
//
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2015-03-12 11:09:30 -06:00
|
|
|
func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
|
2015-02-19 11:38:46 -07:00
|
|
|
// Use local copies of original parameters, so that a stack trace
|
|
|
|
// due to one of the throws below shows the original block
|
|
|
|
// base and extent.
|
|
|
|
b := b0
|
|
|
|
n := n0
|
|
|
|
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
arena_start := mheap_.arena_start
|
|
|
|
arena_used := mheap_.arena_used
|
2015-02-19 11:38:46 -07:00
|
|
|
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
for i := uintptr(0); i < n; {
|
|
|
|
// Find bits for the next word.
|
2015-11-11 10:39:30 -07:00
|
|
|
bits := uint32(*addb(ptrmask, i/(sys.PtrSize*8)))
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
if bits == 0 {
|
2015-11-11 10:39:30 -07:00
|
|
|
i += sys.PtrSize * 8
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
for j := 0; j < 8 && i < n; j++ {
|
|
|
|
if bits&1 != 0 {
|
|
|
|
// Same work as in scanobject; see comments there.
|
|
|
|
obj := *(*uintptr)(unsafe.Pointer(b + i))
|
|
|
|
if obj != 0 && arena_start <= obj && obj < arena_used {
|
2015-09-18 09:55:31 -06:00
|
|
|
if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 {
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
greyobject(obj, b, i, hbits, span, gcw)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bits >>= 1
|
2015-11-11 10:39:30 -07:00
|
|
|
i += sys.PtrSize
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
// scanobject scans the object starting at b, adding pointers to gcw.
|
|
|
|
// b must point to the beginning of a heap object; scanobject consults
|
|
|
|
// the GC bitmap for the pointer mask and the spans for the size of the
|
|
|
|
// object (it ignores n).
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
func scanobject(b uintptr, gcw *gcWork) {
|
2015-06-19 10:29:42 -06:00
|
|
|
// Note that arena_used may change concurrently during
|
|
|
|
// scanobject and hence scanobject may encounter a pointer to
|
|
|
|
// a newly allocated heap object that is *not* in
|
|
|
|
// [start,used). It will not mark this object; however, we
|
|
|
|
// know that it was just installed by a mutator, which means
|
|
|
|
// that mutator will execute a write barrier and take care of
|
|
|
|
// marking it. This is even more pronounced on relaxed memory
|
|
|
|
// architectures since we access arena_used without barriers
|
|
|
|
// or synchronization, but the same logic applies.
|
2015-02-19 11:38:46 -07:00
|
|
|
arena_start := mheap_.arena_start
|
|
|
|
arena_used := mheap_.arena_used
|
|
|
|
|
|
|
|
// Find bits of the beginning of the object.
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
// b must point to the beginning of a heap object, so
|
|
|
|
// we can get its bits and span directly.
|
|
|
|
hbits := heapBitsForAddr(b)
|
|
|
|
s := spanOfUnchecked(b)
|
|
|
|
n := s.elemsize
|
|
|
|
if n == 0 {
|
|
|
|
throw("scanobject n == 0")
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
|
2015-05-04 13:40:58 -06:00
|
|
|
var i uintptr
|
2015-11-11 10:39:30 -07:00
|
|
|
for i = 0; i < n; i += sys.PtrSize {
|
2015-02-19 11:38:46 -07:00
|
|
|
// Find bits for this word.
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
if i != 0 {
|
|
|
|
// Avoid needless hbits.next() on last iteration.
|
|
|
|
hbits = hbits.next()
|
|
|
|
}
|
2015-05-04 08:19:24 -06:00
|
|
|
// During checkmarking, 1-word objects store the checkmark
|
|
|
|
// in the type bit for the one word. The only one-word objects
|
|
|
|
// are pointers, or else they'd be merged with other non-pointer
|
|
|
|
// data into larger allocations.
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 09:30:10 -06:00
|
|
|
bits := hbits.bits()
|
2015-11-11 10:39:30 -07:00
|
|
|
if i >= 2*sys.PtrSize && bits&bitMarked == 0 {
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 09:30:10 -06:00
|
|
|
break // no more pointers in this object
|
|
|
|
}
|
|
|
|
if bits&bitPointer == 0 {
|
|
|
|
continue // not a pointer
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 09:30:10 -06:00
|
|
|
// Work here is duplicated in scanblock and above.
|
|
|
|
// If you make changes here, make changes there too.
|
2015-02-19 11:38:46 -07:00
|
|
|
obj := *(*uintptr)(unsafe.Pointer(b + i))
|
|
|
|
|
|
|
|
// At this point we have extracted the next potential pointer.
|
runtime: add fast check for self-loop pointer in scanobject
Addresses a problem reported on the mailing list.
This will come up mainly in programs custom allocators that batch allocations,
but it still helps in our programs, which mainly do not have such allocations.
name old mean new mean delta
BinaryTree17 5.95s × (0.97,1.03) 5.93s × (0.97,1.04) ~ (p=0.613)
Fannkuch11 4.46s × (0.98,1.04) 4.33s × (0.99,1.01) -2.93% (p=0.000)
FmtFprintfEmpty 86.6ns × (0.98,1.03) 86.8ns × (0.98,1.02) ~ (p=0.523)
FmtFprintfString 290ns × (0.98,1.05) 287ns × (0.98,1.03) ~ (p=0.061)
FmtFprintfInt 271ns × (0.98,1.04) 286ns × (0.99,1.01) +5.54% (p=0.000)
FmtFprintfIntInt 495ns × (0.98,1.04) 489ns × (0.99,1.01) -1.24% (p=0.015)
FmtFprintfPrefixedInt 391ns × (0.99,1.02) 407ns × (0.99,1.01) +4.00% (p=0.000)
FmtFprintfFloat 578ns × (0.99,1.01) 559ns × (0.99,1.01) -3.35% (p=0.000)
FmtManyArgs 1.96µs × (0.98,1.05) 1.94µs × (0.99,1.01) -1.33% (p=0.030)
GobDecode 15.9ms × (0.97,1.05) 15.7ms × (0.99,1.01) -1.35% (p=0.044)
GobEncode 11.4ms × (0.97,1.05) 11.3ms × (0.98,1.03) ~ (p=0.141)
Gzip 658ms × (0.98,1.05) 648ms × (0.99,1.01) -1.59% (p=0.009)
Gunzip 144ms × (0.99,1.03) 144ms × (0.99,1.01) ~ (p=0.867)
HTTPClientServer 92.1µs × (0.97,1.05) 90.3µs × (0.99,1.01) -1.89% (p=0.005)
JSONEncode 31.0ms × (0.96,1.07) 30.2ms × (0.98,1.03) -2.66% (p=0.001)
JSONDecode 110ms × (0.97,1.04) 107ms × (0.99,1.01) -2.59% (p=0.000)
Mandelbrot200 6.15ms × (0.98,1.04) 6.07ms × (0.99,1.02) -1.32% (p=0.045)
GoParse 6.79ms × (0.97,1.04) 6.74ms × (0.97,1.04) ~ (p=0.242)
RegexpMatchEasy0_32 158ns × (0.98,1.05) 155ns × (0.99,1.01) -1.64% (p=0.010)
RegexpMatchEasy0_1K 548ns × (0.97,1.04) 540ns × (0.99,1.01) -1.34% (p=0.042)
RegexpMatchEasy1_32 133ns × (0.97,1.04) 132ns × (0.97,1.05) ~ (p=0.466)
RegexpMatchEasy1_1K 899ns × (0.96,1.05) 878ns × (0.99,1.01) -2.32% (p=0.002)
RegexpMatchMedium_32 250ns × (0.96,1.03) 243ns × (0.99,1.01) -2.90% (p=0.000)
RegexpMatchMedium_1K 73.4µs × (0.98,1.04) 73.0µs × (0.98,1.04) ~ (p=0.411)
RegexpMatchHard_32 3.87µs × (0.97,1.07) 3.84µs × (0.98,1.04) ~ (p=0.273)
RegexpMatchHard_1K 120µs × (0.97,1.08) 117µs × (0.99,1.01) -2.06% (p=0.010)
Revcomp 940ms × (0.96,1.07) 924ms × (0.97,1.07) ~ (p=0.071)
Template 128ms × (0.96,1.05) 128ms × (0.99,1.01) ~ (p=0.502)
TimeParse 632ns × (0.96,1.07) 616ns × (0.99,1.01) -2.58% (p=0.001)
TimeFormat 671ns × (0.97,1.06) 657ns × (0.99,1.02) -2.10% (p=0.002)
In contrast to the one in test/bench/go1 (above), the binarytree program on the
shootout site uses more goroutines, batches allocations, and sets GOMAXPROCS
to runtime.NumCPU()*2.
Using that version, before vs after:
name old mean new mean delta
BinaryTree20 18.6s × (0.96,1.05) 11.3s × (0.98,1.02) -39.46% (p=0.000)
And Go 1.4 vs after:
name old mean new mean delta
BinaryTree20 13.0s × (0.97,1.02) 11.3s × (0.98,1.02) -13.21% (p=0.000)
There is still a scheduling problem - the raw run times are hiding the fact that
this chews up 2x the CPU - but we'll take care of that separately.
Change-Id: I3f5da879b24ae73a0d06745381ffb88c3744948b
Reviewed-on: https://go-review.googlesource.com/10220
Reviewed-by: Austin Clements <austin@google.com>
2015-05-18 09:40:29 -06:00
|
|
|
// Check if it points into heap and not back at the current object.
|
|
|
|
if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
// Mark the object.
|
2015-09-18 09:55:31 -06:00
|
|
|
if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 {
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 20:45:57 -06:00
|
|
|
greyobject(obj, b, i, hbits, span, gcw)
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
}
|
2015-03-12 14:53:57 -06:00
|
|
|
gcw.bytesMarked += uint64(n)
|
2015-05-04 13:40:58 -06:00
|
|
|
gcw.scanWork += int64(i)
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Shade the object if it isn't already.
|
|
|
|
// The object is not nil and known to be in the heap.
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
// Preemption must be disabled.
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
|
|
|
func shade(b uintptr) {
|
2015-09-18 09:55:31 -06:00
|
|
|
if obj, hbits, span := heapBitsForObject(b, 0, 0); obj != 0 {
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
gcw := &getg().m.p.ptr().gcw
|
|
|
|
greyobject(obj, 0, 0, hbits, span, gcw)
|
2015-06-01 16:16:03 -06:00
|
|
|
if gcphase == _GCmarktermination || gcBlackenPromptly {
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 13:22:20 -06:00
|
|
|
// Ps aren't allowed to cache work during mark
|
|
|
|
// termination.
|
2015-02-19 11:38:46 -07:00
|
|
|
gcw.dispose()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// obj is the start of an object with mark mbits.
|
2015-04-27 13:42:45 -06:00
|
|
|
// If it isn't already marked, mark it and enqueue into gcw.
|
2015-02-19 11:38:46 -07:00
|
|
|
// base and off are for debugging only and could be removed.
|
2015-11-24 12:03:58 -07:00
|
|
|
//go:nowritebarrierrec
|
2015-03-12 14:53:57 -06:00
|
|
|
func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork) {
|
2015-02-19 11:38:46 -07:00
|
|
|
// obj should be start of allocation, and so must be at least pointer-aligned.
|
2015-11-11 10:39:30 -07:00
|
|
|
if obj&(sys.PtrSize-1) != 0 {
|
2015-02-19 11:38:46 -07:00
|
|
|
throw("greyobject: obj not pointer-aligned")
|
|
|
|
}
|
|
|
|
|
2015-02-19 14:43:27 -07:00
|
|
|
if useCheckmark {
|
2015-02-19 11:38:46 -07:00
|
|
|
if !hbits.isMarked() {
|
2015-03-12 12:26:04 -06:00
|
|
|
printlock()
|
2015-02-19 11:38:46 -07:00
|
|
|
print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
|
|
|
|
print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
|
|
|
|
|
|
|
|
// Dump the source (base) object
|
2015-03-12 12:26:04 -06:00
|
|
|
gcDumpObject("base", base, off)
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
// Dump the object
|
2015-03-12 12:26:04 -06:00
|
|
|
gcDumpObject("obj", obj, ^uintptr(0))
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
throw("checkmark found unmarked object")
|
|
|
|
}
|
2015-05-04 08:19:24 -06:00
|
|
|
if hbits.isCheckmarked(span.elemsize) {
|
2015-02-19 11:38:46 -07:00
|
|
|
return
|
|
|
|
}
|
2015-05-04 08:19:24 -06:00
|
|
|
hbits.setCheckmarked(span.elemsize)
|
|
|
|
if !hbits.isCheckmarked(span.elemsize) {
|
2015-02-19 11:38:46 -07:00
|
|
|
throw("setCheckmarked and isCheckmarked disagree")
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If marked we have nothing to do.
|
|
|
|
if hbits.isMarked() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
hbits.setMarked()
|
|
|
|
|
2015-02-27 10:41:20 -07:00
|
|
|
// If this is a noscan object, fast-track it to black
|
|
|
|
// instead of greying it.
|
2015-05-04 08:19:24 -06:00
|
|
|
if !hbits.hasPointers(span.elemsize) {
|
2015-03-12 14:53:57 -06:00
|
|
|
gcw.bytesMarked += uint64(span.elemsize)
|
2015-02-27 10:41:20 -07:00
|
|
|
return
|
|
|
|
}
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
|
|
|
|
// seems like a nice optimization that can be added back in.
|
|
|
|
// There needs to be time between the PREFETCH and the use.
|
|
|
|
// Previously we put the obj in an 8 element buffer that is drained at a rate
|
|
|
|
// to give the PREFETCH time to do its work.
|
|
|
|
// Use of PREFETCHNTA might be more appropriate than PREFETCH
|
|
|
|
|
|
|
|
gcw.put(obj)
|
|
|
|
}
|
|
|
|
|
2015-03-12 12:26:04 -06:00
|
|
|
// gcDumpObject dumps the contents of obj for debugging and marks the
|
|
|
|
// field at byte offset off in obj.
|
|
|
|
func gcDumpObject(label string, obj, off uintptr) {
|
2015-04-29 13:15:43 -06:00
|
|
|
if obj < mheap_.arena_start || obj >= mheap_.arena_used {
|
2015-09-18 09:55:31 -06:00
|
|
|
print(label, "=", hex(obj), " is not in the Go heap\n")
|
2015-04-29 13:15:43 -06:00
|
|
|
return
|
|
|
|
}
|
2015-03-12 12:26:04 -06:00
|
|
|
k := obj >> _PageShift
|
|
|
|
x := k
|
|
|
|
x -= mheap_.arena_start >> _PageShift
|
|
|
|
s := h_spans[x]
|
|
|
|
print(label, "=", hex(obj), " k=", hex(k))
|
|
|
|
if s == nil {
|
|
|
|
print(" s=nil\n")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
print(" s.start*_PageSize=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
|
2015-09-18 10:06:24 -06:00
|
|
|
skipped := false
|
2015-11-11 10:39:30 -07:00
|
|
|
for i := uintptr(0); i < s.elemsize; i += sys.PtrSize {
|
2015-09-18 10:06:24 -06:00
|
|
|
// For big objects, just print the beginning (because
|
|
|
|
// that usually hints at the object's type) and the
|
|
|
|
// fields around off.
|
2015-11-11 10:39:30 -07:00
|
|
|
if !(i < 128*sys.PtrSize || off-16*sys.PtrSize < i && i < off+16*sys.PtrSize) {
|
2015-09-18 10:06:24 -06:00
|
|
|
skipped = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if skipped {
|
|
|
|
print(" ...\n")
|
|
|
|
skipped = false
|
|
|
|
}
|
2016-02-29 16:01:00 -07:00
|
|
|
print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + i))))
|
2015-03-12 12:26:04 -06:00
|
|
|
if i == off {
|
|
|
|
print(" <==")
|
|
|
|
}
|
|
|
|
print("\n")
|
|
|
|
}
|
2015-09-18 10:06:24 -06:00
|
|
|
if skipped {
|
|
|
|
print(" ...\n")
|
|
|
|
}
|
2015-03-12 12:26:04 -06:00
|
|
|
}
|
|
|
|
|
2016-04-17 09:42:37 -06:00
|
|
|
// gcmarknewobject marks a newly allocated object black. obj must
|
|
|
|
// not contain any non-nil pointers.
|
|
|
|
//
|
|
|
|
// This is nosplit so it can manipulate a gcWork without preemption.
|
|
|
|
//
|
2015-02-19 11:38:46 -07:00
|
|
|
//go:nowritebarrier
|
2016-04-17 09:42:37 -06:00
|
|
|
//go:nosplit
|
|
|
|
func gcmarknewobject(obj, size, scanSize uintptr) {
|
2015-06-01 16:16:03 -06:00
|
|
|
if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
|
2015-02-19 11:38:46 -07:00
|
|
|
throw("gcmarknewobject called while doing checkmark")
|
|
|
|
}
|
|
|
|
heapBitsForAddr(obj).setMarked()
|
2016-04-16 16:27:38 -06:00
|
|
|
gcw := &getg().m.p.ptr().gcw
|
2016-04-17 09:42:37 -06:00
|
|
|
gcw.bytesMarked += uint64(size)
|
2016-04-16 16:27:38 -06:00
|
|
|
gcw.scanWork += int64(scanSize)
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Checkmarking
|
|
|
|
|
|
|
|
// To help debug the concurrent GC we remark with the world
|
|
|
|
// stopped ensuring that any object encountered has their normal
|
|
|
|
// mark bit set. To do this we use an orthogonal bit
|
|
|
|
// pattern to indicate the object is marked. The following pattern
|
2015-06-11 07:49:38 -06:00
|
|
|
// uses the upper two bits in the object's boundary nibble.
|
2015-02-19 11:38:46 -07:00
|
|
|
// 01: scalar not marked
|
|
|
|
// 10: pointer not marked
|
|
|
|
// 11: pointer marked
|
|
|
|
// 00: scalar marked
|
|
|
|
// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
|
|
|
|
// The higher bit is 1 for pointers and 0 for scalars, whether the object
|
|
|
|
// is marked or not.
|
|
|
|
// The first nibble no longer holds the typeDead pattern indicating that the
|
|
|
|
// there are no more pointers in the object. This information is held
|
|
|
|
// in the second nibble.
|
|
|
|
|
2015-02-19 14:43:27 -07:00
|
|
|
// If useCheckmark is true, marking of an object uses the
|
|
|
|
// checkmark bits (encoding above) instead of the standard
|
|
|
|
// mark bits.
|
|
|
|
var useCheckmark = false
|
2015-02-19 11:38:46 -07:00
|
|
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
func initCheckmarks() {
|
2015-02-19 14:43:27 -07:00
|
|
|
useCheckmark = true
|
2015-02-19 11:38:46 -07:00
|
|
|
for _, s := range work.spans {
|
|
|
|
if s.state == _MSpanInUse {
|
|
|
|
heapBitsForSpan(s.base()).initCheckmarkSpan(s.layout())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func clearCheckmarks() {
|
2015-02-19 14:43:27 -07:00
|
|
|
useCheckmark = false
|
2015-02-19 11:38:46 -07:00
|
|
|
for _, s := range work.spans {
|
|
|
|
if s.state == _MSpanInUse {
|
|
|
|
heapBitsForSpan(s.base()).clearCheckmarkSpan(s.layout())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|