2014-11-11 15:05:02 -07:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// Central free lists.
|
|
|
|
//
|
2015-03-11 13:58:47 -06:00
|
|
|
// See malloc.go for an overview.
|
2014-11-11 15:05:02 -07:00
|
|
|
//
|
|
|
|
// The MCentral doesn't actually contain the list of free objects; the MSpan does.
|
|
|
|
// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
|
|
|
|
// and those that are completely allocated (c->empty).
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
2015-11-02 12:09:24 -07:00
|
|
|
import "runtime/internal/atomic"
|
|
|
|
|
2015-02-19 11:38:46 -07:00
|
|
|
// Central list of free objects of a given size.
|
2016-10-11 20:58:21 -06:00
|
|
|
//
|
|
|
|
//go:notinheap
|
2015-02-19 11:38:46 -07:00
|
|
|
type mcentral struct {
|
|
|
|
lock mutex
|
|
|
|
sizeclass int32
|
2016-02-11 11:57:58 -07:00
|
|
|
nonempty mSpanList // list of spans with a free object, ie a nonempty free list
|
2015-10-15 16:59:49 -06:00
|
|
|
empty mSpanList // list of spans with no free objects (or cached in an mcache)
|
2017-01-03 10:15:55 -07:00
|
|
|
|
|
|
|
// nmalloc is the cumulative count of objects allocated from
|
|
|
|
// this mcentral, assuming all spans in mcaches are
|
|
|
|
// fully-allocated. Written atomically, read under STW.
|
|
|
|
nmalloc uint64
|
2015-02-19 11:38:46 -07:00
|
|
|
}
|
|
|
|
|
2014-11-11 15:05:02 -07:00
|
|
|
// Initialize a single central free list.
|
2015-11-11 17:13:51 -07:00
|
|
|
func (c *mcentral) init(sizeclass int32) {
|
2014-11-11 15:05:02 -07:00
|
|
|
c.sizeclass = sizeclass
|
2015-11-11 17:13:51 -07:00
|
|
|
c.nonempty.init()
|
|
|
|
c.empty.init()
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Allocate a span to use in an MCache.
|
2015-11-11 17:13:51 -07:00
|
|
|
func (c *mcentral) cacheSpan() *mspan {
|
runtime: make sweep proportional to spans bytes allocated
Proportional concurrent sweep is currently based on a ratio of spans
to be swept per bytes of object allocation. However, proportional
sweeping is performed during span allocation, not object allocation,
in order to minimize contention and overhead. Since objects are
allocated from spans after those spans are allocated, the system tends
to operate in debt, which means when the next GC cycle starts, there
is often sweep debt remaining, so GC has to finish the sweep, which
delays the start of the cycle and delays enabling mutator assists.
For example, it's quite likely that many Ps will simultaneously refill
their span caches immediately after a GC cycle (because GC flushes the
span caches), but at this point, there has been very little object
allocation since the end of GC, so very little sweeping is done. The
Ps then allocate objects from these cached spans, which drives up the
bytes of object allocation, but since these allocations are coming
from cached spans, nothing considers whether more sweeping has to
happen. If the sweep ratio is high enough (which can happen if the
next GC trigger is very close to the retained heap size), this can
easily represent a sweep debt of thousands of pages.
Fix this by making proportional sweep proportional to the number of
bytes of spans allocated, rather than the number of bytes of objects
allocated. Prior to allocating a span, both the small object path and
the large object path ensure credit for allocating that span, so the
system operates in the black, rather than in the red.
Combined with the previous commit, this should eliminate all sweeping
from GC start up. On the stress test in issue #11911, this reduces the
time spent sweeping during GC (and delaying start up) by several
orders of magnitude:
mean 99%ile max
pre fix 1 ms 11 ms 144 ms
post fix 270 ns 735 ns 916 ns
Updates #11911.
Change-Id: I89223712883954c9d6ec2a7a51ecb97172097df3
Reviewed-on: https://go-review.googlesource.com/13044
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-08-03 07:46:50 -06:00
|
|
|
// Deduct credit for this span allocation and sweep if necessary.
|
runtime: deduct correct sweep credit
deductSweepCredit expects the size in bytes of the span being
allocated, but mCentral_CacheSpan passes the size of a single object
in the span. As a result, we don't sweep enough on that call and when
mCentral_CacheSpan later calls reimburseSweepCredit, it's very likely
to underflow mheap_.spanBytesAlloc, which causes the next call to
deductSweepCredit to think it owes a huge number of pages and finish
off the whole sweep.
In addition to causing the occasional allocation that triggers the
full sweep to be potentially extremely expensive relative to other
allocations, this can indirectly slow down many other allocations.
deductSweepCredit uses sweepone to sweep spans, which returns
fully-unused spans to the heap, where these spans are freed and
coalesced with neighboring free spans. On the other hand, when
mCentral_CacheSpan sweeps a span, it does so with the intent to
immediately reuse that span and, as a result, will not return the span
to the heap even if it is fully unused. This saves on the cost of
locking the heap, finding a span, and initializing that span. For
example, before this change, with GOMAXPROCS=1 (or the background
sweeper disabled) BinaryTree17 returned roughly 220K spans to the heap
and allocated new spans from the heap roughly 232K times. After this
change, it returns 1.3K spans to the heap and allocates new spans from
the heap 39K times. (With background sweeping these numbers are
effectively unchanged because the background sweeper sweeps almost all
of the spans with sweepone; however, parallel sweeping saves more than
the cost of allocating spans from the heap.)
Fixes #13535.
Fixes #13589.
name old time/op new time/op delta
BinaryTree17-12 3.03s ± 1% 2.86s ± 4% -5.61% (p=0.000 n=18+20)
Fannkuch11-12 2.48s ± 1% 2.49s ± 1% ~ (p=0.060 n=17+20)
FmtFprintfEmpty-12 50.7ns ± 1% 50.9ns ± 1% +0.43% (p=0.025 n=15+16)
FmtFprintfString-12 174ns ± 2% 174ns ± 2% ~ (p=0.539 n=19+20)
FmtFprintfInt-12 158ns ± 1% 158ns ± 1% ~ (p=0.300 n=18+20)
FmtFprintfIntInt-12 269ns ± 2% 269ns ± 2% ~ (p=0.784 n=20+18)
FmtFprintfPrefixedInt-12 233ns ± 1% 234ns ± 1% ~ (p=0.389 n=18+18)
FmtFprintfFloat-12 309ns ± 1% 310ns ± 1% +0.25% (p=0.048 n=18+18)
FmtManyArgs-12 1.10µs ± 1% 1.10µs ± 1% ~ (p=0.259 n=18+19)
GobDecode-12 7.81ms ± 1% 7.72ms ± 1% -1.17% (p=0.000 n=19+19)
GobEncode-12 6.56ms ± 0% 6.55ms ± 1% ~ (p=0.433 n=17+19)
Gzip-12 318ms ± 2% 317ms ± 1% ~ (p=0.578 n=19+18)
Gunzip-12 42.1ms ± 2% 42.0ms ± 0% -0.45% (p=0.007 n=18+16)
HTTPClientServer-12 63.9µs ± 1% 64.0µs ± 1% ~ (p=0.146 n=17+19)
JSONEncode-12 16.4ms ± 1% 16.4ms ± 1% ~ (p=0.271 n=19+19)
JSONDecode-12 58.1ms ± 1% 58.0ms ± 1% ~ (p=0.152 n=18+18)
Mandelbrot200-12 3.85ms ± 0% 3.85ms ± 0% ~ (p=0.126 n=19+18)
GoParse-12 3.71ms ± 1% 3.64ms ± 1% -1.86% (p=0.000 n=20+18)
RegexpMatchEasy0_32-12 100ns ± 2% 100ns ± 1% ~ (p=0.588 n=20+20)
RegexpMatchEasy0_1K-12 346ns ± 1% 347ns ± 1% +0.27% (p=0.014 n=17+20)
RegexpMatchEasy1_32-12 82.9ns ± 3% 83.5ns ± 3% ~ (p=0.096 n=19+20)
RegexpMatchEasy1_1K-12 506ns ± 1% 506ns ± 1% ~ (p=0.530 n=19+19)
RegexpMatchMedium_32-12 129ns ± 2% 129ns ± 1% ~ (p=0.566 n=20+19)
RegexpMatchMedium_1K-12 39.4µs ± 1% 39.4µs ± 1% ~ (p=0.713 n=19+20)
RegexpMatchHard_32-12 2.05µs ± 1% 2.06µs ± 1% +0.36% (p=0.008 n=18+20)
RegexpMatchHard_1K-12 61.6µs ± 1% 61.7µs ± 1% ~ (p=0.286 n=19+20)
Revcomp-12 538ms ± 1% 541ms ± 2% ~ (p=0.081 n=18+19)
Template-12 71.5ms ± 2% 71.6ms ± 1% ~ (p=0.513 n=20+19)
TimeParse-12 357ns ± 1% 357ns ± 1% ~ (p=0.935 n=19+18)
TimeFormat-12 352ns ± 1% 352ns ± 1% ~ (p=0.293 n=19+20)
[Geo mean] 62.0µs 61.9µs -0.21%
name old time/op new time/op delta
XBenchGarbage-12 5.83ms ± 2% 5.86ms ± 3% ~ (p=0.247 n=19+20)
Change-Id: I790bb530adace27ccf25d372f24a11954b88443c
Reviewed-on: https://go-review.googlesource.com/17745
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-12-11 10:33:34 -07:00
|
|
|
spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
|
|
|
|
deductSweepCredit(spanBytes, 0)
|
runtime: finish sweeping before concurrent GC starts
Currently, the concurrent sweep follows a 1:1 rule: when allocation
needs a span, it sweeps a span (likewise, when a large allocation
needs N pages, it sweeps until it frees N pages). This rule worked
well for the STW collector (especially when GOGC==100) because it did
no more sweeping than necessary to keep the heap from growing, would
generally finish sweeping just before GC, and ensured good temporal
locality between sweeping a page and allocating from it.
It doesn't work well with concurrent GC. Since concurrent GC requires
starting GC earlier (sometimes much earlier), the sweep often won't be
done when GC starts. Unfortunately, the first thing GC has to do is
finish the sweep. In the mean time, the mutator can continue
allocating, pushing the heap size even closer to the goal size. This
worked okay with the 7/8ths trigger, but it gets into a vicious cycle
with the GC trigger controller: if the mutator is allocating quickly
and driving the trigger lower, more and more sweep work will be left
to GC; this both causes GC to take longer (allowing the mutator to
allocate more during GC) and delays the start of the concurrent mark
phase, which throws off the GC controller's statistics and generally
causes it to push the trigger even lower.
As an example of a particularly bad case, the garbage benchmark with
GOMAXPROCS=4 and -benchmem 512 (MB) spends the first 0.4-0.8 seconds
of each GC cycle sweeping, during which the heap grows by between
109MB and 252MB.
To fix this, this change replaces the 1:1 sweep rule with a
proportional sweep rule. At the end of GC, GC knows exactly how much
heap allocation will occur before the next concurrent GC as well as
how many span pages must be swept. This change computes this "sweep
ratio" and when the mallocgc asks for a span, the mcentral sweeps
enough spans to bring the swept span count into ratio with the
allocated byte count.
On the benchmark from above, this entirely eliminates sweeping at the
beginning of GC, which reduces the time between startGC readying the
GC goroutine and GC stopping the world for sweep termination to ~100µs
during which the heap grows at most 134KB.
Change-Id: I35422d6bba0c2310d48bb1f8f30a72d29e98c1af
Reviewed-on: https://go-review.googlesource.com/8921
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-13 21:34:57 -06:00
|
|
|
|
2014-11-11 15:05:02 -07:00
|
|
|
lock(&c.lock)
|
|
|
|
sg := mheap_.sweepgen
|
|
|
|
retry:
|
|
|
|
var s *mspan
|
2015-10-15 16:59:49 -06:00
|
|
|
for s = c.nonempty.first; s != nil; s = s.next {
|
2015-11-02 12:09:24 -07:00
|
|
|
if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
|
2015-11-11 17:13:51 -07:00
|
|
|
c.nonempty.remove(s)
|
|
|
|
c.empty.insertBack(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
2015-11-11 17:13:51 -07:00
|
|
|
s.sweep(true)
|
2014-11-11 15:05:02 -07:00
|
|
|
goto havespan
|
|
|
|
}
|
|
|
|
if s.sweepgen == sg-1 {
|
|
|
|
// the span is being swept by background sweeper, skip
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// we have a nonempty span that does not require sweeping, allocate from it
|
2015-11-11 17:13:51 -07:00
|
|
|
c.nonempty.remove(s)
|
|
|
|
c.empty.insertBack(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
|
|
|
goto havespan
|
|
|
|
}
|
|
|
|
|
2015-10-15 16:59:49 -06:00
|
|
|
for s = c.empty.first; s != nil; s = s.next {
|
2015-11-02 12:09:24 -07:00
|
|
|
if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
|
2014-11-11 15:05:02 -07:00
|
|
|
// we have an empty span that requires sweeping,
|
|
|
|
// sweep it and see if we can free some space in it
|
2015-11-11 17:13:51 -07:00
|
|
|
c.empty.remove(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
// swept spans are at the end of the list
|
2015-11-11 17:13:51 -07:00
|
|
|
c.empty.insertBack(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
2015-11-11 17:13:51 -07:00
|
|
|
s.sweep(true)
|
2016-02-24 12:36:30 -07:00
|
|
|
freeIndex := s.nextFreeIndex()
|
2016-02-11 11:57:58 -07:00
|
|
|
if freeIndex != s.nelems {
|
|
|
|
s.freeindex = freeIndex
|
2014-11-11 15:05:02 -07:00
|
|
|
goto havespan
|
|
|
|
}
|
|
|
|
lock(&c.lock)
|
|
|
|
// the span is still empty after sweep
|
|
|
|
// it is already in the empty list, so just retry
|
|
|
|
goto retry
|
|
|
|
}
|
|
|
|
if s.sweepgen == sg-1 {
|
|
|
|
// the span is being swept by background sweeper, skip
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// already swept empty span,
|
|
|
|
// all subsequent ones must also be either swept or in process of sweeping
|
|
|
|
break
|
|
|
|
}
|
|
|
|
unlock(&c.lock)
|
|
|
|
|
|
|
|
// Replenish central list if empty.
|
2015-11-11 17:13:51 -07:00
|
|
|
s = c.grow()
|
2014-11-11 15:05:02 -07:00
|
|
|
if s == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
lock(&c.lock)
|
2015-11-11 17:13:51 -07:00
|
|
|
c.empty.insertBack(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
|
|
|
|
|
|
|
// At this point s is a non-empty span, queued at the end of the empty list,
|
|
|
|
// c is unlocked.
|
|
|
|
havespan:
|
|
|
|
cap := int32((s.npages << _PageShift) / s.elemsize)
|
2016-02-16 15:16:43 -07:00
|
|
|
n := cap - int32(s.allocCount)
|
2016-02-24 12:36:30 -07:00
|
|
|
if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
|
2016-02-16 15:16:43 -07:00
|
|
|
throw("span has no free objects")
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
2017-01-03 10:15:55 -07:00
|
|
|
// Assume all objects from this span will be allocated in the
|
|
|
|
// mcache. If it gets uncached, we'll adjust this.
|
|
|
|
atomic.Xadd64(&c.nmalloc, int64(n))
|
2016-02-16 15:16:43 -07:00
|
|
|
usedBytes := uintptr(s.allocCount) * s.elemsize
|
2015-10-30 12:25:28 -06:00
|
|
|
if usedBytes > 0 {
|
|
|
|
reimburseSweepCredit(usedBytes)
|
|
|
|
}
|
runtime: fix (sometimes major) underestimation of heap_live
Currently, we update memstats.heap_live from mcache.local_cachealloc
whenever we lock the heap (e.g., to obtain a fresh span or to release
an unused span). However, under the right circumstances,
local_cachealloc can accumulate allocations up to the size of
the *entire heap* without flushing them to heap_live. Specifically,
since span allocations from an mcentral don't lock the heap, if a
large number of pages are held in an mcentral and the application
continues to use and free objects of that size class (e.g., the
BinaryTree17 benchmark), local_cachealloc won't be flushed until the
mcentral runs out of spans.
This is a problem because, unlike many of the memory statistics that
are purely informative, heap_live is used to determine when the
garbage collector should start and how hard it should work.
This commit eliminates local_cachealloc, instead atomically updating
heap_live directly. To control contention, we do this only when
obtaining a span from an mcentral. Furthermore, we make heap_live
conservative: allocating a span assumes that all free slots in that
span will be used and accounts for these when the span is
allocated, *before* the objects themselves are. This is important
because 1) this triggers the GC earlier than necessary rather than
potentially too late and 2) this leads to a conservative GC rate
rather than a GC rate that is potentially too low.
Alternatively, we could have flushed local_cachealloc when it passed
some threshold, but this would require determining a threshold and
would cause heap_live to underestimate the true value rather than
overestimate.
Fixes #12199.
name old time/op new time/op delta
BinaryTree17-12 2.88s ± 4% 2.88s ± 1% ~ (p=0.470 n=19+19)
Fannkuch11-12 2.48s ± 1% 2.48s ± 1% ~ (p=0.243 n=16+19)
FmtFprintfEmpty-12 50.9ns ± 2% 50.7ns ± 1% ~ (p=0.238 n=15+14)
FmtFprintfString-12 175ns ± 1% 171ns ± 1% -2.48% (p=0.000 n=18+18)
FmtFprintfInt-12 159ns ± 1% 158ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 270ns ± 1% 265ns ± 2% -1.67% (p=0.000 n=18+18)
FmtFprintfPrefixedInt-12 235ns ± 1% 234ns ± 0% ~ (p=0.362 n=18+19)
FmtFprintfFloat-12 309ns ± 1% 308ns ± 1% -0.41% (p=0.001 n=18+19)
FmtManyArgs-12 1.10µs ± 1% 1.08µs ± 0% -1.96% (p=0.000 n=19+18)
GobDecode-12 7.81ms ± 1% 7.80ms ± 1% ~ (p=0.425 n=18+19)
GobEncode-12 6.53ms ± 1% 6.53ms ± 1% ~ (p=0.817 n=19+19)
Gzip-12 312ms ± 1% 312ms ± 2% ~ (p=0.967 n=19+20)
Gunzip-12 42.0ms ± 1% 41.9ms ± 1% ~ (p=0.172 n=19+19)
HTTPClientServer-12 63.7µs ± 1% 63.8µs ± 1% ~ (p=0.639 n=19+19)
JSONEncode-12 16.4ms ± 1% 16.4ms ± 1% ~ (p=0.954 n=19+19)
JSONDecode-12 58.5ms ± 1% 57.8ms ± 1% -1.27% (p=0.000 n=18+19)
Mandelbrot200-12 3.86ms ± 1% 3.88ms ± 0% +0.44% (p=0.000 n=18+18)
GoParse-12 3.67ms ± 2% 3.66ms ± 1% -0.52% (p=0.001 n=18+19)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 0% ~ (p=0.257 n=19+18)
RegexpMatchEasy0_1K-12 347ns ± 1% 347ns ± 1% ~ (p=0.527 n=18+18)
RegexpMatchEasy1_32-12 83.7ns ± 2% 83.1ns ± 2% ~ (p=0.096 n=18+19)
RegexpMatchEasy1_1K-12 509ns ± 1% 505ns ± 1% -0.75% (p=0.000 n=18+19)
RegexpMatchMedium_32-12 130ns ± 2% 129ns ± 1% ~ (p=0.962 n=20+20)
RegexpMatchMedium_1K-12 39.5µs ± 2% 39.4µs ± 1% ~ (p=0.376 n=20+19)
RegexpMatchHard_32-12 2.04µs ± 0% 2.04µs ± 1% ~ (p=0.195 n=18+17)
RegexpMatchHard_1K-12 61.4µs ± 1% 61.4µs ± 1% ~ (p=0.885 n=19+19)
Revcomp-12 540ms ± 2% 542ms ± 4% ~ (p=0.552 n=19+17)
Template-12 69.6ms ± 1% 71.2ms ± 1% +2.39% (p=0.000 n=20+20)
TimeParse-12 357ns ± 1% 357ns ± 1% ~ (p=0.883 n=18+20)
TimeFormat-12 379ns ± 1% 362ns ± 1% -4.53% (p=0.000 n=18+19)
[Geo mean] 62.0µs 61.8µs -0.44%
name old time/op new time/op delta
XBenchGarbage-12 5.89ms ± 2% 5.81ms ± 2% -1.41% (p=0.000 n=19+18)
Change-Id: I96b31cca6ae77c30693a891cff3fe663fa2447a0
Reviewed-on: https://go-review.googlesource.com/17748
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-12-11 15:49:14 -07:00
|
|
|
atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
|
|
|
|
if trace.enabled {
|
|
|
|
// heap_live changed.
|
|
|
|
traceHeapAlloc()
|
|
|
|
}
|
|
|
|
if gcBlackenEnabled != 0 {
|
|
|
|
// heap_live changed.
|
|
|
|
gcController.revise()
|
|
|
|
}
|
2014-11-11 15:05:02 -07:00
|
|
|
s.incache = true
|
2016-02-24 12:36:30 -07:00
|
|
|
freeByteBase := s.freeindex &^ (64 - 1)
|
|
|
|
whichByte := freeByteBase / 8
|
|
|
|
// Init alloc bits cache.
|
|
|
|
s.refillAllocCache(whichByte)
|
|
|
|
|
|
|
|
// Adjust the allocCache so that s.freeindex corresponds to the low bit in
|
|
|
|
// s.allocCache.
|
|
|
|
s.allocCache >>= s.freeindex % 64
|
|
|
|
|
2014-11-11 15:05:02 -07:00
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return span from an MCache.
|
2015-11-11 17:13:51 -07:00
|
|
|
func (c *mcentral) uncacheSpan(s *mspan) {
|
2014-11-11 15:05:02 -07:00
|
|
|
lock(&c.lock)
|
|
|
|
|
|
|
|
s.incache = false
|
|
|
|
|
2016-02-16 15:16:43 -07:00
|
|
|
if s.allocCount == 0 {
|
|
|
|
throw("uncaching span but s.allocCount == 0")
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
cap := int32((s.npages << _PageShift) / s.elemsize)
|
2016-02-16 15:16:43 -07:00
|
|
|
n := cap - int32(s.allocCount)
|
2014-11-11 15:05:02 -07:00
|
|
|
if n > 0 {
|
2015-11-11 17:13:51 -07:00
|
|
|
c.empty.remove(s)
|
|
|
|
c.nonempty.insert(s)
|
runtime: fix (sometimes major) underestimation of heap_live
Currently, we update memstats.heap_live from mcache.local_cachealloc
whenever we lock the heap (e.g., to obtain a fresh span or to release
an unused span). However, under the right circumstances,
local_cachealloc can accumulate allocations up to the size of
the *entire heap* without flushing them to heap_live. Specifically,
since span allocations from an mcentral don't lock the heap, if a
large number of pages are held in an mcentral and the application
continues to use and free objects of that size class (e.g., the
BinaryTree17 benchmark), local_cachealloc won't be flushed until the
mcentral runs out of spans.
This is a problem because, unlike many of the memory statistics that
are purely informative, heap_live is used to determine when the
garbage collector should start and how hard it should work.
This commit eliminates local_cachealloc, instead atomically updating
heap_live directly. To control contention, we do this only when
obtaining a span from an mcentral. Furthermore, we make heap_live
conservative: allocating a span assumes that all free slots in that
span will be used and accounts for these when the span is
allocated, *before* the objects themselves are. This is important
because 1) this triggers the GC earlier than necessary rather than
potentially too late and 2) this leads to a conservative GC rate
rather than a GC rate that is potentially too low.
Alternatively, we could have flushed local_cachealloc when it passed
some threshold, but this would require determining a threshold and
would cause heap_live to underestimate the true value rather than
overestimate.
Fixes #12199.
name old time/op new time/op delta
BinaryTree17-12 2.88s ± 4% 2.88s ± 1% ~ (p=0.470 n=19+19)
Fannkuch11-12 2.48s ± 1% 2.48s ± 1% ~ (p=0.243 n=16+19)
FmtFprintfEmpty-12 50.9ns ± 2% 50.7ns ± 1% ~ (p=0.238 n=15+14)
FmtFprintfString-12 175ns ± 1% 171ns ± 1% -2.48% (p=0.000 n=18+18)
FmtFprintfInt-12 159ns ± 1% 158ns ± 1% -0.78% (p=0.000 n=19+18)
FmtFprintfIntInt-12 270ns ± 1% 265ns ± 2% -1.67% (p=0.000 n=18+18)
FmtFprintfPrefixedInt-12 235ns ± 1% 234ns ± 0% ~ (p=0.362 n=18+19)
FmtFprintfFloat-12 309ns ± 1% 308ns ± 1% -0.41% (p=0.001 n=18+19)
FmtManyArgs-12 1.10µs ± 1% 1.08µs ± 0% -1.96% (p=0.000 n=19+18)
GobDecode-12 7.81ms ± 1% 7.80ms ± 1% ~ (p=0.425 n=18+19)
GobEncode-12 6.53ms ± 1% 6.53ms ± 1% ~ (p=0.817 n=19+19)
Gzip-12 312ms ± 1% 312ms ± 2% ~ (p=0.967 n=19+20)
Gunzip-12 42.0ms ± 1% 41.9ms ± 1% ~ (p=0.172 n=19+19)
HTTPClientServer-12 63.7µs ± 1% 63.8µs ± 1% ~ (p=0.639 n=19+19)
JSONEncode-12 16.4ms ± 1% 16.4ms ± 1% ~ (p=0.954 n=19+19)
JSONDecode-12 58.5ms ± 1% 57.8ms ± 1% -1.27% (p=0.000 n=18+19)
Mandelbrot200-12 3.86ms ± 1% 3.88ms ± 0% +0.44% (p=0.000 n=18+18)
GoParse-12 3.67ms ± 2% 3.66ms ± 1% -0.52% (p=0.001 n=18+19)
RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 0% ~ (p=0.257 n=19+18)
RegexpMatchEasy0_1K-12 347ns ± 1% 347ns ± 1% ~ (p=0.527 n=18+18)
RegexpMatchEasy1_32-12 83.7ns ± 2% 83.1ns ± 2% ~ (p=0.096 n=18+19)
RegexpMatchEasy1_1K-12 509ns ± 1% 505ns ± 1% -0.75% (p=0.000 n=18+19)
RegexpMatchMedium_32-12 130ns ± 2% 129ns ± 1% ~ (p=0.962 n=20+20)
RegexpMatchMedium_1K-12 39.5µs ± 2% 39.4µs ± 1% ~ (p=0.376 n=20+19)
RegexpMatchHard_32-12 2.04µs ± 0% 2.04µs ± 1% ~ (p=0.195 n=18+17)
RegexpMatchHard_1K-12 61.4µs ± 1% 61.4µs ± 1% ~ (p=0.885 n=19+19)
Revcomp-12 540ms ± 2% 542ms ± 4% ~ (p=0.552 n=19+17)
Template-12 69.6ms ± 1% 71.2ms ± 1% +2.39% (p=0.000 n=20+20)
TimeParse-12 357ns ± 1% 357ns ± 1% ~ (p=0.883 n=18+20)
TimeFormat-12 379ns ± 1% 362ns ± 1% -4.53% (p=0.000 n=18+19)
[Geo mean] 62.0µs 61.8µs -0.44%
name old time/op new time/op delta
XBenchGarbage-12 5.89ms ± 2% 5.81ms ± 2% -1.41% (p=0.000 n=19+18)
Change-Id: I96b31cca6ae77c30693a891cff3fe663fa2447a0
Reviewed-on: https://go-review.googlesource.com/17748
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-12-11 15:49:14 -07:00
|
|
|
// mCentral_CacheSpan conservatively counted
|
|
|
|
// unallocated slots in heap_live. Undo this.
|
|
|
|
atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
|
2017-01-03 10:15:55 -07:00
|
|
|
// cacheSpan updated alloc assuming all objects on s
|
|
|
|
// were going to be allocated. Adjust for any that
|
|
|
|
// weren't.
|
|
|
|
atomic.Xadd64(&c.nmalloc, -int64(n))
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
|
|
|
unlock(&c.lock)
|
|
|
|
}
|
|
|
|
|
2016-02-24 12:36:30 -07:00
|
|
|
// freeSpan updates c and s after sweeping s.
|
|
|
|
// It sets s's sweepgen to the latest generation,
|
|
|
|
// and, based on the number of free objects in s,
|
|
|
|
// moves s to the appropriate list of c or returns it
|
|
|
|
// to the heap.
|
|
|
|
// freeSpan returns true if s was returned to the heap.
|
|
|
|
// If preserve=true, it does not move s (the caller
|
|
|
|
// must take care of it).
|
2016-04-28 08:53:25 -06:00
|
|
|
func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
|
2014-11-11 15:05:02 -07:00
|
|
|
if s.incache {
|
2016-02-11 11:57:58 -07:00
|
|
|
throw("freeSpan given cached span")
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 13:32:01 -06:00
|
|
|
s.needzero = 1
|
2014-11-11 15:05:02 -07:00
|
|
|
|
|
|
|
if preserve {
|
|
|
|
// preserve is set only when called from MCentral_CacheSpan above,
|
|
|
|
// the span must be in the empty list.
|
2015-11-11 17:13:51 -07:00
|
|
|
if !s.inList() {
|
2014-12-27 21:58:00 -07:00
|
|
|
throw("can't preserve unlinked span")
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&s.sweepgen, mheap_.sweepgen)
|
2014-11-11 15:05:02 -07:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
lock(&c.lock)
|
|
|
|
|
|
|
|
// Move to nonempty if necessary.
|
|
|
|
if wasempty {
|
2015-11-11 17:13:51 -07:00
|
|
|
c.empty.remove(s)
|
|
|
|
c.nonempty.insert(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
}
|
|
|
|
|
2016-03-01 16:21:55 -07:00
|
|
|
// delay updating sweepgen until here. This is the signal that
|
2014-11-11 15:05:02 -07:00
|
|
|
// the span may be used in an MCache, so it must come after the
|
|
|
|
// linked list operations above (actually, just after the
|
|
|
|
// lock of c above.)
|
2015-11-02 12:09:24 -07:00
|
|
|
atomic.Store(&s.sweepgen, mheap_.sweepgen)
|
2014-11-11 15:05:02 -07:00
|
|
|
|
2016-02-16 15:16:43 -07:00
|
|
|
if s.allocCount != 0 {
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2015-11-11 17:13:51 -07:00
|
|
|
c.nonempty.remove(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
unlock(&c.lock)
|
2015-11-11 17:13:51 -07:00
|
|
|
mheap_.freeSpan(s, 0)
|
2014-11-11 15:05:02 -07:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-02-11 11:57:58 -07:00
|
|
|
// grow allocates a new empty span from the heap and initializes it for c's size class.
|
2015-11-11 17:13:51 -07:00
|
|
|
func (c *mcentral) grow() *mspan {
|
2014-11-11 15:05:02 -07:00
|
|
|
npages := uintptr(class_to_allocnpages[c.sizeclass])
|
|
|
|
size := uintptr(class_to_size[c.sizeclass])
|
|
|
|
n := (npages << _PageShift) / size
|
|
|
|
|
2015-11-11 17:13:51 -07:00
|
|
|
s := mheap_.alloc(npages, c.sizeclass, false, true)
|
2014-11-11 15:05:02 -07:00
|
|
|
if s == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-03-14 10:02:02 -06:00
|
|
|
p := s.base()
|
2014-11-11 15:05:02 -07:00
|
|
|
s.limit = p + size*n
|
2016-02-11 11:57:58 -07:00
|
|
|
|
2016-02-09 07:38:44 -07:00
|
|
|
heapBitsForSpan(s.base()).initSpan(s)
|
2014-11-11 15:05:02 -07:00
|
|
|
return s
|
|
|
|
}
|