1
0
mirror of https://github.com/golang/go synced 2024-11-13 12:20:26 -07:00

runtime: simplify mem profile checking in mallocgc

Checking whether the current allocation needs to be profiled is
currently branch-y and weirdly a lot of code. The branches are
generally predictable, but it's a surprising number of instructions.
Part of the problem is that MemProfileRate is just a global that can be
set at any time, so we need to load it and check certain settings
explicitly. In an ideal world, we would just always subtract from
nextSample and have a single branch to take the slow path if we
subtract below zero.

If MemProfileRate were a function, we could trash all the nextSample
values intentionally in each mcache. This would be slow, but
MemProfileRate changes rarely while the malloc hot path is well, hot.
Unfortunate...

Although this ideal world is, AFAICT, impossible, we can still get
close. If we cache the value of MemProfileRate in each mcache, then we
can force malloc to take the slow path whenever MemProfileRate changes.
This does require two additional loads, but crucially, these loads are
independent of everything else in mallocgc. Furthermore, the branch
dependent on those loads is incredibly predictable in practice.

This CL on its own has little-to-no impact on mallocgc. But this
codepath is going to be duplicated in several places in the next CL, so
it'll pay to simplify it. Also, we're very much trying to remedy a
death-by-a-thousand-cuts situation, and malloc is currently still kind
of a monster -- it will not help if mallocgc isn't really streamlined
itself.

Lastly, there's a nice property now that all nextSample values get
immediately re-sampled when MemProfileRate changes.

Change-Id: I6443d0cf9bd7861595584442b675ac1be8ea3455
Reviewed-on: https://go-review.googlesource.com/c/go/+/615815
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Michael Anthony Knyszek 2024-09-25 16:51:52 +00:00 committed by Gopher Robot
parent e3d372aea8
commit 31437f25f2
3 changed files with 27 additions and 22 deletions

View File

@ -813,8 +813,8 @@ func newUserArenaChunk() (unsafe.Pointer, *mspan) {
throw("newUserArenaChunk called without a P or outside bootstrapping")
}
// Note cache c only valid while m acquired; see #47302
if rate != 1 && userArenaChunkBytes < c.nextSample {
c.nextSample -= userArenaChunkBytes
if rate != 1 && int64(userArenaChunkBytes) < c.nextSample {
c.nextSample -= int64(userArenaChunkBytes)
} else {
profilealloc(mp, unsafe.Pointer(span.base()), userArenaChunkBytes)
}

View File

@ -1247,21 +1247,19 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
asanunpoison(x, userSize)
}
// Note cache c only valid while m acquired; see #47302
//
// N.B. Use the full size because that matches how the GC
// will update the mem profile on the "free" side.
//
// TODO(mknyszek): We should really count the header as part
// of gc_sys or something. The code below just pretends it is
// internal fragmentation and matches the GC's accounting by
// using the whole allocation slot.
fullSize := span.elemsize
if rate := MemProfileRate; rate > 0 {
// Note cache c only valid while m acquired; see #47302
//
// N.B. Use the full size because that matches how the GC
// will update the mem profile on the "free" side.
if rate != 1 && fullSize < c.nextSample {
c.nextSample -= fullSize
} else {
profilealloc(mp, x, fullSize)
}
c.nextSample -= int64(fullSize)
if c.nextSample < 0 || MemProfileRate != c.memProfRate {
profilealloc(mp, x, fullSize)
}
mp.mallocing = 0
releasem(mp)
@ -1465,11 +1463,16 @@ func maps_newarray(typ *_type, n int) unsafe.Pointer {
return newarray(typ, n)
}
// profilealloc resets the current mcache's nextSample counter and
// records a memory profile sample.
//
// The caller must be non-preemptible and have a P.
func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
c := getMCache(mp)
if c == nil {
throw("profilealloc called without a P or outside bootstrapping")
}
c.memProfRate = MemProfileRate
c.nextSample = nextSample()
mProf_Malloc(mp, x, size)
}
@ -1481,12 +1484,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
// processes, the distance between two samples follows the exponential
// distribution (exp(MemProfileRate)), so the best return value is a random
// number taken from an exponential distribution whose mean is MemProfileRate.
func nextSample() uintptr {
func nextSample() int64 {
if MemProfileRate == 0 {
// Basically never sample.
return maxInt64
}
if MemProfileRate == 1 {
// Callers assign our return value to
// mcache.next_sample, but next_sample is not used
// when the rate is 1. So avoid the math below and
// just return something.
// Sample immediately.
return 0
}
if GOOS == "plan9" {
@ -1496,7 +1500,7 @@ func nextSample() uintptr {
}
}
return uintptr(fastexprand(MemProfileRate))
return int64(fastexprand(MemProfileRate))
}
// fastexprand returns a random number from an exponential distribution with
@ -1531,14 +1535,14 @@ func fastexprand(mean int) int32 {
// nextSampleNoFP is similar to nextSample, but uses older,
// simpler code to avoid floating point.
func nextSampleNoFP() uintptr {
func nextSampleNoFP() int64 {
// Set first allocation sample size.
rate := MemProfileRate
if rate > 0x3fffffff { // make 2*rate not overflow
rate = 0x3fffffff
}
if rate != 0 {
return uintptr(cheaprandn(uint32(2 * rate)))
return int64(cheaprandn(uint32(2 * rate)))
}
return 0
}

View File

@ -21,8 +21,9 @@ type mcache struct {
// The following members are accessed on every malloc,
// so they are grouped here for better caching.
nextSample uintptr // trigger heap sample after allocating this many bytes
scanAlloc uintptr // bytes of scannable heap allocated
nextSample int64 // trigger heap sample after allocating this many bytes
memProfRate int // cached mem profile rate, used to detect changes
scanAlloc uintptr // bytes of scannable heap allocated
// Allocator cache for tiny objects w/o pointers.
// See "Tiny allocator" comment in malloc.go.