From ec25210564562571aeb39cdfd6e02270d7f3fb1d Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 22 Feb 2018 20:38:09 -0500 Subject: [PATCH] runtime: support a two-level arena map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the heap arena map is a single, large array that covers every possible arena frame in the entire address space. This is practical up to about 48 bits of address space with 64 MB arenas. However, there are two problems with this: 1. mips64, ppc64, and s390x support full 64-bit address spaces (though on Linux only s390x has kernel support for 64-bit address spaces). On these platforms, it would be good to support these larger address spaces. 2. On Windows, processes are charged for untouched memory, so for processes with small heaps, the mostly-untouched 32 MB arena map plus a 64 MB arena are significant overhead. Hence, it would be good to reduce both the arena map size and the arena size, but with a single-level arena, these are inversely proportional. This CL adds support for a two-level arena map. Arena frame numbers are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2 index. At the moment, arenaL1Bits is always 0, so we effectively have a single level map. We do a few things so that this has no cost beyond the current single-level map: 1. We embed the L2 array directly in mheap, so if there's a single entry in the L2 array, the representation is identical to the current representation and there's no extra level of indirection. 2. Hot code that accesses the arena map is structured so that it optimizes to nearly the same machine code as it does currently. 3. We make some small tweaks to hot code paths and to the inliner itself to keep some important functions inlined despite their now-larger ASTs. In particular, this is necessary for heapBitsForAddr and heapBits.next. Possibly as a result of some of the tweaks, this actually slightly improves the performance of the x/benchmarks garbage benchmark: name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19) (https://perf.golang.org/search?q=upload:20180223.2) For #23900. Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff Reviewed-on: https://go-review.googlesource.com/96779 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Rick Hudson --- src/cmd/compile/internal/gc/inl.go | 15 +++++ src/runtime/heapdump.go | 12 +++- src/runtime/malloc.go | 77 +++++++++++++++++++------- src/runtime/mbitmap.go | 75 ++++++++++++++++--------- src/runtime/mheap.go | 88 +++++++++++++++++++++++------- 5 files changed, 197 insertions(+), 70 deletions(-) diff --git a/src/cmd/compile/internal/gc/inl.go b/src/cmd/compile/internal/gc/inl.go index e2456eb96f..60df4d06fd 100644 --- a/src/cmd/compile/internal/gc/inl.go +++ b/src/cmd/compile/internal/gc/inl.go @@ -304,6 +304,21 @@ func (v *hairyVisitor) visit(n *Node) bool { if t.Nname() == nil { Fatalf("no function definition for [%p] %+v\n", t, t) } + if isRuntimePkg(n.Left.Sym.Pkg) { + fn := n.Left.Sym.Name + if fn == "heapBits.nextArena" { + // Special case: explicitly allow + // mid-stack inlining of + // runtime.heapBits.next even though + // it calls slow-path + // runtime.heapBits.nextArena. + // + // TODO(austin): Once mid-stack + // inlining is the default, remove + // this special case. + break + } + } if inlfn := asNode(t.FuncType().Nname).Func; inlfn.Inl.Len() != 0 { v.budget -= inlfn.InlCost break diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index dbeaed9277..b255cbbae3 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -489,9 +489,15 @@ func dumpparams() { } dumpint(sys.PtrSize) var arenaStart, arenaEnd uintptr - for i, ha := range mheap_.arenas { - if ha != nil { - base := arenaBase(uint(i)) + for i1 := range mheap_.arenas { + if mheap_.arenas[i1] == nil { + continue + } + for i, ha := range mheap_.arenas[i1] { + if ha == nil { + continue + } + base := arenaBase(arenaIdx(i1)<= uint(len(mheap_.arenas)) { + } else if arenaIndex(p+n-1) >= 1<= uint(len(mheap_.arenas)) { + } else if arenaIndex(p) >= 1<= uint(len(mheap_.arenas)) { + } else if arenaIndex(p+size-1) >= 1<maxsmallsize) nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize) - // arenas is the heap arena map. - // arenas[(va+arenaBaseOffset)/heapArenaBytes] points to the - // metadata for the heap arena containing va. + // arenas is the heap arena map. It points to the metadata for + // the heap for every arena frame of the entire usable virtual + // address space. // // Use arenaIndex to compute indexes into this array. // @@ -110,9 +110,13 @@ type mheap struct { // transition from nil to non-nil at any time when the lock // isn't held. (Entries never transitions back to nil.) // - // This structure is fully mapped by mallocinit, so it's safe - // to probe any index. - arenas *[(1 << heapAddrBits) / heapArenaBytes]*heapArena + // In general, this is a two-level mapping consisting of an L1 + // map and possibly many L2 maps. This saves space when there + // are a huge number of arena frames. However, on many + // platforms (even 64-bit), arenaL1Bits is 0, making this + // effectively a single-level map. In this case, arenas[0] + // will never be nil. + arenas [1 << arenaL1Bits]*[1 << arenaL2Bits]*heapArena // heapArenaAlloc is pre-reserved space for allocating heapArena // objects. This is only used on 32-bit, where we pre-reserve @@ -410,24 +414,48 @@ func (sc spanClass) noscan() bool { return sc&1 != 0 } -// arenaIndex returns the mheap_.arenas index of the arena containing -// metadata for p. If p is outside the range of valid heap addresses, -// it returns an index larger than len(mheap_.arenas). +// arenaIndex returns the index into mheap_.arenas of the arena +// containing metadata for p. This index combines of an index into the +// L1 map and an index into the L2 map and should be used as +// mheap_.arenas[ai.l1()][ai.l2()]. +// +// If p is outside the range of valid heap addresses, either l1() or +// l2() will be out of bounds. // // It is nosplit because it's called by spanOf and several other // nosplit functions. // //go:nosplit -func arenaIndex(p uintptr) uint { - return uint((p + arenaBaseOffset) / heapArenaBytes) +func arenaIndex(p uintptr) arenaIdx { + return arenaIdx((p + arenaBaseOffset) / heapArenaBytes) } // arenaBase returns the low address of the region covered by heap // arena i. -func arenaBase(i uint) uintptr { +func arenaBase(i arenaIdx) uintptr { return uintptr(i)*heapArenaBytes - arenaBaseOffset } +type arenaIdx uint + +func (i arenaIdx) l1() uint { + if arenaL1Bits == 0 { + // Let the compiler optimize this away if there's no + // L1 map. + return 0 + } else { + return uint(i) >> arenaL1Shift + } +} + +func (i arenaIdx) l2() uint { + if arenaL1Bits == 0 { + return uint(i) + } else { + return uint(i) & (1<= uint(len(mheap_.arenas)) { + if arenaL1Bits == 0 { + // If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can. + if ri.l2() >= uint(len(mheap_.arenas[0])) { + return nil + } + } else { + // If there's an L1, then ri.l1() can be out of bounds but ri.l2() can't. + if ri.l1() >= uint(len(mheap_.arenas)) { + return nil + } + } + l2 := mheap_.arenas[ri.l1()] + if arenaL1Bits != 0 && l2 == nil { // Should never happen if there's no L1. return nil } - ha := mheap_.arenas[ri] + ha := l2[ri.l2()] if ha == nil { return nil } @@ -488,7 +530,8 @@ func spanOf(p uintptr) *mspan { // //go:nosplit func spanOfUnchecked(p uintptr) *mspan { - return mheap_.arenas[arenaIndex(p)].spans[(p/pageSize)%pagesPerArena] + ai := arenaIndex(p) + return mheap_.arenas[ai.l1()][ai.l2()].spans[(p/pageSize)%pagesPerArena] } // spanOfHeap is like spanOf, but returns nil if p does not point to a @@ -763,18 +806,21 @@ func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan { // setSpan modifies the span map so spanOf(base) is s. func (h *mheap) setSpan(base uintptr, s *mspan) { - h.arenas[arenaIndex(base)].spans[(base/pageSize)%pagesPerArena] = s + ai := arenaIndex(base) + h.arenas[ai.l1()][ai.l2()].spans[(base/pageSize)%pagesPerArena] = s } // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize)) // is s. func (h *mheap) setSpans(base, npage uintptr, s *mspan) { p := base / pageSize - ha := h.arenas[arenaIndex(base)] + ai := arenaIndex(base) + ha := h.arenas[ai.l1()][ai.l2()] for n := uintptr(0); n < npage; n++ { i := (p + n) % pagesPerArena if i == 0 { - ha = h.arenas[arenaIndex(base+n*pageSize)] + ai = arenaIndex(base + n*pageSize) + ha = h.arenas[ai.l1()][ai.l2()] } ha.spans[i] = s }