mirror of
https://github.com/golang/go
synced 2024-11-17 20:04:47 -07:00
runtime: adjust huge page flags only on huge page granularity
This fixes an issue where the runtime panics with "out of memory" or
"cannot allocate memory" even though there's ample memory by reducing
the number of memory mappings created by the memory allocator.
Commit 7e1b61c
worked around issue #8832 where Linux's transparent
huge page support could dramatically increase the RSS of a Go process
by setting the MADV_NOHUGEPAGE flag on any regions of pages released
to the OS with MADV_DONTNEED. This had the side effect of also
increasing the number of VMAs (memory mappings) in a Go address space
because a separate VMA is needed for every region of the virtual
address space with different flags. Unfortunately, by default, Linux
limits the number of VMAs in an address space to 65530, and a large
heap can quickly reach this limit when the runtime starts scavenging
memory.
This commit dramatically reduces the number of VMAs. It does this
primarily by only adjusting the huge page flag at huge page
granularity. With this change, on amd64, even a pessimal heap that
alternates between MADV_NOHUGEPAGE and MADV_HUGEPAGE must reach 128GB
to reach the VMA limit. Because of this rounding to huge page
granularity, this change is also careful to leave large used and
unused regions huge page-enabled.
This change reduces the maximum number of VMAs during the runtime
benchmarks with GODEBUG=scavenge=1 from 692 to 49.
Fixes #12233.
Change-Id: Ic397776d042f20d53783a1cacf122e2e2db00584
Reviewed-on: https://go-review.googlesource.com/15191
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
9a31d38f65
commit
44078a3228
@ -69,29 +69,89 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func sysUnused(v unsafe.Pointer, n uintptr) {
|
func sysUnused(v unsafe.Pointer, n uintptr) {
|
||||||
var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
|
// By default, Linux's "transparent huge page" support will
|
||||||
if s != 0 && (uintptr(v)%s != 0 || n%s != 0) {
|
// merge pages into a huge page if there's even a single
|
||||||
// See issue 8832
|
// present regular page, undoing the effects of the DONTNEED
|
||||||
// Linux kernel bug: https://bugzilla.kernel.org/show_bug.cgi?id=93111
|
// below. On amd64, that means khugepaged can turn a single
|
||||||
// Mark the region as NOHUGEPAGE so the kernel's khugepaged
|
// 4KB page to 2MB, bloating the process's RSS by as much as
|
||||||
// doesn't undo our DONTNEED request. khugepaged likes to migrate
|
// 512X. (See issue #8832 and Linux kernel bug
|
||||||
// regions which are only partially mapped to huge pages, including
|
// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
|
||||||
// regions with some DONTNEED marks. That needlessly allocates physical
|
//
|
||||||
// memory for our DONTNEED regions.
|
// To work around this, we explicitly disable transparent huge
|
||||||
madvise(v, n, _MADV_NOHUGEPAGE)
|
// pages when we release pages of the heap. However, we have
|
||||||
|
// to do this carefully because changing this flag tends to
|
||||||
|
// split the VMA (memory mapping) containing v in to three
|
||||||
|
// VMAs in order to track the different values of the
|
||||||
|
// MADV_NOHUGEPAGE flag in the different regions. There's a
|
||||||
|
// default limit of 65530 VMAs per address space (sysctl
|
||||||
|
// vm.max_map_count), so we must be careful not to create too
|
||||||
|
// many VMAs (see issue #12233).
|
||||||
|
//
|
||||||
|
// Since huge pages are huge, there's little use in adjusting
|
||||||
|
// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
|
||||||
|
// exploding the number of VMAs by only adjusting the
|
||||||
|
// MADV_NOHUGEPAGE flag on a large granularity. This still
|
||||||
|
// gets most of the benefit of huge pages while keeping the
|
||||||
|
// number of VMAs under control. With hugePageSize = 2MB, even
|
||||||
|
// a pessimal heap can reach 128GB before running out of VMAs.
|
||||||
|
if hugePageSize != 0 {
|
||||||
|
var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
|
||||||
|
|
||||||
|
// If it's a large allocation, we want to leave huge
|
||||||
|
// pages enabled. Hence, we only adjust the huge page
|
||||||
|
// flag on the huge pages containing v and v+n-1, and
|
||||||
|
// only if those aren't aligned.
|
||||||
|
var head, tail uintptr
|
||||||
|
if uintptr(v)%s != 0 {
|
||||||
|
// Compute huge page containing v.
|
||||||
|
head = uintptr(v) &^ (s - 1)
|
||||||
|
}
|
||||||
|
if (uintptr(v)+n)%s != 0 {
|
||||||
|
// Compute huge page containing v+n-1.
|
||||||
|
tail = (uintptr(v) + n - 1) &^ (s - 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note that madvise will return EINVAL if the flag is
|
||||||
|
// already set, which is quite likely. We ignore
|
||||||
|
// errors.
|
||||||
|
if head != 0 && head+hugePageSize == tail {
|
||||||
|
// head and tail are different but adjacent,
|
||||||
|
// so do this in one call.
|
||||||
|
madvise(unsafe.Pointer(head), 2*hugePageSize, _MADV_NOHUGEPAGE)
|
||||||
|
} else {
|
||||||
|
// Advise the huge pages containing v and v+n-1.
|
||||||
|
if head != 0 {
|
||||||
|
madvise(unsafe.Pointer(head), hugePageSize, _MADV_NOHUGEPAGE)
|
||||||
|
}
|
||||||
|
if tail != 0 && tail != head {
|
||||||
|
madvise(unsafe.Pointer(tail), hugePageSize, _MADV_NOHUGEPAGE)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
madvise(v, n, _MADV_DONTNEED)
|
madvise(v, n, _MADV_DONTNEED)
|
||||||
}
|
}
|
||||||
|
|
||||||
func sysUsed(v unsafe.Pointer, n uintptr) {
|
func sysUsed(v unsafe.Pointer, n uintptr) {
|
||||||
if hugePageSize != 0 {
|
if hugePageSize != 0 {
|
||||||
// Undo the NOHUGEPAGE marks from sysUnused. There is no alignment check
|
// Partially undo the NOHUGEPAGE marks from sysUnused
|
||||||
// around this call as spans may have been merged in the interim.
|
// for whole huge pages between v and v+n. This may
|
||||||
// Note that this might enable huge pages for regions which were
|
// leave huge pages off at the end points v and v+n
|
||||||
// previously disabled. Unfortunately there is no easy way to detect
|
// even though allocations may cover these entire huge
|
||||||
// what the previous state was, and in any case we probably want huge
|
// pages. We could detect this and undo NOHUGEPAGE on
|
||||||
// pages to back our heap if the kernel can arrange that.
|
// the end points as well, but it's probably not worth
|
||||||
madvise(v, n, _MADV_HUGEPAGE)
|
// the cost because when neighboring allocations are
|
||||||
|
// freed sysUnused will just set NOHUGEPAGE again.
|
||||||
|
var s uintptr = hugePageSize
|
||||||
|
|
||||||
|
// Round v up to a huge page boundary.
|
||||||
|
beg := (uintptr(v) + (s - 1)) &^ (s - 1)
|
||||||
|
// Round v+n down to a huge page boundary.
|
||||||
|
end := (uintptr(v) + n) &^ (s - 1)
|
||||||
|
|
||||||
|
if beg < end {
|
||||||
|
madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user