1
0
mirror of https://github.com/golang/go synced 2024-10-02 10:18:33 -06:00
go/src/runtime/malloc.go

951 lines
30 KiB
Go
Raw Normal View History

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Memory allocator, based on tcmalloc.
// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
// The main allocator works in runs of pages.
// Small allocation sizes (up to and including 32 kB) are
// rounded to one of about 100 size classes, each of which
// has its own free list of objects of exactly that size.
// Any free page of memory can be split into a set of objects
// of one size class, which are then managed using free list
// allocators.
//
// The allocator's data structures are:
//
// FixAlloc: a free-list allocator for fixed-size objects,
// used to manage storage used by the allocator.
// MHeap: the malloc heap, managed at page (4096-byte) granularity.
// MSpan: a run of pages managed by the MHeap.
// MCentral: a shared free list for a given size class.
// MCache: a per-thread (in Go, per-P) cache for small objects.
// MStats: allocation statistics.
//
// Allocating a small object proceeds up a hierarchy of caches:
//
// 1. Round the size up to one of the small size classes
// and look in the corresponding MCache free list.
// If the list is not empty, allocate an object from it.
// This can all be done without acquiring a lock.
//
// 2. If the MCache free list is empty, replenish it by
// taking a bunch of objects from the MCentral free list.
// Moving a bunch amortizes the cost of acquiring the MCentral lock.
//
// 3. If the MCentral free list is empty, replenish it by
// allocating a run of pages from the MHeap and then
// chopping that memory into objects of the given size.
// Allocating many objects amortizes the cost of locking
// the heap.
//
// 4. If the MHeap is empty or has no page runs large enough,
// allocate a new group of pages (at least 1MB) from the
// operating system. Allocating a large run of pages
// amortizes the cost of talking to the operating system.
//
// Freeing a small object proceeds up the same hierarchy:
//
// 1. Look up the size class for the object and add it to
// the MCache free list.
//
// 2. If the MCache free list is too long or the MCache has
// too much memory, return some to the MCentral free lists.
//
// 3. If all the objects in a given span have returned to
// the MCentral list, return that span to the page heap.
//
// 4. If the heap has too much memory, return some to the
// operating system.
//
// TODO(rsc): Step 4 is not implemented.
//
// Allocating and freeing a large object uses the page heap
// directly, bypassing the MCache and MCentral free lists.
//
// The small objects on the MCache and MCentral free lists
// may or may not be zeroed. They are zeroed if and only if
// the second word of the object is zero. A span in the
// page heap is zeroed unless s->needzero is set. When a span
// is allocated to break into small objects, it is zeroed if needed
// and s->needzero is set. There are two main benefits to delaying the
// zeroing this way:
//
// 1. stack frames allocated from the small object lists
// or the page heap can avoid zeroing altogether.
// 2. the cost of zeroing when reusing a small object is
// charged to the mutator, not the garbage collector.
package runtime
import (
"runtime/internal/sys"
"unsafe"
)
const (
debugMalloc = false
flagNoScan = _FlagNoScan
flagNoZero = _FlagNoZero
maxTinySize = _TinySize
tinySizeClass = _TinySizeClass
maxSmallSize = _MaxSmallSize
pageShift = _PageShift
pageSize = _PageSize
pageMask = _PageMask
mSpanInUse = _MSpanInUse
concurrentSweep = _ConcurrentSweep
)
const (
_PageShift = 13
_PageSize = 1 << _PageShift
_PageMask = _PageSize - 1
)
const (
// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
_64bit = 1 << (^uintptr(0) >> 63) / 2
// Computed constant. The definition of MaxSmallSize and the
// algorithm in msize.go produces some number of different allocation
// size classes. NumSizeClasses is that number. It's needed here
// because there are static arrays of this length; when msize runs its
// size choosing algorithm it double-checks that NumSizeClasses agrees.
_NumSizeClasses = 67
// Tunable constants.
_MaxSmallSize = 32 << 10
// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
_TinySize = 16
_TinySizeClass = 2
_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc
_MaxMHeapList = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
_HeapAllocChunk = 1 << 20 // Chunk size for heap growth
// Per-P, per order stack segment cache size.
_StackCacheSize = 32 * 1024
// Number of orders that get caching. Order 0 is FixedStack
// and each successive order is twice as large.
// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
// will be allocated directly.
// Since FixedStack is different on different systems, we
// must vary NumStackOrders to keep the same maximum cached size.
// OS | FixedStack | NumStackOrders
// -----------------+------------+---------------
// linux/darwin/bsd | 2KB | 4
// windows/32 | 4KB | 3
// windows/64 | 8KB | 2
// plan9 | 4KB | 3
_NumStackOrders = 4 - sys.PtrSize/4*sys.GoosWindows - 1*sys.GoosPlan9
// Number of bits in page to span calculations (4k pages).
// On Windows 64-bit we limit the arena to 32GB or 35 bits.
// Windows counts memory used by page table into committed memory
// of the process, so we can't reserve too much memory.
// See https://golang.org/issue/5402 and https://golang.org/issue/5236.
// On other 64-bit platforms, we limit the arena to 512GB, or 39 bits.
// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
// On Darwin/arm64, we cannot reserve more than ~5GB of virtual memory,
// but as most devices have less than 4GB of physical memory anyway, we
// try to be conservative here, and only ask for a 2GB heap.
_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*32
_MHeapMap_Bits = _MHeapMap_TotalBits - _PageShift
_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
// Max number of threads to run garbage collection.
// 2, 3, and 4 are all plausible maximums depending
// on the hardware details of the machine. The garbage
// collector scales well to 32 cpus.
_MaxGcproc = 32
)
// Page number (address>>pageShift)
type pageID uintptr
const _MaxArena32 = 2 << 30
// OS-defined helpers:
//
// sysAlloc obtains a large chunk of zeroed memory from the
// operating system, typically on the order of a hundred kilobytes
// or a megabyte.
// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
// may use larger alignment, so the caller must be careful to realign the
// memory obtained by sysAlloc.
//
// SysUnused notifies the operating system that the contents
// of the memory region are no longer needed and can be reused
// for other purposes.
// SysUsed notifies the operating system that the contents
// of the memory region are needed again.
//
// SysFree returns it unconditionally; this is only used if
// an out-of-memory error has been detected midway through
// an allocation. It is okay if SysFree is a no-op.
//
// SysReserve reserves address space without allocating memory.
// If the pointer passed to it is non-nil, the caller wants the
// reservation there, but SysReserve can still choose another
// location if that one is unavailable. On some systems and in some
// cases SysReserve will simply check that the address space is
// available and not actually reserve it. If SysReserve returns
// non-nil, it sets *reserved to true if the address space is
// reserved, false if it has merely been checked.
// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
// may use larger alignment, so the caller must be careful to realign the
// memory obtained by sysAlloc.
//
// SysMap maps previously reserved address space for use.
// The reserved argument is true if the address space was really
// reserved, not merely checked.
//
// SysFault marks a (already sysAlloc'd) region to fault
// if accessed. Used only for debugging the runtime.
func mallocinit() {
initSizes()
if class_to_size[_TinySizeClass] != _TinySize {
throw("bad TinySizeClass")
}
var p, bitmapSize, spansSize, pSize, limit uintptr
var reserved bool
// limit = runtime.memlimit();
// See https://golang.org/issue/5049
// TODO(rsc): Fix after 1.1.
limit = 0
// Set up the allocation arena, a contiguous area of memory where
// allocated data will be found. The arena begins with a bitmap large
// enough to hold 4 bits per allocated word.
if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
// On a 64-bit machine, allocate from a single contiguous reservation.
// 512 GB (MaxMem) should be big enough for now.
//
// The code will work with the reservation at any address, but ask
// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
// Allocating a 512 GB region takes away 39 bits, and the amd64
// doesn't let us choose the top 17 bits, so that leaves the 9 bits
// in the middle of 0x00c0 for us to choose. Choosing 0x00c0 means
// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
// UTF-8 sequences, and they are otherwise as far away from
// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
// on OS X during thread allocations. 0x00c0 causes conflicts with
// AddressSanitizer which reserves all memory up to 0x0100.
// These choices are both for debuggability and to reduce the
// odds of a conservative garbage collector (as is still used in gccgo)
// not collecting memory because some non-pointer block of memory
// had a bit pattern that matched a memory address.
//
// Actually we reserve 544 GB (because the bitmap ends up being 32 GB)
// but it hardly matters: e0 00 is not valid UTF-8 either.
//
// If this fails we fall back to the 32 bit memory mechanism
//
// However, on arm64, we ignore all this advice above and slam the
// allocation at 0x40 << 32 because when using 4k pages with 3-level
// translation buffers, the user address space is limited to 39 bits
// On darwin/arm64, the address space is even smaller.
arenaSize := round(_MaxMem, _PageSize)
bitmapSize = arenaSize / (sys.PtrSize * 8 / 4)
spansSize = arenaSize / _PageSize * sys.PtrSize
spansSize = round(spansSize, _PageSize)
for i := 0; i <= 0x7f; i++ {
switch {
case GOARCH == "arm64" && GOOS == "darwin":
p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
case GOARCH == "arm64":
p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
default:
p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
}
pSize = bitmapSize + spansSize + arenaSize + _PageSize
p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
if p != 0 {
break
}
}
}
if p == 0 {
// On a 32-bit machine, we can't typically get away
// with a giant virtual address space reservation.
// Instead we map the memory information bitmap
// immediately after the data segment, large enough
// to handle another 2GB of mappings (256 MB),
// along with a reservation for an initial arena.
// When that gets used up, we'll start asking the kernel
// for any memory anywhere and hope it's in the 2GB
// following the bitmap (presumably the executable begins
// near the bottom of memory, so we'll have to use up
// most of memory before the kernel resorts to giving out
// memory before the beginning of the text segment).
//
// Alternatively we could reserve 512 MB bitmap, enough
// for 4GB of mappings, and then accept any memory the
// kernel threw at us, but normally that's a waste of 512 MB
// of address space, which is probably too much in a 32-bit world.
// If we fail to allocate, try again with a smaller arena.
// This is necessary on Android L where we share a process
// with ART, which reserves virtual memory aggressively.
arenaSizes := []uintptr{
512 << 20,
256 << 20,
128 << 20,
}
for _, arenaSize := range arenaSizes {
bitmapSize = _MaxArena32 / (sys.PtrSize * 8 / 4)
spansSize = _MaxArena32 / _PageSize * sys.PtrSize
if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
arenaSize = bitmapSize * 8
spansSize = arenaSize / _PageSize * sys.PtrSize
}
spansSize = round(spansSize, _PageSize)
// SysReserve treats the address we ask for, end, as a hint,
// not as an absolute requirement. If we ask for the end
// of the data segment but the operating system requires
// a little more space before we can start allocating, it will
// give out a slightly higher pointer. Except QEMU, which
// is buggy, as usual: it won't adjust the pointer upward.
// So adjust it upward a little bit ourselves: 1/4 MB to get
// away from the running binary image and then round up
// to a MB boundary.
p = round(firstmoduledata.end+(1<<18), 1<<20)
pSize = bitmapSize + spansSize + arenaSize + _PageSize
p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
if p != 0 {
break
}
}
if p == 0 {
throw("runtime: cannot reserve arena virtual address space")
}
}
// PageSize can be larger than OS definition of page size,
// so SysReserve can give us a PageSize-unaligned pointer.
// To overcome this we ask for PageSize more and round up the pointer.
p1 := round(p, _PageSize)
mheap_.spans = (**mspan)(unsafe.Pointer(p1))
mheap_.bitmap = p1 + spansSize
mheap_.arena_start = p1 + (spansSize + bitmapSize)
mheap_.arena_used = mheap_.arena_start
mheap_.arena_end = p + pSize
mheap_.arena_reserved = reserved
if mheap_.arena_start&(_PageSize-1) != 0 {
println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
throw("misrounded allocation in mallocinit")
}
// Initialize the rest of the allocator.
mheap_.init(spansSize)
_g_ := getg()
_g_.m.mcache = allocmcache()
}
// sysReserveHigh reserves space somewhere high in the address space.
// sysReserve doesn't actually reserve the full amount requested on
// 64-bit systems, because of problems with ulimit. Instead it checks
// that it can get the first 64 kB and assumes it can grab the rest as
// needed. This doesn't work well with the "let the kernel pick an address"
// mode, so don't do that. Pick a high address instead.
func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer {
if sys.PtrSize == 4 {
return sysReserve(nil, n, reserved)
}
for i := 0; i <= 0x7f; i++ {
p := uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
*reserved = false
p = uintptr(sysReserve(unsafe.Pointer(p), n, reserved))
if p != 0 {
return unsafe.Pointer(p)
}
}
return sysReserve(nil, n, reserved)
}
func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
if n > h.arena_end-h.arena_used {
// We are in 32-bit mode, maybe we didn't use all possible address space yet.
// Reserve some more space.
p_size := round(n+_PageSize, 256<<20)
new_end := h.arena_end + p_size
if new_end <= h.arena_start+_MaxArena32 {
// TODO: It would be bad if part of the arena
// is reserved and part is not.
var reserved bool
p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
if p == 0 {
return nil
}
if p == h.arena_end {
h.arena_end = new_end
h.arena_reserved = reserved
} else if h.arena_start <= p && p+p_size <= h.arena_start+_MaxArena32 {
// Keep everything page-aligned.
// Our pages are bigger than hardware pages.
h.arena_end = p + p_size
used := p + (-uintptr(p) & (_PageSize - 1))
h.mapBits(used)
h.mapSpans(used)
h.arena_used = used
h.arena_reserved = reserved
} else {
// We haven't added this allocation to
// the stats, so subtract it from a
// fake stat (but avoid underflow).
stat := uint64(p_size)
sysFree(unsafe.Pointer(p), p_size, &stat)
}
}
}
if n <= h.arena_end-h.arena_used {
// Keep taking from our reservation.
p := h.arena_used
sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
h.mapBits(p + n)
h.mapSpans(p + n)
h.arena_used = p + n
if raceenabled {
racemapshadow(unsafe.Pointer(p), n)
}
if uintptr(p)&(_PageSize-1) != 0 {
throw("misrounded allocation in MHeap_SysAlloc")
}
return unsafe.Pointer(p)
}
// If using 64-bit, our reservation is all we have.
if h.arena_end-h.arena_start >= _MaxArena32 {
return nil
}
// On 32-bit, once the reservation is gone we can
// try to get memory at a location chosen by the OS
// and hope that it is in the range we allocated bitmap for.
p_size := round(n, _PageSize) + _PageSize
p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
if p == 0 {
return nil
}
if p < h.arena_start || uintptr(p)+p_size-h.arena_start >= _MaxArena32 {
print("runtime: memory allocated by OS (", p, ") not in usable range [", hex(h.arena_start), ",", hex(h.arena_start+_MaxArena32), ")\n")
sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
return nil
}
p_end := p + p_size
p += -p & (_PageSize - 1)
if uintptr(p)+n > h.arena_used {
h.mapBits(p + n)
h.mapSpans(p + n)
h.arena_used = p + n
if p_end > h.arena_end {
h.arena_end = p_end
}
if raceenabled {
racemapshadow(unsafe.Pointer(p), n)
}
}
if uintptr(p)&(_PageSize-1) != 0 {
throw("misrounded allocation in MHeap_SysAlloc")
}
return unsafe.Pointer(p)
}
// base address for all 0-byte allocations
var zerobase uintptr
const (
// flags to malloc
_FlagNoScan = 1 << 0 // GC doesn't have to scan object
_FlagNoZero = 1 << 1 // don't zero memory
)
// Allocate an object of size bytes.
// Small objects are allocated from the per-P cache's free lists.
// Large objects (> 32 kB) are allocated straight from the heap.
func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
if gcphase == _GCmarktermination {
throw("mallocgc called with gcphase == _GCmarktermination")
}
if size == 0 {
return unsafe.Pointer(&zerobase)
}
if flags&flagNoScan == 0 && typ == nil {
throw("malloc missing type")
}
if debug.sbrk != 0 {
align := uintptr(16)
if typ != nil {
align = uintptr(typ.align)
}
return persistentalloc(size, align, &memstats.other_sys)
}
// assistG is the G to charge for this allocation, or nil if
// GC is not currently active.
var assistG *g
if gcBlackenEnabled != 0 {
// Charge the current user G for this allocation.
assistG = getg()
if assistG.m.curg != nil {
assistG = assistG.m.curg
}
// Charge the allocation against the G. We'll account
// for internal fragmentation at the end of mallocgc.
assistG.gcAssistBytes -= int64(size)
if assistG.gcAssistBytes < 0 {
// This G is in debt. Assist the GC to correct
// this before allocating. This must happen
// before disabling preemption.
gcAssistAlloc(assistG)
}
}
// Set mp.mallocing to keep from being preempted by GC.
mp := acquirem()
if mp.mallocing != 0 {
throw("malloc deadlock")
}
if mp.gsignal == getg() {
throw("malloc during signal")
}
mp.mallocing = 1
shouldhelpgc := false
dataSize := size
c := gomcache()
var s *mspan
var x unsafe.Pointer
if size <= maxSmallSize {
if flags&flagNoScan != 0 && size < maxTinySize {
// Tiny allocator.
//
// Tiny allocator combines several tiny allocation requests
// into a single memory block. The resulting memory block
// is freed when all subobjects are unreachable. The subobjects
// must be FlagNoScan (don't have pointers), this ensures that
// the amount of potentially wasted memory is bounded.
//
// Size of the memory block used for combining (maxTinySize) is tunable.
// Current setting is 16 bytes, which relates to 2x worst case memory
// wastage (when all but one subobjects are unreachable).
// 8 bytes would result in no wastage at all, but provides less
// opportunities for combining.
// 32 bytes provides more opportunities for combining,
// but can lead to 4x worst case wastage.
// The best case winning is 8x regardless of block size.
//
// Objects obtained from tiny allocator must not be freed explicitly.
// So when an object will be freed explicitly, we ensure that
// its size >= maxTinySize.
//
// SetFinalizer has a special case for objects potentially coming
// from tiny allocator, it such case it allows to set finalizers
// for an inner byte of a memory block.
//
// The main targets of tiny allocator are small strings and
// standalone escaping variables. On a json benchmark
// the allocator reduces number of allocations by ~12% and
// reduces heap size by ~20%.
off := c.tinyoffset
// Align tiny pointer for required (conservative) alignment.
if size&7 == 0 {
off = round(off, 8)
} else if size&3 == 0 {
off = round(off, 4)
} else if size&1 == 0 {
off = round(off, 2)
}
if off+size <= maxTinySize && c.tiny != 0 {
// The object fits into existing tiny block.
x = unsafe.Pointer(c.tiny + off)
c.tinyoffset = off + size
c.local_tinyallocs++
mp.mallocing = 0
releasem(mp)
return x
}
// Allocate a new maxTinySize block.
s = c.alloc[tinySizeClass]
v := s.freelist
if v.ptr() == nil {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 12:54:31 -07:00
systemstack(func() {
c.refill(tinySizeClass)
})
shouldhelpgc = true
s = c.alloc[tinySizeClass]
v = s.freelist
}
s.freelist = v.ptr().next
s.ref++
// prefetchnta offers best performance, see change list message.
prefetchnta(uintptr(v.ptr().next))
x = unsafe.Pointer(v)
(*[2]uint64)(x)[0] = 0
(*[2]uint64)(x)[1] = 0
// See if we need to replace the existing tiny block with the new one
// based on amount of remaining free space.
if size < c.tinyoffset || c.tiny == 0 {
c.tiny = uintptr(x)
c.tinyoffset = size
}
size = maxTinySize
} else {
var sizeclass int8
if size <= 1024-8 {
sizeclass = size_to_class8[(size+7)>>3]
} else {
sizeclass = size_to_class128[(size-1024+127)>>7]
}
size = uintptr(class_to_size[sizeclass])
s = c.alloc[sizeclass]
v := s.freelist
if v.ptr() == nil {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 12:54:31 -07:00
systemstack(func() {
c.refill(int32(sizeclass))
})
shouldhelpgc = true
s = c.alloc[sizeclass]
v = s.freelist
}
s.freelist = v.ptr().next
s.ref++
// prefetchnta offers best performance, see change list message.
prefetchnta(uintptr(v.ptr().next))
x = unsafe.Pointer(v)
if flags&flagNoZero == 0 {
v.ptr().next = 0
if size > 2*sys.PtrSize && ((*[2]uintptr)(x))[1] != 0 {
memclr(unsafe.Pointer(v), size)
}
}
}
c.local_cachealloc += size
} else {
var s *mspan
shouldhelpgc = true
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 12:54:31 -07:00
systemstack(func() {
s = largeAlloc(size, uint32(flags))
})
x = unsafe.Pointer(uintptr(s.start << pageShift))
size = uintptr(s.elemsize)
}
if flags&flagNoScan != 0 {
// All objects are pre-marked as noscan. Nothing to do.
} else {
// If allocating a defer+arg block, now that we've picked a malloc size
// large enough to hold everything, cut the "asked for" size down to
// just the defer header, so that the GC bitmap will record the arg block
// as containing nothing at all (as if it were unused space at the end of
// a malloc block caused by size rounding).
// The defer arg areas are scanned as part of scanstack.
if typ == deferType {
dataSize = unsafe.Sizeof(_defer{})
}
heapBitsSetType(uintptr(x), size, dataSize, typ)
if dataSize > typ.size {
// Array allocation. If there are any
// pointers, GC has to scan to the last
// element.
if typ.ptrdata != 0 {
c.local_scan += dataSize - typ.size + typ.ptrdata
}
} else {
c.local_scan += typ.ptrdata
}
// Ensure that the stores above that initialize x to
// type-safe memory and set the heap bits occur before
// the caller can make x observable to the garbage
// collector. Otherwise, on weakly ordered machines,
// the garbage collector could follow a pointer to x,
// but see uninitialized memory or stale heap bits.
publicationBarrier()
}
// GCmarkterminate allocates black
// All slots hold nil so no scanning is needed.
// This may be racing with GC so do it atomically if there can be
// a race marking the bit.
runtime: reduce latency by aggressively ending mark phase Some latency regressions have crept into our system over the past few weeks. This CL fixes those by having the mark phase more aggressively blacken objects so that the mark termination phase, a STW phase, has less work to do. Three approaches were taken when the mark phase believes it has no more work to do, ie all the work buffers are empty. If things have gone well the mark phase is correct and there is in fact little or no work. In that case the following items will take very little time. If the mark phase is wrong this CL will ferret that work out and give the mark phase a chance to deal with it concurrently before mark termination begins. When the mark phase first appears to be out of work, it does three things: 1) It switches from allocating white to allocating black to reduce the number of unmarked objects reachable only from stacks. 2) It flushes and disables per-P GC work caches so all work must be in globally visible work buffers. 3) It rescans the global roots---the BSS and data segments---so there are fewer objects to blacken during mark termination. We do not rescan stacks at this point, though that could be done in a later CL. After these steps, it again drains the global work buffers. On a lightly loaded machine the garbage benchmark has reduced the number of GC cycles with latency > 10 ms from 83 out of 4083 cycles down to 2 out of 3995 cycles. Maximum latency was reduced from 60+ msecs down to 20 ms. Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492 Reviewed-on: https://go-review.googlesource.com/10590 Reviewed-by: Austin Clements <austin@google.com>
2015-06-01 16:16:03 -06:00
if gcphase == _GCmarktermination || gcBlackenPromptly {
systemstack(func() {
gcmarknewobject_m(uintptr(x), size)
})
}
if raceenabled {
racemalloc(x, size)
}
if msanenabled {
msanmalloc(x, size)
}
mp.mallocing = 0
releasem(mp)
if debug.allocfreetrace != 0 {
tracealloc(x, size, typ)
}
if rate := MemProfileRate; rate > 0 {
if size < uintptr(rate) && int32(size) < c.next_sample {
c.next_sample -= int32(size)
} else {
mp := acquirem()
profilealloc(mp, x, size)
releasem(mp)
}
}
if assistG != nil {
// Account for internal fragmentation in the assist
// debt now that we know it.
assistG.gcAssistBytes -= int64(size - dataSize)
}
if shouldhelpgc && gcShouldStart(false) {
gcStart(gcBackgroundMode, false)
}
return x
}
func largeAlloc(size uintptr, flag uint32) *mspan {
// print("largeAlloc size=", size, "\n")
if size+_PageSize < size {
throw("out of memory")
}
npages := size >> _PageShift
if size&_PageMask != 0 {
npages++
}
runtime: make sweep proportional to spans bytes allocated Proportional concurrent sweep is currently based on a ratio of spans to be swept per bytes of object allocation. However, proportional sweeping is performed during span allocation, not object allocation, in order to minimize contention and overhead. Since objects are allocated from spans after those spans are allocated, the system tends to operate in debt, which means when the next GC cycle starts, there is often sweep debt remaining, so GC has to finish the sweep, which delays the start of the cycle and delays enabling mutator assists. For example, it's quite likely that many Ps will simultaneously refill their span caches immediately after a GC cycle (because GC flushes the span caches), but at this point, there has been very little object allocation since the end of GC, so very little sweeping is done. The Ps then allocate objects from these cached spans, which drives up the bytes of object allocation, but since these allocations are coming from cached spans, nothing considers whether more sweeping has to happen. If the sweep ratio is high enough (which can happen if the next GC trigger is very close to the retained heap size), this can easily represent a sweep debt of thousands of pages. Fix this by making proportional sweep proportional to the number of bytes of spans allocated, rather than the number of bytes of objects allocated. Prior to allocating a span, both the small object path and the large object path ensure credit for allocating that span, so the system operates in the black, rather than in the red. Combined with the previous commit, this should eliminate all sweeping from GC start up. On the stress test in issue #11911, this reduces the time spent sweeping during GC (and delaying start up) by several orders of magnitude: mean 99%ile max pre fix 1 ms 11 ms 144 ms post fix 270 ns 735 ns 916 ns Updates #11911. Change-Id: I89223712883954c9d6ec2a7a51ecb97172097df3 Reviewed-on: https://go-review.googlesource.com/13044 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-08-03 07:46:50 -06:00
// Deduct credit for this span allocation and sweep if
// necessary. mHeap_Alloc will also sweep npages, so this only
// pays the debt down to npage pages.
deductSweepCredit(npages*_PageSize, npages)
s := mheap_.alloc(npages, 0, true, flag&_FlagNoZero == 0)
if s == nil {
throw("out of memory")
}
s.limit = uintptr(s.start)<<_PageShift + size
heapBitsForSpan(s.base()).initSpan(s.layout())
return s
}
// implementation of new builtin
func newobject(typ *_type) unsafe.Pointer {
flags := uint32(0)
if typ.kind&kindNoPointers != 0 {
flags |= flagNoScan
}
return mallocgc(uintptr(typ.size), typ, flags)
}
//go:linkname reflect_unsafe_New reflect.unsafe_New
func reflect_unsafe_New(typ *_type) unsafe.Pointer {
return newobject(typ)
}
// implementation of make builtin for slices
func newarray(typ *_type, n uintptr) unsafe.Pointer {
flags := uint32(0)
if typ.kind&kindNoPointers != 0 {
flags |= flagNoScan
}
if int(n) < 0 || (typ.size > 0 && n > _MaxMem/uintptr(typ.size)) {
panic("runtime: allocation size out of range")
}
return mallocgc(uintptr(typ.size)*n, typ, flags)
}
//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
func reflect_unsafe_NewArray(typ *_type, n uintptr) unsafe.Pointer {
return newarray(typ, n)
}
// rawmem returns a chunk of pointerless memory. It is
// not zeroed.
func rawmem(size uintptr) unsafe.Pointer {
return mallocgc(size, nil, flagNoScan|flagNoZero)
}
func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
mp.mcache.next_sample = nextSample()
mProf_Malloc(x, size)
}
// nextSample returns the next sampling point for heap profiling.
// It produces a random variable with a geometric distribution and
// mean MemProfileRate. This is done by generating a uniformly
// distributed random number and applying the cumulative distribution
// function for an exponential.
func nextSample() int32 {
if GOOS == "plan9" {
// Plan 9 doesn't support floating point in note handler.
if g := getg(); g == g.m.gsignal {
return nextSampleNoFP()
}
}
period := MemProfileRate
// make nextSample not overflow. Maximum possible step is
// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
switch {
case period > 0x7000000:
period = 0x7000000
case period == 0:
return 0
}
// Let m be the sample rate,
// the probability distribution function is m*exp(-mx), so the CDF is
// p = 1 - exp(-mx), so
// q = 1 - p == exp(-mx)
// log_e(q) = -mx
// -log_e(q)/m = x
// x = -log_e(q) * period
// x = log_2(q) * (-log_e(2)) * period ; Using log_2 for efficiency
const randomBitCount = 26
q := uint32(fastrand1())%(1<<randomBitCount) + 1
qlog := fastlog2(float64(q)) - randomBitCount
if qlog > 0 {
qlog = 0
}
const minusLog2 = -0.6931471805599453 // -ln(2)
return int32(qlog*(minusLog2*float64(period))) + 1
}
// nextSampleNoFP is similar to nextSample, but uses older,
// simpler code to avoid floating point.
func nextSampleNoFP() int32 {
// Set first allocation sample size.
rate := MemProfileRate
if rate > 0x3fffffff { // make 2*rate not overflow
rate = 0x3fffffff
}
if rate != 0 {
return int32(int(fastrand1()) % (2 * rate))
}
return 0
}
type persistentAlloc struct {
base unsafe.Pointer
off uintptr
}
var globalAlloc struct {
mutex
persistentAlloc
}
// Wrapper around sysAlloc that can allocate small chunks.
// There is no associated free operation.
// Intended for things like function/type/debug-related persistent data.
// If align is 0, uses default align (currently 8).
func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
var p unsafe.Pointer
systemstack(func() {
p = persistentalloc1(size, align, sysStat)
})
return p
}
// Must run on system stack because stack growth can (re)invoke it.
// See issue 9174.
//go:systemstack
func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
const (
chunk = 256 << 10
maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
)
if size == 0 {
throw("persistentalloc: size == 0")
}
if align != 0 {
if align&(align-1) != 0 {
throw("persistentalloc: align is not a power of 2")
}
if align > _PageSize {
throw("persistentalloc: align is too large")
}
} else {
align = 8
}
if size >= maxBlock {
return sysAlloc(size, sysStat)
}
mp := acquirem()
var persistent *persistentAlloc
if mp != nil && mp.p != 0 {
persistent = &mp.p.ptr().palloc
} else {
lock(&globalAlloc.mutex)
persistent = &globalAlloc.persistentAlloc
}
persistent.off = round(persistent.off, align)
if persistent.off+size > chunk || persistent.base == nil {
persistent.base = sysAlloc(chunk, &memstats.other_sys)
if persistent.base == nil {
if persistent == &globalAlloc.persistentAlloc {
unlock(&globalAlloc.mutex)
}
throw("runtime: cannot allocate memory")
}
persistent.off = 0
}
p := add(persistent.base, persistent.off)
persistent.off += size
releasem(mp)
if persistent == &globalAlloc.persistentAlloc {
unlock(&globalAlloc.mutex)
}
if sysStat != &memstats.other_sys {
mSysStatInc(sysStat, size)
mSysStatDec(&memstats.other_sys, size)
}
return p
}