mirror of
https://github.com/golang/go
synced 2024-09-25 15:10:11 -06:00
runtime: convert markallocated from C to Go
benchmark old ns/op new ns/op delta BenchmarkMalloc8 28.7 22.4 -21.95% BenchmarkMalloc16 44.8 33.8 -24.55% BenchmarkMallocTypeInfo8 49.0 32.9 -32.86% BenchmarkMallocTypeInfo16 46.7 35.8 -23.34% BenchmarkMallocLargeStruct 907 901 -0.66% BenchmarkGobDecode 13235542 12036851 -9.06% BenchmarkGobEncode 10639699 9539155 -10.34% BenchmarkJSONEncode 25193036 21898922 -13.08% BenchmarkJSONDecode 96104044 89464904 -6.91% Fixes #8452. LGTM=khr R=golang-codereviews, bradfitz, rsc, dave, khr CC=golang-codereviews https://golang.org/cl/122090043
This commit is contained in:
parent
c5b2c370c6
commit
aac7f1a0d6
@ -9,6 +9,8 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
debugMalloc = false
|
||||
|
||||
flagNoScan = 1 << 0 // GC doesn't have to scan object
|
||||
flagNoProfiling = 1 << 1 // must not profile
|
||||
flagNoZero = 1 << 2 // don't zero memory
|
||||
@ -29,6 +31,22 @@ const (
|
||||
pageShift = 13
|
||||
pageSize = 1 << pageShift
|
||||
pageMask = pageSize - 1
|
||||
|
||||
wordsPerBitmapWord = ptrSize * 8 / 4
|
||||
gcBits = 4
|
||||
bitsPerPointer = 2
|
||||
bitsMask = 1<<bitsPerPointer - 1
|
||||
pointersPerByte = 8 / bitsPerPointer
|
||||
bitPtrMask = bitsMask << 2
|
||||
maxGCMask = 0 // disabled because wastes several bytes of memory
|
||||
bitsDead = 0
|
||||
bitsPointer = 2
|
||||
|
||||
bitMiddle = 0
|
||||
bitBoundary = 1
|
||||
bitAllocated = 2
|
||||
bitMarked = 3
|
||||
bitMask = bitMiddle | bitBoundary | bitAllocated | bitMarked
|
||||
)
|
||||
|
||||
// All zero-sized allocations return a pointer to this byte.
|
||||
@ -168,14 +186,112 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
|
||||
size = uintptr(s.elemsize)
|
||||
}
|
||||
|
||||
// TODO: write markallocated in Go
|
||||
mp.ptrarg[0] = x
|
||||
mp.scalararg[0] = uint(size)
|
||||
mp.scalararg[1] = uint(size0)
|
||||
mp.ptrarg[1] = unsafe.Pointer(typ)
|
||||
mp.scalararg[2] = uint(flags & flagNoScan)
|
||||
onM(&markallocated_m)
|
||||
// From here till marked label marking the object as allocated
|
||||
// and storing type info in the GC bitmap.
|
||||
arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
|
||||
off := (uintptr(x) - arena_start) / ptrSize
|
||||
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
|
||||
shift := (off % wordsPerBitmapWord) * gcBits
|
||||
if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
|
||||
gothrow("bad bits in markallocated")
|
||||
}
|
||||
|
||||
var ti, te uintptr
|
||||
var ptrmask *uint8
|
||||
if flags&flagNoScan != 0 {
|
||||
// bitsDead in the first quadruple means don't scan.
|
||||
if size == ptrSize {
|
||||
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
|
||||
} else {
|
||||
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
|
||||
*xbitsb = bitAllocated + (bitsDead << 2)
|
||||
}
|
||||
goto marked
|
||||
}
|
||||
if size == ptrSize {
|
||||
// It's one word and it has pointers, it must be a pointer.
|
||||
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
|
||||
goto marked
|
||||
}
|
||||
if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
|
||||
if typ.kind&kindGCProg != 0 {
|
||||
nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
|
||||
masksize := nptr
|
||||
if masksize%2 != 0 {
|
||||
masksize *= 2 // repeated
|
||||
}
|
||||
masksize = masksize * pointersPerByte / 8 // 4 bits per word
|
||||
masksize++ // unroll flag in the beginning
|
||||
if masksize > maxGCMask && typ.gc[1] != 0 {
|
||||
// If the mask is too large, unroll the program directly
|
||||
// into the GC bitmap. It's 7 times slower than copying
|
||||
// from the pre-unrolled mask, but saves 1/16 of type size
|
||||
// memory for the mask.
|
||||
mp.ptrarg[0] = x
|
||||
mp.ptrarg[1] = unsafe.Pointer(typ)
|
||||
mp.scalararg[0] = uint(size)
|
||||
mp.scalararg[1] = uint(size0)
|
||||
onM(&unrollgcproginplace_m)
|
||||
goto marked
|
||||
}
|
||||
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
|
||||
// Check whether the program is already unrolled.
|
||||
if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
|
||||
mp.ptrarg[0] = unsafe.Pointer(typ)
|
||||
onM(&unrollgcprog_m)
|
||||
}
|
||||
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
|
||||
} else {
|
||||
ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
|
||||
}
|
||||
if size == 2*ptrSize {
|
||||
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
|
||||
*xbitsb = *ptrmask | bitAllocated
|
||||
goto marked
|
||||
}
|
||||
te = uintptr(typ.size) / ptrSize
|
||||
// If the type occupies odd number of words, its mask is repeated.
|
||||
if te%2 == 0 {
|
||||
te /= 2
|
||||
}
|
||||
}
|
||||
if size == 2*ptrSize {
|
||||
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
|
||||
*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
|
||||
goto marked
|
||||
}
|
||||
// Copy pointer bitmask into the bitmap.
|
||||
for i := uintptr(0); i < size0; i += 2 * ptrSize {
|
||||
v := uint8((bitsPointer << 2) | (bitsPointer << 6))
|
||||
if ptrmask != nil {
|
||||
v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
|
||||
ti++
|
||||
if ti == te {
|
||||
ti = 0
|
||||
}
|
||||
}
|
||||
if i == 0 {
|
||||
v |= bitAllocated
|
||||
}
|
||||
if i+ptrSize == size0 {
|
||||
v &= ^uint8(bitPtrMask << 4)
|
||||
}
|
||||
|
||||
off := (uintptr(x) + i - arena_start) / ptrSize
|
||||
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
|
||||
shift := (off % wordsPerBitmapWord) * gcBits
|
||||
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
|
||||
*xbitsb = v
|
||||
}
|
||||
if size0%(2*ptrSize) == 0 && size0 < size {
|
||||
// Mark the word after last object's word as bitsDead.
|
||||
off := (uintptr(x) + size0 - arena_start) / ptrSize
|
||||
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
|
||||
shift := (off % wordsPerBitmapWord) * gcBits
|
||||
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
|
||||
*xbitsb = bitsDead << 2
|
||||
}
|
||||
marked:
|
||||
mp.mallocing = 0
|
||||
|
||||
if raceenabled {
|
||||
|
@ -517,7 +517,6 @@ void* runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
|
||||
int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
|
||||
void runtime·gc(int32 force);
|
||||
uintptr runtime·sweepone(void);
|
||||
void runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
|
||||
void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
|
||||
void runtime·unmarkspan(void *v, uintptr size);
|
||||
void runtime·purgecachedstats(MCache*);
|
||||
|
@ -1741,7 +1741,7 @@ runtime·wakefing(void)
|
||||
return res;
|
||||
}
|
||||
|
||||
// Recursively GC program in prog.
|
||||
// Recursively unrolls GC program in prog.
|
||||
// mask is where to store the result.
|
||||
// ppos is a pointer to position in mask, in bits.
|
||||
// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
|
||||
@ -1833,11 +1833,20 @@ unrollglobgcprog(byte *prog, uintptr size)
|
||||
return mask;
|
||||
}
|
||||
|
||||
static void
|
||||
unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
|
||||
void
|
||||
runtime·unrollgcproginplace_m(void)
|
||||
{
|
||||
uintptr *b, off, shift, pos;
|
||||
uintptr size, size0, *b, off, shift, pos;
|
||||
byte *arena_start, *prog;
|
||||
Type *typ;
|
||||
void *v;
|
||||
|
||||
v = g->m->ptrarg[0];
|
||||
typ = g->m->ptrarg[1];
|
||||
size = g->m->scalararg[0];
|
||||
size0 = g->m->scalararg[1];
|
||||
g->m->ptrarg[0] = nil;
|
||||
g->m->ptrarg[1] = nil;
|
||||
|
||||
pos = 0;
|
||||
prog = (byte*)typ->gc[1];
|
||||
@ -1859,14 +1868,18 @@ unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
|
||||
}
|
||||
|
||||
// Unrolls GC program in typ->gc[1] into typ->gc[0]
|
||||
static void
|
||||
unrollgcprog(Type *typ)
|
||||
void
|
||||
runtime·unrollgcprog_m(void)
|
||||
{
|
||||
static Lock lock;
|
||||
Type *typ;
|
||||
byte *mask, *prog;
|
||||
uintptr pos;
|
||||
uint32 x;
|
||||
|
||||
typ = g->m->ptrarg[0];
|
||||
g->m->ptrarg[0] = nil;
|
||||
|
||||
runtime·lock(&lock);
|
||||
mask = (byte*)typ->gc[0];
|
||||
if(mask[0] == 0) {
|
||||
@ -1887,110 +1900,6 @@ unrollgcprog(Type *typ)
|
||||
runtime·unlock(&lock);
|
||||
}
|
||||
|
||||
void
|
||||
runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan)
|
||||
{
|
||||
uintptr *b, off, shift, i, ti, te, nptr, masksize;
|
||||
byte *arena_start, x;
|
||||
bool *ptrmask;
|
||||
|
||||
arena_start = runtime·mheap.arena_start;
|
||||
off = (uintptr*)v - (uintptr*)arena_start;
|
||||
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
|
||||
shift = (off % wordsPerBitmapWord) * gcBits;
|
||||
if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) {
|
||||
runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b);
|
||||
runtime·throw("bad bits in markallocated");
|
||||
}
|
||||
|
||||
if(!scan) {
|
||||
// BitsDead in the first quadruple means don't scan.
|
||||
if(size == PtrSize)
|
||||
*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated+(BitsDead<<2))<<shift);
|
||||
else
|
||||
((byte*)b)[shift/8] = bitAllocated+(BitsDead<<2);
|
||||
return;
|
||||
}
|
||||
if(size == PtrSize) {
|
||||
// It's one word and it has pointers, it must be a pointer.
|
||||
*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated | (BitsPointer<<2))<<shift);
|
||||
return;
|
||||
}
|
||||
ti = te = 0;
|
||||
ptrmask = nil;
|
||||
if(typ != nil && (typ->gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) {
|
||||
if(typ->kind&KindGCProg) {
|
||||
nptr = ROUND(typ->size, PtrSize)/PtrSize;
|
||||
masksize = nptr;
|
||||
if(masksize%2)
|
||||
masksize *= 2; // repeated twice
|
||||
masksize = masksize*PointersPerByte/8; // 4 bits per word
|
||||
masksize++; // unroll flag in the beginning
|
||||
if(masksize > MaxGCMask && typ->gc[1] != 0) {
|
||||
// If the mask is too large, unroll the program directly
|
||||
// into the GC bitmap. It's 7 times slower than copying
|
||||
// from the pre-unrolled mask, but saves 1/16 of type size
|
||||
// memory for the mask.
|
||||
unrollgcproginplace(v, size, size0, typ);
|
||||
return;
|
||||
}
|
||||
ptrmask = (byte*)typ->gc[0];
|
||||
// check whether the program is already unrolled
|
||||
if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0)
|
||||
unrollgcprog(typ);
|
||||
ptrmask++; // skip the unroll flag byte
|
||||
} else
|
||||
ptrmask = (byte*)&typ->gc[0]; // embed mask
|
||||
if(size == 2*PtrSize) {
|
||||
((byte*)b)[shift/8] = ptrmask[0] | bitAllocated;
|
||||
return;
|
||||
}
|
||||
te = typ->size/PtrSize;
|
||||
// if the type occupies odd number of words, its mask is repeated twice
|
||||
if((te%2) == 0)
|
||||
te /= 2;
|
||||
}
|
||||
if(size == 2*PtrSize) {
|
||||
((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated;
|
||||
return;
|
||||
}
|
||||
// Copy pointer bitmask into the bitmap.
|
||||
for(i=0; i<size0; i+=2*PtrSize) {
|
||||
x = (BitsPointer<<2) | (BitsPointer<<6);
|
||||
if(ptrmask != nil) {
|
||||
x = ptrmask[ti++];
|
||||
if(ti == te)
|
||||
ti = 0;
|
||||
}
|
||||
off = (uintptr*)((byte*)v + i) - (uintptr*)arena_start;
|
||||
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
|
||||
shift = (off % wordsPerBitmapWord) * gcBits;
|
||||
if(i == 0)
|
||||
x |= bitAllocated;
|
||||
if(i+PtrSize == size0)
|
||||
x &= ~(bitPtrMask<<4);
|
||||
((byte*)b)[shift/8] = x;
|
||||
}
|
||||
if(size0 == i && size0 < size) {
|
||||
// mark the word after last object's word as BitsDead
|
||||
off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
|
||||
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
|
||||
shift = (off % wordsPerBitmapWord) * gcBits;
|
||||
((byte*)b)[shift/8] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
runtime·markallocated_m(void)
|
||||
{
|
||||
M *mp;
|
||||
|
||||
mp = g->m;
|
||||
runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
|
||||
mp->ptrarg[0] = nil;
|
||||
mp->ptrarg[1] = nil;
|
||||
}
|
||||
|
||||
// mark the span of memory at v as having n blocks of the given size.
|
||||
// if leftover is true, there is left over space at the end of the span.
|
||||
void
|
||||
|
@ -65,7 +65,9 @@ var (
|
||||
mprofMalloc_m,
|
||||
gc_m,
|
||||
setFinalizer_m,
|
||||
markallocated_m mFunction
|
||||
markallocated_m,
|
||||
unrollgcprog_m,
|
||||
unrollgcproginplace_m mFunction
|
||||
)
|
||||
|
||||
// memclr clears n bytes starting at ptr.
|
||||
@ -87,7 +89,12 @@ const (
|
||||
concurrentSweep = true
|
||||
)
|
||||
|
||||
// in asm_*.s
|
||||
// Atomic operations to read/write a pointer.
|
||||
// in stubs.goc
|
||||
func goatomicloadp(p unsafe.Pointer) unsafe.Pointer // return *p
|
||||
func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v
|
||||
|
||||
// in stubs.goc
|
||||
// if *p == x { *p = y; return true } else { return false }, atomically
|
||||
//go:noescape
|
||||
func gocas(p *uint32, x uint32, y uint32) bool
|
||||
|
@ -48,6 +48,16 @@ func gonanotime() (r int64) {
|
||||
r = runtime·nanotime();
|
||||
}
|
||||
|
||||
#pragma textflag NOSPLIT
|
||||
func goatomicloadp(p **byte) (v *byte) {
|
||||
v = runtime·atomicloadp(p);
|
||||
}
|
||||
|
||||
#pragma textflag NOSPLIT
|
||||
func goatomicstorep(p **byte, v *byte) {
|
||||
runtime·atomicstorep(p, v);
|
||||
}
|
||||
|
||||
#pragma textflag NOSPLIT
|
||||
func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
|
||||
ret = runtime·cas(p, x, y);
|
||||
|
Loading…
Reference in New Issue
Block a user