mirror of
https://github.com/golang/go
synced 2024-11-20 01:54:41 -07:00
899a4ad47e
Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements <austin@google.com>
255 lines
7.9 KiB
Go
255 lines
7.9 KiB
Go
// Copyright 2009 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Malloc small size classes.
|
|
//
|
|
// See malloc.go for overview.
|
|
//
|
|
// The size classes are chosen so that rounding an allocation
|
|
// request up to the next size class wastes at most 12.5% (1.125x).
|
|
//
|
|
// Each size class has its own page count that gets allocated
|
|
// and chopped up when new objects of the size class are needed.
|
|
// That page count is chosen so that chopping up the run of
|
|
// pages into objects of the given size wastes at most 12.5% (1.125x)
|
|
// of the memory. It is not necessary that the cutoff here be
|
|
// the same as above.
|
|
//
|
|
// The two sources of waste multiply, so the worst possible case
|
|
// for the above constraints would be that allocations of some
|
|
// size might have a 26.6% (1.266x) overhead.
|
|
// In practice, only one of the wastes comes into play for a
|
|
// given size (sizes < 512 waste mainly on the round-up,
|
|
// sizes > 512 waste mainly on the page chopping).
|
|
//
|
|
// TODO(rsc): Compute max waste for any given size.
|
|
|
|
package runtime
|
|
|
|
// Size classes. Computed and initialized by InitSizes.
|
|
//
|
|
// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
|
|
// 1 <= sizeclass < NumSizeClasses, for n.
|
|
// Size class 0 is reserved to mean "not small".
|
|
//
|
|
// class_to_size[i] = largest size in class i
|
|
// class_to_allocnpages[i] = number of pages to allocate when
|
|
// making new objects in class i
|
|
|
|
// The SizeToClass lookup is implemented using two arrays,
|
|
// one mapping sizes <= 1024 to their class and one mapping
|
|
// sizes >= 1024 and <= MaxSmallSize to their class.
|
|
// All objects are 8-aligned, so the first array is indexed by
|
|
// the size divided by 8 (rounded up). Objects >= 1024 bytes
|
|
// are 128-aligned, so the second array is indexed by the
|
|
// size divided by 128 (rounded up). The arrays are filled in
|
|
// by InitSizes.
|
|
|
|
var class_to_size [_NumSizeClasses]int32
|
|
var class_to_allocnpages [_NumSizeClasses]int32
|
|
var class_to_divmagic [_NumSizeClasses]divMagic
|
|
|
|
var size_to_class8 [1024/8 + 1]int8
|
|
var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
|
|
|
|
func sizeToClass(size int32) int32 {
|
|
if size > _MaxSmallSize {
|
|
throw("SizeToClass - invalid size")
|
|
}
|
|
if size > 1024-8 {
|
|
return int32(size_to_class128[(size-1024+127)>>7])
|
|
}
|
|
return int32(size_to_class8[(size+7)>>3])
|
|
}
|
|
|
|
func initSizes() {
|
|
// Initialize the runtime·class_to_size table (and choose class sizes in the process).
|
|
class_to_size[0] = 0
|
|
sizeclass := 1 // 0 means no class
|
|
align := 8
|
|
for size := align; size <= _MaxSmallSize; size += align {
|
|
if size&(size-1) == 0 { // bump alignment once in a while
|
|
if size >= 2048 {
|
|
align = 256
|
|
} else if size >= 128 {
|
|
align = size / 8
|
|
} else if size >= 16 {
|
|
align = 16 // required for x86 SSE instructions, if we want to use them
|
|
}
|
|
}
|
|
if align&(align-1) != 0 {
|
|
throw("InitSizes - bug")
|
|
}
|
|
|
|
// Make the allocnpages big enough that
|
|
// the leftover is less than 1/8 of the total,
|
|
// so wasted space is at most 12.5%.
|
|
allocsize := _PageSize
|
|
for allocsize%size > allocsize/8 {
|
|
allocsize += _PageSize
|
|
}
|
|
npages := allocsize >> _PageShift
|
|
|
|
// If the previous sizeclass chose the same
|
|
// allocation size and fit the same number of
|
|
// objects into the page, we might as well
|
|
// use just this size instead of having two
|
|
// different sizes.
|
|
if sizeclass > 1 && npages == int(class_to_allocnpages[sizeclass-1]) && allocsize/size == allocsize/int(class_to_size[sizeclass-1]) {
|
|
class_to_size[sizeclass-1] = int32(size)
|
|
continue
|
|
}
|
|
|
|
class_to_allocnpages[sizeclass] = int32(npages)
|
|
class_to_size[sizeclass] = int32(size)
|
|
sizeclass++
|
|
}
|
|
if sizeclass != _NumSizeClasses {
|
|
print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n")
|
|
throw("InitSizes - bad NumSizeClasses")
|
|
}
|
|
|
|
// Initialize the size_to_class tables.
|
|
nextsize := 0
|
|
for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ {
|
|
for ; nextsize < 1024 && nextsize <= int(class_to_size[sizeclass]); nextsize += 8 {
|
|
size_to_class8[nextsize/8] = int8(sizeclass)
|
|
}
|
|
if nextsize >= 1024 {
|
|
for ; nextsize <= int(class_to_size[sizeclass]); nextsize += 128 {
|
|
size_to_class128[(nextsize-1024)/128] = int8(sizeclass)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Double-check SizeToClass.
|
|
if false {
|
|
for n := int32(0); n < _MaxSmallSize; n++ {
|
|
sizeclass := sizeToClass(n)
|
|
if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n {
|
|
print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
|
|
print("incorrect SizeToClass\n")
|
|
goto dump
|
|
}
|
|
if sizeclass > 1 && class_to_size[sizeclass-1] >= n {
|
|
print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
|
|
print("SizeToClass too big\n")
|
|
goto dump
|
|
}
|
|
}
|
|
}
|
|
|
|
testdefersizes()
|
|
|
|
// Copy out for statistics table.
|
|
for i := 0; i < len(class_to_size); i++ {
|
|
memstats.by_size[i].size = uint32(class_to_size[i])
|
|
}
|
|
|
|
for i := 1; i < len(class_to_size); i++ {
|
|
class_to_divmagic[i] = computeDivMagic(uint32(class_to_size[i]))
|
|
}
|
|
|
|
return
|
|
|
|
dump:
|
|
if true {
|
|
print("NumSizeClasses=", _NumSizeClasses, "\n")
|
|
print("runtime·class_to_size:")
|
|
for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ {
|
|
print(" ", class_to_size[sizeclass], "")
|
|
}
|
|
print("\n\n")
|
|
print("size_to_class8:")
|
|
for i := 0; i < len(size_to_class8); i++ {
|
|
print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n")
|
|
}
|
|
print("\n")
|
|
print("size_to_class128:")
|
|
for i := 0; i < len(size_to_class128); i++ {
|
|
print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n")
|
|
}
|
|
print("\n")
|
|
}
|
|
throw("InitSizes failed")
|
|
}
|
|
|
|
// Returns size of the memory block that mallocgc will allocate if you ask for the size.
|
|
func roundupsize(size uintptr) uintptr {
|
|
if size < _MaxSmallSize {
|
|
if size <= 1024-8 {
|
|
return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
|
|
} else {
|
|
return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
|
|
}
|
|
}
|
|
if size+_PageSize < size {
|
|
return size
|
|
}
|
|
return round(size, _PageSize)
|
|
}
|
|
|
|
// divMagic holds magic constants to implement division
|
|
// by a particular constant as a shift, multiply, and shift.
|
|
// That is, given
|
|
// m = computeMagic(d)
|
|
// then
|
|
// n/d == ((n>>m.shift) * m.mul) >> m.shift2
|
|
//
|
|
// The magic computation picks m such that
|
|
// d = d₁*d₂
|
|
// d₂= 2^m.shift
|
|
// m.mul = ⌈2^m.shift2 / d₁⌉
|
|
//
|
|
// The magic computation here is tailored for malloc block sizes
|
|
// and does not handle arbitrary d correctly. Malloc block sizes d are
|
|
// always even, so the first shift implements the factors of 2 in d
|
|
// and then the mul and second shift implement the odd factor
|
|
// that remains. Because the first shift divides n by at least 2 (actually 8)
|
|
// before the multiply gets involved, the huge corner cases that
|
|
// require additional adjustment are impossible, so the usual
|
|
// fixup is not needed.
|
|
//
|
|
// For more details see Hacker's Delight, Chapter 10, and
|
|
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
|
|
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
|
|
type divMagic struct {
|
|
shift uint8
|
|
mul uint32
|
|
shift2 uint8
|
|
baseMask uintptr
|
|
}
|
|
|
|
func computeDivMagic(d uint32) divMagic {
|
|
var m divMagic
|
|
|
|
// If the size is a power of two, heapBitsForObject can divide even faster by masking.
|
|
// Compute this mask.
|
|
if d&(d-1) == 0 {
|
|
// It is a power of 2 (assuming dinptr != 1)
|
|
m.baseMask = ^(uintptr(d) - 1)
|
|
} else {
|
|
m.baseMask = 0
|
|
}
|
|
|
|
// Compute pre-shift by factoring power of 2 out of d.
|
|
for d&1 == 0 {
|
|
m.shift++
|
|
d >>= 1
|
|
}
|
|
|
|
// Compute largest k such that ⌈2^k / d⌉ fits in a 32-bit int.
|
|
// This is always a good enough approximation.
|
|
// We could use smaller k for some divisors but there's no point.
|
|
k := uint8(63)
|
|
d64 := uint64(d)
|
|
for ((1<<k)+d64-1)/d64 >= 1<<32 {
|
|
k--
|
|
}
|
|
m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉
|
|
m.shift2 = k
|
|
|
|
return m
|
|
}
|