1
0
mirror of https://github.com/golang/go synced 2024-11-20 03:44:40 -07:00
go/src/runtime/msize.go

255 lines
7.9 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Malloc small size classes.
//
// See malloc.go for overview.
//
// The size classes are chosen so that rounding an allocation
// request up to the next size class wastes at most 12.5% (1.125x).
//
// Each size class has its own page count that gets allocated
// and chopped up when new objects of the size class are needed.
// That page count is chosen so that chopping up the run of
// pages into objects of the given size wastes at most 12.5% (1.125x)
// of the memory. It is not necessary that the cutoff here be
// the same as above.
//
// The two sources of waste multiply, so the worst possible case
// for the above constraints would be that allocations of some
// size might have a 26.6% (1.266x) overhead.
// In practice, only one of the wastes comes into play for a
// given size (sizes < 512 waste mainly on the round-up,
// sizes > 512 waste mainly on the page chopping).
//
// TODO(rsc): Compute max waste for any given size.
package runtime
// Size classes. Computed and initialized by InitSizes.
//
// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
// 1 <= sizeclass < NumSizeClasses, for n.
// Size class 0 is reserved to mean "not small".
//
// class_to_size[i] = largest size in class i
// class_to_allocnpages[i] = number of pages to allocate when
// making new objects in class i
// The SizeToClass lookup is implemented using two arrays,
// one mapping sizes <= 1024 to their class and one mapping
// sizes >= 1024 and <= MaxSmallSize to their class.
// All objects are 8-aligned, so the first array is indexed by
// the size divided by 8 (rounded up). Objects >= 1024 bytes
// are 128-aligned, so the second array is indexed by the
// size divided by 128 (rounded up). The arrays are filled in
// by InitSizes.
var class_to_size [_NumSizeClasses]int32
var class_to_allocnpages [_NumSizeClasses]int32
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
var class_to_divmagic [_NumSizeClasses]divMagic
var size_to_class8 [1024/8 + 1]int8
var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
func sizeToClass(size int32) int32 {
if size > _MaxSmallSize {
throw("SizeToClass - invalid size")
}
if size > 1024-8 {
return int32(size_to_class128[(size-1024+127)>>7])
}
return int32(size_to_class8[(size+7)>>3])
}
func initSizes() {
// Initialize the runtime·class_to_size table (and choose class sizes in the process).
class_to_size[0] = 0
sizeclass := 1 // 0 means no class
align := 8
for size := align; size <= _MaxSmallSize; size += align {
if size&(size-1) == 0 { // bump alignment once in a while
if size >= 2048 {
align = 256
} else if size >= 128 {
align = size / 8
} else if size >= 16 {
align = 16 // required for x86 SSE instructions, if we want to use them
}
}
if align&(align-1) != 0 {
throw("InitSizes - bug")
}
// Make the allocnpages big enough that
// the leftover is less than 1/8 of the total,
// so wasted space is at most 12.5%.
allocsize := _PageSize
for allocsize%size > allocsize/8 {
allocsize += _PageSize
}
npages := allocsize >> _PageShift
// If the previous sizeclass chose the same
// allocation size and fit the same number of
// objects into the page, we might as well
// use just this size instead of having two
// different sizes.
if sizeclass > 1 && npages == int(class_to_allocnpages[sizeclass-1]) && allocsize/size == allocsize/int(class_to_size[sizeclass-1]) {
class_to_size[sizeclass-1] = int32(size)
continue
}
class_to_allocnpages[sizeclass] = int32(npages)
class_to_size[sizeclass] = int32(size)
sizeclass++
}
if sizeclass != _NumSizeClasses {
print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n")
throw("InitSizes - bad NumSizeClasses")
}
// Initialize the size_to_class tables.
nextsize := 0
for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ {
for ; nextsize < 1024 && nextsize <= int(class_to_size[sizeclass]); nextsize += 8 {
size_to_class8[nextsize/8] = int8(sizeclass)
}
if nextsize >= 1024 {
for ; nextsize <= int(class_to_size[sizeclass]); nextsize += 128 {
size_to_class128[(nextsize-1024)/128] = int8(sizeclass)
}
}
}
// Double-check SizeToClass.
if false {
for n := int32(0); n < _MaxSmallSize; n++ {
sizeclass := sizeToClass(n)
if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n {
print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
print("incorrect SizeToClass\n")
goto dump
}
if sizeclass > 1 && class_to_size[sizeclass-1] >= n {
print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
print("SizeToClass too big\n")
goto dump
}
}
}
testdefersizes()
// Copy out for statistics table.
for i := 0; i < len(class_to_size); i++ {
memstats.by_size[i].size = uint32(class_to_size[i])
}
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
for i := 1; i < len(class_to_size); i++ {
class_to_divmagic[i] = computeDivMagic(uint32(class_to_size[i]))
}
return
dump:
if true {
print("NumSizeClasses=", _NumSizeClasses, "\n")
print("runtime·class_to_size:")
for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ {
print(" ", class_to_size[sizeclass], "")
}
print("\n\n")
print("size_to_class8:")
for i := 0; i < len(size_to_class8); i++ {
print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n")
}
print("\n")
print("size_to_class128:")
for i := 0; i < len(size_to_class128); i++ {
print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n")
}
print("\n")
}
throw("InitSizes failed")
}
// Returns size of the memory block that mallocgc will allocate if you ask for the size.
func roundupsize(size uintptr) uintptr {
if size < _MaxSmallSize {
if size <= 1024-8 {
return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
} else {
return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
}
}
if size+_PageSize < size {
return size
}
return round(size, _PageSize)
}
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
// divMagic holds magic constants to implement division
// by a particular constant as a shift, multiply, and shift.
// That is, given
// m = computeMagic(d)
// then
// n/d == ((n>>m.shift) * m.mul) >> m.shift2
//
// The magic computation picks m such that
// d = d₁*d₂
// d₂= 2^m.shift
// m.mul = ⌈2^m.shift2 / d₁⌉
//
// The magic computation here is tailored for malloc block sizes
// and does not handle arbitrary d correctly. Malloc block sizes d are
// always even, so the first shift implements the factors of 2 in d
// and then the mul and second shift implement the odd factor
// that remains. Because the first shift divides n by at least 2 (actually 8)
// before the multiply gets involved, the huge corner cases that
// require additional adjustment are impossible, so the usual
// fixup is not needed.
//
// For more details see Hacker's Delight, Chapter 10, and
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
type divMagic struct {
runtime: Speed up heapBitsForObject Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements <austin@google.com>
2015-04-15 15:08:58 -06:00
shift uint8
mul uint32
shift2 uint8
baseMask uintptr
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
}
func computeDivMagic(d uint32) divMagic {
var m divMagic
runtime: Speed up heapBitsForObject Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements <austin@google.com>
2015-04-15 15:08:58 -06:00
// If the size is a power of two, heapBitsForObject can divide even faster by masking.
// Compute this mask.
if d&(d-1) == 0 {
// It is a power of 2 (assuming dinptr != 1)
m.baseMask = ^(uintptr(d) - 1)
} else {
m.baseMask = 0
}
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
// Compute pre-shift by factoring power of 2 out of d.
for d&1 == 0 {
m.shift++
d >>= 1
}
// Compute largest k such that ⌈2^k / d⌉ fits in a 32-bit int.
// This is always a good enough approximation.
// We could use smaller k for some divisors but there's no point.
k := uint8(63)
d64 := uint64(d)
for ((1<<k)+d64-1)/d64 >= 1<<32 {
k--
}
m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉
m.shift2 = k
runtime: Speed up heapBitsForObject Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements <austin@google.com>
2015-04-15 15:08:58 -06:00
runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2015-03-04 09:34:50 -07:00
return m
}