mirror of
https://github.com/golang/go
synced 2024-11-05 17:26:11 -07:00
runtime: Speed up heapBitsForObject
Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements <austin@google.com>
This commit is contained in:
parent
e7ffafdb6e
commit
899a4ad47e
@ -154,17 +154,16 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) {
|
||||
// return base == 0
|
||||
// otherwise return the base of the object.
|
||||
func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
|
||||
if p < mheap_.arena_start || p >= mheap_.arena_used {
|
||||
arenaStart := mheap_.arena_start
|
||||
if p < arenaStart || p >= mheap_.arena_used {
|
||||
return
|
||||
}
|
||||
|
||||
off := p - arenaStart
|
||||
idx := off >> _PageShift
|
||||
// p points into the heap, but possibly to the middle of an object.
|
||||
// Consult the span table to find the block beginning.
|
||||
// TODO(rsc): Factor this out.
|
||||
k := p >> _PageShift
|
||||
x := k
|
||||
x -= mheap_.arena_start >> _PageShift
|
||||
s = h_spans[x]
|
||||
s = h_spans[idx]
|
||||
if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse {
|
||||
if s == nil || s.state == _MSpanStack {
|
||||
// If s is nil, the virtual address has never been part of the heap.
|
||||
@ -188,23 +187,23 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
|
||||
printunlock()
|
||||
throw("objectstart: bad pointer in unexpected span")
|
||||
}
|
||||
return
|
||||
}
|
||||
base = s.base()
|
||||
if p-base >= s.elemsize {
|
||||
// n := (p - base) / s.elemsize, using division by multiplication
|
||||
n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
|
||||
|
||||
const debugMagic = false
|
||||
if debugMagic {
|
||||
n2 := (p - base) / s.elemsize
|
||||
if n != n2 {
|
||||
println("runtime: bad div magic", (p - base), s.elemsize, s.divShift, s.divMul, s.divShift2)
|
||||
throw("bad div magic")
|
||||
}
|
||||
// If this span holds object of a power of 2 size, just mask off the bits to
|
||||
// the interior of the object. Otherwise use the size to get the base.
|
||||
if s.baseMask != 0 {
|
||||
// optimize for power of 2 sized objects.
|
||||
base = s.base()
|
||||
base = base + (p-base)&s.baseMask
|
||||
// base = p & s.baseMask is faster for small spans,
|
||||
// but doesn't work for large spans.
|
||||
// Overall, it's faster to use the more general computation above.
|
||||
} else {
|
||||
base = s.base()
|
||||
if p-base >= s.elemsize {
|
||||
// n := (p - base) / s.elemsize, using division by multiplication
|
||||
n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
|
||||
base += n * s.elemsize
|
||||
}
|
||||
|
||||
base += n * s.elemsize
|
||||
}
|
||||
// Now that we know the actual base, compute heapBits to return to caller.
|
||||
hbits = heapBitsForAddr(base)
|
||||
|
@ -24,7 +24,6 @@ type mheap struct {
|
||||
nspan uint32
|
||||
sweepgen uint32 // sweep generation, see comment in mspan
|
||||
sweepdone uint32 // all spans are swept
|
||||
|
||||
// span lookup
|
||||
spans **mspan
|
||||
spans_mapped uintptr
|
||||
@ -99,6 +98,7 @@ type mspan struct {
|
||||
// if sweepgen == h->sweepgen - 1, the span is currently being swept
|
||||
// if sweepgen == h->sweepgen, the span is swept and ready to use
|
||||
// h->sweepgen is incremented by 2 after every GC
|
||||
|
||||
sweepgen uint32
|
||||
divMul uint32 // for divide by elemsize - divMagic.mul
|
||||
ref uint16 // capacity - number of objects in freelist
|
||||
@ -114,6 +114,7 @@ type mspan struct {
|
||||
limit uintptr // end of data in span
|
||||
speciallock mutex // guards specials list
|
||||
specials *special // linked list of special records sorted by offset.
|
||||
baseMask uintptr // if non-0, elemsize is a power of 2, & this will get object allocation base
|
||||
}
|
||||
|
||||
func (s *mspan) base() uintptr {
|
||||
@ -384,12 +385,14 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
|
||||
s.divShift = 0
|
||||
s.divMul = 0
|
||||
s.divShift2 = 0
|
||||
s.baseMask = 0
|
||||
} else {
|
||||
s.elemsize = uintptr(class_to_size[sizeclass])
|
||||
m := &class_to_divmagic[sizeclass]
|
||||
s.divShift = m.shift
|
||||
s.divMul = m.mul
|
||||
s.divShift2 = m.shift2
|
||||
s.baseMask = m.baseMask
|
||||
}
|
||||
|
||||
// update stats, sweep lists
|
||||
|
@ -215,14 +215,24 @@ func roundupsize(size uintptr) uintptr {
|
||||
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
|
||||
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
|
||||
type divMagic struct {
|
||||
shift uint8
|
||||
mul uint32
|
||||
shift2 uint8
|
||||
shift uint8
|
||||
mul uint32
|
||||
shift2 uint8
|
||||
baseMask uintptr
|
||||
}
|
||||
|
||||
func computeDivMagic(d uint32) divMagic {
|
||||
var m divMagic
|
||||
|
||||
// If the size is a power of two, heapBitsForObject can divide even faster by masking.
|
||||
// Compute this mask.
|
||||
if d&(d-1) == 0 {
|
||||
// It is a power of 2 (assuming dinptr != 1)
|
||||
m.baseMask = ^(uintptr(d) - 1)
|
||||
} else {
|
||||
m.baseMask = 0
|
||||
}
|
||||
|
||||
// Compute pre-shift by factoring power of 2 out of d.
|
||||
for d&1 == 0 {
|
||||
m.shift++
|
||||
@ -239,5 +249,6 @@ func computeDivMagic(d uint32) divMagic {
|
||||
}
|
||||
m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉
|
||||
m.shift2 = k
|
||||
|
||||
return m
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user