mirror of
https://github.com/golang/go
synced 2024-11-18 10:14:45 -07:00
runtime: add pcvalue cache to improve stack scan speed
The cost of scanning large stacks is currently dominated by the time spent looking up and decoding the pcvalue table. However, large stacks are usually large not because they contain calls to many different functions, but because they contain many calls to the same, small set of recursive functions. Hence, walking large stacks tends to make the same pcvalue queries many times. Based on this observation, this commit adds a small, very simple, and fast cache in front of pcvalue lookup. We thread this cache down from operations that make many pcvalue calls, such as gentraceback, stack scanning, and stack adjusting. This simple cache works well because it has minimal overhead when it's not effective. I also tried a hashed direct-map cache, CLOCK-based replacement, round-robin replacement, and round-robin with lookups disabled until there had been at least 16 probes, but none of these approaches had obvious wins over the random replacement policy in this commit. This nearly doubles the overall performance of the deep stack test program from issue #10898: name old time/op new time/op delta Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5) It's a very slight win on the garbage benchmark: name old time/op new time/op delta XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19) It's a wash (but doesn't harm performance) on the go1 benchmarks, which don't have particularly deep stacks: name old time/op new time/op delta BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20) Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18) FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20) FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20) FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19) FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15) FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19) FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20) FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20) GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18) GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20) Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20) Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20) HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20) JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20) JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17) Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19) GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20) RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20) RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19) RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20) RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20) RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19) RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18) RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19) RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19) Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20) Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18) TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20) TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18) [Geo mean] 63.5µs 63.2µs -0.41% name old speed new speed delta GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17) GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20) Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20) Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20) JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20) JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17) GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20) RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19) RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19) RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20) RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20) RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19) RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18) RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19) RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19) Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20) Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18) [Geo mean] 97.8MB/s 98.0MB/s +0.23% Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4 Reviewed-on: https://go-review.googlesource.com/13614 Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
46630ecd65
commit
beedb1ec33
@ -247,7 +247,7 @@ func dumpframe(s *stkframe, arg unsafe.Pointer) bool {
|
||||
if pc != f.entry {
|
||||
pc--
|
||||
}
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc)
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil)
|
||||
if pcdata == -1 {
|
||||
// We do not have a valid pcdata value but there might be a
|
||||
// stackmap for this function. It is likely that we are looking
|
||||
|
@ -1696,7 +1696,7 @@ func getgcmask(ep interface{}) (mask []byte) {
|
||||
if targetpc != f.entry {
|
||||
targetpc--
|
||||
}
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
|
||||
if pcdata == -1 {
|
||||
return
|
||||
}
|
||||
|
@ -438,10 +438,11 @@ func scanstack(gp *g) {
|
||||
throw("scanstack in wrong phase")
|
||||
}
|
||||
|
||||
var cache pcvalueCache
|
||||
gcw := &getg().m.p.ptr().gcw
|
||||
n := 0
|
||||
scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
|
||||
scanframeworker(frame, unused, gcw)
|
||||
scanframeworker(frame, &cache, gcw)
|
||||
|
||||
if frame.fp > nextBarrier {
|
||||
// We skip installing a barrier on bottom-most
|
||||
@ -474,7 +475,7 @@ func scanstack(gp *g) {
|
||||
|
||||
// Scan a stack frame: local variables and function arguments/results.
|
||||
//go:nowritebarrier
|
||||
func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
|
||||
func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
|
||||
|
||||
f := frame.fn
|
||||
targetpc := frame.continpc
|
||||
@ -488,7 +489,7 @@ func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
|
||||
if targetpc != f.entry {
|
||||
targetpc--
|
||||
}
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
|
||||
if pcdata == -1 {
|
||||
// We do not have a valid pcdata value but there might be a
|
||||
// stackmap for this function. It is likely that we are looking
|
||||
|
@ -482,6 +482,7 @@ var ptrnames = []string{
|
||||
type adjustinfo struct {
|
||||
old stack
|
||||
delta uintptr // ptr distance from old to new stack (newbase - oldbase)
|
||||
cache pcvalueCache
|
||||
}
|
||||
|
||||
// Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
|
||||
@ -575,7 +576,7 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
|
||||
if targetpc != f.entry {
|
||||
targetpc--
|
||||
}
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
|
||||
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, &adjinfo.cache)
|
||||
if pcdata == -1 {
|
||||
pcdata = 0 // in prologue
|
||||
}
|
||||
|
@ -112,6 +112,7 @@ func moduledataverify1(datap *moduledata) {
|
||||
|
||||
// ftab is lookup table for function by program counter.
|
||||
nftab := len(datap.ftab) - 1
|
||||
var pcCache pcvalueCache
|
||||
for i := 0; i < nftab; i++ {
|
||||
// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
|
||||
if datap.ftab[i].entry > datap.ftab[i+1].entry {
|
||||
@ -147,9 +148,9 @@ func moduledataverify1(datap *moduledata) {
|
||||
}
|
||||
}
|
||||
}
|
||||
pcvalue(f, f.pcfile, end, true)
|
||||
pcvalue(f, f.pcln, end, true)
|
||||
pcvalue(f, f.pcsp, end, true)
|
||||
pcvalue(f, f.pcfile, end, &pcCache, true)
|
||||
pcvalue(f, f.pcln, end, &pcCache, true)
|
||||
pcvalue(f, f.pcsp, end, &pcCache, true)
|
||||
}
|
||||
}
|
||||
|
||||
@ -226,10 +227,42 @@ func findfunc(pc uintptr) *_func {
|
||||
return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff]))
|
||||
}
|
||||
|
||||
func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
|
||||
type pcvalueCache struct {
|
||||
entries [16]pcvalueCacheEnt
|
||||
}
|
||||
|
||||
type pcvalueCacheEnt struct {
|
||||
// targetpc and off together are the key of this cache entry.
|
||||
targetpc uintptr
|
||||
off int32
|
||||
// val is the value of this cached pcvalue entry.
|
||||
val int32
|
||||
}
|
||||
|
||||
func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
|
||||
if off == 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
// Check the cache. This speeds up walks of deep stacks, which
|
||||
// tend to have the same recursive functions over and over.
|
||||
//
|
||||
// This cache is small enough that full associativity is
|
||||
// cheaper than doing the hashing for a less associative
|
||||
// cache.
|
||||
if cache != nil {
|
||||
for _, ent := range cache.entries {
|
||||
// We check off first because we're more
|
||||
// likely to have multiple entries with
|
||||
// different offsets for the same targetpc
|
||||
// than the other way around, so we'll usually
|
||||
// fail in the first clause.
|
||||
if ent.off == off && ent.targetpc == targetpc {
|
||||
return ent.val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
datap := findmoduledatap(f.entry) // inefficient
|
||||
if datap == nil {
|
||||
if strict && panicking == 0 {
|
||||
@ -248,6 +281,19 @@ func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
|
||||
break
|
||||
}
|
||||
if targetpc < pc {
|
||||
// Replace a random entry in the cache. Random
|
||||
// replacement prevents a performance cliff if
|
||||
// a recursive stack's cycle is slightly
|
||||
// larger than the cache.
|
||||
if cache != nil {
|
||||
ci := fastrand1() % uint32(len(cache.entries))
|
||||
cache.entries[ci] = pcvalueCacheEnt{
|
||||
targetpc: targetpc,
|
||||
off: off,
|
||||
val: val,
|
||||
}
|
||||
}
|
||||
|
||||
return val
|
||||
}
|
||||
}
|
||||
@ -296,8 +342,8 @@ func funcline1(f *_func, targetpc uintptr, strict bool) (file string, line int32
|
||||
if datap == nil {
|
||||
return "?", 0
|
||||
}
|
||||
fileno := int(pcvalue(f, f.pcfile, targetpc, strict))
|
||||
line = pcvalue(f, f.pcln, targetpc, strict)
|
||||
fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
|
||||
line = pcvalue(f, f.pcln, targetpc, nil, strict)
|
||||
if fileno == -1 || line == -1 || fileno >= len(datap.filetab) {
|
||||
// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
|
||||
return "?", 0
|
||||
@ -310,20 +356,20 @@ func funcline(f *_func, targetpc uintptr) (file string, line int32) {
|
||||
return funcline1(f, targetpc, true)
|
||||
}
|
||||
|
||||
func funcspdelta(f *_func, targetpc uintptr) int32 {
|
||||
x := pcvalue(f, f.pcsp, targetpc, true)
|
||||
func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 {
|
||||
x := pcvalue(f, f.pcsp, targetpc, cache, true)
|
||||
if x&(ptrSize-1) != 0 {
|
||||
print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 {
|
||||
func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
|
||||
if table < 0 || table >= f.npcdata {
|
||||
return -1
|
||||
}
|
||||
off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
|
||||
return pcvalue(f, off, targetpc, true)
|
||||
return pcvalue(f, off, targetpc, cache, true)
|
||||
}
|
||||
|
||||
func funcdata(f *_func, i int32) unsafe.Pointer {
|
||||
|
@ -198,6 +198,8 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
|
||||
}
|
||||
frame.fn = f
|
||||
|
||||
var cache pcvalueCache
|
||||
|
||||
n := 0
|
||||
for n < max {
|
||||
// Typically:
|
||||
@ -219,7 +221,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
|
||||
sp = gp.m.curg.sched.sp
|
||||
stkbar = gp.m.curg.stkbar[gp.m.curg.stkbarPos:]
|
||||
}
|
||||
frame.fp = sp + uintptr(funcspdelta(f, frame.pc))
|
||||
frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
|
||||
if !usesLR {
|
||||
// On x86, call instruction pushes return PC before entering new function.
|
||||
frame.fp += regSize
|
||||
@ -403,7 +405,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
|
||||
frame.fn = f
|
||||
if f == nil {
|
||||
frame.pc = x
|
||||
} else if funcspdelta(f, frame.pc) == 0 {
|
||||
} else if funcspdelta(f, frame.pc, &cache) == 0 {
|
||||
frame.lr = x
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user