From beedb1ec334a785a565684cf563f8354998212b4 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Wed, 12 Aug 2015 23:43:43 -0400 Subject: [PATCH] runtime: add pcvalue cache to improve stack scan speed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cost of scanning large stacks is currently dominated by the time spent looking up and decoding the pcvalue table. However, large stacks are usually large not because they contain calls to many different functions, but because they contain many calls to the same, small set of recursive functions. Hence, walking large stacks tends to make the same pcvalue queries many times. Based on this observation, this commit adds a small, very simple, and fast cache in front of pcvalue lookup. We thread this cache down from operations that make many pcvalue calls, such as gentraceback, stack scanning, and stack adjusting. This simple cache works well because it has minimal overhead when it's not effective. I also tried a hashed direct-map cache, CLOCK-based replacement, round-robin replacement, and round-robin with lookups disabled until there had been at least 16 probes, but none of these approaches had obvious wins over the random replacement policy in this commit. This nearly doubles the overall performance of the deep stack test program from issue #10898: name old time/op new time/op delta Issue10898 16.5s ±12% 9.2s ±12% -44.37% (p=0.008 n=5+5) It's a very slight win on the garbage benchmark: name old time/op new time/op delta XBenchGarbage-12 4.92ms ± 1% 4.89ms ± 1% -0.75% (p=0.000 n=18+19) It's a wash (but doesn't harm performance) on the go1 benchmarks, which don't have particularly deep stacks: name old time/op new time/op delta BinaryTree17-12 3.11s ± 2% 3.20s ± 3% +2.83% (p=0.000 n=17+20) Fannkuch11-12 2.51s ± 1% 2.51s ± 1% -0.22% (p=0.034 n=19+18) FmtFprintfEmpty-12 50.8ns ± 3% 50.6ns ± 2% ~ (p=0.793 n=20+20) FmtFprintfString-12 174ns ± 0% 174ns ± 1% +0.17% (p=0.048 n=15+20) FmtFprintfInt-12 177ns ± 0% 165ns ± 1% -6.99% (p=0.000 n=17+19) FmtFprintfIntInt-12 283ns ± 1% 284ns ± 0% +0.22% (p=0.000 n=18+15) FmtFprintfPrefixedInt-12 243ns ± 1% 244ns ± 1% +0.40% (p=0.000 n=20+19) FmtFprintfFloat-12 318ns ± 0% 319ns ± 0% +0.27% (p=0.001 n=19+20) FmtManyArgs-12 1.12µs ± 0% 1.14µs ± 0% +1.74% (p=0.000 n=19+20) GobDecode-12 8.69ms ± 0% 8.73ms ± 1% +0.46% (p=0.000 n=18+18) GobEncode-12 6.64ms ± 1% 6.61ms ± 1% -0.46% (p=0.000 n=20+20) Gzip-12 323ms ± 2% 319ms ± 1% -1.11% (p=0.000 n=20+20) Gunzip-12 42.8ms ± 0% 42.9ms ± 0% ~ (p=0.158 n=18+20) HTTPClientServer-12 63.3µs ± 1% 63.1µs ± 1% -0.35% (p=0.011 n=20+20) JSONEncode-12 16.9ms ± 1% 17.3ms ± 1% +2.84% (p=0.000 n=19+20) JSONDecode-12 59.7ms ± 0% 58.5ms ± 0% -2.05% (p=0.000 n=19+17) Mandelbrot200-12 3.92ms ± 0% 3.91ms ± 0% -0.16% (p=0.003 n=19+19) GoParse-12 3.79ms ± 2% 3.75ms ± 2% -0.91% (p=0.005 n=20+20) RegexpMatchEasy0_32-12 102ns ± 1% 101ns ± 1% -0.80% (p=0.001 n=14+20) RegexpMatchEasy0_1K-12 337ns ± 1% 346ns ± 1% +2.90% (p=0.000 n=20+19) RegexpMatchEasy1_32-12 84.4ns ± 2% 84.3ns ± 2% ~ (p=0.743 n=20+20) RegexpMatchEasy1_1K-12 502ns ± 1% 505ns ± 0% +0.64% (p=0.000 n=20+20) RegexpMatchMedium_32-12 133ns ± 1% 132ns ± 1% -0.85% (p=0.000 n=20+19) RegexpMatchMedium_1K-12 40.1µs ± 1% 39.8µs ± 1% -0.77% (p=0.000 n=18+18) RegexpMatchHard_32-12 2.08µs ± 1% 2.07µs ± 1% -0.55% (p=0.001 n=18+19) RegexpMatchHard_1K-12 62.4µs ± 1% 62.0µs ± 1% -0.74% (p=0.000 n=19+19) Revcomp-12 545ms ± 2% 545ms ± 3% ~ (p=0.771 n=19+20) Template-12 73.7ms ± 1% 72.0ms ± 0% -2.33% (p=0.000 n=20+18) TimeParse-12 358ns ± 1% 351ns ± 1% -2.07% (p=0.000 n=20+20) TimeFormat-12 369ns ± 1% 356ns ± 0% -3.53% (p=0.000 n=20+18) [Geo mean] 63.5µs 63.2µs -0.41% name old speed new speed delta GobDecode-12 88.3MB/s ± 0% 87.9MB/s ± 0% -0.43% (p=0.000 n=18+17) GobEncode-12 116MB/s ± 1% 116MB/s ± 1% +0.47% (p=0.000 n=20+20) Gzip-12 60.2MB/s ± 2% 60.8MB/s ± 1% +1.13% (p=0.000 n=20+20) Gunzip-12 453MB/s ± 0% 453MB/s ± 0% ~ (p=0.160 n=18+20) JSONEncode-12 115MB/s ± 1% 112MB/s ± 1% -2.76% (p=0.000 n=19+20) JSONDecode-12 32.5MB/s ± 0% 33.2MB/s ± 0% +2.09% (p=0.000 n=19+17) GoParse-12 15.3MB/s ± 2% 15.4MB/s ± 2% +0.92% (p=0.004 n=20+20) RegexpMatchEasy0_32-12 311MB/s ± 1% 314MB/s ± 1% +0.78% (p=0.000 n=15+19) RegexpMatchEasy0_1K-12 3.04GB/s ± 1% 2.95GB/s ± 1% -2.90% (p=0.000 n=19+19) RegexpMatchEasy1_32-12 379MB/s ± 2% 380MB/s ± 2% ~ (p=0.779 n=20+20) RegexpMatchEasy1_1K-12 2.04GB/s ± 1% 2.02GB/s ± 0% -0.62% (p=0.000 n=20+20) RegexpMatchMedium_32-12 7.46MB/s ± 1% 7.53MB/s ± 1% +0.86% (p=0.000 n=20+19) RegexpMatchMedium_1K-12 25.5MB/s ± 1% 25.7MB/s ± 1% +0.78% (p=0.000 n=18+18) RegexpMatchHard_32-12 15.4MB/s ± 1% 15.5MB/s ± 1% +0.62% (p=0.000 n=19+19) RegexpMatchHard_1K-12 16.4MB/s ± 1% 16.5MB/s ± 1% +0.82% (p=0.000 n=20+19) Revcomp-12 466MB/s ± 2% 466MB/s ± 3% ~ (p=0.765 n=19+20) Template-12 26.3MB/s ± 1% 27.0MB/s ± 0% +2.38% (p=0.000 n=20+18) [Geo mean] 97.8MB/s 98.0MB/s +0.23% Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4 Reviewed-on: https://go-review.googlesource.com/13614 Reviewed-by: Keith Randall --- src/runtime/heapdump.go | 2 +- src/runtime/mbitmap.go | 2 +- src/runtime/mgcmark.go | 7 +++-- src/runtime/stack.go | 3 +- src/runtime/symtab.go | 66 ++++++++++++++++++++++++++++++++++------ src/runtime/traceback.go | 6 ++-- 6 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 0344330e4d..f8f88c6515 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -247,7 +247,7 @@ func dumpframe(s *stkframe, arg unsafe.Pointer) bool { if pc != f.entry { pc-- } - pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc) + pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil) if pcdata == -1 { // We do not have a valid pcdata value but there might be a // stackmap for this function. It is likely that we are looking diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 33715f287b..42afdf4390 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -1696,7 +1696,7 @@ func getgcmask(ep interface{}) (mask []byte) { if targetpc != f.entry { targetpc-- } - pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc) + pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil) if pcdata == -1 { return } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 35bdda9789..93018207d6 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -438,10 +438,11 @@ func scanstack(gp *g) { throw("scanstack in wrong phase") } + var cache pcvalueCache gcw := &getg().m.p.ptr().gcw n := 0 scanframe := func(frame *stkframe, unused unsafe.Pointer) bool { - scanframeworker(frame, unused, gcw) + scanframeworker(frame, &cache, gcw) if frame.fp > nextBarrier { // We skip installing a barrier on bottom-most @@ -474,7 +475,7 @@ func scanstack(gp *g) { // Scan a stack frame: local variables and function arguments/results. //go:nowritebarrier -func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) { +func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) { f := frame.fn targetpc := frame.continpc @@ -488,7 +489,7 @@ func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) { if targetpc != f.entry { targetpc-- } - pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc) + pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache) if pcdata == -1 { // We do not have a valid pcdata value but there might be a // stackmap for this function. It is likely that we are looking diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 56efc2eb4a..e3087af940 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -482,6 +482,7 @@ var ptrnames = []string{ type adjustinfo struct { old stack delta uintptr // ptr distance from old to new stack (newbase - oldbase) + cache pcvalueCache } // Adjustpointer checks whether *vpp is in the old stack described by adjinfo. @@ -575,7 +576,7 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool { if targetpc != f.entry { targetpc-- } - pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc) + pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, &adjinfo.cache) if pcdata == -1 { pcdata = 0 // in prologue } diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go index 46686092f8..c3235fac03 100644 --- a/src/runtime/symtab.go +++ b/src/runtime/symtab.go @@ -112,6 +112,7 @@ func moduledataverify1(datap *moduledata) { // ftab is lookup table for function by program counter. nftab := len(datap.ftab) - 1 + var pcCache pcvalueCache for i := 0; i < nftab; i++ { // NOTE: ftab[nftab].entry is legal; it is the address beyond the final function. if datap.ftab[i].entry > datap.ftab[i+1].entry { @@ -147,9 +148,9 @@ func moduledataverify1(datap *moduledata) { } } } - pcvalue(f, f.pcfile, end, true) - pcvalue(f, f.pcln, end, true) - pcvalue(f, f.pcsp, end, true) + pcvalue(f, f.pcfile, end, &pcCache, true) + pcvalue(f, f.pcln, end, &pcCache, true) + pcvalue(f, f.pcsp, end, &pcCache, true) } } @@ -226,10 +227,42 @@ func findfunc(pc uintptr) *_func { return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff])) } -func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 { +type pcvalueCache struct { + entries [16]pcvalueCacheEnt +} + +type pcvalueCacheEnt struct { + // targetpc and off together are the key of this cache entry. + targetpc uintptr + off int32 + // val is the value of this cached pcvalue entry. + val int32 +} + +func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 { if off == 0 { return -1 } + + // Check the cache. This speeds up walks of deep stacks, which + // tend to have the same recursive functions over and over. + // + // This cache is small enough that full associativity is + // cheaper than doing the hashing for a less associative + // cache. + if cache != nil { + for _, ent := range cache.entries { + // We check off first because we're more + // likely to have multiple entries with + // different offsets for the same targetpc + // than the other way around, so we'll usually + // fail in the first clause. + if ent.off == off && ent.targetpc == targetpc { + return ent.val + } + } + } + datap := findmoduledatap(f.entry) // inefficient if datap == nil { if strict && panicking == 0 { @@ -248,6 +281,19 @@ func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 { break } if targetpc < pc { + // Replace a random entry in the cache. Random + // replacement prevents a performance cliff if + // a recursive stack's cycle is slightly + // larger than the cache. + if cache != nil { + ci := fastrand1() % uint32(len(cache.entries)) + cache.entries[ci] = pcvalueCacheEnt{ + targetpc: targetpc, + off: off, + val: val, + } + } + return val } } @@ -296,8 +342,8 @@ func funcline1(f *_func, targetpc uintptr, strict bool) (file string, line int32 if datap == nil { return "?", 0 } - fileno := int(pcvalue(f, f.pcfile, targetpc, strict)) - line = pcvalue(f, f.pcln, targetpc, strict) + fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict)) + line = pcvalue(f, f.pcln, targetpc, nil, strict) if fileno == -1 || line == -1 || fileno >= len(datap.filetab) { // print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n") return "?", 0 @@ -310,20 +356,20 @@ func funcline(f *_func, targetpc uintptr) (file string, line int32) { return funcline1(f, targetpc, true) } -func funcspdelta(f *_func, targetpc uintptr) int32 { - x := pcvalue(f, f.pcsp, targetpc, true) +func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 { + x := pcvalue(f, f.pcsp, targetpc, cache, true) if x&(ptrSize-1) != 0 { print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n") } return x } -func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 { +func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 { if table < 0 || table >= f.npcdata { return -1 } off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4)) - return pcvalue(f, off, targetpc, true) + return pcvalue(f, off, targetpc, cache, true) } func funcdata(f *_func, i int32) unsafe.Pointer { diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 2d223ced62..b99920ab4f 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -198,6 +198,8 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in } frame.fn = f + var cache pcvalueCache + n := 0 for n < max { // Typically: @@ -219,7 +221,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in sp = gp.m.curg.sched.sp stkbar = gp.m.curg.stkbar[gp.m.curg.stkbarPos:] } - frame.fp = sp + uintptr(funcspdelta(f, frame.pc)) + frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache)) if !usesLR { // On x86, call instruction pushes return PC before entering new function. frame.fp += regSize @@ -403,7 +405,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in frame.fn = f if f == nil { frame.pc = x - } else if funcspdelta(f, frame.pc) == 0 { + } else if funcspdelta(f, frame.pc, &cache) == 0 { frame.lr = x } }