mirror of
https://github.com/golang/go
synced 2024-11-11 22:20:22 -07:00
cmd/6g: treat vardef-initialized fat variables as live at calls
This CL forces the optimizer to preserve some memory stores
that would be redundant except that a stack scan due to garbage
collection or stack copying might look at them during a function call.
As such, it forces additional memory writes and therefore slows
down the execution of some programs, especially garbage-heavy
programs that are already limited by memory bandwidth.
The slowdown can be as much as 7% for end-to-end benchmarks.
These numbers are from running go1.test -test.benchtime=5s three times,
taking the best (lowest) ns/op for each benchmark. I am excluding
benchmarks with time/op < 10us to focus on macro effects.
All benchmarks are on amd64.
Comparing tip (a27f34c771cb) against this CL on an Intel Core i5 MacBook Pro:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 3876500413 3856337341 -0.52%
BenchmarkFannkuch11 2965104777 2991182127 +0.88%
BenchmarkGobDecode 8563026 8788340 +2.63%
BenchmarkGobEncode 5050608 5267394 +4.29%
BenchmarkGzip 431191816 434168065 +0.69%
BenchmarkGunzip 107873523 110563792 +2.49%
BenchmarkHTTPClientServer 85036 86131 +1.29%
BenchmarkJSONEncode 22143764 22501647 +1.62%
BenchmarkJSONDecode 79646916 85658808 +7.55%
BenchmarkMandelbrot200 4720421 4700108 -0.43%
BenchmarkGoParse 4651575 4712247 +1.30%
BenchmarkRegexpMatchMedium_1K 71986 73490 +2.09%
BenchmarkRegexpMatchHard_1K 111018 117495 +5.83%
BenchmarkRevcomp 648798723 659352759 +1.63%
BenchmarkTemplate 112673009 112819078 +0.13%
Comparing tip (a27f34c771cb) against this CL on an Intel Xeon E5520:
BenchmarkBinaryTree17 5461110720 5393104469 -1.25%
BenchmarkFannkuch11 4314677151 4327177615 +0.29%
BenchmarkGobDecode 11065853 11235272 +1.53%
BenchmarkGobEncode 6500065
6959837 +7.07%
BenchmarkGzip 647478596 671769097 +3.75%
BenchmarkGunzip 139348579 141096376 +1.25%
BenchmarkHTTPClientServer 69376 73610 +6.10%
BenchmarkJSONEncode 30172320 31796106 +5.38%
BenchmarkJSONDecode 113704905 114239137 +0.47%
BenchmarkMandelbrot200 6032730 6003077 -0.49%
BenchmarkGoParse 6775251 6405995 -5.45%
BenchmarkRegexpMatchMedium_1K 111832 113895 +1.84%
BenchmarkRegexpMatchHard_1K 161112 168420 +4.54%
BenchmarkRevcomp 876363406 892319935 +1.82%
BenchmarkTemplate 146273096 148998339 +1.86%
Just to get a sense of where we are compared to the previous release,
here are the same benchmarks comparing Go 1.2 to this CL.
Comparing Go 1.2 against this CL on an Intel Core i5 MacBook Pro:
BenchmarkBinaryTree17 4370077662 3856337341 -11.76%
BenchmarkFannkuch11 3347052657 2991182127 -10.63%
BenchmarkGobDecode 8791384 8788340 -0.03%
BenchmarkGobEncode 4968759 5267394 +6.01%
BenchmarkGzip 437815669 434168065 -0.83%
BenchmarkGunzip 94604099 110563792 +16.87%
BenchmarkHTTPClientServer 87798 86131 -1.90%
BenchmarkJSONEncode 22818243 22501647 -1.39%
BenchmarkJSONDecode 97182444 85658808 -11.86%
BenchmarkMandelbrot200 4733516 4700108 -0.71%
BenchmarkGoParse 5054384 4712247 -6.77%
BenchmarkRegexpMatchMedium_1K 67612 73490 +8.69%
BenchmarkRegexpMatchHard_1K 107321 117495 +9.48%
BenchmarkRevcomp 733270055 659352759 -10.08%
BenchmarkTemplate 109304977 112819078 +3.21%
Comparing Go 1.2 against this CL on an Intel Xeon E5520:
BenchmarkBinaryTree17 5986953594 5393104469 -9.92%
BenchmarkFannkuch11 4861139174 4327177615 -10.98%
BenchmarkGobDecode 11830997 11235272 -5.04%
BenchmarkGobEncode 6608722 6959837 +5.31%
BenchmarkGzip 661875826 671769097 +1.49%
BenchmarkGunzip 138630019 141096376 +1.78%
BenchmarkHTTPClientServer 71534 73610 +2.90%
BenchmarkJSONEncode 30393609 31796106 +4.61%
BenchmarkJSONDecode 139645860 114239137 -18.19%
BenchmarkMandelbrot200 5988660 6003077 +0.24%
BenchmarkGoParse 6974092 6405995 -8.15%
BenchmarkRegexpMatchMedium_1K 111331 113895 +2.30%
BenchmarkRegexpMatchHard_1K 165961 168420 +1.48%
BenchmarkRevcomp 995049292 892319935 -10.32%
BenchmarkTemplate 145623363 148998339 +2.32%
Fixes #8036.
LGTM=khr
R=golang-codereviews, josharian, khr
CC=golang-codereviews, iant, r
https://golang.org/cl/99660044
This commit is contained in:
parent
aad4609c08
commit
1afbceb599
@ -129,13 +129,15 @@ static char* regname[] = {
|
||||
|
||||
static Node* regnodes[NREGVAR];
|
||||
|
||||
static void walkvardef(Node *n, Reg *r, int active);
|
||||
|
||||
void
|
||||
regopt(Prog *firstp)
|
||||
{
|
||||
Reg *r, *r1;
|
||||
Prog *p;
|
||||
Graph *g;
|
||||
int i, z;
|
||||
int i, z, active;
|
||||
uint32 vreg;
|
||||
Bits bit;
|
||||
ProgInfo info;
|
||||
@ -249,6 +251,26 @@ regopt(Prog *firstp)
|
||||
if(debug['R'] && debug['v'])
|
||||
dumpit("pass2", &firstr->f, 1);
|
||||
|
||||
/*
|
||||
* pass 2.5
|
||||
* iterate propagating fat vardef covering forward
|
||||
* r->act records vars with a VARDEF since the last CALL.
|
||||
* (r->act will be reused in pass 5 for something else,
|
||||
* but we'll be done with it by then.)
|
||||
*/
|
||||
active = 0;
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
r->f.active = 0;
|
||||
r->act = zbits;
|
||||
}
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
p = r->f.prog;
|
||||
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
|
||||
active++;
|
||||
walkvardef(p->to.node, r, active);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* pass 3
|
||||
* iterate propagating usage
|
||||
@ -524,6 +546,32 @@ brk:
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walkvardef(Node *n, Reg *r, int active)
|
||||
{
|
||||
Reg *r1, *r2;
|
||||
int bn;
|
||||
Var *v;
|
||||
|
||||
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
|
||||
if(r1->f.active == active)
|
||||
break;
|
||||
r1->f.active = active;
|
||||
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
|
||||
break;
|
||||
for(v=n->opt; v!=nil; v=v->nextinnode) {
|
||||
bn = v - var;
|
||||
r1->act.b[bn/32] |= 1L << (bn%32);
|
||||
}
|
||||
if(r1->f.prog->as == ABL)
|
||||
break;
|
||||
}
|
||||
|
||||
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
|
||||
if(r2->f.s2 != nil)
|
||||
walkvardef(n, (Reg*)r2->f.s2, active);
|
||||
}
|
||||
|
||||
void
|
||||
addsplits(void)
|
||||
{
|
||||
@ -891,8 +939,13 @@ prop(Reg *r, Bits ref, Bits cal)
|
||||
// Mark all input variables (ivar) as used, because that's what the
|
||||
// liveness bitmaps say. The liveness bitmaps say that so that a
|
||||
// panic will not show stale values in the parameter dump.
|
||||
// Mark variables with a recent VARDEF (r1->act) as used,
|
||||
// so that the optimizer flushes initializations to memory,
|
||||
// so that if a garbage collection happens during this CALL,
|
||||
// the collector will see initialized memory. Again this is to
|
||||
// match what the liveness bitmaps say.
|
||||
for(z=0; z<BITS; z++) {
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
|
||||
ref.b[z] = 0;
|
||||
}
|
||||
|
||||
|
@ -114,6 +114,8 @@ static char* regname[] = {
|
||||
|
||||
static Node* regnodes[NREGVAR];
|
||||
|
||||
static void walkvardef(Node *n, Reg *r, int active);
|
||||
|
||||
void
|
||||
regopt(Prog *firstp)
|
||||
{
|
||||
@ -121,7 +123,7 @@ regopt(Prog *firstp)
|
||||
Prog *p;
|
||||
Graph *g;
|
||||
ProgInfo info;
|
||||
int i, z;
|
||||
int i, z, active;
|
||||
uint32 vreg;
|
||||
Bits bit;
|
||||
|
||||
@ -234,6 +236,26 @@ regopt(Prog *firstp)
|
||||
if(debug['R'] && debug['v'])
|
||||
dumpit("pass2", &firstr->f, 1);
|
||||
|
||||
/*
|
||||
* pass 2.5
|
||||
* iterate propagating fat vardef covering forward
|
||||
* r->act records vars with a VARDEF since the last CALL.
|
||||
* (r->act will be reused in pass 5 for something else,
|
||||
* but we'll be done with it by then.)
|
||||
*/
|
||||
active = 0;
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
r->f.active = 0;
|
||||
r->act = zbits;
|
||||
}
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
p = r->f.prog;
|
||||
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
|
||||
active++;
|
||||
walkvardef(p->to.node, r, active);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* pass 3
|
||||
* iterate propagating usage
|
||||
@ -435,6 +457,32 @@ brk:
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walkvardef(Node *n, Reg *r, int active)
|
||||
{
|
||||
Reg *r1, *r2;
|
||||
int bn;
|
||||
Var *v;
|
||||
|
||||
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
|
||||
if(r1->f.active == active)
|
||||
break;
|
||||
r1->f.active = active;
|
||||
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
|
||||
break;
|
||||
for(v=n->opt; v!=nil; v=v->nextinnode) {
|
||||
bn = v - var;
|
||||
r1->act.b[bn/32] |= 1L << (bn%32);
|
||||
}
|
||||
if(r1->f.prog->as == ACALL)
|
||||
break;
|
||||
}
|
||||
|
||||
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
|
||||
if(r2->f.s2 != nil)
|
||||
walkvardef(n, (Reg*)r2->f.s2, active);
|
||||
}
|
||||
|
||||
/*
|
||||
* add mov b,rn
|
||||
* just after r
|
||||
@ -745,8 +793,13 @@ prop(Reg *r, Bits ref, Bits cal)
|
||||
// Mark all input variables (ivar) as used, because that's what the
|
||||
// liveness bitmaps say. The liveness bitmaps say that so that a
|
||||
// panic will not show stale values in the parameter dump.
|
||||
// Mark variables with a recent VARDEF (r1->act) as used,
|
||||
// so that the optimizer flushes initializations to memory,
|
||||
// so that if a garbage collection happens during this CALL,
|
||||
// the collector will see initialized memory. Again this is to
|
||||
// match what the liveness bitmaps say.
|
||||
for(z=0; z<BITS; z++) {
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
|
||||
ref.b[z] = 0;
|
||||
}
|
||||
|
||||
|
@ -84,6 +84,8 @@ static char* regname[] = {
|
||||
|
||||
static Node* regnodes[NREGVAR];
|
||||
|
||||
static void walkvardef(Node *n, Reg *r, int active);
|
||||
|
||||
void
|
||||
regopt(Prog *firstp)
|
||||
{
|
||||
@ -91,7 +93,7 @@ regopt(Prog *firstp)
|
||||
Prog *p;
|
||||
Graph *g;
|
||||
ProgInfo info;
|
||||
int i, z;
|
||||
int i, z, active;
|
||||
uint32 vreg;
|
||||
Bits bit;
|
||||
|
||||
@ -206,6 +208,26 @@ regopt(Prog *firstp)
|
||||
if(debug['R'] && debug['v'])
|
||||
dumpit("pass2", &firstr->f, 1);
|
||||
|
||||
/*
|
||||
* pass 2.5
|
||||
* iterate propagating fat vardef covering forward
|
||||
* r->act records vars with a VARDEF since the last CALL.
|
||||
* (r->act will be reused in pass 5 for something else,
|
||||
* but we'll be done with it by then.)
|
||||
*/
|
||||
active = 0;
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
r->f.active = 0;
|
||||
r->act = zbits;
|
||||
}
|
||||
for(r = firstr; r != R; r = (Reg*)r->f.link) {
|
||||
p = r->f.prog;
|
||||
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
|
||||
active++;
|
||||
walkvardef(p->to.node, r, active);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* pass 3
|
||||
* iterate propagating usage
|
||||
@ -404,6 +426,32 @@ brk:
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walkvardef(Node *n, Reg *r, int active)
|
||||
{
|
||||
Reg *r1, *r2;
|
||||
int bn;
|
||||
Var *v;
|
||||
|
||||
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
|
||||
if(r1->f.active == active)
|
||||
break;
|
||||
r1->f.active = active;
|
||||
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
|
||||
break;
|
||||
for(v=n->opt; v!=nil; v=v->nextinnode) {
|
||||
bn = v - var;
|
||||
r1->act.b[bn/32] |= 1L << (bn%32);
|
||||
}
|
||||
if(r1->f.prog->as == ACALL)
|
||||
break;
|
||||
}
|
||||
|
||||
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
|
||||
if(r2->f.s2 != nil)
|
||||
walkvardef(n, (Reg*)r2->f.s2, active);
|
||||
}
|
||||
|
||||
/*
|
||||
* add mov b,rn
|
||||
* just after r
|
||||
@ -711,8 +759,13 @@ prop(Reg *r, Bits ref, Bits cal)
|
||||
// Mark all input variables (ivar) as used, because that's what the
|
||||
// liveness bitmaps say. The liveness bitmaps say that so that a
|
||||
// panic will not show stale values in the parameter dump.
|
||||
// Mark variables with a recent VARDEF (r1->act) as used,
|
||||
// so that the optimizer flushes initializations to memory,
|
||||
// so that if a garbage collection happens during this CALL,
|
||||
// the collector will see initialized memory. Again this is to
|
||||
// match what the liveness bitmaps say.
|
||||
for(z=0; z<BITS; z++) {
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
|
||||
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
|
||||
ref.b[z] = 0;
|
||||
}
|
||||
|
||||
|
45
test/fixedbugs/issue8036.go
Normal file
45
test/fixedbugs/issue8036.go
Normal file
@ -0,0 +1,45 @@
|
||||
// run
|
||||
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Issue 8036. Stores necessary for stack scan being eliminated as redundant by optimizer.
|
||||
|
||||
package main
|
||||
|
||||
import "runtime"
|
||||
|
||||
type T struct {
|
||||
X *int
|
||||
Y *int
|
||||
Z *int
|
||||
}
|
||||
|
||||
type TI [3]uintptr
|
||||
|
||||
func G() (t TI) {
|
||||
t[0] = 1
|
||||
t[1] = 2
|
||||
t[2] = 3
|
||||
runtime.GC() // prevent inlining
|
||||
return
|
||||
}
|
||||
|
||||
func F() (t T) {
|
||||
t.X = newint()
|
||||
t.Y = t.X
|
||||
t.Z = t.Y
|
||||
runtime.GC() // prevent inlining
|
||||
return
|
||||
}
|
||||
|
||||
func newint() *int {
|
||||
runtime.GC()
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
G() // leave non-pointers where F's return values go
|
||||
F()
|
||||
}
|
Loading…
Reference in New Issue
Block a user