mirror of
https://github.com/golang/go
synced 2024-11-05 15:16:11 -07:00
cmd/6g, cmd/8g: make 2/3 word sgen more efficient
When compiling the stdlib most of the calls to sgen are for exactly 2 or 3 words: 85% for 6g and 70% for 8g. Special case them for performance. This optimization is not relevant to 5g and 9g. 6g benchmark old ns/op new ns/op delta BenchmarkCopyFat16 3.25 0.82 -74.77% BenchmarkCopyFat24 5.47 0.95 -82.63% 8g benchmark old ns/op new ns/op delta BenchmarkCopyFat8 3.84 2.42 -36.98% BenchmarkCopyFat12 4.94 2.15 -56.48% Change-Id: I8bc60b453f12597dfd916df2d072a7d5fc33ab85 Reviewed-on: https://go-review.googlesource.com/2607 Reviewed-by: Russ Cox <rsc@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
This commit is contained in:
parent
c1c3ce6b36
commit
747c849833
@ -1457,6 +1457,18 @@ sgen(Node *n, Node *ns, int64 w)
|
||||
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
|
||||
// 14 and 128 = magic constants: see ../../runtime/asm_amd64.s
|
||||
p->to.offset = 14*(128-q);
|
||||
} else if(!nacl && c == 0) {
|
||||
// We don't need the MOVSQ side-effect of updating SI and DI,
|
||||
// and issuing a sequence of MOVQs directly is faster.
|
||||
nodsi.op = OINDREG;
|
||||
noddi.op = OINDREG;
|
||||
while(q > 0) {
|
||||
gmove(&nodsi, &cx); // MOVQ x+(SI),CX
|
||||
gmove(&cx, &noddi); // MOVQ CX,x+(DI)
|
||||
nodsi.xoffset += 8;
|
||||
noddi.xoffset += 8;
|
||||
q--;
|
||||
}
|
||||
} else
|
||||
while(q > 0) {
|
||||
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
|
||||
|
@ -1213,7 +1213,7 @@ stkof(Node *n)
|
||||
void
|
||||
sgen(Node *n, Node *res, int64 w)
|
||||
{
|
||||
Node dst, src, tdst, tsrc;
|
||||
Node dst, src, tdst, tsrc, cx;
|
||||
int32 c, q, odst, osrc;
|
||||
NodeList *l;
|
||||
Prog *p;
|
||||
@ -1329,6 +1329,19 @@ sgen(Node *n, Node *res, int64 w)
|
||||
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
|
||||
// 10 and 128 = magic constants: see ../../runtime/asm_386.s
|
||||
p->to.offset = 10*(128-q);
|
||||
} else if(!nacl && c == 0) {
|
||||
nodreg(&cx, types[TINT32], REG_CX);
|
||||
// We don't need the MOVSL side-effect of updating SI and DI,
|
||||
// and issuing a sequence of MOVLs directly is faster.
|
||||
src.op = OINDREG;
|
||||
dst.op = OINDREG;
|
||||
while(q > 0) {
|
||||
gmove(&src, &cx); // MOVL x+(SI),CX
|
||||
gmove(&cx, &dst); // MOVL CX,x+(DI)
|
||||
src.xoffset += 4;
|
||||
dst.xoffset += 4;
|
||||
q--;
|
||||
}
|
||||
} else
|
||||
while(q > 0) {
|
||||
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
|
||||
|
Loading…
Reference in New Issue
Block a user