From 747c8498339b799eb613db1701a927a3549d389a Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Fri, 9 Jan 2015 12:41:47 -0800 Subject: [PATCH] cmd/6g, cmd/8g: make 2/3 word sgen more efficient When compiling the stdlib most of the calls to sgen are for exactly 2 or 3 words: 85% for 6g and 70% for 8g. Special case them for performance. This optimization is not relevant to 5g and 9g. 6g benchmark old ns/op new ns/op delta BenchmarkCopyFat16 3.25 0.82 -74.77% BenchmarkCopyFat24 5.47 0.95 -82.63% 8g benchmark old ns/op new ns/op delta BenchmarkCopyFat8 3.84 2.42 -36.98% BenchmarkCopyFat12 4.94 2.15 -56.48% Change-Id: I8bc60b453f12597dfd916df2d072a7d5fc33ab85 Reviewed-on: https://go-review.googlesource.com/2607 Reviewed-by: Russ Cox Run-TryBot: Josh Bleecher Snyder --- src/cmd/6g/cgen.c | 12 ++++++++++++ src/cmd/8g/cgen.c | 15 ++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/cmd/6g/cgen.c b/src/cmd/6g/cgen.c index 627bffe27e..5e98934da6 100644 --- a/src/cmd/6g/cgen.c +++ b/src/cmd/6g/cgen.c @@ -1457,6 +1457,18 @@ sgen(Node *n, Node *ns, int64 w) p->to.sym = linksym(pkglookup("duffcopy", runtimepkg)); // 14 and 128 = magic constants: see ../../runtime/asm_amd64.s p->to.offset = 14*(128-q); + } else if(!nacl && c == 0) { + // We don't need the MOVSQ side-effect of updating SI and DI, + // and issuing a sequence of MOVQs directly is faster. + nodsi.op = OINDREG; + noddi.op = OINDREG; + while(q > 0) { + gmove(&nodsi, &cx); // MOVQ x+(SI),CX + gmove(&cx, &noddi); // MOVQ CX,x+(DI) + nodsi.xoffset += 8; + noddi.xoffset += 8; + q--; + } } else while(q > 0) { gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+ diff --git a/src/cmd/8g/cgen.c b/src/cmd/8g/cgen.c index eabf52ae81..c65a437857 100644 --- a/src/cmd/8g/cgen.c +++ b/src/cmd/8g/cgen.c @@ -1213,7 +1213,7 @@ stkof(Node *n) void sgen(Node *n, Node *res, int64 w) { - Node dst, src, tdst, tsrc; + Node dst, src, tdst, tsrc, cx; int32 c, q, odst, osrc; NodeList *l; Prog *p; @@ -1329,6 +1329,19 @@ sgen(Node *n, Node *res, int64 w) p->to.sym = linksym(pkglookup("duffcopy", runtimepkg)); // 10 and 128 = magic constants: see ../../runtime/asm_386.s p->to.offset = 10*(128-q); + } else if(!nacl && c == 0) { + nodreg(&cx, types[TINT32], REG_CX); + // We don't need the MOVSL side-effect of updating SI and DI, + // and issuing a sequence of MOVLs directly is faster. + src.op = OINDREG; + dst.op = OINDREG; + while(q > 0) { + gmove(&src, &cx); // MOVL x+(SI),CX + gmove(&cx, &dst); // MOVL CX,x+(DI) + src.xoffset += 4; + dst.xoffset += 4; + q--; + } } else while(q > 0) { gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+