1
0
mirror of https://github.com/golang/go synced 2024-10-04 10:11:21 -06:00
go/src/cmd/8g/ggen.c

789 lines
14 KiB
C
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#undef EXTERN
#define EXTERN
#include <u.h>
#include <libc.h>
#include "gg.h"
#include "opt.h"
void
defframe(Prog *ptxt)
{
// fill in argument size
ptxt->to.offset2 = rnd(curfn->type->argwid, widthptr);
// fill in final stack size
if(stksize > maxstksize)
maxstksize = stksize;
ptxt->to.offset = rnd(maxstksize+maxarg, widthptr);
maxstksize = 0;
}
// Sweep the prog list to mark any used nodes.
void
markautoused(Prog* p)
{
for (; p; p = p->link) {
if (p->from.type == D_AUTO && p->from.node)
p->from.node->used = 1;
if (p->to.type == D_AUTO && p->to.node)
p->to.node->used = 1;
}
}
// Fixup instructions after compactframe has moved all autos around.
void
fixautoused(Prog* p)
{
for (; p; p = p->link) {
if (p->from.type == D_AUTO && p->from.node)
p->from.offset += p->from.node->stkdelta;
if (p->to.type == D_AUTO && p->to.node)
p->to.offset += p->to.node->stkdelta;
}
}
void
clearfat(Node *nl)
{
uint32 w, c, q;
Node n1;
/* clear a fat object */
if(debug['g'])
dump("\nclearfat", nl);
w = nl->type->width;
if(w == 8 || w == 12)
if(componentgen(N, nl))
return;
c = w % 4; // bytes
q = w / 4; // quads
gconreg(AMOVL, 0, D_AX);
nodreg(&n1, types[tptr], D_DI);
agen(nl, &n1);
if(q >= 4) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSL, N, N); // STOL AL,*(DI)+
} else
while(q > 0) {
gins(ASTOSL, N, N); // STOL AL,*(DI)+
q--;
}
if(c >= 4) {
gconreg(AMOVL, c, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSB, N, N); // STOB AL,*(DI)+
} else
while(c > 0) {
gins(ASTOSB, N, N); // STOB AL,*(DI)+
c--;
}
}
/*
* generate:
* call f
* proc=0 normal call
* proc=1 goroutine run in new proc
* proc=2 defer call save away stack
*/
void
ginscall(Node *f, int proc)
{
Prog *p;
Node reg, con;
switch(proc) {
default:
fatal("ginscall: bad proc %d", proc);
break;
case 0: // normal call
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
case -1: // normal call but no return
p = gins(ACALL, N, f);
afunclit(&p->to);
if(proc == -1 || noreturn(p))
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
gins(AUNDEF, N, N);
break;
case 1: // call in new proc (go)
case 2: // deferred call (defer)
nodreg(&reg, types[TINT32], D_CX);
gins(APUSHL, f, N);
nodconst(&con, types[TINT32], argsize(f->type));
gins(APUSHL, &con, N);
if(proc == 1)
ginscall(newproc, 0);
else
ginscall(deferproc, 0);
gins(APOPL, N, &reg);
gins(APOPL, N, &reg);
if(proc == 2) {
nodreg(&reg, types[TINT64], D_AX);
gins(ATESTL, &reg, &reg);
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
patch(gbranch(AJNE, T, -1), retpc);
}
break;
}
}
/*
* n is call to interface method.
* generate res = n.
*/
void
cgen_callinter(Node *n, Node *res, int proc)
{
Node *i, *f;
Node tmpi, nodi, nodo, nodr, nodsp;
i = n->left;
if(i->op != ODOTINTER)
fatal("cgen_callinter: not ODOTINTER %O", i->op);
f = i->right; // field
if(f->op != ONAME)
fatal("cgen_callinter: not ONAME %O", f->op);
i = i->left; // interface
if(!i->addable) {
tempname(&tmpi, i->type);
cgen(i, &tmpi);
i = &tmpi;
}
genlist(n->list); // assign the args
// i is now addable, prepare an indirected
// register to hold its address.
igen(i, &nodi, res); // REG = &inter
nodindreg(&nodsp, types[tptr], D_SP);
nodi.type = types[tptr];
nodi.xoffset += widthptr;
cgen(&nodi, &nodsp); // 0(SP) = 4(REG) -- i.data
regalloc(&nodo, types[tptr], res);
nodi.type = types[tptr];
nodi.xoffset -= widthptr;
cgen(&nodi, &nodo); // REG = 0(REG) -- i.tab
regfree(&nodi);
regalloc(&nodr, types[tptr], &nodo);
if(n->left->xoffset == BADWIDTH)
fatal("cgen_callinter: badwidth");
nodo.op = OINDREG;
nodo.xoffset = n->left->xoffset + 3*widthptr + 8;
cgen(&nodo, &nodr); // REG = 20+offset(REG) -- i.tab->fun[f]
// BOTCH nodr.type = fntype;
nodr.type = n->left->type;
ginscall(&nodr, proc);
regfree(&nodr);
regfree(&nodo);
setmaxarg(n->left->type);
}
/*
* generate function call;
* proc=0 normal call
* proc=1 goroutine run in new proc
* proc=2 defer call save away stack
*/
void
cgen_call(Node *n, int proc)
{
Type *t;
Node nod, afun;
if(n == N)
return;
if(n->left->ullman >= UINF) {
// if name involves a fn call
// precompute the address of the fn
tempname(&afun, types[tptr]);
cgen(n->left, &afun);
}
genlist(n->list); // assign the args
t = n->left->type;
setmaxarg(t);
// call tempname pointer
if(n->left->ullman >= UINF) {
regalloc(&nod, types[tptr], N);
cgen_as(&nod, &afun);
nod.type = t;
ginscall(&nod, proc);
regfree(&nod);
return;
}
// call pointer
if(n->left->op != ONAME || n->left->class != PFUNC) {
regalloc(&nod, types[tptr], N);
cgen_as(&nod, n->left);
nod.type = t;
ginscall(&nod, proc);
regfree(&nod);
return;
}
// call direct
n->left->method = 1;
ginscall(n->left, proc);
}
/*
* call to n has already been generated.
* generate:
* res = return value from call.
*/
void
cgen_callret(Node *n, Node *res)
{
Node nod;
Type *fp, *t;
Iter flist;
t = n->left->type;
if(t->etype == TPTR32 || t->etype == TPTR64)
t = t->type;
fp = structfirst(&flist, getoutarg(t));
if(fp == T)
fatal("cgen_callret: nil");
memset(&nod, 0, sizeof(nod));
nod.op = OINDREG;
nod.val.u.reg = D_SP;
nod.addable = 1;
nod.xoffset = fp->width;
nod.type = fp->type;
cgen_as(res, &nod);
}
/*
* call to n has already been generated.
* generate:
* res = &return value from call.
*/
void
cgen_aret(Node *n, Node *res)
{
Node nod1, nod2;
Type *fp, *t;
Iter flist;
t = n->left->type;
if(isptr[t->etype])
t = t->type;
fp = structfirst(&flist, getoutarg(t));
if(fp == T)
fatal("cgen_aret: nil");
memset(&nod1, 0, sizeof(nod1));
nod1.op = OINDREG;
nod1.val.u.reg = D_SP;
nod1.addable = 1;
nod1.xoffset = fp->width;
nod1.type = fp->type;
if(res->op != OREGISTER) {
regalloc(&nod2, types[tptr], res);
gins(ALEAL, &nod1, &nod2);
gins(AMOVL, &nod2, res);
regfree(&nod2);
} else
gins(ALEAL, &nod1, res);
}
/*
* generate return.
* n->left is assignments to return values.
*/
void
cgen_ret(Node *n)
{
genlist(n->list); // copy out args
if(retpc)
gjmp(retpc);
else
gins(ARET, N, N);
}
/*
* generate += *= etc.
*/
void
cgen_asop(Node *n)
{
Node n1, n2, n3, n4;
Node *nl, *nr;
Prog *p1;
Addr addr;
int a;
nl = n->left;
nr = n->right;
if(nr->ullman >= UINF && nl->ullman >= UINF) {
tempname(&n1, nr->type);
cgen(nr, &n1);
n2 = *n;
n2.right = &n1;
cgen_asop(&n2);
goto ret;
}
if(!isint[nl->type->etype])
goto hard;
if(!isint[nr->type->etype])
goto hard;
if(is64(nl->type) || is64(nr->type))
goto hard;
switch(n->etype) {
case OADD:
if(smallintconst(nr))
if(mpgetfix(nr->val.u.xval) == 1) {
a = optoas(OINC, nl->type);
if(nl->addable) {
gins(a, N, nl);
goto ret;
}
if(sudoaddable(a, nl, &addr)) {
p1 = gins(a, N, N);
p1->to = addr;
sudoclean();
goto ret;
}
}
break;
case OSUB:
if(smallintconst(nr))
if(mpgetfix(nr->val.u.xval) == 1) {
a = optoas(ODEC, nl->type);
if(nl->addable) {
gins(a, N, nl);
goto ret;
}
if(sudoaddable(a, nl, &addr)) {
p1 = gins(a, N, N);
p1->to = addr;
sudoclean();
goto ret;
}
}
break;
}
switch(n->etype) {
case OADD:
case OSUB:
case OXOR:
case OAND:
case OOR:
a = optoas(n->etype, nl->type);
if(nl->addable) {
if(smallintconst(nr)) {
gins(a, nr, nl);
goto ret;
}
regalloc(&n2, nr->type, N);
cgen(nr, &n2);
gins(a, &n2, nl);
regfree(&n2);
goto ret;
}
if(nr->ullman < UINF)
if(sudoaddable(a, nl, &addr)) {
if(smallintconst(nr)) {
p1 = gins(a, nr, N);
p1->to = addr;
sudoclean();
goto ret;
}
regalloc(&n2, nr->type, N);
cgen(nr, &n2);
p1 = gins(a, &n2, N);
p1->to = addr;
regfree(&n2);
sudoclean();
goto ret;
}
}
hard:
n2.op = 0;
n1.op = 0;
if(nr->ullman >= nl->ullman || nl->addable) {
mgen(nr, &n2, N);
nr = &n2;
} else {
tempname(&n2, nr->type);
cgen(nr, &n2);
nr = &n2;
}
if(!nl->addable) {
igen(nl, &n1, N);
nl = &n1;
}
n3 = *n;
n3.left = nl;
n3.right = nr;
n3.op = n->etype;
mgen(&n3, &n4, N);
gmove(&n4, nl);
if(n1.op)
regfree(&n1);
mfree(&n2);
mfree(&n4);
ret:
;
}
2009-08-07 13:57:44 -06:00
int
samereg(Node *a, Node *b)
{
if(a->op != OREGISTER)
return 0;
if(b->op != OREGISTER)
return 0;
if(a->val.u.reg != b->val.u.reg)
return 0;
return 1;
}
/*
* generate division.
* caller must set:
* ax = allocated AX register
* dx = allocated DX register
* generates one of:
* res = nl / nr
* res = nl % nr
* according to op.
*/
void
dodiv(int op, Node *nl, Node *nr, Node *res, Node *ax, Node *dx)
{
int check;
Node n1, t1, t2, t3, t4, n4, nz;
Type *t, *t0;
Prog *p1, *p2, *p3;
// Have to be careful about handling
// most negative int divided by -1 correctly.
// The hardware will trap.
// Also the byte divide instruction needs AH,
// which we otherwise don't have to deal with.
// Easiest way to avoid for int8, int16: use int32.
// For int32 and int64, use explicit test.
// Could use int64 hw for int32.
t = nl->type;
t0 = t;
check = 0;
if(issigned[t->etype]) {
check = 1;
if(isconst(nl, CTINT) && mpgetfix(nl->val.u.xval) != -1LL<<(t->width*8-1))
check = 0;
else if(isconst(nr, CTINT) && mpgetfix(nr->val.u.xval) != -1)
check = 0;
}
if(t->width < 4) {
if(issigned[t->etype])
t = types[TINT32];
else
t = types[TUINT32];
check = 0;
}
tempname(&t1, t);
tempname(&t2, t);
if(t0 != t) {
tempname(&t3, t0);
tempname(&t4, t0);
cgen(nl, &t3);
cgen(nr, &t4);
// Convert.
gmove(&t3, &t1);
gmove(&t4, &t2);
} else {
cgen(nl, &t1);
cgen(nr, &t2);
}
if(!samereg(ax, res) && !samereg(dx, res))
regalloc(&n1, t, res);
else
regalloc(&n1, t, N);
gmove(&t2, &n1);
gmove(&t1, ax);
p3 = P;
if(check) {
nodconst(&n4, t, -1);
gins(optoas(OCMP, t), &n1, &n4);
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p1 = gbranch(optoas(ONE, t), T, +1);
nodconst(&n4, t, -1LL<<(t->width*8-1));
gins(optoas(OCMP, t), ax, &n4);
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p2 = gbranch(optoas(ONE, t), T, +1);
if(op == ODIV)
gmove(&n4, res);
if(op == OMOD) {
nodconst(&n4, t, 0);
gmove(&n4, res);
}
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p3 = gbranch(AJMP, T, 0);
patch(p1, pc);
patch(p2, pc);
}
if(!issigned[t->etype]) {
nodconst(&nz, t, 0);
gmove(&nz, dx);
} else
gins(optoas(OEXTEND, t), N, N);
gins(optoas(op, t), &n1, N);
regfree(&n1);
if(op == ODIV)
gmove(ax, res);
else
gmove(dx, res);
if(check)
patch(p3, pc);
}
static void
savex(int dr, Node *x, Node *oldx, Node *res, Type *t)
{
int r;
r = reg[dr];
nodreg(x, types[TINT32], dr);
// save current ax and dx if they are live
// and not the destination
memset(oldx, 0, sizeof *oldx);
if(r > 0 && !samereg(x, res)) {
tempname(oldx, types[TINT32]);
gmove(x, oldx);
}
regalloc(x, t, x);
}
static void
restx(Node *x, Node *oldx)
{
regfree(x);
if(oldx->op != 0) {
x->type = types[TINT32];
gmove(oldx, x);
}
}
/*
* generate division according to op, one of:
* res = nl / nr
* res = nl % nr
*/
void
cgen_div(int op, Node *nl, Node *nr, Node *res)
{
Node ax, dx, oldax, olddx;
Type *t;
if(is64(nl->type))
fatal("cgen_div %T", nl->type);
if(issigned[nl->type->etype])
t = types[TINT32];
else
t = types[TUINT32];
savex(D_AX, &ax, &oldax, res, t);
savex(D_DX, &dx, &olddx, res, t);
dodiv(op, nl, nr, res, &ax, &dx);
restx(&dx, &olddx);
restx(&ax, &oldax);
}
/*
* generate shift according to op, one of:
* res = nl << nr
* res = nl >> nr
*/
void
cgen_shift(int op, int bounded, Node *nl, Node *nr, Node *res)
{
Node n1, n2, nt, cx, oldcx, hi, lo;
int a, w;
Prog *p1, *p2;
uvlong sc;
if(nl->type->width > 4)
fatal("cgen_shift %T", nl->type);
w = nl->type->width * 8;
a = optoas(op, nl->type);
if(nr->op == OLITERAL) {
tempname(&n2, nl->type);
cgen(nl, &n2);
regalloc(&n1, nl->type, res);
gmove(&n2, &n1);
sc = mpgetfix(nr->val.u.xval);
if(sc >= nl->type->width*8) {
// large shift gets 2 shifts by width-1
gins(a, ncon(w-1), &n1);
gins(a, ncon(w-1), &n1);
} else
gins(a, nr, &n1);
gmove(&n1, res);
regfree(&n1);
return;
}
2009-08-07 13:57:44 -06:00
memset(&oldcx, 0, sizeof oldcx);
nodreg(&cx, types[TUINT32], D_CX);
if(reg[D_CX] > 1 && !samereg(&cx, res)) {
tempname(&oldcx, types[TUINT32]);
2009-08-07 13:57:44 -06:00
gmove(&cx, &oldcx);
}
if(nr->type->width > 4) {
tempname(&nt, nr->type);
n1 = nt;
} else {
nodreg(&n1, types[TUINT32], D_CX);
regalloc(&n1, nr->type, &n1); // to hold the shift type in CX
}
2009-08-07 13:57:44 -06:00
if(samereg(&cx, res))
regalloc(&n2, nl->type, N);
else
regalloc(&n2, nl->type, res);
if(nl->ullman >= nr->ullman) {
cgen(nl, &n2);
cgen(nr, &n1);
} else {
cgen(nr, &n1);
cgen(nl, &n2);
}
// test and fix up large shifts
if(bounded) {
if(nr->type->width > 4) {
// delayed reg alloc
nodreg(&n1, types[TUINT32], D_CX);
regalloc(&n1, types[TUINT32], &n1); // to hold the shift type in CX
split64(&nt, &lo, &hi);
gmove(&lo, &n1);
}
} else {
if(nr->type->width > 4) {
// delayed reg alloc
nodreg(&n1, types[TUINT32], D_CX);
regalloc(&n1, types[TUINT32], &n1); // to hold the shift type in CX
split64(&nt, &lo, &hi);
gmove(&lo, &n1);
gins(optoas(OCMP, types[TUINT32]), &hi, ncon(0));
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p2 = gbranch(optoas(ONE, types[TUINT32]), T, +1);
gins(optoas(OCMP, types[TUINT32]), &n1, ncon(w));
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
patch(p2, pc);
} else {
gins(optoas(OCMP, nr->type), &n1, ncon(w));
cmd/gc: contiguous loop layout Drop expecttaken function in favor of extra argument to gbranch and bgen. Mark loop condition as likely to be true, so that loops are generated inline. The main benefit here is contiguous code when trying to read the generated assembly. It has only minor effects on the timing, and they mostly cancel the minor effects that aligning function entry points had. One exception: both changes made Fannkuch faster. Compared to before CL 6244066 (before aligned functions) benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4222117400 4201958800 -0.48% BenchmarkFannkuch11 3462631800 3215908600 -7.13% BenchmarkGobDecode 20887622 20899164 +0.06% BenchmarkGobEncode 9548772 9439083 -1.15% BenchmarkGzip 151687 152060 +0.25% BenchmarkGunzip 8742 8711 -0.35% BenchmarkJSONEncode 62730560 62686700 -0.07% BenchmarkJSONDecode 252569180 252368960 -0.08% BenchmarkMandelbrot200 5267599 5252531 -0.29% BenchmarkRevcomp25M 980813500 985248400 +0.45% BenchmarkTemplate 361259100 357414680 -1.06% Compared to tip (aligned functions): benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 4140739800 4201958800 +1.48% BenchmarkFannkuch11 3259914400 3215908600 -1.35% BenchmarkGobDecode 20620222 20899164 +1.35% BenchmarkGobEncode 9384886 9439083 +0.58% BenchmarkGzip 150333 152060 +1.15% BenchmarkGunzip 8741 8711 -0.34% BenchmarkJSONEncode 65210990 62686700 -3.87% BenchmarkJSONDecode 249394860 252368960 +1.19% BenchmarkMandelbrot200 5273394 5252531 -0.40% BenchmarkRevcomp25M 996013800 985248400 -1.08% BenchmarkTemplate 360620840 357414680 -0.89% R=ken2 CC=golang-dev https://golang.org/cl/6245069
2012-05-30 16:07:39 -06:00
p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
}
if(op == ORSH && issigned[nl->type->etype]) {
gins(a, ncon(w-1), &n2);
} else {
gmove(ncon(0), &n2);
}
patch(p1, pc);
}
gins(a, &n1, &n2);
if(oldcx.op != 0)
2009-08-07 13:57:44 -06:00
gmove(&oldcx, &cx);
gmove(&n2, res);
regfree(&n1);
regfree(&n2);
}
/*
* generate byte multiply:
* res = nl * nr
* no byte multiply instruction so have to do
* 16-bit multiply and take bottom half.
*/
void
cgen_bmul(int op, Node *nl, Node *nr, Node *res)
{
Node n1b, n2b, n1w, n2w;
Type *t;
int a;
if(nl->ullman >= nr->ullman) {
regalloc(&n1b, nl->type, res);
cgen(nl, &n1b);
regalloc(&n2b, nr->type, N);
cgen(nr, &n2b);
} else {
regalloc(&n2b, nr->type, N);
cgen(nr, &n2b);
regalloc(&n1b, nl->type, res);
cgen(nl, &n1b);
}
// copy from byte to short registers
t = types[TUINT16];
if(issigned[nl->type->etype])
t = types[TINT16];
regalloc(&n2w, t, &n2b);
cgen(&n2b, &n2w);
regalloc(&n1w, t, &n1b);
cgen(&n1b, &n1w);
a = optoas(op, t);
gins(a, &n2w, &n1w);
cgen(&n1w, &n1b);
cgen(&n1b, res);
regfree(&n1w);
regfree(&n2w);
regfree(&n1b);
regfree(&n2b);
}