1
0
mirror of https://github.com/golang/go synced 2024-10-05 19:11:22 -06:00
go/src/cmd/8g/cgen64.c
Russ Cox 001b75c942 cmd/gc: contiguous loop layout
Drop expecttaken function in favor of extra argument
to gbranch and bgen. Mark loop condition as likely to
be true, so that loops are generated inline.

The main benefit here is contiguous code when trying
to read the generated assembly. It has only minor effects
on the timing, and they mostly cancel the minor effects
that aligning function entry points had.  One exception:
both changes made Fannkuch faster.

Compared to before CL 6244066 (before aligned functions)
benchmark                 old ns/op    new ns/op    delta
BenchmarkBinaryTree17    4222117400   4201958800   -0.48%
BenchmarkFannkuch11      3462631800   3215908600   -7.13%
BenchmarkGobDecode         20887622     20899164   +0.06%
BenchmarkGobEncode          9548772      9439083   -1.15%
BenchmarkGzip                151687       152060   +0.25%
BenchmarkGunzip                8742         8711   -0.35%
BenchmarkJSONEncode        62730560     62686700   -0.07%
BenchmarkJSONDecode       252569180    252368960   -0.08%
BenchmarkMandelbrot200      5267599      5252531   -0.29%
BenchmarkRevcomp25M       980813500    985248400   +0.45%
BenchmarkTemplate         361259100    357414680   -1.06%

Compared to tip (aligned functions):
benchmark                 old ns/op    new ns/op    delta
BenchmarkBinaryTree17    4140739800   4201958800   +1.48%
BenchmarkFannkuch11      3259914400   3215908600   -1.35%
BenchmarkGobDecode         20620222     20899164   +1.35%
BenchmarkGobEncode          9384886      9439083   +0.58%
BenchmarkGzip                150333       152060   +1.15%
BenchmarkGunzip                8741         8711   -0.34%
BenchmarkJSONEncode        65210990     62686700   -3.87%
BenchmarkJSONDecode       249394860    252368960   +1.19%
BenchmarkMandelbrot200      5273394      5252531   -0.40%
BenchmarkRevcomp25M       996013800    985248400   -1.08%
BenchmarkTemplate         360620840    357414680   -0.89%

R=ken2
CC=golang-dev
https://golang.org/cl/6245069
2012-05-30 18:07:39 -04:00

550 lines
11 KiB
C

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <u.h>
#include <libc.h>
#include "gg.h"
/*
* attempt to generate 64-bit
* res = n
* return 1 on success, 0 if op not handled.
*/
void
cgen64(Node *n, Node *res)
{
Node t1, t2, ax, dx, cx, ex, fx, *l, *r;
Node lo1, lo2, hi1, hi2;
Prog *p1, *p2;
uint64 v;
uint32 lv, hv;
if(res->op != OINDREG && res->op != ONAME) {
dump("n", n);
dump("res", res);
fatal("cgen64 %O of %O", n->op, res->op);
}
switch(n->op) {
default:
fatal("cgen64 %O", n->op);
case OMINUS:
cgen(n->left, res);
split64(res, &lo1, &hi1);
gins(ANEGL, N, &lo1);
gins(AADCL, ncon(0), &hi1);
gins(ANEGL, N, &hi1);
splitclean();
return;
case OCOM:
cgen(n->left, res);
split64(res, &lo1, &hi1);
gins(ANOTL, N, &lo1);
gins(ANOTL, N, &hi1);
splitclean();
return;
case OADD:
case OSUB:
case OMUL:
case OLROT:
case OLSH:
case ORSH:
case OAND:
case OOR:
case OXOR:
// binary operators.
// common setup below.
break;
}
l = n->left;
r = n->right;
if(!l->addable) {
tempname(&t1, l->type);
cgen(l, &t1);
l = &t1;
}
if(r != N && !r->addable) {
tempname(&t2, r->type);
cgen(r, &t2);
r = &t2;
}
nodreg(&ax, types[TINT32], D_AX);
nodreg(&cx, types[TINT32], D_CX);
nodreg(&dx, types[TINT32], D_DX);
// Setup for binary operation.
split64(l, &lo1, &hi1);
if(is64(r->type))
split64(r, &lo2, &hi2);
// Do op. Leave result in DX:AX.
switch(n->op) {
case OADD:
// TODO: Constants
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
gins(AADDL, &lo2, &ax);
gins(AADCL, &hi2, &dx);
break;
case OSUB:
// TODO: Constants.
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
gins(ASUBL, &lo2, &ax);
gins(ASBBL, &hi2, &dx);
break;
case OMUL:
// let's call the next two EX and FX.
regalloc(&ex, types[TPTR32], N);
regalloc(&fx, types[TPTR32], N);
// load args into DX:AX and EX:CX.
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
gins(AMOVL, &lo2, &cx);
gins(AMOVL, &hi2, &ex);
// if DX and EX are zero, use 32 x 32 -> 64 unsigned multiply.
gins(AMOVL, &dx, &fx);
gins(AORL, &ex, &fx);
p1 = gbranch(AJNE, T, 0);
gins(AMULL, &cx, N); // implicit &ax
p2 = gbranch(AJMP, T, 0);
patch(p1, pc);
// full 64x64 -> 64, from 32x32 -> 64.
gins(AIMULL, &cx, &dx);
gins(AMOVL, &ax, &fx);
gins(AIMULL, &ex, &fx);
gins(AADDL, &dx, &fx);
gins(AMOVL, &cx, &dx);
gins(AMULL, &dx, N); // implicit &ax
gins(AADDL, &fx, &dx);
patch(p2, pc);
regfree(&ex);
regfree(&fx);
break;
case OLROT:
// We only rotate by a constant c in [0,64).
// if c >= 32:
// lo, hi = hi, lo
// c -= 32
// if c == 0:
// no-op
// else:
// t = hi
// shld hi:lo, c
// shld lo:t, c
v = mpgetfix(r->val.u.xval);
if(v >= 32) {
// reverse during load to do the first 32 bits of rotate
v -= 32;
gins(AMOVL, &lo1, &dx);
gins(AMOVL, &hi1, &ax);
} else {
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
}
if(v == 0) {
// done
} else {
gins(AMOVL, &dx, &cx);
p1 = gins(ASHLL, ncon(v), &dx);
p1->from.index = D_AX; // double-width shift
p1->from.scale = 0;
p1 = gins(ASHLL, ncon(v), &ax);
p1->from.index = D_CX; // double-width shift
p1->from.scale = 0;
}
break;
case OLSH:
if(r->op == OLITERAL) {
v = mpgetfix(r->val.u.xval);
if(v >= 64) {
if(is64(r->type))
splitclean();
splitclean();
split64(res, &lo2, &hi2);
gins(AMOVL, ncon(0), &lo2);
gins(AMOVL, ncon(0), &hi2);
splitclean();
goto out;
}
if(v >= 32) {
if(is64(r->type))
splitclean();
split64(res, &lo2, &hi2);
gmove(&lo1, &hi2);
if(v > 32) {
gins(ASHLL, ncon(v - 32), &hi2);
}
gins(AMOVL, ncon(0), &lo2);
splitclean();
splitclean();
goto out;
}
// general shift
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
p1 = gins(ASHLL, ncon(v), &dx);
p1->from.index = D_AX; // double-width shift
p1->from.scale = 0;
gins(ASHLL, ncon(v), &ax);
break;
}
// load value into DX:AX.
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
// load shift value into register.
// if high bits are set, zero value.
p1 = P;
if(is64(r->type)) {
gins(ACMPL, &hi2, ncon(0));
p1 = gbranch(AJNE, T, +1);
gins(AMOVL, &lo2, &cx);
} else {
cx.type = types[TUINT32];
gmove(r, &cx);
}
// if shift count is >=64, zero value
gins(ACMPL, &cx, ncon(64));
p2 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
if(p1 != P)
patch(p1, pc);
gins(AXORL, &dx, &dx);
gins(AXORL, &ax, &ax);
patch(p2, pc);
// if shift count is >= 32, zero low.
gins(ACMPL, &cx, ncon(32));
p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
gins(AMOVL, &ax, &dx);
gins(ASHLL, &cx, &dx); // SHLL only uses bottom 5 bits of count
gins(AXORL, &ax, &ax);
p2 = gbranch(AJMP, T, 0);
patch(p1, pc);
// general shift
p1 = gins(ASHLL, &cx, &dx);
p1->from.index = D_AX; // double-width shift
p1->from.scale = 0;
gins(ASHLL, &cx, &ax);
patch(p2, pc);
break;
case ORSH:
if(r->op == OLITERAL) {
v = mpgetfix(r->val.u.xval);
if(v >= 64) {
if(is64(r->type))
splitclean();
splitclean();
split64(res, &lo2, &hi2);
if(hi1.type->etype == TINT32) {
gmove(&hi1, &lo2);
gins(ASARL, ncon(31), &lo2);
gmove(&hi1, &hi2);
gins(ASARL, ncon(31), &hi2);
} else {
gins(AMOVL, ncon(0), &lo2);
gins(AMOVL, ncon(0), &hi2);
}
splitclean();
goto out;
}
if(v >= 32) {
if(is64(r->type))
splitclean();
split64(res, &lo2, &hi2);
gmove(&hi1, &lo2);
if(v > 32)
gins(optoas(ORSH, hi1.type), ncon(v-32), &lo2);
if(hi1.type->etype == TINT32) {
gmove(&hi1, &hi2);
gins(ASARL, ncon(31), &hi2);
} else
gins(AMOVL, ncon(0), &hi2);
splitclean();
splitclean();
goto out;
}
// general shift
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
p1 = gins(ASHRL, ncon(v), &ax);
p1->from.index = D_DX; // double-width shift
p1->from.scale = 0;
gins(optoas(ORSH, hi1.type), ncon(v), &dx);
break;
}
// load value into DX:AX.
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
// load shift value into register.
// if high bits are set, zero value.
p1 = P;
if(is64(r->type)) {
gins(ACMPL, &hi2, ncon(0));
p1 = gbranch(AJNE, T, +1);
gins(AMOVL, &lo2, &cx);
} else {
cx.type = types[TUINT32];
gmove(r, &cx);
}
// if shift count is >=64, zero or sign-extend value
gins(ACMPL, &cx, ncon(64));
p2 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
if(p1 != P)
patch(p1, pc);
if(hi1.type->etype == TINT32) {
gins(ASARL, ncon(31), &dx);
gins(AMOVL, &dx, &ax);
} else {
gins(AXORL, &dx, &dx);
gins(AXORL, &ax, &ax);
}
patch(p2, pc);
// if shift count is >= 32, sign-extend hi.
gins(ACMPL, &cx, ncon(32));
p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
gins(AMOVL, &dx, &ax);
if(hi1.type->etype == TINT32) {
gins(ASARL, &cx, &ax); // SARL only uses bottom 5 bits of count
gins(ASARL, ncon(31), &dx);
} else {
gins(ASHRL, &cx, &ax);
gins(AXORL, &dx, &dx);
}
p2 = gbranch(AJMP, T, 0);
patch(p1, pc);
// general shift
p1 = gins(ASHRL, &cx, &ax);
p1->from.index = D_DX; // double-width shift
p1->from.scale = 0;
gins(optoas(ORSH, hi1.type), &cx, &dx);
patch(p2, pc);
break;
case OXOR:
case OAND:
case OOR:
// make constant the right side (it usually is anyway).
if(lo1.op == OLITERAL) {
nswap(&lo1, &lo2);
nswap(&hi1, &hi2);
}
if(lo2.op == OLITERAL) {
// special cases for constants.
lv = mpgetfix(lo2.val.u.xval);
hv = mpgetfix(hi2.val.u.xval);
splitclean(); // right side
split64(res, &lo2, &hi2);
switch(n->op) {
case OXOR:
gmove(&lo1, &lo2);
gmove(&hi1, &hi2);
switch(lv) {
case 0:
break;
case 0xffffffffu:
gins(ANOTL, N, &lo2);
break;
default:
gins(AXORL, ncon(lv), &lo2);
break;
}
switch(hv) {
case 0:
break;
case 0xffffffffu:
gins(ANOTL, N, &hi2);
break;
default:
gins(AXORL, ncon(hv), &hi2);
break;
}
break;
case OAND:
switch(lv) {
case 0:
gins(AMOVL, ncon(0), &lo2);
break;
default:
gmove(&lo1, &lo2);
if(lv != 0xffffffffu)
gins(AANDL, ncon(lv), &lo2);
break;
}
switch(hv) {
case 0:
gins(AMOVL, ncon(0), &hi2);
break;
default:
gmove(&hi1, &hi2);
if(hv != 0xffffffffu)
gins(AANDL, ncon(hv), &hi2);
break;
}
break;
case OOR:
switch(lv) {
case 0:
gmove(&lo1, &lo2);
break;
case 0xffffffffu:
gins(AMOVL, ncon(0xffffffffu), &lo2);
break;
default:
gmove(&lo1, &lo2);
gins(AORL, ncon(lv), &lo2);
break;
}
switch(hv) {
case 0:
gmove(&hi1, &hi2);
break;
case 0xffffffffu:
gins(AMOVL, ncon(0xffffffffu), &hi2);
break;
default:
gmove(&hi1, &hi2);
gins(AORL, ncon(hv), &hi2);
break;
}
break;
}
splitclean();
splitclean();
goto out;
}
gins(AMOVL, &lo1, &ax);
gins(AMOVL, &hi1, &dx);
gins(optoas(n->op, lo1.type), &lo2, &ax);
gins(optoas(n->op, lo1.type), &hi2, &dx);
break;
}
if(is64(r->type))
splitclean();
splitclean();
split64(res, &lo1, &hi1);
gins(AMOVL, &ax, &lo1);
gins(AMOVL, &dx, &hi1);
splitclean();
out:;
}
/*
* generate comparison of nl, nr, both 64-bit.
* nl is memory; nr is constant or memory.
*/
void
cmp64(Node *nl, Node *nr, int op, int likely, Prog *to)
{
Node lo1, hi1, lo2, hi2, rr;
Prog *br;
Type *t;
split64(nl, &lo1, &hi1);
split64(nr, &lo2, &hi2);
// compare most significant word;
// if they differ, we're done.
t = hi1.type;
if(nl->op == OLITERAL || nr->op == OLITERAL)
gins(ACMPL, &hi1, &hi2);
else {
regalloc(&rr, types[TINT32], N);
gins(AMOVL, &hi1, &rr);
gins(ACMPL, &rr, &hi2);
regfree(&rr);
}
br = P;
switch(op) {
default:
fatal("cmp64 %O %T", op, t);
case OEQ:
// cmp hi
// jne L
// cmp lo
// jeq to
// L:
br = gbranch(AJNE, T, -likely);
break;
case ONE:
// cmp hi
// jne to
// cmp lo
// jne to
patch(gbranch(AJNE, T, likely), to);
break;
case OGE:
case OGT:
// cmp hi
// jgt to
// jlt L
// cmp lo
// jge to (or jgt to)
// L:
patch(gbranch(optoas(OGT, t), T, likely), to);
br = gbranch(optoas(OLT, t), T, -likely);
break;
case OLE:
case OLT:
// cmp hi
// jlt to
// jgt L
// cmp lo
// jle to (or jlt to)
// L:
patch(gbranch(optoas(OLT, t), T, likely), to);
br = gbranch(optoas(OGT, t), T, -likely);
break;
}
// compare least significant word
t = lo1.type;
if(nl->op == OLITERAL || nr->op == OLITERAL)
gins(ACMPL, &lo1, &lo2);
else {
regalloc(&rr, types[TINT32], N);
gins(AMOVL, &lo1, &rr);
gins(ACMPL, &rr, &lo2);
regfree(&rr);
}
// jump again
patch(gbranch(optoas(op, t), T, likely), to);
// point first branch down here if appropriate
if(br != P)
patch(br, pc);
splitclean();
splitclean();
}