1
0
mirror of https://github.com/golang/go synced 2024-09-25 03:00:12 -06:00

runtime: get rid of most uses of REP for copying/zeroing.

REP MOVSQ and REP STOSQ have a really high startup overhead.
Use a Duff's device to do the repetition instead.

benchmark                 old ns/op     new ns/op     delta
BenchmarkClearFat32       7.20          1.60          -77.78%
BenchmarkCopyFat32        6.88          2.38          -65.41%
BenchmarkClearFat64       7.15          3.20          -55.24%
BenchmarkCopyFat64        6.88          3.44          -50.00%
BenchmarkClearFat128      9.53          5.34          -43.97%
BenchmarkCopyFat128       9.27          5.56          -40.02%
BenchmarkClearFat256      13.8          9.53          -30.94%
BenchmarkCopyFat256       13.5          10.3          -23.70%
BenchmarkClearFat512      22.3          18.0          -19.28%
BenchmarkCopyFat512       22.0          19.7          -10.45%
BenchmarkCopyFat1024      36.5          38.4          +5.21%
BenchmarkClearFat1024     35.1          35.0          -0.28%

TODO: use for stack frame zeroing
TODO: REP prefixes are still used for "reverse" copying when src/dst
regions overlap.  Might be worth fixing.

LGTM=rsc
R=golang-codereviews, rsc
CC=golang-codereviews, r
https://golang.org/cl/81370046
This commit is contained in:
Keith Randall 2014-04-01 12:51:02 -07:00
parent cfb347fc0a
commit 6c7cbf086c
13 changed files with 1726 additions and 4 deletions

View File

@ -1345,6 +1345,7 @@ sgen(Node *n, Node *ns, int64 w)
Node nodl, nodr, nodsi, noddi, cx, oldcx, tmp;
vlong c, q, odst, osrc;
NodeList *l;
Prog *p;
if(debug['g']) {
print("\nsgen w=%lld\n", w);
@ -1447,10 +1448,16 @@ sgen(Node *n, Node *ns, int64 w)
gins(ACLD, N, N);
} else {
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
} else if (q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 14 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 14*(128-q);
} else
while(q > 0) {
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+

View File

@ -1088,10 +1088,16 @@ clearfat(Node *nl)
savex(D_AX, &ax, &oldax, N, types[tptr]);
gconreg(AMOVL, 0, D_AX);
if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 2 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 2*(128-q);
} else
while(q > 0) {
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+

View File

@ -170,6 +170,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSQ]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},
[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
@ -257,6 +258,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSQ]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},
[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},

View File

@ -764,6 +764,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,
ALAST
};

View File

@ -1212,6 +1212,7 @@ sgen(Node *n, Node *res, int64 w)
Node dst, src, tdst, tsrc;
int32 c, q, odst, osrc;
NodeList *l;
Prog *p;
if(debug['g']) {
print("\nsgen w=%lld\n", w);
@ -1314,10 +1315,16 @@ sgen(Node *n, Node *res, int64 w)
} else {
gins(ACLD, N, N); // paranoia. TODO(rsc): remove?
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 10 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 10*(128-q);
} else
while(q > 0) {
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+

View File

@ -130,6 +130,7 @@ clearfat(Node *nl)
{
uint32 w, c, q;
Node n1;
Prog *p;
/* clear a fat object */
if(debug['g'])
@ -147,10 +148,16 @@ clearfat(Node *nl)
agen(nl, &n1);
gconreg(AMOVL, 0, D_AX);
if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSL, N, N); // STOL AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 1 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 1*(128-q);
} else
while(q > 0) {
gins(ASTOSL, N, N); // STOL AL,*(DI)+

View File

@ -195,6 +195,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSB]= {OK, DI|SI, DI|SI},
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},
[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
@ -287,6 +288,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSB]= {OK, AX|DI, DI},
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},
[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},

View File

@ -581,6 +581,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,
ALAST
};

View File

@ -507,6 +507,11 @@ static uchar ycall[] =
Ynone, Ybr, Zcall, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m64, 2,
@ -1519,6 +1524,9 @@ Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },
{ AEND },
0
@ -3030,6 +3038,7 @@ found:
r = addrel(ctxt->cursym);
r->off = p->pc + ctxt->andptr - ctxt->and;
r->sym = p->to.sym;
r->add = p->to.offset;
r->type = D_PCREL;
r->siz = 4;
put4(ctxt, 0);

View File

@ -420,6 +420,11 @@ static uchar ycall[] =
Ynone, Yi32, Zcallcon, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m, 2,
@ -1147,6 +1152,9 @@ static Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },
0
};
@ -2377,6 +2385,7 @@ found:
r->type = D_PCREL;
r->siz = 4;
r->sym = p->to.sym;
r->add = p->to.offset;
put4(ctxt, 0);
break;

View File

@ -1348,3 +1348,797 @@ cmp_allsame:
SETEQ CX // 1 if alen == blen
LEAL -1(CX)(AX*2), AX // 1,0,-1 result
RET
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/8g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
// NOTE: this is equivalent to a sequence of MOVSL but
// for some reason MOVSL is really slow.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
RET

View File

@ -1380,3 +1380,798 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49
eqret:
MOVB AX, ret+48(FP)
RET
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/6g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
// NOTE: this is equivalent to a sequence of MOVSQ but
// for some reason that is 3.5x slower than this code.
// The STOSQ above seem fine, though.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
RET

View File

@ -161,3 +161,83 @@ func BenchmarkMemclr64(b *testing.B) { bmMemclr(b, 64) }
func BenchmarkMemclr256(b *testing.B) { bmMemclr(b, 256) }
func BenchmarkMemclr4096(b *testing.B) { bmMemclr(b, 4096) }
func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
func BenchmarkClearFat32(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [32]byte
_ = x
}
}
func BenchmarkClearFat64(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [64]byte
_ = x
}
}
func BenchmarkClearFat128(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [128]byte
_ = x
}
}
func BenchmarkClearFat256(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [256]byte
_ = x
}
}
func BenchmarkClearFat512(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [512]byte
_ = x
}
}
func BenchmarkClearFat1024(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [1024]byte
_ = x
}
}
func BenchmarkCopyFat32(b *testing.B) {
var x [32]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat64(b *testing.B) {
var x [64]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat128(b *testing.B) {
var x [128]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat256(b *testing.B) {
var x [256]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat512(b *testing.B) {
var x [512]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat1024(b *testing.B) {
var x [1024]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}