1
0
mirror of https://github.com/golang/go synced 2024-10-02 06:18:32 -06:00
go/src/runtime/duff_amd64.s
Ilya Tocar df70982825 cmd/compile/internal/ssa: use sse to zero on amd64
Use 16-byte stores instead of 8-byte stores to zero small blocks.
Also switch to duffzero for 65+ bytes only, because for each
duffzero call we also save/restore BP, so call requires 4 instructions
and replacing it with 4 sse stores doesn't cause code-bloat.
Also switch duffzero to use leaq, instead of addq to avoid clobbering flags.

ClearFat8-6     0.54ns ± 0%  0.54ns ± 0%     ~     (all equal)
ClearFat12-6    1.07ns ± 0%  1.07ns ± 0%     ~     (all equal)
ClearFat16-6    1.07ns ± 0%  0.69ns ± 0%  -35.51%  (p=0.001 n=8+9)
ClearFat24-6    1.61ns ± 1%  1.07ns ± 0%  -33.33%  (p=0.000 n=10+10)
ClearFat32-6    2.14ns ± 0%  1.07ns ± 0%  -50.00%  (p=0.001 n=8+9)
ClearFat40-6    2.67ns ± 1%  1.61ns ± 0%  -39.72%  (p=0.000 n=10+8)
ClearFat48-6    3.75ns ± 0%  2.68ns ± 0%  -28.59%  (p=0.000 n=9+9)
ClearFat56-6    4.29ns ± 0%  3.22ns ± 0%  -25.10%  (p=0.000 n=9+9)
ClearFat64-6    4.30ns ± 0%  3.22ns ± 0%  -25.15%  (p=0.000 n=8+8)
ClearFat128-6   7.50ns ± 1%  7.51ns ± 0%     ~     (p=0.767 n=10+9)
ClearFat256-6   13.9ns ± 1%  13.9ns ± 1%     ~     (p=0.257 n=10+10)
ClearFat512-6   26.8ns ± 0%  26.8ns ± 0%     ~     (p=0.467 n=8+8)
ClearFat1024-6  52.5ns ± 0%  52.5ns ± 0%     ~     (p=1.000 n=8+8)

Also shaves ~20kb from go tool:

go_old 10384994
go_new 10364514 [-20480 bytes]

section differences
global text (code) = -20585 bytes (-0.532047%)
read-only data = -302 bytes (-0.018101%)
Total difference -20887 bytes (-0.348731%)

Change-Id: I15854e87544545c1af24775df895e38e16e12694
Reviewed-on: https://go-review.googlesource.com/54410
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-16 15:52:27 +00:00

428 lines
5.5 KiB
ArmAsm

// Code generated by mkduff.go; DO NOT EDIT.
// Run go generate from src/runtime to update.
// See mkduff.go for comments.
#include "textflag.h"
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
MOVUPS X0,(DI)
MOVUPS X0,16(DI)
MOVUPS X0,32(DI)
MOVUPS X0,48(DI)
LEAQ 64(DI),DI
RET
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
MOVUPS (SI), X0
ADDQ $16, SI
MOVUPS X0, (DI)
ADDQ $16, DI
RET