mirror of
https://github.com/golang/go
synced 2024-11-20 03:24:41 -07:00
df70982825
Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
428 lines
5.5 KiB
ArmAsm
428 lines
5.5 KiB
ArmAsm
// Code generated by mkduff.go; DO NOT EDIT.
|
|
// Run go generate from src/runtime to update.
|
|
// See mkduff.go for comments.
|
|
|
|
#include "textflag.h"
|
|
|
|
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
MOVUPS X0,(DI)
|
|
MOVUPS X0,16(DI)
|
|
MOVUPS X0,32(DI)
|
|
MOVUPS X0,48(DI)
|
|
LEAQ 64(DI),DI
|
|
|
|
RET
|
|
|
|
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
MOVUPS (SI), X0
|
|
ADDQ $16, SI
|
|
MOVUPS X0, (DI)
|
|
ADDQ $16, DI
|
|
|
|
RET
|