mirror of
https://github.com/golang/go
synced 2024-11-19 13:04:45 -07:00
87e48c5afd
Since barrier-less memclr is only safe in very narrow circumstances, this commit renames memclr to avoid accidentally calling memclr on typed memory. This can cause subtle, non-deterministic bugs, so it's worth some effort to prevent. In the near term, this will also prevent bugs creeping in from any concurrent CLs that add calls to memclr; if this happens, whichever patch hits master second will fail to compile. This also adds the other new memclr variants to the compiler's builtin.go to minimize the churn on that binary blob. We'll use these in future commits. Updates #17503. Change-Id: I00eead049f5bd35ca107ea525966831f3d1ed9ca Reviewed-on: https://go-review.googlesource.com/31369 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
178 lines
3.6 KiB
ArmAsm
178 lines
3.6 KiB
ArmAsm
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build !plan9
|
|
|
|
#include "textflag.h"
|
|
|
|
// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
|
|
|
|
// void runtime·memclrNoHeapPointers(void*, uintptr)
|
|
TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
|
|
MOVQ ptr+0(FP), DI
|
|
MOVQ n+8(FP), BX
|
|
XORQ AX, AX
|
|
|
|
// MOVOU seems always faster than REP STOSQ.
|
|
tail:
|
|
TESTQ BX, BX
|
|
JEQ _0
|
|
CMPQ BX, $2
|
|
JBE _1or2
|
|
CMPQ BX, $4
|
|
JBE _3or4
|
|
CMPQ BX, $8
|
|
JB _5through7
|
|
JE _8
|
|
CMPQ BX, $16
|
|
JBE _9through16
|
|
PXOR X0, X0
|
|
CMPQ BX, $32
|
|
JBE _17through32
|
|
CMPQ BX, $64
|
|
JBE _33through64
|
|
CMPQ BX, $128
|
|
JBE _65through128
|
|
CMPQ BX, $256
|
|
JBE _129through256
|
|
CMPB runtime·support_avx2(SB), $1
|
|
JE loop_preheader_avx2
|
|
// TODO: use branch table and BSR to make this just a single dispatch
|
|
// TODO: for really big clears, use MOVNTDQ, even without AVX2.
|
|
|
|
loop:
|
|
MOVOU X0, 0(DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, 64(DI)
|
|
MOVOU X0, 80(DI)
|
|
MOVOU X0, 96(DI)
|
|
MOVOU X0, 112(DI)
|
|
MOVOU X0, 128(DI)
|
|
MOVOU X0, 144(DI)
|
|
MOVOU X0, 160(DI)
|
|
MOVOU X0, 176(DI)
|
|
MOVOU X0, 192(DI)
|
|
MOVOU X0, 208(DI)
|
|
MOVOU X0, 224(DI)
|
|
MOVOU X0, 240(DI)
|
|
SUBQ $256, BX
|
|
ADDQ $256, DI
|
|
CMPQ BX, $256
|
|
JAE loop
|
|
JMP tail
|
|
|
|
loop_preheader_avx2:
|
|
VPXOR Y0, Y0, Y0
|
|
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
|
|
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
|
|
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
|
|
CMPQ BX, $0x2000000
|
|
JAE loop_preheader_avx2_huge
|
|
loop_avx2:
|
|
VMOVDQU Y0, 0(DI)
|
|
VMOVDQU Y0, 32(DI)
|
|
VMOVDQU Y0, 64(DI)
|
|
VMOVDQU Y0, 96(DI)
|
|
SUBQ $128, BX
|
|
ADDQ $128, DI
|
|
CMPQ BX, $128
|
|
JAE loop_avx2
|
|
VMOVDQU Y0, -32(DI)(BX*1)
|
|
VMOVDQU Y0, -64(DI)(BX*1)
|
|
VMOVDQU Y0, -96(DI)(BX*1)
|
|
VMOVDQU Y0, -128(DI)(BX*1)
|
|
VZEROUPPER
|
|
RET
|
|
loop_preheader_avx2_huge:
|
|
// Align to 32 byte boundary
|
|
VMOVDQU Y0, 0(DI)
|
|
MOVQ DI, SI
|
|
ADDQ $32, DI
|
|
ANDQ $~31, DI
|
|
SUBQ DI, SI
|
|
ADDQ SI, BX
|
|
loop_avx2_huge:
|
|
VMOVNTDQ Y0, 0(DI)
|
|
VMOVNTDQ Y0, 32(DI)
|
|
VMOVNTDQ Y0, 64(DI)
|
|
VMOVNTDQ Y0, 96(DI)
|
|
SUBQ $128, BX
|
|
ADDQ $128, DI
|
|
CMPQ BX, $128
|
|
JAE loop_avx2_huge
|
|
// In the description of MOVNTDQ in [1]
|
|
// "... fencing operation implemented with the SFENCE or MFENCE instruction
|
|
// should be used in conjunction with MOVNTDQ instructions..."
|
|
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
|
|
SFENCE
|
|
VMOVDQU Y0, -32(DI)(BX*1)
|
|
VMOVDQU Y0, -64(DI)(BX*1)
|
|
VMOVDQU Y0, -96(DI)(BX*1)
|
|
VMOVDQU Y0, -128(DI)(BX*1)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
_1or2:
|
|
MOVB AX, (DI)
|
|
MOVB AX, -1(DI)(BX*1)
|
|
RET
|
|
_0:
|
|
RET
|
|
_3or4:
|
|
MOVW AX, (DI)
|
|
MOVW AX, -2(DI)(BX*1)
|
|
RET
|
|
_5through7:
|
|
MOVL AX, (DI)
|
|
MOVL AX, -4(DI)(BX*1)
|
|
RET
|
|
_8:
|
|
// We need a separate case for 8 to make sure we clear pointers atomically.
|
|
MOVQ AX, (DI)
|
|
RET
|
|
_9through16:
|
|
MOVQ AX, (DI)
|
|
MOVQ AX, -8(DI)(BX*1)
|
|
RET
|
|
_17through32:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
_33through64:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
_65through128:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, -64(DI)(BX*1)
|
|
MOVOU X0, -48(DI)(BX*1)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
_129through256:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, 64(DI)
|
|
MOVOU X0, 80(DI)
|
|
MOVOU X0, 96(DI)
|
|
MOVOU X0, 112(DI)
|
|
MOVOU X0, -128(DI)(BX*1)
|
|
MOVOU X0, -112(DI)(BX*1)
|
|
MOVOU X0, -96(DI)(BX*1)
|
|
MOVOU X0, -80(DI)(BX*1)
|
|
MOVOU X0, -64(DI)(BX*1)
|
|
MOVOU X0, -48(DI)(BX*1)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|