mirror of
https://github.com/golang/go
synced 2024-10-04 00:21:20 -06:00
ee6c6d96b6
The difference between the old and the new (from earlier) code is that we set stackguard = stack.lo + StackGuard, while the old code set stackguard = stack.lo. That 512 bytes appears to be the difference between the profileloop function running and not running. We don't know how big the system stack is, but it is likely MUCH bigger than 4k. Give Go/C 8k. TBR=iant CC=golang-codereviews https://golang.org/cl/140440044
131 lines
2.3 KiB
ArmAsm
131 lines
2.3 KiB
ArmAsm
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build !plan9
|
|
|
|
#include "textflag.h"
|
|
|
|
// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
|
|
|
|
// void runtime·memclr(void*, uintptr)
|
|
TEXT runtime·memclr(SB), NOSPLIT, $0-8
|
|
MOVL ptr+0(FP), DI
|
|
MOVL n+4(FP), BX
|
|
XORL AX, AX
|
|
|
|
// MOVOU seems always faster than REP STOSL.
|
|
clr_tail:
|
|
TESTL BX, BX
|
|
JEQ clr_0
|
|
CMPL BX, $2
|
|
JBE clr_1or2
|
|
CMPL BX, $4
|
|
JBE clr_3or4
|
|
CMPL BX, $8
|
|
JBE clr_5through8
|
|
CMPL BX, $16
|
|
JBE clr_9through16
|
|
TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
|
|
JEQ nosse2
|
|
PXOR X0, X0
|
|
CMPL BX, $32
|
|
JBE clr_17through32
|
|
CMPL BX, $64
|
|
JBE clr_33through64
|
|
CMPL BX, $128
|
|
JBE clr_65through128
|
|
CMPL BX, $256
|
|
JBE clr_129through256
|
|
// TODO: use branch table and BSR to make this just a single dispatch
|
|
|
|
clr_loop:
|
|
MOVOU X0, 0(DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, 64(DI)
|
|
MOVOU X0, 80(DI)
|
|
MOVOU X0, 96(DI)
|
|
MOVOU X0, 112(DI)
|
|
MOVOU X0, 128(DI)
|
|
MOVOU X0, 144(DI)
|
|
MOVOU X0, 160(DI)
|
|
MOVOU X0, 176(DI)
|
|
MOVOU X0, 192(DI)
|
|
MOVOU X0, 208(DI)
|
|
MOVOU X0, 224(DI)
|
|
MOVOU X0, 240(DI)
|
|
SUBL $256, BX
|
|
ADDL $256, DI
|
|
CMPL BX, $256
|
|
JAE clr_loop
|
|
JMP clr_tail
|
|
|
|
clr_1or2:
|
|
MOVB AX, (DI)
|
|
MOVB AX, -1(DI)(BX*1)
|
|
RET
|
|
clr_0:
|
|
RET
|
|
clr_3or4:
|
|
MOVW AX, (DI)
|
|
MOVW AX, -2(DI)(BX*1)
|
|
RET
|
|
clr_5through8:
|
|
MOVL AX, (DI)
|
|
MOVL AX, -4(DI)(BX*1)
|
|
RET
|
|
clr_9through16:
|
|
MOVL AX, (DI)
|
|
MOVL AX, 4(DI)
|
|
MOVL AX, -8(DI)(BX*1)
|
|
MOVL AX, -4(DI)(BX*1)
|
|
RET
|
|
clr_17through32:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
clr_33through64:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
clr_65through128:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, -64(DI)(BX*1)
|
|
MOVOU X0, -48(DI)(BX*1)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
clr_129through256:
|
|
MOVOU X0, (DI)
|
|
MOVOU X0, 16(DI)
|
|
MOVOU X0, 32(DI)
|
|
MOVOU X0, 48(DI)
|
|
MOVOU X0, 64(DI)
|
|
MOVOU X0, 80(DI)
|
|
MOVOU X0, 96(DI)
|
|
MOVOU X0, 112(DI)
|
|
MOVOU X0, -128(DI)(BX*1)
|
|
MOVOU X0, -112(DI)(BX*1)
|
|
MOVOU X0, -96(DI)(BX*1)
|
|
MOVOU X0, -80(DI)(BX*1)
|
|
MOVOU X0, -64(DI)(BX*1)
|
|
MOVOU X0, -48(DI)(BX*1)
|
|
MOVOU X0, -32(DI)(BX*1)
|
|
MOVOU X0, -16(DI)(BX*1)
|
|
RET
|
|
nosse2:
|
|
MOVL BX, CX
|
|
SHRL $2, CX
|
|
REP
|
|
STOSL
|
|
ANDL $3, BX
|
|
JNE clr_tail
|
|
RET
|