mirror of
https://github.com/golang/go
synced 2024-11-19 02:54:42 -07:00
runtime: don't use REP;MOVSB if CPUID doesn't say it is fast
Only use REP;MOVSB if: 1) The CPUID flag says it is fast, and 2) The pointers are unaligned Otherwise, use REP;MOVSQ. Update #14630 Change-Id: I946b28b87880c08e5eed1ce2945016466c89db66 Reviewed-on: https://go-review.googlesource.com/21300 Reviewed-by: Nigel Tao <nigeltao@golang.org>
This commit is contained in:
parent
1a9373bc57
commit
4b209dbf0b
@ -54,6 +54,7 @@ bad_proc: // show that the program requires MMX.
|
|||||||
has_cpuid:
|
has_cpuid:
|
||||||
MOVL $0, AX
|
MOVL $0, AX
|
||||||
CPUID
|
CPUID
|
||||||
|
MOVL AX, SI
|
||||||
CMPL AX, $0
|
CMPL AX, $0
|
||||||
JE nocpuinfo
|
JE nocpuinfo
|
||||||
|
|
||||||
@ -69,6 +70,7 @@ has_cpuid:
|
|||||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
||||||
notintel:
|
notintel:
|
||||||
|
|
||||||
|
// Load EAX=1 cpuid flags
|
||||||
MOVL $1, AX
|
MOVL $1, AX
|
||||||
CPUID
|
CPUID
|
||||||
MOVL CX, AX // Move to global variable clobbers CX when generating PIC
|
MOVL CX, AX // Move to global variable clobbers CX when generating PIC
|
||||||
@ -79,6 +81,14 @@ notintel:
|
|||||||
TESTL $(1<<23), DX // MMX
|
TESTL $(1<<23), DX // MMX
|
||||||
JZ bad_proc
|
JZ bad_proc
|
||||||
|
|
||||||
|
// Load EAX=7/ECX=0 cpuid flags
|
||||||
|
CMPL SI, $7
|
||||||
|
JLT nocpuinfo
|
||||||
|
MOVL $7, AX
|
||||||
|
MOVL $0, CX
|
||||||
|
CPUID
|
||||||
|
MOVL BX, runtime·cpuid_ebx7(SB)
|
||||||
|
|
||||||
nocpuinfo:
|
nocpuinfo:
|
||||||
|
|
||||||
// if there is an _cgo_init, call it to let it
|
// if there is an _cgo_init, call it to let it
|
||||||
|
@ -28,6 +28,7 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
|
|||||||
// find out information about the processor we're on
|
// find out information about the processor we're on
|
||||||
MOVQ $0, AX
|
MOVQ $0, AX
|
||||||
CPUID
|
CPUID
|
||||||
|
MOVQ AX, SI
|
||||||
CMPQ AX, $0
|
CMPQ AX, $0
|
||||||
JE nocpuinfo
|
JE nocpuinfo
|
||||||
|
|
||||||
@ -42,15 +43,25 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
|
|||||||
JNE notintel
|
JNE notintel
|
||||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
||||||
notintel:
|
notintel:
|
||||||
// Do nothing.
|
|
||||||
|
|
||||||
|
// Load EAX=1 cpuid flags
|
||||||
MOVQ $1, AX
|
MOVQ $1, AX
|
||||||
CPUID
|
CPUID
|
||||||
MOVL CX, runtime·cpuid_ecx(SB)
|
MOVL CX, runtime·cpuid_ecx(SB)
|
||||||
MOVL DX, runtime·cpuid_edx(SB)
|
MOVL DX, runtime·cpuid_edx(SB)
|
||||||
|
|
||||||
|
// Load EAX=7/ECX=0 cpuid flags
|
||||||
|
CMPQ SI, $7
|
||||||
|
JLT no7
|
||||||
|
MOVL $7, AX
|
||||||
|
MOVL $0, CX
|
||||||
|
CPUID
|
||||||
|
MOVL BX, runtime·cpuid_ebx7(SB)
|
||||||
|
no7:
|
||||||
// Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
|
// Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
|
||||||
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
|
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||||
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
|
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||||
|
MOVL runtime·cpuid_ecx(SB), CX
|
||||||
ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
|
ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
|
||||||
CMPL CX, $0x18000000
|
CMPL CX, $0x18000000
|
||||||
JNE noavx
|
JNE noavx
|
||||||
@ -61,12 +72,8 @@ notintel:
|
|||||||
CMPL AX, $6 // Check for OS support of YMM registers
|
CMPL AX, $6 // Check for OS support of YMM registers
|
||||||
JNE noavx
|
JNE noavx
|
||||||
MOVB $1, runtime·support_avx(SB)
|
MOVB $1, runtime·support_avx(SB)
|
||||||
MOVL $7, AX
|
TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
|
||||||
MOVL $0, CX
|
JEQ noavx2
|
||||||
CPUID
|
|
||||||
ANDL $0x20, BX // check for AVX2 bit
|
|
||||||
CMPL BX, $0x20
|
|
||||||
JNE noavx2
|
|
||||||
MOVB $1, runtime·support_avx2(SB)
|
MOVB $1, runtime·support_avx2(SB)
|
||||||
JMP nocpuinfo
|
JMP nocpuinfo
|
||||||
noavx:
|
noavx:
|
||||||
|
@ -70,24 +70,29 @@ nosse2:
|
|||||||
* forward copy loop
|
* forward copy loop
|
||||||
*/
|
*/
|
||||||
forward:
|
forward:
|
||||||
|
// If REP MOVSB isn't fast, don't use it
|
||||||
|
TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
|
||||||
|
JEQ fwdBy4
|
||||||
|
|
||||||
// Check alignment
|
// Check alignment
|
||||||
MOVL SI, AX
|
MOVL SI, AX
|
||||||
ORL DI, AX
|
ORL DI, AX
|
||||||
TESTL $3, AX
|
TESTL $3, AX
|
||||||
JNE unaligned_fwd
|
JEQ fwdBy4
|
||||||
|
|
||||||
MOVL BX, CX
|
// Do 1 byte at a time
|
||||||
SHRL $2, CX
|
|
||||||
ANDL $3, BX
|
|
||||||
|
|
||||||
REP; MOVSL
|
|
||||||
JMP tail
|
|
||||||
|
|
||||||
unaligned_fwd:
|
|
||||||
MOVL BX, CX
|
MOVL BX, CX
|
||||||
REP; MOVSB
|
REP; MOVSB
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
fwdBy4:
|
||||||
|
// Do 4 bytes at a time
|
||||||
|
MOVL BX, CX
|
||||||
|
SHRL $2, CX
|
||||||
|
ANDL $3, BX
|
||||||
|
REP; MOVSL
|
||||||
|
JMP tail
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* check overlap
|
* check overlap
|
||||||
*/
|
*/
|
||||||
|
@ -77,25 +77,29 @@ forward:
|
|||||||
CMPQ BX, $2048
|
CMPQ BX, $2048
|
||||||
JLS move_256through2048
|
JLS move_256through2048
|
||||||
|
|
||||||
// Check alignment
|
// If REP MOVSB isn't fast, don't use it
|
||||||
MOVQ SI, AX
|
TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
|
||||||
ORQ DI, AX
|
JEQ fwdBy8
|
||||||
TESTL $7, AX
|
|
||||||
JNE unaligned_fwd
|
|
||||||
|
|
||||||
// Aligned - do 8 bytes at a time
|
// Check alignment
|
||||||
|
MOVL SI, AX
|
||||||
|
ORL DI, AX
|
||||||
|
TESTL $7, AX
|
||||||
|
JEQ fwdBy8
|
||||||
|
|
||||||
|
// Do 1 byte at a time
|
||||||
|
MOVQ BX, CX
|
||||||
|
REP; MOVSB
|
||||||
|
RET
|
||||||
|
|
||||||
|
fwdBy8:
|
||||||
|
// Do 8 bytes at a time
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX
|
||||||
REP; MOVSQ
|
REP; MOVSQ
|
||||||
JMP tail
|
JMP tail
|
||||||
|
|
||||||
unaligned_fwd:
|
|
||||||
// Unaligned - do 1 byte at a time
|
|
||||||
MOVQ BX, CX
|
|
||||||
REP; MOVSB
|
|
||||||
RET
|
|
||||||
|
|
||||||
back:
|
back:
|
||||||
/*
|
/*
|
||||||
* check overlap
|
* check overlap
|
||||||
|
@ -701,6 +701,7 @@ var (
|
|||||||
// Set on startup in asm_{x86,amd64}.s.
|
// Set on startup in asm_{x86,amd64}.s.
|
||||||
cpuid_ecx uint32
|
cpuid_ecx uint32
|
||||||
cpuid_edx uint32
|
cpuid_edx uint32
|
||||||
|
cpuid_ebx7 uint32
|
||||||
lfenceBeforeRdtsc bool
|
lfenceBeforeRdtsc bool
|
||||||
support_avx bool
|
support_avx bool
|
||||||
support_avx2 bool
|
support_avx2 bool
|
||||||
|
Loading…
Reference in New Issue
Block a user