From 5a6c58099085a8156bc42b68a7cf51b5b9c72802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= Date: Thu, 27 Apr 2017 08:30:27 +0200 Subject: [PATCH] runtime: refactor cpu feature detection for 386 & amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes all cpu features to be detected and stored in bools in rt0_go. Updates: #15403 Change-Id: I5a9961cdec789b331d09c44d86beb53833d5dc3e Reviewed-on: https://go-review.googlesource.com/41950 Run-TryBot: Martin Möhrmann TryBot-Result: Gobot Gobot Reviewed-by: Ilya Tocar Reviewed-by: Keith Randall --- src/runtime/alg.go | 6 +- src/runtime/asm_386.s | 79 ++++++++++++++++++++---- src/runtime/asm_amd64.s | 112 +++++++++++++++++++--------------- src/runtime/asm_amd64p32.s | 80 ++++++++++++++++++++++-- src/runtime/cpuflags_amd64.go | 10 +-- src/runtime/memclr_386.s | 4 +- src/runtime/memmove_386.s | 8 +-- src/runtime/memmove_amd64.s | 4 +- src/runtime/runtime2.go | 34 +++++++---- 9 files changed, 243 insertions(+), 94 deletions(-) diff --git a/src/runtime/alg.go b/src/runtime/alg.go index 504be61cd01..8d388da5a2b 100644 --- a/src/runtime/alg.go +++ b/src/runtime/alg.go @@ -283,9 +283,9 @@ func alginit() { // Install aes hash algorithm if we have the instructions we need if (GOARCH == "386" || GOARCH == "amd64") && GOOS != "nacl" && - cpuid_ecx&(1<<25) != 0 && // aes (aesenc) - cpuid_ecx&(1<<9) != 0 && // sse3 (pshufb) - cpuid_ecx&(1<<19) != 0 { // sse4.1 (pinsr{d,q}) + support_aes && // AESENC + support_ssse3 && // PSHUFB + support_sse41 { // PINSR{D,Q} useAeshash = true algarray[alg_MEM32].hash = aeshash32 algarray[alg_MEM64].hash = aeshash64 diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s index eaf8c935ad3..dc5db91ea85 100644 --- a/src/runtime/asm_386.s +++ b/src/runtime/asm_386.s @@ -75,24 +75,81 @@ notintel: MOVL $1, AX CPUID MOVL CX, DI // Move to global variable clobbers CX when generating PIC - MOVL AX, runtime·cpuid_eax(SB) + MOVL AX, runtime·processorVersionInfo(SB) MOVL DI, runtime·cpuid_ecx(SB) MOVL DX, runtime·cpuid_edx(SB) // Check for MMX support - TESTL $(1<<23), DX // MMX - JZ bad_proc + TESTL $(1<<23), DX // MMX + JZ bad_proc + TESTL $(1<<26), DX // SSE2 + SETNE runtime·support_sse2(SB) + + TESTL $(1<<9), DI // SSSE3 + SETNE runtime·support_ssse3(SB) + + TESTL $(1<<19), DI // SSE4.1 + SETNE runtime·support_sse41(SB) + + TESTL $(1<<20), DI // SSE4.2 + SETNE runtime·support_sse42(SB) + + TESTL $(1<<23), DI // POPCNT + SETNE runtime·support_popcnt(SB) + + TESTL $(1<<25), DI // AES + SETNE runtime·support_aes(SB) + + TESTL $(1<<27), DI // OSXSAVE + SETNE runtime·support_osxsave(SB) + + // If OS support for XMM and YMM is not present + // support_avx will be set back to false later. + TESTL $(1<<28), DI // AVX + SETNE runtime·support_avx(SB) + +eax7: // Load EAX=7/ECX=0 cpuid flags CMPL SI, $7 - JLT nocpuinfo + JLT osavx MOVL $7, AX MOVL $0, CX CPUID MOVL BX, runtime·cpuid_ebx7(SB) -nocpuinfo: + TESTL $(1<<3), BX // BMI1 + SETNE runtime·support_bmi1(SB) + // If OS support for XMM and YMM is not present + // support_avx2 will be set back to false later. + TESTL $(1<<5), BX + SETNE runtime·support_avx2(SB) + + TESTL $(1<<8), BX // BMI2 + SETNE runtime·support_bmi2(SB) + + TESTL $(1<<9), BX // ERMS + SETNE runtime·support_erms(SB) + +osavx: + // nacl does not support XGETBV to test + // for XMM and YMM OS support. +#ifndef GOOS_nacl + CMPB runtime·support_osxsave(SB), $1 + JNE noavx + MOVL $0, CX + // For XGETBV, OSXSAVE bit is required and sufficient + XGETBV + ANDL $6, AX + CMPL AX, $6 // Check for OS support of XMM and YMM registers. + JE nocpuinfo +#endif +noavx: + MOVB $0, runtime·support_avx(SB) + MOVB $0, runtime·support_avx2(SB) + +nocpuinfo: // if there is an _cgo_init, call it to let it // initialize and to set up GS. if not, // we set up GS ourselves. @@ -803,8 +860,8 @@ TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8 // func cputicks() int64 TEXT runtime·cputicks(SB),NOSPLIT,$0-8 - TESTL $0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence - JEQ done + CMPB runtime·support_sse2(SB), $1 + JNE done CMPB runtime·lfenceBeforeRdtsc(SB), $1 JNE mfence BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE @@ -1311,8 +1368,8 @@ TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 hugeloop: CMPL BX, $64 JB bigloop - TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JE bigloop + CMPB runtime·support_sse2(SB), $1 + JNE bigloop MOVOU (SI), X0 MOVOU (DI), X1 MOVOU 16(SI), X2 @@ -1455,8 +1512,8 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 JEQ allsame CMPL BP, $4 JB small - TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JE mediumloop + CMPB runtime·support_sse2(SB), $1 + JNE mediumloop largeloop: CMPL BP, $16 JB mediumloop diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 65bbf63bf10..0dc9a9c5427 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -26,10 +26,10 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 MOVQ SP, (g_stack+stack_hi)(DI) // find out information about the processor we're on - MOVQ $0, AX + MOVL $0, AX CPUID - MOVQ AX, SI - CMPQ AX, $0 + MOVL AX, SI + CMPL AX, $0 JE nocpuinfo // Figure out how to serialize RDTSC. @@ -46,62 +46,75 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 notintel: // Load EAX=1 cpuid flags - MOVQ $1, AX + MOVL $1, AX CPUID - MOVL AX, runtime·cpuid_eax(SB) + MOVL AX, runtime·processorVersionInfo(SB) MOVL CX, runtime·cpuid_ecx(SB) MOVL DX, runtime·cpuid_edx(SB) + TESTL $(1<<26), DX // SSE2 + SETNE runtime·support_sse2(SB) + + TESTL $(1<<9), CX // SSSE3 + SETNE runtime·support_ssse3(SB) + + TESTL $(1<<19), CX // SSE4.1 + SETNE runtime·support_sse41(SB) + + TESTL $(1<<20), CX // SSE4.2 + SETNE runtime·support_sse42(SB) + + TESTL $(1<<23), CX // POPCNT + SETNE runtime·support_popcnt(SB) + + TESTL $(1<<25), CX // AES + SETNE runtime·support_aes(SB) + + TESTL $(1<<27), CX // OSXSAVE + SETNE runtime·support_osxsave(SB) + + // If OS support for XMM and YMM is not present + // support_avx will be set back to false later. + TESTL $(1<<28), CX // AVX + SETNE runtime·support_avx(SB) + +eax7: // Load EAX=7/ECX=0 cpuid flags - CMPQ SI, $7 - JLT no7 + CMPL SI, $7 + JLT osavx MOVL $7, AX MOVL $0, CX CPUID MOVL BX, runtime·cpuid_ebx7(SB) -no7: - // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] - // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf - // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf - MOVL runtime·cpuid_ecx(SB), CX - ANDL $0x18000000, CX // check for OSXSAVE and AVX bits - CMPL CX, $0x18000000 - JNE noavx - MOVL $0, CX + + TESTL $(1<<3), BX // BMI1 + SETNE runtime·support_bmi1(SB) + + // If OS support for XMM and YMM is not present + // support_avx2 will be set back to false later. + TESTL $(1<<5), BX + SETNE runtime·support_avx2(SB) + + TESTL $(1<<8), BX // BMI2 + SETNE runtime·support_bmi2(SB) + + TESTL $(1<<9), BX // ERMS + SETNE runtime·support_erms(SB) + +osavx: + CMPB runtime·support_osxsave(SB), $1 + JNE noavx + MOVL $0, CX // For XGETBV, OSXSAVE bit is required and sufficient XGETBV - ANDL $6, AX - CMPL AX, $6 // Check for OS support of YMM registers - JNE noavx - MOVB $1, runtime·support_avx(SB) - TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit - JEQ noavx2 - MOVB $1, runtime·support_avx2(SB) - JMP testbmi1 + ANDL $6, AX + CMPL AX, $6 // Check for OS support of XMM and YMM registers. + JE nocpuinfo noavx: - MOVB $0, runtime·support_avx(SB) -noavx2: - MOVB $0, runtime·support_avx2(SB) -testbmi1: - // Detect BMI1 and BMI2 extensions as per - // 5.1.16.1 Detection of VEX-encoded GPR Instructions, - // LZCNT and TZCNT, PREFETCHW chapter of [1] - MOVB $0, runtime·support_bmi1(SB) - TESTL $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit - JEQ testbmi2 - MOVB $1, runtime·support_bmi1(SB) -testbmi2: - MOVB $0, runtime·support_bmi2(SB) - TESTL $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit - JEQ testpopcnt - MOVB $1, runtime·support_bmi2(SB) -testpopcnt: - MOVB $0, runtime·support_popcnt(SB) - TESTL $(1<<23), runtime·cpuid_ecx(SB) // check for POPCNT bit - JEQ nocpuinfo - MOVB $1, runtime·support_popcnt(SB) -nocpuinfo: - + MOVB $0, runtime·support_avx(SB) + MOVB $0, runtime·support_avx2(SB) + +nocpuinfo: // if there is an _cgo_init, call it. MOVQ _cgo_init(SB), AX TESTQ AX, AX @@ -1942,9 +1955,8 @@ success_avx2: VZEROUPPER JMP success sse42: - MOVL runtime·cpuid_ecx(SB), CX - ANDL $0x100000, CX - JZ no_sse42 + CMPB runtime·support_sse42(SB), $1 + JNE no_sse42 CMPQ AX, $12 // PCMPESTRI is slower than normal compare, // so using it makes sense only if we advance 4+ bytes per compare diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s index 14c22133843..e97674cc842 100644 --- a/src/runtime/asm_amd64p32.s +++ b/src/runtime/asm_amd64p32.s @@ -28,9 +28,9 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 MOVL SP, (g_stack+stack_hi)(DI) // find out information about the processor we're on - MOVQ $0, AX + MOVL $0, AX CPUID - CMPQ AX, $0 + CMPL AX, $0 JE nocpuinfo CMPL BX, $0x756E6547 // "Genu" @@ -42,13 +42,81 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 MOVB $1, runtime·isIntel(SB) notintel: - MOVQ $1, AX + // Load EAX=1 cpuid flags + MOVL $1, AX CPUID - MOVL AX, runtime·cpuid_eax(SB) + MOVL AX, runtime·processorVersionInfo(SB) MOVL CX, runtime·cpuid_ecx(SB) MOVL DX, runtime·cpuid_edx(SB) -nocpuinfo: - + + TESTL $(1<<26), DX // SSE2 + SETNE runtime·support_sse2(SB) + + TESTL $(1<<9), CX // SSSE3 + SETNE runtime·support_ssse3(SB) + + TESTL $(1<<19), CX // SSE4.1 + SETNE runtime·support_sse41(SB) + + TESTL $(1<<20), CX // SSE4.2 + SETNE runtime·support_sse42(SB) + + TESTL $(1<<23), CX // POPCNT + SETNE runtime·support_popcnt(SB) + + TESTL $(1<<25), CX // AES + SETNE runtime·support_aes(SB) + + TESTL $(1<<27), CX // OSXSAVE + SETNE runtime·support_osxsave(SB) + + // If OS support for XMM and YMM is not present + // support_avx will be set back to false later. + TESTL $(1<<28), CX // AVX + SETNE runtime·support_avx(SB) + +eax7: + // Load EAX=7/ECX=0 cpuid flags + CMPL SI, $7 + JLT osavx + MOVL $7, AX + MOVL $0, CX + CPUID + MOVL BX, runtime·cpuid_ebx7(SB) + + TESTL $(1<<3), BX // BMI1 + SETNE runtime·support_bmi1(SB) + + // If OS support for XMM and YMM is not present + // support_avx2 will be set back to false later. + TESTL $(1<<5), BX + SETNE runtime·support_avx2(SB) + + TESTL $(1<<8), BX // BMI2 + SETNE runtime·support_bmi2(SB) + + TESTL $(1<<9), BX // ERMS + SETNE runtime·support_erms(SB) + +osavx: + // nacl does not support XGETBV to test + // for XMM and YMM OS support. +#ifndef GOOS_nacl + CMPB runtime·support_osxsave(SB), $1 + JNE noavx + MOVL $0, CX + // For XGETBV, OSXSAVE bit is required and sufficient + XGETBV + ANDL $6, AX + CMPL AX, $6 // Check for OS support of XMM and YMM registers. + JE nocpuinfo +#endif +noavx: + MOVB $0, runtime·support_avx(SB) + MOVB $0, runtime·support_avx2(SB) + +nocpuinfo: + needtls: LEAL runtime·m0+m_tls(SB), DI CALL runtime·settls(SB) diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go index 3a463487a9f..3e408dae5f1 100644 --- a/src/runtime/cpuflags_amd64.go +++ b/src/runtime/cpuflags_amd64.go @@ -8,13 +8,13 @@ var useAVXmemmove bool func init() { // Let's remove stepping and reserved fields - processorVersionInfo := cpuid_eax & 0x0FFF3FF0 + processor := processorVersionInfo & 0x0FFF3FF0 isIntelBridgeFamily := isIntel && - (processorVersionInfo == 0x206A0 || - processorVersionInfo == 0x206D0 || - processorVersionInfo == 0x306A0 || - processorVersionInfo == 0x306E0) + processor == 0x206A0 || + processor == 0x206D0 || + processor == 0x306A0 || + processor == 0x306E0 useAVXmemmove = support_avx && !isIntelBridgeFamily } diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s index ef6e60287cd..1adb26b4524 100644 --- a/src/runtime/memclr_386.s +++ b/src/runtime/memclr_386.s @@ -27,8 +27,8 @@ tail: JBE _5through8 CMPL BX, $16 JBE _9through16 - TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JEQ nosse2 + CMPB runtime·support_sse2(SB), $1 + JNE nosse2 PXOR X0, X0 CMPL BX, $32 JBE _17through32 diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s index b712ea182ae..e76201b48b4 100644 --- a/src/runtime/memmove_386.s +++ b/src/runtime/memmove_386.s @@ -49,8 +49,8 @@ tail: JBE move_5through8 CMPL BX, $16 JBE move_9through16 - TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JEQ nosse2 + CMPB runtime·support_sse2(SB), $1 + JNE nosse2 CMPL BX, $32 JBE move_17through32 CMPL BX, $64 @@ -71,8 +71,8 @@ nosse2: */ forward: // If REP MOVSB isn't fast, don't use it - TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB - JEQ fwdBy4 + CMPB runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB + JNE fwdBy4 // Check alignment MOVL SI, AX diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index 510d0d694b4..21bf8e47e0f 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -81,8 +81,8 @@ forward: JLS move_256through2048 // If REP MOVSB isn't fast, don't use it - TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB - JEQ fwdBy8 + CMPB runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB + JNE fwdBy8 // Check alignment MOVL SI, AX diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 13360a9ad34..8c4d41d9289 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -728,17 +728,29 @@ var ( // Information about what cpu features are available. // Set on startup in asm_{386,amd64,amd64p32}.s. - cpuid_eax uint32 - cpuid_ecx uint32 - cpuid_edx uint32 - cpuid_ebx7 uint32 // not set on amd64p32 - isIntel bool - lfenceBeforeRdtsc bool - support_avx bool - support_avx2 bool - support_bmi1 bool - support_bmi2 bool - support_popcnt bool + // Packages outside the runtime should not use these + // as they are not an external api. + processorVersionInfo uint32 + isIntel bool + lfenceBeforeRdtsc bool + support_aes bool + support_avx bool + support_avx2 bool + support_bmi1 bool + support_bmi2 bool + support_erms bool + support_osxsave bool + support_popcnt bool + support_sse2 bool + support_sse41 bool + support_sse42 bool + support_ssse3 bool + + // TODO(moehrmann) delete below variables once external + // packages have their dependencies on these removed. + cpuid_ecx uint32 + cpuid_edx uint32 + cpuid_ebx7 uint32 // not set on amd64p32 goarm uint8 // set by cmd/link on arm systems framepointer_enabled bool // set by cmd/link