diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 2829945af0b..0ceedcd7d22 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -56,32 +56,34 @@ var ARM struct { // The booleans in ARM64 contain the correspondingly named cpu feature bit. // The struct is padded to avoid false sharing. var ARM64 struct { - _ CacheLinePad - HasFP bool - HasASIMD bool - HasEVTSTRM bool - HasAES bool - HasPMULL bool - HasSHA1 bool - HasSHA2 bool - HasCRC32 bool - HasATOMICS bool - HasFPHP bool - HasASIMDHP bool - HasCPUID bool - HasASIMDRDM bool - HasJSCVT bool - HasFCMA bool - HasLRCPC bool - HasDCPOP bool - HasSHA3 bool - HasSM3 bool - HasSM4 bool - HasASIMDDP bool - HasSHA512 bool - HasSVE bool - HasASIMDFHM bool - _ CacheLinePad + _ CacheLinePad + HasFP bool + HasASIMD bool + HasEVTSTRM bool + HasAES bool + HasPMULL bool + HasSHA1 bool + HasSHA2 bool + HasCRC32 bool + HasATOMICS bool + HasFPHP bool + HasASIMDHP bool + HasCPUID bool + HasASIMDRDM bool + HasJSCVT bool + HasFCMA bool + HasLRCPC bool + HasDCPOP bool + HasSHA3 bool + HasSM3 bool + HasSM4 bool + HasASIMDDP bool + HasSHA512 bool + HasSVE bool + HasASIMDFHM bool + IsNeoverseN1 bool + IsZeus bool + _ CacheLinePad } var MIPS64X struct { diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index 533bea24700..8fde39f03e1 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -18,6 +18,7 @@ const ( hwcap_SHA2 = 1 << 6 hwcap_CRC32 = 1 << 7 hwcap_ATOMICS = 1 << 8 + hwcap_CPUID = 1 << 11 ) func doinit() { @@ -28,6 +29,8 @@ func doinit() { {Name: "sha2", Feature: &ARM64.HasSHA2}, {Name: "crc32", Feature: &ARM64.HasCRC32}, {Name: "atomics", Feature: &ARM64.HasATOMICS}, + {Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1}, + {Name: "isZeus", Feature: &ARM64.IsZeus}, } switch GOOS { @@ -40,12 +43,32 @@ func doinit() { ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) + ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID) // The Samsung S9+ kernel reports support for atomics, but not all cores // actually support them, resulting in SIGILL. See issue #28431. // TODO(elias.naur): Only disable the optimization on bad chipsets on android. ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android" + // Check to see if executing on a NeoverseN1 and in order to do that, + // check the AUXV for the CPUID bit. The getMIDR function executes an + // instruction which would normally be an illegal instruction, but it's + // trapped by the kernel, the value sanitized and then returned. Without + // the CPUID bit the kernel will not trap the instruction and the process + // will be terminated with SIGILL. + if ARM64.HasCPUID { + midr := getMIDR() + part_num := uint16((midr >> 4) & 0xfff) + implementor := byte((midr >> 24) & 0xff) + + if implementor == 'A' && part_num == 0xd0c { + ARM64.IsNeoverseN1 = true + } + if implementor == 'A' && part_num == 0xd40 { + ARM64.IsZeus = true + } + } + case "freebsd": // Retrieve info from system register ID_AA64ISAR0_EL1. isar0 := getisar0() @@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool { } func getisar0() uint64 + +func getMIDR() uint64 diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s index d85914973f9..d6e7f443739 100644 --- a/src/internal/cpu/cpu_arm64.s +++ b/src/internal/cpu/cpu_arm64.s @@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0 MRS ID_AA64ISAR0_EL1, R0 MOVD R0, ret+0(FP) RET + +// func getMIDR() uint64 +TEXT ·getMIDR(SB), NOSPLIT, $0-8 + MRS MIDR_EL1, R0 + MOVD R0, ret+0(FP) + RET diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go new file mode 100644 index 00000000000..7576bef4a75 --- /dev/null +++ b/src/runtime/cpuflags_arm64.go @@ -0,0 +1,17 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import ( + "internal/cpu" +) + +var arm64UseAlignedLoads bool + +func init() { + if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus { + arm64UseAlignedLoads = true + } +} diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s index dbb7e9a28a0..43d27629e5b 100644 --- a/src/runtime/memmove_arm64.s +++ b/src/runtime/memmove_arm64.s @@ -6,152 +6,236 @@ // See memmove Go doc for important implementation constraints. +// Register map +// +// dstin R0 +// src R1 +// count R2 +// dst R3 (same as R0, but gets modified in unaligned cases) +// srcend R4 +// dstend R5 +// data R6-R17 +// tmp1 R14 + +// Copies are split into 3 main cases: small copies of up to 32 bytes, medium +// copies of up to 128 bytes, and large copies. The overhead of the overlap +// check is negligible since it is only required for large copies. +// +// Large copies use a software pipelined loop processing 64 bytes per iteration. +// The destination pointer is 16-byte aligned to minimize unaligned accesses. +// The loop tail is handled by always copying 64 bytes from the end. + // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 - CBNZ R5, check - RET + MOVD to+0(FP), R0 + MOVD from+8(FP), R1 + MOVD n+16(FP), R2 + CBZ R2, copy0 -check: - CMP $16, R5 + // Small copies: 1..16 bytes + CMP $16, R2 BLE copy16 - AND $~31, R5, R7 // R7 is N&~31 - SUB R7, R5, R6 // R6 is N&31 + // Large copies + CMP $128, R2 + BHI copy_long + CMP $32, R2 + BHI copy32_128 - CMP R3, R4 - BLT backward - - // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes. - // R3 and R4 are advanced as we copy. - - // (There may be implementations of armv8 where copying by bytes until - // at least one of source or dest is word aligned is a worthwhile - // optimization, but the on the one tested so far (xgene) it did not - // make a significance difference.) - - CBZ R7, noforwardlarge // Do we need to do any quadword copying? - - ADD R3, R7, R9 // R9 points just past where we copy by word - -forwardlargeloop: - // Copy 32 bytes at a time. - LDP.P 32(R4), (R8, R10) - STP.P (R8, R10), 32(R3) - LDP -16(R4), (R11, R12) - STP (R11, R12), -16(R3) - SUB $32, R7, R7 - CBNZ R7, forwardlargeloop - -noforwardlarge: - CBNZ R6, forwardtail // Do we need to copy any tail bytes? + // Small copies: 17..32 bytes. + LDP (R1), (R6, R7) + ADD R1, R2, R4 // R4 points just past the last source byte + LDP -16(R4), (R12, R13) + STP (R6, R7), (R0) + ADD R0, R2, R5 // R5 points just past the last destination byte + STP (R12, R13), -16(R5) RET -forwardtail: - // There are R6 <= 31 bytes remaining to copy. - // This is large enough to still contain pointers, - // which must be copied atomically. - // Copy the next 16 bytes, then 8 bytes, then any remaining bytes. - TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0 - LDP.P 16(R4), (R8, R10) - STP.P (R8, R10), 16(R3) - - TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0 - MOVD.P 8(R4), R8 - MOVD.P R8, 8(R3) - - AND $7, R6 - CBNZ R6, 2(PC) - RET - - ADD R3, R6, R9 // R9 points just past the destination memory - -forwardtailloop: - MOVBU.P 1(R4), R8 - MOVBU.P R8, 1(R3) - CMP R3, R9 - BNE forwardtailloop - RET - - // Small copies: 1..16 bytes. +// Small copies: 1..16 bytes. copy16: - ADD R4, R5, R8 // R8 points just past the last source byte - ADD R3, R5, R9 // R9 points just past the last destination byte - CMP $8, R5 + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + CMP $8, R2 BLT copy7 - MOVD (R4), R6 - MOVD -8(R8), R7 - MOVD R6, (R3) - MOVD R7, -8(R9) + MOVD (R1), R6 + MOVD -8(R4), R7 + MOVD R6, (R0) + MOVD R7, -8(R5) RET copy7: - TBZ $2, R5, copy3 - MOVWU (R4), R6 - MOVWU -4(R8), R7 - MOVW R6, (R3) - MOVW R7, -4(R9) + TBZ $2, R2, copy3 + MOVWU (R1), R6 + MOVWU -4(R4), R7 + MOVW R6, (R0) + MOVW R7, -4(R5) RET copy3: - TBZ $1, R5, copy1 - MOVHU (R4), R6 - MOVHU -2(R8), R7 - MOVH R6, (R3) - MOVH R7, -2(R9) + TBZ $1, R2, copy1 + MOVHU (R1), R6 + MOVHU -2(R4), R7 + MOVH R6, (R0) + MOVH R7, -2(R5) RET copy1: - MOVBU (R4), R6 - MOVB R6, (R3) + MOVBU (R1), R6 + MOVB R6, (R0) + +copy0: RET -backward: - // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords. - // R3 and R4 are advanced to the end of the destination/source buffers - // respectively and moved back as we copy. - - ADD R4, R5, R4 // R4 points just past the last source byte - ADD R3, R5, R3 // R3 points just past the last destination byte - - CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? - - AND $7, R6, R12 - CBZ R12, backwardtaillarge - - SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. -backwardtailloop: - // Copy sub-pointer-size tail. - MOVBU.W -1(R4), R8 - MOVBU.W R8, -1(R3) - CMP R9, R3 - BNE backwardtailloop - -backwardtaillarge: - // Do 8/16-byte write if possible. - // See comment at forwardtail. - TBZ $3, R6, 3(PC) - MOVD.W -8(R4), R8 - MOVD.W R8, -8(R3) - - TBZ $4, R6, 3(PC) - LDP.W -16(R4), (R8, R10) - STP.W (R8, R10), -16(R3) - -nobackwardtail: - CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? + // Medium copies: 33..128 bytes. +copy32_128: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + LDP (R1), (R6, R7) + LDP 16(R1), (R8, R9) + LDP -32(R4), (R10, R11) + LDP -16(R4), (R12, R13) + CMP $64, R2 + BHI copy128 + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) RET -backwardlarge: - SUB R7, R3, R9 // R9 points at the lowest destination byte + // Copy 65..128 bytes. +copy128: + LDP 32(R1), (R14, R15) + LDP 48(R1), (R16, R17) + CMP $96, R2 + BLS copy96 + LDP -64(R4), (R2, R3) + LDP -48(R4), (R1, R4) + STP (R2, R3), -64(R5) + STP (R1, R4), -48(R5) -backwardlargeloop: - LDP -16(R4), (R8, R10) - STP (R8, R10), -16(R3) - LDP.W -32(R4), (R11, R12) - STP.W (R11, R12), -32(R3) - CMP R9, R3 - BNE backwardlargeloop +copy96: + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R14, R15), 32(R0) + STP (R16, R17), 48(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) + RET + + // Copy more than 128 bytes. +copy_long: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + MOVD ZR, R7 + MOVD ZR, R8 + + CMP $1024, R2 + BLT backward_check + // feature detect to decide how to align + MOVBU runtime·arm64UseAlignedLoads(SB), R6 + CBNZ R6, use_aligned_loads + MOVD R0, R7 + MOVD R5, R8 + B backward_check +use_aligned_loads: + MOVD R1, R7 + MOVD R4, R8 + // R7 and R8 are used here for the realignment calculation. In + // the use_aligned_loads case, R7 is the src pointer and R8 is + // srcend pointer, which is used in the backward copy case. + // When doing aligned stores, R7 is the dst pointer and R8 is + // the dstend pointer. + +backward_check: + // Use backward copy if there is an overlap. + SUB R1, R0, R14 + CBZ R14, copy0 + CMP R2, R14 + BCC copy_long_backward + + // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. + LDP (R1), (R12, R13) // Load A + AND $15, R7, R14 // Calculate the realignment offset + SUB R14, R1, R1 + SUB R14, R0, R3 // move dst back same amount as src + ADD R14, R2, R2 + LDP 16(R1), (R6, R7) // Load B + STP (R12, R13), (R0) // Store A + LDP 32(R1), (R8, R9) // Load C + LDP 48(R1), (R10, R11) // Load D + LDP.W 64(R1), (R12, R13) // Load E + // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end + SUBS $144, R2, R2 + BLS copy64_from_end + +loop64: + STP (R6, R7), 16(R3) // Store B + LDP 16(R1), (R6, R7) // Load B (next iteration) + STP (R8, R9), 32(R3) // Store C + LDP 32(R1), (R8, R9) // Load C + STP (R10, R11), 48(R3) // Store D + LDP 48(R1), (R10, R11) // Load D + STP.W (R12, R13), 64(R3) // Store E + LDP.W 64(R1), (R12, R13) // Load E + SUBS $64, R2, R2 + BHI loop64 + + // Write the last iteration and copy 64 bytes from the end. +copy64_from_end: + LDP -64(R4), (R14, R15) // Load F + STP (R6, R7), 16(R3) // Store B + LDP -48(R4), (R6, R7) // Load G + STP (R8, R9), 32(R3) // Store C + LDP -32(R4), (R8, R9) // Load H + STP (R10, R11), 48(R3) // Store D + LDP -16(R4), (R10, R11) // Load I + STP (R12, R13), 64(R3) // Store E + STP (R14, R15), -64(R5) // Store F + STP (R6, R7), -48(R5) // Store G + STP (R8, R9), -32(R5) // Store H + STP (R10, R11), -16(R5) // Store I + RET + + // Large backward copy for overlapping copies. + // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. +copy_long_backward: + LDP -16(R4), (R12, R13) + AND $15, R8, R14 + SUB R14, R4, R4 + SUB R14, R2, R2 + LDP -16(R4), (R6, R7) + STP (R12, R13), -16(R5) + LDP -32(R4), (R8, R9) + LDP -48(R4), (R10, R11) + LDP.W -64(R4), (R12, R13) + SUB R14, R5, R5 + SUBS $128, R2, R2 + BLS copy64_from_start + +loop64_backward: + STP (R6, R7), -16(R5) + LDP -16(R4), (R6, R7) + STP (R8, R9), -32(R5) + LDP -32(R4), (R8, R9) + STP (R10, R11), -48(R5) + LDP -48(R4), (R10, R11) + STP.W (R12, R13), -64(R5) + LDP.W -64(R4), (R12, R13) + SUBS $64, R2, R2 + BHI loop64_backward + + // Write the last iteration and copy 64 bytes from the start. +copy64_from_start: + LDP 48(R1), (R2, R3) + STP (R6, R7), -16(R5) + LDP 32(R1), (R6, R7) + STP (R8, R9), -32(R5) + LDP 16(R1), (R8, R9) + STP (R10, R11), -48(R5) + LDP (R1), (R10, R11) + STP (R12, R13), -64(R5) + STP (R2, R3), 48(R0) + STP (R6, R7), 32(R0) + STP (R8, R9), 16(R0) + STP (R10, R11), (R0) RET diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index b549433f71c..7c9d2ada45f 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -286,6 +286,9 @@ var bufSizes = []int{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, } +var bufSizesOverlap = []int{ + 32, 64, 128, 256, 512, 1024, 2048, 4096, +} func BenchmarkMemmove(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { @@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) { }) } +func BenchmarkMemmoveOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[:n]) + } + }) +} + func BenchmarkMemmoveUnalignedDst(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n+1) @@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[1:n+1]) + } + }) +} + func BenchmarkMemmoveUnalignedSrc(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n) @@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+1) + for i := 0; i < b.N; i++ { + copy(x[1:n+1], x[:n]) + } + }) +} + func TestMemclr(t *testing.T) { size := 512 if testing.Short() {