1
0
mirror of https://github.com/golang/go synced 2024-09-29 15:34:30 -06:00

runtime: improve memmove performance on arm64

Replace the memmove implementation for moves of 17 bytes or larger
with an implementation from ARM optimized software. The moves of 16
bytes or fewer are unchanged, but the registers used are updated to
match the rest of the implementation.

This implementation makes use of new optimizations:
 - software pipelined loop for large (>128 byte) moves
 - medium size moves (17..128 bytes) have a new implementation
 - address realignment when src or dst is unaligned
 - preference for aligned src (loads) or dst (stores) depending on CPU

To support preference for aligned loads or aligned stores, a new CPU
flag is added. This flag indicates that the detected micro
architecture performs better with aligned loads. Some tested CPUs did
not exhibit a significant difference and are left with the default
behavior of realigning based on the destination address (stores).

Neoverse N1 (Tested on Graviton 2)
name                               old time/op    new time/op     delta
Memmove/0-4                          1.88ns ± 1%     1.87ns ± 1%   -0.58%  (p=0.020 n=10+10)
Memmove/1-4                          4.40ns ± 0%     4.40ns ± 0%     ~     (all equal)
Memmove/8-4                          3.88ns ± 3%     3.80ns ± 0%   -1.97%  (p=0.001 n=10+9)
Memmove/16-4                         3.90ns ± 3%     3.80ns ± 0%   -2.49%  (p=0.000 n=10+9)
Memmove/32-4                         4.80ns ± 0%     4.40ns ± 0%   -8.33%  (p=0.000 n=9+8)
Memmove/64-4                         5.86ns ± 0%     5.00ns ± 0%  -14.76%  (p=0.000 n=8+8)
Memmove/128-4                        8.46ns ± 0%     8.06ns ± 0%   -4.62%  (p=0.000 n=10+10)
Memmove/256-4                        12.4ns ± 0%     12.2ns ± 0%   -1.61%  (p=0.000 n=10+10)
Memmove/512-4                        19.5ns ± 0%     19.1ns ± 0%   -2.05%  (p=0.000 n=10+10)
Memmove/1024-4                       33.7ns ± 0%     33.5ns ± 0%   -0.59%  (p=0.000 n=10+10)
Memmove/2048-4                       62.1ns ± 0%     59.0ns ± 0%   -4.99%  (p=0.000 n=10+10)
Memmove/4096-4                        117ns ± 1%      110ns ± 0%   -5.66%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             6.41ns ± 0%     5.62ns ± 0%  -12.32%  (p=0.000 n=10+7)
MemmoveUnalignedDst/128-4            9.40ns ± 0%     8.34ns ± 0%  -11.24%  (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4            12.8ns ± 0%     12.8ns ± 0%     ~     (all equal)
MemmoveUnalignedDst/512-4            20.4ns ± 0%     19.7ns ± 0%   -3.43%  (p=0.000 n=9+10)
MemmoveUnalignedDst/1024-4           34.1ns ± 0%     35.1ns ± 0%   +2.93%  (p=0.000 n=9+9)
MemmoveUnalignedDst/2048-4           61.5ns ± 0%     60.4ns ± 0%   -1.77%  (p=0.000 n=10+10)
MemmoveUnalignedDst/4096-4            122ns ± 0%      113ns ± 0%   -7.38%  (p=0.002 n=8+10)
MemmoveUnalignedSrc/64-4             7.25ns ± 1%     6.26ns ± 0%  -13.64%  (p=0.000 n=9+9)
MemmoveUnalignedSrc/128-4            10.5ns ± 0%      9.7ns ± 0%   -7.52%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4            17.1ns ± 0%     17.3ns ± 0%   +1.17%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/512-4            27.0ns ± 0%     27.0ns ± 0%     ~     (all equal)
MemmoveUnalignedSrc/1024-4           46.7ns ± 0%     35.7ns ± 0%  -23.55%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/2048-4           85.2ns ± 0%     61.2ns ± 0%  -28.17%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/4096-4            162ns ± 0%      113ns ± 0%  -30.25%  (p=0.000 n=10+10)

name                               old speed      new speed       delta
Memmove/4096-4                     35.2GB/s ± 0%   37.1GB/s ± 0%   +5.56%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/1024-4         21.9GB/s ± 0%   28.7GB/s ± 0%  +30.90%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/2048-4         24.0GB/s ± 0%   33.5GB/s ± 0%  +39.18%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/4096-4         25.3GB/s ± 0%   36.2GB/s ± 0%  +43.50%  (p=0.000 n=10+7)

Cortex-A72 (Graviton 1)
name                               old time/op    new time/op    delta
Memmove/0-4                          3.06ns ± 3%    3.08ns ± 1%     ~     (p=0.958 n=10+9)
Memmove/1-4                          8.72ns ± 0%    7.85ns ± 0%   -9.98%  (p=0.002 n=8+10)
Memmove/8-4                          8.29ns ± 0%    8.29ns ± 0%     ~     (all equal)
Memmove/16-4                         8.29ns ± 0%    8.29ns ± 0%     ~     (all equal)
Memmove/32-4                         8.19ns ± 2%    8.29ns ± 0%     ~     (p=0.114 n=10+10)
Memmove/64-4                         18.3ns ± 4%    10.0ns ± 0%  -45.36%  (p=0.000 n=10+10)
Memmove/128-4                        14.8ns ± 0%    17.4ns ± 0%  +17.77%  (p=0.000 n=10+10)
Memmove/256-4                        21.8ns ± 0%    23.1ns ± 0%   +5.96%  (p=0.000 n=10+10)
Memmove/512-4                        35.8ns ± 0%    37.2ns ± 0%   +3.91%  (p=0.000 n=10+10)
Memmove/1024-4                       63.7ns ± 0%    67.2ns ± 0%   +5.49%  (p=0.000 n=10+10)
Memmove/2048-4                        126ns ± 0%     123ns ± 0%   -2.38%  (p=0.000 n=10+10)
Memmove/4096-4                        238ns ± 1%     243ns ± 1%   +1.93%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             19.3ns ± 1%    12.0ns ± 1%  -37.49%  (p=0.000 n=10+10)
MemmoveUnalignedDst/128-4            17.2ns ± 0%    17.4ns ± 0%   +1.16%  (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4            28.2ns ± 8%    29.2ns ± 0%     ~     (p=0.352 n=10+10)
MemmoveUnalignedDst/512-4            49.8ns ± 3%    48.9ns ± 0%     ~     (p=1.000 n=10+10)
MemmoveUnalignedDst/1024-4           89.5ns ± 0%    80.5ns ± 1%  -10.02%  (p=0.000 n=10+10)
MemmoveUnalignedDst/2048-4            180ns ± 0%     127ns ± 0%  -29.44%  (p=0.000 n=9+10)
MemmoveUnalignedDst/4096-4            347ns ± 0%     244ns ± 0%  -29.59%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/128-4            16.1ns ± 0%    21.8ns ± 0%  +35.40%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4            24.9ns ± 8%    26.6ns ± 0%   +6.70%  (p=0.015 n=10+10)
MemmoveUnalignedSrc/512-4            39.4ns ± 6%    40.6ns ± 0%     ~     (p=0.352 n=10+10)
MemmoveUnalignedSrc/1024-4           72.5ns ± 0%    83.0ns ± 1%  +14.44%  (p=0.000 n=9+10)
MemmoveUnalignedSrc/2048-4            129ns ± 1%     128ns ± 1%     ~     (p=0.179 n=10+10)
MemmoveUnalignedSrc/4096-4            241ns ± 0%     253ns ± 1%   +4.99%  (p=0.000 n=9+9)

Cortex-A53 (Raspberry Pi 3)
name                               old time/op    new time/op    delta
Memmove/0-4                          11.0ns ± 0%    11.0ns ± 1%     ~     (p=0.294 n=8+10)
Memmove/1-4                          29.6ns ± 0%    28.0ns ± 1%   -5.41%  (p=0.000 n=9+10)
Memmove/8-4                          23.5ns ± 0%    22.1ns ± 0%   -6.11%  (p=0.000 n=8+8)
Memmove/16-4                         23.7ns ± 1%    22.1ns ± 0%   -6.59%  (p=0.000 n=10+8)
Memmove/32-4                         27.9ns ± 0%    27.1ns ± 0%   -3.13%  (p=0.000 n=8+8)
Memmove/64-4                         33.8ns ± 0%    31.5ns ± 1%   -6.99%  (p=0.000 n=8+10)
Memmove/128-4                        45.6ns ± 0%    44.2ns ± 1%   -3.23%  (p=0.000 n=9+10)
Memmove/256-4                        69.3ns ± 0%    69.3ns ± 0%     ~     (p=0.072 n=8+8)
Memmove/512-4                         127ns ± 0%     110ns ± 0%  -13.39%  (p=0.000 n=8+8)
Memmove/1024-4                        222ns ± 0%     205ns ± 1%   -7.66%  (p=0.000 n=7+10)
Memmove/2048-4                        411ns ± 0%     366ns ± 0%  -10.98%  (p=0.000 n=8+9)
Memmove/4096-4                        795ns ± 1%     695ns ± 1%  -12.63%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             44.0ns ± 0%    40.5ns ± 0%   -7.93%  (p=0.000 n=8+8)
MemmoveUnalignedDst/128-4            59.6ns ± 0%    54.9ns ± 0%   -7.85%  (p=0.000 n=9+9)
MemmoveUnalignedDst/256-4            98.2ns ±11%    90.0ns ± 1%     ~     (p=0.130 n=10+10)
MemmoveUnalignedDst/512-4             161ns ± 2%     145ns ± 1%   -9.96%  (p=0.000 n=10+10)
MemmoveUnalignedDst/1024-4            281ns ± 0%     265ns ± 0%   -5.65%  (p=0.000 n=9+8)
MemmoveUnalignedDst/2048-4            528ns ± 0%     482ns ± 0%   -8.73%  (p=0.000 n=8+9)
MemmoveUnalignedDst/4096-4           1.02µs ± 1%    0.92µs ± 0%  -10.00%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/64-4             42.4ns ± 1%    40.5ns ± 0%   -4.39%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/128-4            57.4ns ± 0%    57.0ns ± 1%   -0.75%  (p=0.048 n=9+10)
MemmoveUnalignedSrc/256-4            88.1ns ± 1%    89.6ns ± 0%   +1.70%  (p=0.000 n=9+8)
MemmoveUnalignedSrc/512-4             160ns ± 2%     144ns ± 0%   -9.89%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/1024-4            286ns ± 0%     266ns ± 1%   -6.69%  (p=0.000 n=8+10)
MemmoveUnalignedSrc/2048-4            525ns ± 0%     483ns ± 1%   -7.96%  (p=0.000 n=9+10)
MemmoveUnalignedSrc/4096-4           1.01µs ± 0%    0.92µs ± 1%   -9.40%  (p=0.000 n=8+10)

Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/243357
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
Jonathan Swinney 2020-10-30 18:46:23 +00:00 committed by Cherry Zhang
parent 7be8358f70
commit d5388e23b5
6 changed files with 314 additions and 150 deletions

View File

@ -56,32 +56,34 @@ var ARM struct {
// The booleans in ARM64 contain the correspondingly named cpu feature bit.
// The struct is padded to avoid false sharing.
var ARM64 struct {
_ CacheLinePad
HasFP bool
HasASIMD bool
HasEVTSTRM bool
HasAES bool
HasPMULL bool
HasSHA1 bool
HasSHA2 bool
HasCRC32 bool
HasATOMICS bool
HasFPHP bool
HasASIMDHP bool
HasCPUID bool
HasASIMDRDM bool
HasJSCVT bool
HasFCMA bool
HasLRCPC bool
HasDCPOP bool
HasSHA3 bool
HasSM3 bool
HasSM4 bool
HasASIMDDP bool
HasSHA512 bool
HasSVE bool
HasASIMDFHM bool
_ CacheLinePad
_ CacheLinePad
HasFP bool
HasASIMD bool
HasEVTSTRM bool
HasAES bool
HasPMULL bool
HasSHA1 bool
HasSHA2 bool
HasCRC32 bool
HasATOMICS bool
HasFPHP bool
HasASIMDHP bool
HasCPUID bool
HasASIMDRDM bool
HasJSCVT bool
HasFCMA bool
HasLRCPC bool
HasDCPOP bool
HasSHA3 bool
HasSM3 bool
HasSM4 bool
HasASIMDDP bool
HasSHA512 bool
HasSVE bool
HasASIMDFHM bool
IsNeoverseN1 bool
IsZeus bool
_ CacheLinePad
}
var MIPS64X struct {

View File

@ -18,6 +18,7 @@ const (
hwcap_SHA2 = 1 << 6
hwcap_CRC32 = 1 << 7
hwcap_ATOMICS = 1 << 8
hwcap_CPUID = 1 << 11
)
func doinit() {
@ -28,6 +29,8 @@ func doinit() {
{Name: "sha2", Feature: &ARM64.HasSHA2},
{Name: "crc32", Feature: &ARM64.HasCRC32},
{Name: "atomics", Feature: &ARM64.HasATOMICS},
{Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1},
{Name: "isZeus", Feature: &ARM64.IsZeus},
}
switch GOOS {
@ -40,12 +43,32 @@ func doinit() {
ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1)
ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2)
ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32)
ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID)
// The Samsung S9+ kernel reports support for atomics, but not all cores
// actually support them, resulting in SIGILL. See issue #28431.
// TODO(elias.naur): Only disable the optimization on bad chipsets on android.
ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android"
// Check to see if executing on a NeoverseN1 and in order to do that,
// check the AUXV for the CPUID bit. The getMIDR function executes an
// instruction which would normally be an illegal instruction, but it's
// trapped by the kernel, the value sanitized and then returned. Without
// the CPUID bit the kernel will not trap the instruction and the process
// will be terminated with SIGILL.
if ARM64.HasCPUID {
midr := getMIDR()
part_num := uint16((midr >> 4) & 0xfff)
implementor := byte((midr >> 24) & 0xff)
if implementor == 'A' && part_num == 0xd0c {
ARM64.IsNeoverseN1 = true
}
if implementor == 'A' && part_num == 0xd40 {
ARM64.IsZeus = true
}
}
case "freebsd":
// Retrieve info from system register ID_AA64ISAR0_EL1.
isar0 := getisar0()
@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool {
}
func getisar0() uint64
func getMIDR() uint64

View File

@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0
MRS ID_AA64ISAR0_EL1, R0
MOVD R0, ret+0(FP)
RET
// func getMIDR() uint64
TEXT ·getMIDR(SB), NOSPLIT, $0-8
MRS MIDR_EL1, R0
MOVD R0, ret+0(FP)
RET

View File

@ -0,0 +1,17 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime
import (
"internal/cpu"
)
var arm64UseAlignedLoads bool
func init() {
if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
arm64UseAlignedLoads = true
}
}

View File

@ -6,152 +6,236 @@
// See memmove Go doc for important implementation constraints.
// Register map
//
// dstin R0
// src R1
// count R2
// dst R3 (same as R0, but gets modified in unaligned cases)
// srcend R4
// dstend R5
// data R6-R17
// tmp1 R14
// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
// copies of up to 128 bytes, and large copies. The overhead of the overlap
// check is negligible since it is only required for large copies.
//
// Large copies use a software pipelined loop processing 64 bytes per iteration.
// The destination pointer is 16-byte aligned to minimize unaligned accesses.
// The loop tail is handled by always copying 64 bytes from the end.
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
CBNZ R5, check
RET
MOVD to+0(FP), R0
MOVD from+8(FP), R1
MOVD n+16(FP), R2
CBZ R2, copy0
check:
CMP $16, R5
// Small copies: 1..16 bytes
CMP $16, R2
BLE copy16
AND $~31, R5, R7 // R7 is N&~31
SUB R7, R5, R6 // R6 is N&31
// Large copies
CMP $128, R2
BHI copy_long
CMP $32, R2
BHI copy32_128
CMP R3, R4
BLT backward
// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
// R3 and R4 are advanced as we copy.
// (There may be implementations of armv8 where copying by bytes until
// at least one of source or dest is word aligned is a worthwhile
// optimization, but the on the one tested so far (xgene) it did not
// make a significance difference.)
CBZ R7, noforwardlarge // Do we need to do any quadword copying?
ADD R3, R7, R9 // R9 points just past where we copy by word
forwardlargeloop:
// Copy 32 bytes at a time.
LDP.P 32(R4), (R8, R10)
STP.P (R8, R10), 32(R3)
LDP -16(R4), (R11, R12)
STP (R11, R12), -16(R3)
SUB $32, R7, R7
CBNZ R7, forwardlargeloop
noforwardlarge:
CBNZ R6, forwardtail // Do we need to copy any tail bytes?
// Small copies: 17..32 bytes.
LDP (R1), (R6, R7)
ADD R1, R2, R4 // R4 points just past the last source byte
LDP -16(R4), (R12, R13)
STP (R6, R7), (R0)
ADD R0, R2, R5 // R5 points just past the last destination byte
STP (R12, R13), -16(R5)
RET
forwardtail:
// There are R6 <= 31 bytes remaining to copy.
// This is large enough to still contain pointers,
// which must be copied atomically.
// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0
LDP.P 16(R4), (R8, R10)
STP.P (R8, R10), 16(R3)
TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0
MOVD.P 8(R4), R8
MOVD.P R8, 8(R3)
AND $7, R6
CBNZ R6, 2(PC)
RET
ADD R3, R6, R9 // R9 points just past the destination memory
forwardtailloop:
MOVBU.P 1(R4), R8
MOVBU.P R8, 1(R3)
CMP R3, R9
BNE forwardtailloop
RET
// Small copies: 1..16 bytes.
// Small copies: 1..16 bytes.
copy16:
ADD R4, R5, R8 // R8 points just past the last source byte
ADD R3, R5, R9 // R9 points just past the last destination byte
CMP $8, R5
ADD R1, R2, R4 // R4 points just past the last source byte
ADD R0, R2, R5 // R5 points just past the last destination byte
CMP $8, R2
BLT copy7
MOVD (R4), R6
MOVD -8(R8), R7
MOVD R6, (R3)
MOVD R7, -8(R9)
MOVD (R1), R6
MOVD -8(R4), R7
MOVD R6, (R0)
MOVD R7, -8(R5)
RET
copy7:
TBZ $2, R5, copy3
MOVWU (R4), R6
MOVWU -4(R8), R7
MOVW R6, (R3)
MOVW R7, -4(R9)
TBZ $2, R2, copy3
MOVWU (R1), R6
MOVWU -4(R4), R7
MOVW R6, (R0)
MOVW R7, -4(R5)
RET
copy3:
TBZ $1, R5, copy1
MOVHU (R4), R6
MOVHU -2(R8), R7
MOVH R6, (R3)
MOVH R7, -2(R9)
TBZ $1, R2, copy1
MOVHU (R1), R6
MOVHU -2(R4), R7
MOVH R6, (R0)
MOVH R7, -2(R5)
RET
copy1:
MOVBU (R4), R6
MOVB R6, (R3)
MOVBU (R1), R6
MOVB R6, (R0)
copy0:
RET
backward:
// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
ADD R4, R5, R4 // R4 points just past the last source byte
ADD R3, R5, R3 // R3 points just past the last destination byte
CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
AND $7, R6, R12
CBZ R12, backwardtaillarge
SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
backwardtailloop:
// Copy sub-pointer-size tail.
MOVBU.W -1(R4), R8
MOVBU.W R8, -1(R3)
CMP R9, R3
BNE backwardtailloop
backwardtaillarge:
// Do 8/16-byte write if possible.
// See comment at forwardtail.
TBZ $3, R6, 3(PC)
MOVD.W -8(R4), R8
MOVD.W R8, -8(R3)
TBZ $4, R6, 3(PC)
LDP.W -16(R4), (R8, R10)
STP.W (R8, R10), -16(R3)
nobackwardtail:
CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
// Medium copies: 33..128 bytes.
copy32_128:
ADD R1, R2, R4 // R4 points just past the last source byte
ADD R0, R2, R5 // R5 points just past the last destination byte
LDP (R1), (R6, R7)
LDP 16(R1), (R8, R9)
LDP -32(R4), (R10, R11)
LDP -16(R4), (R12, R13)
CMP $64, R2
BHI copy128
STP (R6, R7), (R0)
STP (R8, R9), 16(R0)
STP (R10, R11), -32(R5)
STP (R12, R13), -16(R5)
RET
backwardlarge:
SUB R7, R3, R9 // R9 points at the lowest destination byte
// Copy 65..128 bytes.
copy128:
LDP 32(R1), (R14, R15)
LDP 48(R1), (R16, R17)
CMP $96, R2
BLS copy96
LDP -64(R4), (R2, R3)
LDP -48(R4), (R1, R4)
STP (R2, R3), -64(R5)
STP (R1, R4), -48(R5)
backwardlargeloop:
LDP -16(R4), (R8, R10)
STP (R8, R10), -16(R3)
LDP.W -32(R4), (R11, R12)
STP.W (R11, R12), -32(R3)
CMP R9, R3
BNE backwardlargeloop
copy96:
STP (R6, R7), (R0)
STP (R8, R9), 16(R0)
STP (R14, R15), 32(R0)
STP (R16, R17), 48(R0)
STP (R10, R11), -32(R5)
STP (R12, R13), -16(R5)
RET
// Copy more than 128 bytes.
copy_long:
ADD R1, R2, R4 // R4 points just past the last source byte
ADD R0, R2, R5 // R5 points just past the last destination byte
MOVD ZR, R7
MOVD ZR, R8
CMP $1024, R2
BLT backward_check
// feature detect to decide how to align
MOVBU runtime·arm64UseAlignedLoads(SB), R6
CBNZ R6, use_aligned_loads
MOVD R0, R7
MOVD R5, R8
B backward_check
use_aligned_loads:
MOVD R1, R7
MOVD R4, R8
// R7 and R8 are used here for the realignment calculation. In
// the use_aligned_loads case, R7 is the src pointer and R8 is
// srcend pointer, which is used in the backward copy case.
// When doing aligned stores, R7 is the dst pointer and R8 is
// the dstend pointer.
backward_check:
// Use backward copy if there is an overlap.
SUB R1, R0, R14
CBZ R14, copy0
CMP R2, R14
BCC copy_long_backward
// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
LDP (R1), (R12, R13) // Load A
AND $15, R7, R14 // Calculate the realignment offset
SUB R14, R1, R1
SUB R14, R0, R3 // move dst back same amount as src
ADD R14, R2, R2
LDP 16(R1), (R6, R7) // Load B
STP (R12, R13), (R0) // Store A
LDP 32(R1), (R8, R9) // Load C
LDP 48(R1), (R10, R11) // Load D
LDP.W 64(R1), (R12, R13) // Load E
// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
SUBS $144, R2, R2
BLS copy64_from_end
loop64:
STP (R6, R7), 16(R3) // Store B
LDP 16(R1), (R6, R7) // Load B (next iteration)
STP (R8, R9), 32(R3) // Store C
LDP 32(R1), (R8, R9) // Load C
STP (R10, R11), 48(R3) // Store D
LDP 48(R1), (R10, R11) // Load D
STP.W (R12, R13), 64(R3) // Store E
LDP.W 64(R1), (R12, R13) // Load E
SUBS $64, R2, R2
BHI loop64
// Write the last iteration and copy 64 bytes from the end.
copy64_from_end:
LDP -64(R4), (R14, R15) // Load F
STP (R6, R7), 16(R3) // Store B
LDP -48(R4), (R6, R7) // Load G
STP (R8, R9), 32(R3) // Store C
LDP -32(R4), (R8, R9) // Load H
STP (R10, R11), 48(R3) // Store D
LDP -16(R4), (R10, R11) // Load I
STP (R12, R13), 64(R3) // Store E
STP (R14, R15), -64(R5) // Store F
STP (R6, R7), -48(R5) // Store G
STP (R8, R9), -32(R5) // Store H
STP (R10, R11), -16(R5) // Store I
RET
// Large backward copy for overlapping copies.
// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
copy_long_backward:
LDP -16(R4), (R12, R13)
AND $15, R8, R14
SUB R14, R4, R4
SUB R14, R2, R2
LDP -16(R4), (R6, R7)
STP (R12, R13), -16(R5)
LDP -32(R4), (R8, R9)
LDP -48(R4), (R10, R11)
LDP.W -64(R4), (R12, R13)
SUB R14, R5, R5
SUBS $128, R2, R2
BLS copy64_from_start
loop64_backward:
STP (R6, R7), -16(R5)
LDP -16(R4), (R6, R7)
STP (R8, R9), -32(R5)
LDP -32(R4), (R8, R9)
STP (R10, R11), -48(R5)
LDP -48(R4), (R10, R11)
STP.W (R12, R13), -64(R5)
LDP.W -64(R4), (R12, R13)
SUBS $64, R2, R2
BHI loop64_backward
// Write the last iteration and copy 64 bytes from the start.
copy64_from_start:
LDP 48(R1), (R2, R3)
STP (R6, R7), -16(R5)
LDP 32(R1), (R6, R7)
STP (R8, R9), -32(R5)
LDP 16(R1), (R8, R9)
STP (R10, R11), -48(R5)
LDP (R1), (R10, R11)
STP (R12, R13), -64(R5)
STP (R2, R3), 48(R0)
STP (R6, R7), 32(R0)
STP (R8, R9), 16(R0)
STP (R10, R11), (R0)
RET

View File

@ -286,6 +286,9 @@ var bufSizes = []int{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
var bufSizesOverlap = []int{
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
func BenchmarkMemmove(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) {
})
}
func BenchmarkMemmoveOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+16)
for i := 0; i < b.N; i++ {
copy(x[16:n+16], x[:n])
}
})
}
func BenchmarkMemmoveUnalignedDst(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n+1)
@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) {
})
}
func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+16)
for i := 0; i < b.N; i++ {
copy(x[16:n+16], x[1:n+1])
}
})
}
func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
})
}
func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+1)
for i := 0; i < b.N; i++ {
copy(x[1:n+1], x[:n])
}
})
}
func TestMemclr(t *testing.T) {
size := 512
if testing.Short() {