mirror of
https://github.com/golang/go
synced 2024-11-19 13:54:56 -07:00
213a75171d
Improve runtime memmove_arm64.s specializing for small copies and processing 32 bytes per iteration for 32 bytes or more. Benchmark results of runtime/Memmove on Amberwing: name old time/op new time/op delta Memmove/0 7.61ns ± 0% 7.20ns ± 0% ~ (p=0.053 n=5+7) Memmove/1 9.28ns ± 0% 8.80ns ± 0% -5.17% (p=0.000 n=4+8) Memmove/2 9.65ns ± 0% 9.20ns ± 0% -4.68% (p=0.000 n=5+8) Memmove/3 10.0ns ± 0% 9.2ns ± 0% -7.83% (p=0.000 n=5+8) Memmove/4 10.6ns ± 0% 9.2ns ± 0% -13.21% (p=0.000 n=5+8) Memmove/5 11.0ns ± 0% 9.2ns ± 0% -16.36% (p=0.000 n=5+8) Memmove/6 12.4ns ± 0% 9.2ns ± 0% -25.81% (p=0.000 n=5+8) Memmove/7 13.1ns ± 0% 9.2ns ± 0% -29.56% (p=0.000 n=5+8) Memmove/8 9.10ns ± 1% 9.20ns ± 0% +1.08% (p=0.002 n=5+8) Memmove/9 9.67ns ± 0% 9.20ns ± 0% -4.88% (p=0.000 n=5+8) Memmove/10 10.4ns ± 0% 9.2ns ± 0% -11.54% (p=0.000 n=5+8) Memmove/11 10.9ns ± 0% 9.2ns ± 0% -15.60% (p=0.000 n=5+8) Memmove/12 11.5ns ± 0% 9.2ns ± 0% -20.00% (p=0.000 n=5+8) Memmove/13 12.4ns ± 0% 9.2ns ± 0% -25.81% (p=0.000 n=5+8) Memmove/14 13.1ns ± 0% 9.2ns ± 0% -29.77% (p=0.000 n=5+8) Memmove/15 13.8ns ± 0% 9.2ns ± 0% -33.33% (p=0.000 n=5+8) Memmove/16 9.70ns ± 0% 9.20ns ± 0% -5.19% (p=0.000 n=5+8) Memmove/32 10.6ns ± 0% 9.2ns ± 0% -13.21% (p=0.000 n=4+8) Memmove/64 13.4ns ± 0% 10.2ns ± 0% -23.88% (p=0.000 n=4+8) Memmove/128 18.1ns ± 1% 13.2ns ± 0% -26.99% (p=0.000 n=5+8) Memmove/256 25.2ns ± 0% 16.4ns ± 0% -34.92% (p=0.000 n=5+8) Memmove/512 36.4ns ± 0% 22.8ns ± 0% -37.36% (p=0.000 n=5+8) Memmove/1024 70.1ns ± 0% 36.8ns ±11% -47.49% (p=0.002 n=5+8) Memmove/2048 121ns ± 0% 61ns ± 0% ~ (p=0.053 n=5+7) Memmove/4096 224ns ± 0% 120ns ± 0% -46.43% (p=0.000 n=5+8) MemmoveUnalignedDst/0 8.40ns ± 0% 8.00ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedDst/1 9.87ns ± 1% 10.00ns ± 0% ~ (p=0.070 n=5+8) MemmoveUnalignedDst/2 10.6ns ± 0% 10.4ns ± 0% -1.89% (p=0.000 n=5+8) MemmoveUnalignedDst/3 10.8ns ± 0% 10.4ns ± 0% -3.70% (p=0.000 n=5+8) MemmoveUnalignedDst/4 10.9ns ± 0% 10.3ns ± 0% ~ (p=0.053 n=5+7) MemmoveUnalignedDst/5 11.5ns ± 0% 10.3ns ± 1% -10.22% (p=0.000 n=4+8) MemmoveUnalignedDst/6 13.2ns ± 0% 10.4ns ± 1% -21.50% (p=0.000 n=5+8) MemmoveUnalignedDst/7 13.7ns ± 0% 10.3ns ± 1% -24.64% (p=0.000 n=4+8) MemmoveUnalignedDst/8 10.1ns ± 0% 10.4ns ± 0% +2.97% (p=0.002 n=5+8) MemmoveUnalignedDst/9 10.7ns ± 0% 10.4ns ± 0% -2.80% (p=0.000 n=5+8) MemmoveUnalignedDst/10 11.2ns ± 1% 10.4ns ± 0% -6.81% (p=0.000 n=5+8) MemmoveUnalignedDst/11 11.6ns ± 0% 10.4ns ± 0% -10.34% (p=0.000 n=5+8) MemmoveUnalignedDst/12 12.5ns ± 2% 10.4ns ± 0% -16.53% (p=0.000 n=5+8) MemmoveUnalignedDst/13 13.7ns ± 0% 10.4ns ± 0% -24.09% (p=0.000 n=5+8) MemmoveUnalignedDst/14 14.0ns ± 0% 10.4ns ± 0% -25.71% (p=0.000 n=5+8) MemmoveUnalignedDst/15 14.6ns ± 0% 10.4ns ± 0% -28.77% (p=0.000 n=5+8) MemmoveUnalignedDst/16 10.5ns ± 0% 10.4ns ± 0% -0.95% (p=0.000 n=5+8) MemmoveUnalignedDst/32 12.4ns ± 0% 11.6ns ± 0% -6.05% (p=0.000 n=5+8) MemmoveUnalignedDst/64 15.2ns ± 0% 12.3ns ± 0% -19.08% (p=0.000 n=5+8) MemmoveUnalignedDst/128 18.7ns ± 0% 15.2ns ± 0% -18.72% (p=0.000 n=5+8) MemmoveUnalignedDst/256 25.1ns ± 0% 18.6ns ± 0% -25.90% (p=0.000 n=5+8) MemmoveUnalignedDst/512 37.8ns ± 0% 24.4ns ± 0% -35.45% (p=0.000 n=5+8) MemmoveUnalignedDst/1024 74.6ns ± 0% 40.4ns ± 0% ~ (p=0.053 n=5+7) MemmoveUnalignedDst/2048 133ns ± 0% 75ns ± 0% -43.91% (p=0.000 n=5+8) MemmoveUnalignedDst/4096 247ns ± 0% 141ns ± 0% -42.91% (p=0.000 n=5+8) MemmoveUnalignedSrc/0 8.40ns ± 0% 8.00ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedSrc/1 9.81ns ± 0% 10.00ns ± 0% +1.98% (p=0.002 n=5+8) MemmoveUnalignedSrc/2 10.5ns ± 0% 10.0ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedSrc/3 10.7ns ± 1% 10.0ns ± 0% -6.89% (p=0.000 n=5+8) MemmoveUnalignedSrc/4 11.3ns ± 0% 10.0ns ± 0% -11.50% (p=0.000 n=5+8) MemmoveUnalignedSrc/5 11.6ns ± 0% 10.0ns ± 0% -13.79% (p=0.000 n=5+8) MemmoveUnalignedSrc/6 13.6ns ± 0% 10.0ns ± 0% -26.47% (p=0.000 n=5+8) MemmoveUnalignedSrc/7 14.4ns ± 0% 10.0ns ± 0% -30.75% (p=0.000 n=5+8) MemmoveUnalignedSrc/8 9.87ns ± 1% 10.00ns ± 0% ~ (p=0.070 n=5+8) MemmoveUnalignedSrc/9 10.4ns ± 0% 10.0ns ± 0% -3.85% (p=0.000 n=5+8) MemmoveUnalignedSrc/10 11.2ns ± 0% 10.0ns ± 0% -10.71% (p=0.000 n=5+8) MemmoveUnalignedSrc/11 11.8ns ± 0% 10.0ns ± 0% -15.25% (p=0.000 n=5+8) MemmoveUnalignedSrc/12 12.1ns ± 0% 10.0ns ± 0% -17.36% (p=0.000 n=5+8) MemmoveUnalignedSrc/13 13.6ns ± 0% 10.0ns ± 0% -26.47% (p=0.000 n=5+8) MemmoveUnalignedSrc/14 14.7ns ± 0% 10.0ns ± 0% -31.79% (p=0.000 n=5+8) MemmoveUnalignedSrc/15 14.4ns ± 0% 10.0ns ± 0% -30.56% (p=0.000 n=5+8) MemmoveUnalignedSrc/16 11.0ns ± 0% 10.0ns ± 0% -9.09% (p=0.000 n=5+8) MemmoveUnalignedSrc/32 11.5ns ± 0% 10.0ns ± 0% -13.04% (p=0.000 n=5+8) MemmoveUnalignedSrc/64 14.9ns ± 0% 11.2ns ± 0% -24.83% (p=0.000 n=4+8) MemmoveUnalignedSrc/128 19.5ns ± 0% 15.2ns ± 0% -22.05% (p=0.000 n=5+8) MemmoveUnalignedSrc/256 27.3ns ± 2% 19.2ns ± 0% -29.62% (p=0.000 n=5+8) MemmoveUnalignedSrc/512 40.4ns ± 0% 27.2ns ± 0% -32.67% (p=0.000 n=5+8) MemmoveUnalignedSrc/1024 75.4ns ± 0% 44.4ns ± 0% -41.15% (p=0.000 n=5+8) MemmoveUnalignedSrc/2048 131ns ± 0% 77ns ± 3% -41.56% (p=0.002 n=5+8) MemmoveUnalignedSrc/4096 248ns ± 0% 145ns ± 0% -41.53% (p=0.000 n=5+8) name old speed new speed delta Memmove/1 108MB/s ± 0% 114MB/s ± 0% +5.37% (p=0.004 n=4+8) Memmove/2 207MB/s ± 0% 217MB/s ± 0% +4.85% (p=0.002 n=5+8) Memmove/3 301MB/s ± 0% 326MB/s ± 0% +8.45% (p=0.002 n=5+8) Memmove/4 377MB/s ± 0% 435MB/s ± 0% +15.31% (p=0.004 n=4+8) Memmove/5 455MB/s ± 0% 543MB/s ± 0% +19.46% (p=0.002 n=5+8) Memmove/6 483MB/s ± 0% 652MB/s ± 0% +34.88% (p=0.003 n=5+7) Memmove/7 537MB/s ± 0% 761MB/s ± 0% +41.71% (p=0.002 n=5+8) Memmove/8 879MB/s ± 1% 869MB/s ± 0% -1.15% (p=0.000 n=5+7) Memmove/9 931MB/s ± 0% 978MB/s ± 0% +5.05% (p=0.002 n=5+8) Memmove/10 960MB/s ± 0% 1086MB/s ± 0% +13.13% (p=0.002 n=5+8) Memmove/11 1.00GB/s ± 0% 1.20GB/s ± 0% +18.92% (p=0.003 n=5+7) Memmove/12 1.04GB/s ± 0% 1.30GB/s ± 0% +25.40% (p=0.002 n=5+8) Memmove/13 1.05GB/s ± 0% 1.41GB/s ± 0% +34.87% (p=0.002 n=5+8) Memmove/14 1.07GB/s ± 0% 1.52GB/s ± 0% +42.14% (p=0.002 n=5+8) Memmove/15 1.09GB/s ± 0% 1.63GB/s ± 0% +49.91% (p=0.002 n=5+8) Memmove/16 1.65GB/s ± 0% 1.74GB/s ± 0% +5.40% (p=0.003 n=5+7) Memmove/32 3.01GB/s ± 0% 3.48GB/s ± 0% +15.58% (p=0.003 n=5+7) Memmove/64 4.76GB/s ± 0% 6.27GB/s ± 0% +31.75% (p=0.003 n=5+7) Memmove/128 7.08GB/s ± 1% 9.69GB/s ± 0% +36.96% (p=0.002 n=5+8) Memmove/256 10.2GB/s ± 0% 15.6GB/s ± 0% +53.58% (p=0.002 n=5+8) Memmove/512 14.1GB/s ± 0% 22.4GB/s ± 0% +59.57% (p=0.003 n=5+7) Memmove/1024 14.6GB/s ± 0% 27.9GB/s ±10% +91.00% (p=0.002 n=5+8) Memmove/2048 16.9GB/s ± 0% 33.4GB/s ± 0% +98.32% (p=0.003 n=5+7) Memmove/4096 18.3GB/s ± 0% 33.9GB/s ± 0% +85.80% (p=0.002 n=5+8) MemmoveUnalignedDst/1 101MB/s ± 1% 100MB/s ± 0% ~ (p=0.586 n=5+8) MemmoveUnalignedDst/2 189MB/s ± 0% 192MB/s ± 0% +1.82% (p=0.002 n=5+8) MemmoveUnalignedDst/3 278MB/s ± 0% 288MB/s ± 0% +3.88% (p=0.003 n=5+7) MemmoveUnalignedDst/4 368MB/s ± 0% 387MB/s ± 0% +5.41% (p=0.003 n=5+7) MemmoveUnalignedDst/5 434MB/s ± 0% 484MB/s ± 0% +11.52% (p=0.002 n=5+8) MemmoveUnalignedDst/6 454MB/s ± 0% 580MB/s ± 0% +27.62% (p=0.002 n=5+8) MemmoveUnalignedDst/7 509MB/s ± 0% 677MB/s ± 0% +33.01% (p=0.002 n=5+8) MemmoveUnalignedDst/8 792MB/s ± 0% 770MB/s ± 0% -2.77% (p=0.002 n=5+8) MemmoveUnalignedDst/9 841MB/s ± 0% 866MB/s ± 0% +2.92% (p=0.002 n=5+8) MemmoveUnalignedDst/10 896MB/s ± 0% 962MB/s ± 0% +7.35% (p=0.003 n=5+7) MemmoveUnalignedDst/11 947MB/s ± 0% 1058MB/s ± 0% +11.80% (p=0.002 n=5+8) MemmoveUnalignedDst/12 962MB/s ± 2% 1154MB/s ± 0% +19.97% (p=0.002 n=5+8) MemmoveUnalignedDst/13 947MB/s ± 0% 1251MB/s ± 0% +32.08% (p=0.002 n=5+8) MemmoveUnalignedDst/14 1.00GB/s ± 0% 1.35GB/s ± 0% +34.55% (p=0.002 n=5+8) MemmoveUnalignedDst/15 1.03GB/s ± 0% 1.44GB/s ± 0% +40.50% (p=0.002 n=5+8) MemmoveUnalignedDst/16 1.53GB/s ± 0% 1.54GB/s ± 0% +0.77% (p=0.002 n=5+8) MemmoveUnalignedDst/32 2.58GB/s ± 0% 2.75GB/s ± 0% +6.52% (p=0.003 n=5+7) MemmoveUnalignedDst/64 4.21GB/s ± 0% 5.19GB/s ± 0% +23.40% (p=0.004 n=5+6) MemmoveUnalignedDst/128 6.86GB/s ± 0% 8.42GB/s ± 0% +22.78% (p=0.003 n=5+7) MemmoveUnalignedDst/256 10.2GB/s ± 0% 13.8GB/s ± 0% +35.15% (p=0.002 n=5+8) MemmoveUnalignedDst/512 13.5GB/s ± 0% 21.0GB/s ± 0% +54.90% (p=0.002 n=5+8) MemmoveUnalignedDst/1024 13.7GB/s ± 0% 25.3GB/s ± 0% +84.61% (p=0.003 n=5+7) MemmoveUnalignedDst/2048 15.3GB/s ± 0% 27.5GB/s ± 0% +79.52% (p=0.002 n=5+8) MemmoveUnalignedDst/4096 16.5GB/s ± 0% 28.9GB/s ± 0% +74.74% (p=0.002 n=5+8) MemmoveUnalignedSrc/1 102MB/s ± 0% 100MB/s ± 0% -2.02% (p=0.000 n=5+7) MemmoveUnalignedSrc/2 191MB/s ± 0% 200MB/s ± 0% +4.78% (p=0.002 n=5+8) MemmoveUnalignedSrc/3 279MB/s ± 0% 300MB/s ± 0% +7.45% (p=0.002 n=5+8) MemmoveUnalignedSrc/4 354MB/s ± 0% 400MB/s ± 0% +13.10% (p=0.002 n=5+8) MemmoveUnalignedSrc/5 431MB/s ± 0% 500MB/s ± 0% +16.02% (p=0.002 n=5+8) MemmoveUnalignedSrc/6 441MB/s ± 0% 600MB/s ± 0% +36.03% (p=0.002 n=5+8) MemmoveUnalignedSrc/7 485MB/s ± 0% 700MB/s ± 0% +44.29% (p=0.002 n=5+8) MemmoveUnalignedSrc/8 811MB/s ± 1% 800MB/s ± 0% -1.36% (p=0.016 n=5+8) MemmoveUnalignedSrc/9 864MB/s ± 0% 900MB/s ± 0% +4.07% (p=0.002 n=5+8) MemmoveUnalignedSrc/10 893MB/s ± 0% 999MB/s ± 0% +11.97% (p=0.002 n=5+8) MemmoveUnalignedSrc/11 932MB/s ± 0% 1099MB/s ± 0% +18.01% (p=0.002 n=5+8) MemmoveUnalignedSrc/12 988MB/s ± 0% 1199MB/s ± 0% +21.35% (p=0.002 n=5+8) MemmoveUnalignedSrc/13 955MB/s ± 0% 1299MB/s ± 0% +36.02% (p=0.002 n=5+8) MemmoveUnalignedSrc/14 955MB/s ± 0% 1399MB/s ± 0% +46.52% (p=0.002 n=5+8) MemmoveUnalignedSrc/15 1.04GB/s ± 0% 1.50GB/s ± 0% +44.18% (p=0.002 n=5+8) MemmoveUnalignedSrc/16 1.45GB/s ± 0% 1.60GB/s ± 0% +10.14% (p=0.002 n=5+8) MemmoveUnalignedSrc/32 2.78GB/s ± 0% 3.20GB/s ± 0% +15.16% (p=0.003 n=5+7) MemmoveUnalignedSrc/64 4.30GB/s ± 0% 5.72GB/s ± 0% +32.90% (p=0.003 n=5+7) MemmoveUnalignedSrc/128 6.57GB/s ± 0% 8.42GB/s ± 0% +28.06% (p=0.002 n=5+8) MemmoveUnalignedSrc/256 9.39GB/s ± 1% 13.33GB/s ± 0% +41.96% (p=0.002 n=5+8) MemmoveUnalignedSrc/512 12.7GB/s ± 0% 18.8GB/s ± 0% +48.53% (p=0.003 n=5+7) MemmoveUnalignedSrc/1024 13.6GB/s ± 0% 23.0GB/s ± 0% +69.82% (p=0.002 n=5+8) MemmoveUnalignedSrc/2048 15.6GB/s ± 0% 26.8GB/s ± 3% +71.37% (p=0.002 n=5+8) MemmoveUnalignedSrc/4096 16.5GB/s ± 0% 28.2GB/s ± 0% +71.40% (p=0.002 n=5+8) Fixes #22925 Change-Id: I38c1a9ad5c6e3f4f95fc521c4b7e3140b58b4737 Reviewed-on: https://go-review.googlesource.com/83799 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
124 lines
2.8 KiB
ArmAsm
124 lines
2.8 KiB
ArmAsm
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "textflag.h"
|
|
|
|
// void runtime·memmove(void*, void*, uintptr)
|
|
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
|
|
MOVD to+0(FP), R3
|
|
MOVD from+8(FP), R4
|
|
MOVD n+16(FP), R5
|
|
CBNZ R5, check
|
|
RET
|
|
|
|
check:
|
|
CMP $16, R5
|
|
BLE copy16
|
|
|
|
AND $~31, R5, R7 // R7 is N&~31
|
|
SUB R7, R5, R6 // R6 is N&31
|
|
|
|
CMP R3, R4
|
|
BLT backward
|
|
|
|
// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
|
|
// R3 and R4 are advanced as we copy.
|
|
|
|
// (There may be implementations of armv8 where copying by bytes until
|
|
// at least one of source or dest is word aligned is a worthwhile
|
|
// optimization, but the on the one tested so far (xgene) it did not
|
|
// make a significance difference.)
|
|
|
|
CBZ R7, noforwardlarge // Do we need to do any doubleword-by-doubleword copying?
|
|
|
|
ADD R3, R7, R9 // R9 points just past where we copy by word
|
|
|
|
forwardlargeloop:
|
|
LDP.P 32(R4), (R8, R10)
|
|
STP.P (R8, R10), 32(R3)
|
|
LDP -16(R4), (R11, R12)
|
|
STP (R11, R12), -16(R3)
|
|
SUB $32, R7, R7
|
|
CBNZ R7, forwardlargeloop
|
|
|
|
noforwardlarge:
|
|
CBNZ R6, forwardtail // Do we need to do any byte-by-byte copying?
|
|
RET
|
|
|
|
forwardtail:
|
|
ADD R3, R6, R9 // R9 points just past the destination memory
|
|
|
|
forwardtailloop:
|
|
MOVBU.P 1(R4), R8
|
|
MOVBU.P R8, 1(R3)
|
|
CMP R3, R9
|
|
BNE forwardtailloop
|
|
RET
|
|
|
|
// Small copies: 1..16 bytes.
|
|
copy16:
|
|
ADD R4, R5, R8 // R8 points just past the last source byte
|
|
ADD R3, R5, R9 // R9 points just past the last destination byte
|
|
CMP $8, R5
|
|
BLT copy7
|
|
MOVD (R4), R6
|
|
MOVD -8(R8), R7
|
|
MOVD R6, (R3)
|
|
MOVD R7, -8(R9)
|
|
RET
|
|
|
|
copy7:
|
|
TBZ $2, R5, copy3
|
|
MOVWU (R4), R6
|
|
MOVWU -4(R8), R7
|
|
MOVW R6, (R3)
|
|
MOVW R7, -4(R9)
|
|
RET
|
|
|
|
copy3:
|
|
TBZ $1, R5, copy1
|
|
MOVHU (R4), R6
|
|
MOVHU -2(R8), R7
|
|
MOVH R6, (R3)
|
|
MOVH R7, -2(R9)
|
|
RET
|
|
|
|
copy1:
|
|
MOVBU (R4), R6
|
|
MOVB R6, (R3)
|
|
RET
|
|
|
|
backward:
|
|
// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
|
|
// R3 and R4 are advanced to the end of the destination/source buffers
|
|
// respectively and moved back as we copy.
|
|
|
|
ADD R4, R5, R4 // R4 points just past the last source byte
|
|
ADD R3, R5, R3 // R3 points just past the last destination byte
|
|
|
|
CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
|
|
|
|
SUB R6, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
|
|
backwardtailloop:
|
|
MOVBU.W -1(R4), R8
|
|
MOVBU.W R8, -1(R3)
|
|
CMP R9, R3
|
|
BNE backwardtailloop
|
|
|
|
nobackwardtail:
|
|
CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
|
|
RET
|
|
|
|
backwardlarge:
|
|
SUB R7, R3, R9 // R9 points at the lowest destination byte
|
|
|
|
backwardlargeloop:
|
|
LDP -16(R4), (R8, R10)
|
|
STP (R8, R10), -16(R3)
|
|
LDP.W -32(R4), (R11, R12)
|
|
STP.W (R11, R12), -32(R3)
|
|
CMP R9, R3
|
|
BNE backwardlargeloop
|
|
RET
|