mirror of
https://github.com/golang/go
synced 2024-11-23 06:30:06 -07:00
9552a1122f
The riscv64 implementation of memmove has two optimizations that are applied when both source and destination pointers share the same alignment but that alignment is not 8 bytes. Both optimizations attempt to align the source and destination pointers to 8 byte boundaries before performing 8 byte aligned loads and stores. Both optimizations are incorrect. The first optimization is applied when the destination pointer is smaller than the source pointer. In this case the code increments both pointers by (pointer & 3) bytes rather than (8 - (pointer & 7)) bytes. The second optimization is applied when the destination pointer is larger than the source pointer. In this case the existing code decrements the pointers by (pointer & 3) bytes instead of (pointer & 7). This commit fixes both optimizations avoiding unaligned 8 byte accesses. As this particular optimization is not covered by any of the existing benchmarks a new benchmark, BenchmarkMemmoveUnalignedSrcDst, is provided that exercises both optimizations. Results of the new benchmark, which were run on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. MemmoveUnalignedSrcDst/f_16_0-4 39.48n ± 5% 43.47n ± 2% +10.13% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_0-4 45.39n ± 5% 41.55n ± 4% -8.47% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_1-4 1230.50n ± 1% 83.44n ± 5% -93.22% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_1-4 69.34n ± 4% 67.83n ± 8% ~ (p=0.436 n=10) MemmoveUnalignedSrcDst/f_16_4-4 2349.00n ± 1% 72.09n ± 4% -96.93% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_4-4 2357.00n ± 0% 77.61n ± 4% -96.71% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_7-4 1235.00n ± 0% 62.02n ± 2% -94.98% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_7-4 1246.00n ± 0% 84.05n ± 6% -93.25% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_0-4 49.96n ± 2% 50.01n ± 2% ~ (p=0.755 n=10) MemmoveUnalignedSrcDst/b_64_0-4 52.06n ± 3% 51.65n ± 3% ~ (p=0.631 n=10) MemmoveUnalignedSrcDst/f_64_1-4 8105.50n ± 0% 97.63n ± 1% -98.80% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_1-4 84.07n ± 4% 84.90n ± 5% ~ (p=0.315 n=10) MemmoveUnalignedSrcDst/f_64_4-4 9192.00n ± 0% 86.16n ± 3% -99.06% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_4-4 9195.50n ± 1% 91.88n ± 5% -99.00% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_7-4 8106.50n ± 0% 78.44n ± 9% -99.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_7-4 8107.00n ± 0% 99.19n ± 1% -98.78% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_0-4 90.95n ± 1% 92.16n ± 8% ~ (p=0.123 n=10) MemmoveUnalignedSrcDst/b_256_0-4 96.09n ± 12% 94.90n ± 2% ~ (p=0.143 n=10) MemmoveUnalignedSrcDst/f_256_1-4 35492.5n ± 0% 133.5n ± 0% -99.62% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_1-4 128.7n ± 1% 130.1n ± 1% +1.13% (p=0.005 n=10) MemmoveUnalignedSrcDst/f_256_4-4 36599.0n ± 0% 123.0n ± 1% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_4-4 36675.5n ± 0% 130.7n ± 1% -99.64% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_7-4 35555.5n ± 0% 121.6n ± 2% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_7-4 35584.0n ± 0% 139.1n ± 1% -99.61% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_0-4 956.3n ± 2% 960.8n ± 1% ~ (p=0.306 n=10) MemmoveUnalignedSrcDst/b_4096_0-4 1.015µ ± 2% 1.012µ ± 2% ~ (p=0.076 n=10) MemmoveUnalignedSrcDst/f_4096_1-4 584.406µ ± 0% 1.002µ ± 1% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_1-4 1.044µ ± 1% 1.040µ ± 2% ~ (p=0.090 n=10) MemmoveUnalignedSrcDst/f_4096_4-4 585113.5n ± 0% 988.6n ± 2% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_4-4 586.521µ ± 0% 1.044µ ± 1% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_7-4 585374.5n ± 0% 986.2n ± 0% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_7-4 584.595µ ± 1% 1.055µ ± 0% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_0-4 54.83µ ± 0% 55.00µ ± 0% +0.31% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_0-4 56.54µ ± 0% 56.64µ ± 0% +0.19% (p=0.011 n=10) MemmoveUnalignedSrcDst/f_65536_1-4 9450.51µ ± 0% 58.25µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_1-4 56.65µ ± 0% 56.68µ ± 0% ~ (p=0.353 n=10) MemmoveUnalignedSrcDst/f_65536_4-4 9449.48µ ± 0% 58.24µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_4-4 9462.91µ ± 0% 56.69µ ± 0% -99.40% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_7-4 9477.37µ ± 0% 58.26µ ± 0% -99.39% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_7-4 9467.96µ ± 0% 56.68µ ± 0% -99.40% (p=0.000 n=10) geomean 11.16µ 509.8n -95.43% Change-Id: Idfa1873b81fece3b2b1a0aed398fa5663cc73b83 Reviewed-on: https://go-review.googlesource.com/c/go/+/498377 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
320 lines
5.5 KiB
ArmAsm
320 lines
5.5 KiB
ArmAsm
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "textflag.h"
|
|
|
|
// See memmove Go doc for important implementation constraints.
|
|
|
|
// void runtime·memmove(void*, void*, uintptr)
|
|
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
|
|
// X10 = to
|
|
// X11 = from
|
|
// X12 = n
|
|
BEQ X10, X11, done
|
|
BEQZ X12, done
|
|
|
|
// If the destination is ahead of the source, start at the end of the
|
|
// buffer and go backward.
|
|
BGTU X10, X11, backward
|
|
|
|
// If less than 8 bytes, do single byte copies.
|
|
MOV $8, X9
|
|
BLT X12, X9, f_loop4_check
|
|
|
|
// Check alignment - if alignment differs we have to do one byte at a time.
|
|
AND $7, X10, X5
|
|
AND $7, X11, X6
|
|
BNE X5, X6, f_loop8_unaligned_check
|
|
BEQZ X5, f_loop_check
|
|
|
|
// Move one byte at a time until we reach 8 byte alignment.
|
|
SUB X5, X9, X5
|
|
SUB X5, X12, X12
|
|
f_align:
|
|
ADD $-1, X5
|
|
MOVB 0(X11), X14
|
|
MOVB X14, 0(X10)
|
|
ADD $1, X10
|
|
ADD $1, X11
|
|
BNEZ X5, f_align
|
|
|
|
f_loop_check:
|
|
MOV $16, X9
|
|
BLT X12, X9, f_loop8_check
|
|
MOV $32, X9
|
|
BLT X12, X9, f_loop16_check
|
|
MOV $64, X9
|
|
BLT X12, X9, f_loop32_check
|
|
f_loop64:
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV 16(X11), X16
|
|
MOV 24(X11), X17
|
|
MOV 32(X11), X18
|
|
MOV 40(X11), X19
|
|
MOV 48(X11), X20
|
|
MOV 56(X11), X21
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
MOV X16, 16(X10)
|
|
MOV X17, 24(X10)
|
|
MOV X18, 32(X10)
|
|
MOV X19, 40(X10)
|
|
MOV X20, 48(X10)
|
|
MOV X21, 56(X10)
|
|
ADD $64, X10
|
|
ADD $64, X11
|
|
ADD $-64, X12
|
|
BGE X12, X9, f_loop64
|
|
BEQZ X12, done
|
|
|
|
f_loop32_check:
|
|
MOV $32, X9
|
|
BLT X12, X9, f_loop16_check
|
|
f_loop32:
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV 16(X11), X16
|
|
MOV 24(X11), X17
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
MOV X16, 16(X10)
|
|
MOV X17, 24(X10)
|
|
ADD $32, X10
|
|
ADD $32, X11
|
|
ADD $-32, X12
|
|
BGE X12, X9, f_loop32
|
|
BEQZ X12, done
|
|
|
|
f_loop16_check:
|
|
MOV $16, X9
|
|
BLT X12, X9, f_loop8_check
|
|
f_loop16:
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
ADD $16, X10
|
|
ADD $16, X11
|
|
ADD $-16, X12
|
|
BGE X12, X9, f_loop16
|
|
BEQZ X12, done
|
|
|
|
f_loop8_check:
|
|
MOV $8, X9
|
|
BLT X12, X9, f_loop4_check
|
|
f_loop8:
|
|
MOV 0(X11), X14
|
|
MOV X14, 0(X10)
|
|
ADD $8, X10
|
|
ADD $8, X11
|
|
ADD $-8, X12
|
|
BGE X12, X9, f_loop8
|
|
BEQZ X12, done
|
|
JMP f_loop4_check
|
|
|
|
f_loop8_unaligned_check:
|
|
MOV $8, X9
|
|
BLT X12, X9, f_loop4_check
|
|
f_loop8_unaligned:
|
|
MOVB 0(X11), X14
|
|
MOVB 1(X11), X15
|
|
MOVB 2(X11), X16
|
|
MOVB 3(X11), X17
|
|
MOVB 4(X11), X18
|
|
MOVB 5(X11), X19
|
|
MOVB 6(X11), X20
|
|
MOVB 7(X11), X21
|
|
MOVB X14, 0(X10)
|
|
MOVB X15, 1(X10)
|
|
MOVB X16, 2(X10)
|
|
MOVB X17, 3(X10)
|
|
MOVB X18, 4(X10)
|
|
MOVB X19, 5(X10)
|
|
MOVB X20, 6(X10)
|
|
MOVB X21, 7(X10)
|
|
ADD $8, X10
|
|
ADD $8, X11
|
|
ADD $-8, X12
|
|
BGE X12, X9, f_loop8_unaligned
|
|
|
|
f_loop4_check:
|
|
MOV $4, X9
|
|
BLT X12, X9, f_loop1
|
|
f_loop4:
|
|
MOVB 0(X11), X14
|
|
MOVB 1(X11), X15
|
|
MOVB 2(X11), X16
|
|
MOVB 3(X11), X17
|
|
MOVB X14, 0(X10)
|
|
MOVB X15, 1(X10)
|
|
MOVB X16, 2(X10)
|
|
MOVB X17, 3(X10)
|
|
ADD $4, X10
|
|
ADD $4, X11
|
|
ADD $-4, X12
|
|
BGE X12, X9, f_loop4
|
|
|
|
f_loop1:
|
|
BEQZ X12, done
|
|
MOVB 0(X11), X14
|
|
MOVB X14, 0(X10)
|
|
ADD $1, X10
|
|
ADD $1, X11
|
|
ADD $-1, X12
|
|
JMP f_loop1
|
|
|
|
backward:
|
|
ADD X10, X12, X10
|
|
ADD X11, X12, X11
|
|
|
|
// If less than 8 bytes, do single byte copies.
|
|
MOV $8, X9
|
|
BLT X12, X9, b_loop4_check
|
|
|
|
// Check alignment - if alignment differs we have to do one byte at a time.
|
|
AND $7, X10, X5
|
|
AND $7, X11, X6
|
|
BNE X5, X6, b_loop8_unaligned_check
|
|
BEQZ X5, b_loop_check
|
|
|
|
// Move one byte at a time until we reach 8 byte alignment.
|
|
SUB X5, X12, X12
|
|
b_align:
|
|
ADD $-1, X5
|
|
ADD $-1, X10
|
|
ADD $-1, X11
|
|
MOVB 0(X11), X14
|
|
MOVB X14, 0(X10)
|
|
BNEZ X5, b_align
|
|
|
|
b_loop_check:
|
|
MOV $16, X9
|
|
BLT X12, X9, b_loop8_check
|
|
MOV $32, X9
|
|
BLT X12, X9, b_loop16_check
|
|
MOV $64, X9
|
|
BLT X12, X9, b_loop32_check
|
|
b_loop64:
|
|
ADD $-64, X10
|
|
ADD $-64, X11
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV 16(X11), X16
|
|
MOV 24(X11), X17
|
|
MOV 32(X11), X18
|
|
MOV 40(X11), X19
|
|
MOV 48(X11), X20
|
|
MOV 56(X11), X21
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
MOV X16, 16(X10)
|
|
MOV X17, 24(X10)
|
|
MOV X18, 32(X10)
|
|
MOV X19, 40(X10)
|
|
MOV X20, 48(X10)
|
|
MOV X21, 56(X10)
|
|
ADD $-64, X12
|
|
BGE X12, X9, b_loop64
|
|
BEQZ X12, done
|
|
|
|
b_loop32_check:
|
|
MOV $32, X9
|
|
BLT X12, X9, b_loop16_check
|
|
b_loop32:
|
|
ADD $-32, X10
|
|
ADD $-32, X11
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV 16(X11), X16
|
|
MOV 24(X11), X17
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
MOV X16, 16(X10)
|
|
MOV X17, 24(X10)
|
|
ADD $-32, X12
|
|
BGE X12, X9, b_loop32
|
|
BEQZ X12, done
|
|
|
|
b_loop16_check:
|
|
MOV $16, X9
|
|
BLT X12, X9, b_loop8_check
|
|
b_loop16:
|
|
ADD $-16, X10
|
|
ADD $-16, X11
|
|
MOV 0(X11), X14
|
|
MOV 8(X11), X15
|
|
MOV X14, 0(X10)
|
|
MOV X15, 8(X10)
|
|
ADD $-16, X12
|
|
BGE X12, X9, b_loop16
|
|
BEQZ X12, done
|
|
|
|
b_loop8_check:
|
|
MOV $8, X9
|
|
BLT X12, X9, b_loop4_check
|
|
b_loop8:
|
|
ADD $-8, X10
|
|
ADD $-8, X11
|
|
MOV 0(X11), X14
|
|
MOV X14, 0(X10)
|
|
ADD $-8, X12
|
|
BGE X12, X9, b_loop8
|
|
BEQZ X12, done
|
|
JMP b_loop4_check
|
|
|
|
b_loop8_unaligned_check:
|
|
MOV $8, X9
|
|
BLT X12, X9, b_loop4_check
|
|
b_loop8_unaligned:
|
|
ADD $-8, X10
|
|
ADD $-8, X11
|
|
MOVB 0(X11), X14
|
|
MOVB 1(X11), X15
|
|
MOVB 2(X11), X16
|
|
MOVB 3(X11), X17
|
|
MOVB 4(X11), X18
|
|
MOVB 5(X11), X19
|
|
MOVB 6(X11), X20
|
|
MOVB 7(X11), X21
|
|
MOVB X14, 0(X10)
|
|
MOVB X15, 1(X10)
|
|
MOVB X16, 2(X10)
|
|
MOVB X17, 3(X10)
|
|
MOVB X18, 4(X10)
|
|
MOVB X19, 5(X10)
|
|
MOVB X20, 6(X10)
|
|
MOVB X21, 7(X10)
|
|
ADD $-8, X12
|
|
BGE X12, X9, b_loop8_unaligned
|
|
|
|
b_loop4_check:
|
|
MOV $4, X9
|
|
BLT X12, X9, b_loop1
|
|
b_loop4:
|
|
ADD $-4, X10
|
|
ADD $-4, X11
|
|
MOVB 0(X11), X14
|
|
MOVB 1(X11), X15
|
|
MOVB 2(X11), X16
|
|
MOVB 3(X11), X17
|
|
MOVB X14, 0(X10)
|
|
MOVB X15, 1(X10)
|
|
MOVB X16, 2(X10)
|
|
MOVB X17, 3(X10)
|
|
ADD $-4, X12
|
|
BGE X12, X9, b_loop4
|
|
|
|
b_loop1:
|
|
BEQZ X12, done
|
|
ADD $-1, X10
|
|
ADD $-1, X11
|
|
MOVB 0(X11), X14
|
|
MOVB X14, 0(X10)
|
|
ADD $-1, X12
|
|
JMP b_loop1
|
|
|
|
done:
|
|
RET
|