1
0
mirror of https://github.com/golang/go synced 2024-11-23 06:30:06 -07:00
go/src/runtime/memmove_riscv64.s
Mark Ryan 9552a1122f runtime: fix alignment code in memmove_riscv64.s
The riscv64 implementation of memmove has two optimizations that are
applied when both source and destination pointers share the same alignment
but that alignment is not 8 bytes. Both optimizations attempt to align the
source and destination pointers to 8 byte boundaries before performing 8 byte
aligned loads and stores.  Both optimizations are incorrect.  The first
optimization is applied when the destination pointer is smaller than the source
pointer.  In this case the code increments both pointers by (pointer & 3) bytes
rather than (8 - (pointer & 7)) bytes.  The second optimization is applied
when the destination pointer is larger than the source pointer.  In this
case the existing code decrements the pointers by (pointer & 3) bytes instead
of (pointer & 7).

This commit fixes both optimizations avoiding unaligned 8 byte accesses.
As this particular optimization is not covered by any of the existing
benchmarks a new benchmark, BenchmarkMemmoveUnalignedSrcDst,
is provided that exercises both optimizations. Results of the new
benchmark, which were run on a SiFive HiFive Unmatched A00 with 16GB of RAM
running Ubuntu 23.04 are presented below.

MemmoveUnalignedSrcDst/f_16_0-4            39.48n ±  5%   43.47n ± 2%  +10.13% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_16_0-4            45.39n ±  5%   41.55n ± 4%   -8.47% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_16_1-4          1230.50n ±  1%   83.44n ± 5%  -93.22% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_16_1-4            69.34n ±  4%   67.83n ± 8%        ~ (p=0.436 n=10)
MemmoveUnalignedSrcDst/f_16_4-4          2349.00n ±  1%   72.09n ± 4%  -96.93% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_16_4-4          2357.00n ±  0%   77.61n ± 4%  -96.71% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_16_7-4          1235.00n ±  0%   62.02n ± 2%  -94.98% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_16_7-4          1246.00n ±  0%   84.05n ± 6%  -93.25% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_64_0-4            49.96n ±  2%   50.01n ± 2%        ~ (p=0.755 n=10)
MemmoveUnalignedSrcDst/b_64_0-4            52.06n ±  3%   51.65n ± 3%        ~ (p=0.631 n=10)
MemmoveUnalignedSrcDst/f_64_1-4          8105.50n ±  0%   97.63n ± 1%  -98.80% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_64_1-4            84.07n ±  4%   84.90n ± 5%        ~ (p=0.315 n=10)
MemmoveUnalignedSrcDst/f_64_4-4          9192.00n ±  0%   86.16n ± 3%  -99.06% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_64_4-4          9195.50n ±  1%   91.88n ± 5%  -99.00% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_64_7-4          8106.50n ±  0%   78.44n ± 9%  -99.03% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_64_7-4          8107.00n ±  0%   99.19n ± 1%  -98.78% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_256_0-4           90.95n ±  1%   92.16n ± 8%        ~ (p=0.123 n=10)
MemmoveUnalignedSrcDst/b_256_0-4           96.09n ± 12%   94.90n ± 2%        ~ (p=0.143 n=10)
MemmoveUnalignedSrcDst/f_256_1-4         35492.5n ±  0%   133.5n ± 0%  -99.62% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_256_1-4           128.7n ±  1%   130.1n ± 1%   +1.13% (p=0.005 n=10)
MemmoveUnalignedSrcDst/f_256_4-4         36599.0n ±  0%   123.0n ± 1%  -99.66% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_256_4-4         36675.5n ±  0%   130.7n ± 1%  -99.64% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_256_7-4         35555.5n ±  0%   121.6n ± 2%  -99.66% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_256_7-4         35584.0n ±  0%   139.1n ± 1%  -99.61% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_4096_0-4          956.3n ±  2%   960.8n ± 1%        ~ (p=0.306 n=10)
MemmoveUnalignedSrcDst/b_4096_0-4          1.015µ ±  2%   1.012µ ± 2%        ~ (p=0.076 n=10)
MemmoveUnalignedSrcDst/f_4096_1-4        584.406µ ±  0%   1.002µ ± 1%  -99.83% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_4096_1-4          1.044µ ±  1%   1.040µ ± 2%        ~ (p=0.090 n=10)
MemmoveUnalignedSrcDst/f_4096_4-4       585113.5n ±  0%   988.6n ± 2%  -99.83% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_4096_4-4        586.521µ ±  0%   1.044µ ± 1%  -99.82% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_4096_7-4       585374.5n ±  0%   986.2n ± 0%  -99.83% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_4096_7-4        584.595µ ±  1%   1.055µ ± 0%  -99.82% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_65536_0-4         54.83µ ±  0%   55.00µ ± 0%   +0.31% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_65536_0-4         56.54µ ±  0%   56.64µ ± 0%   +0.19% (p=0.011 n=10)
MemmoveUnalignedSrcDst/f_65536_1-4       9450.51µ ±  0%   58.25µ ± 0%  -99.38% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_65536_1-4         56.65µ ±  0%   56.68µ ± 0%        ~ (p=0.353 n=10)
MemmoveUnalignedSrcDst/f_65536_4-4       9449.48µ ±  0%   58.24µ ± 0%  -99.38% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_65536_4-4       9462.91µ ±  0%   56.69µ ± 0%  -99.40% (p=0.000 n=10)
MemmoveUnalignedSrcDst/f_65536_7-4       9477.37µ ±  0%   58.26µ ± 0%  -99.39% (p=0.000 n=10)
MemmoveUnalignedSrcDst/b_65536_7-4       9467.96µ ±  0%   56.68µ ± 0%  -99.40% (p=0.000 n=10)
geomean                                    11.16µ         509.8n       -95.43%

Change-Id: Idfa1873b81fece3b2b1a0aed398fa5663cc73b83
Reviewed-on: https://go-review.googlesource.com/c/go/+/498377
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-05-31 19:56:05 +00:00

320 lines
5.5 KiB
ArmAsm

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// See memmove Go doc for important implementation constraints.
// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
// X10 = to
// X11 = from
// X12 = n
BEQ X10, X11, done
BEQZ X12, done
// If the destination is ahead of the source, start at the end of the
// buffer and go backward.
BGTU X10, X11, backward
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, f_loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X5
AND $7, X11, X6
BNE X5, X6, f_loop8_unaligned_check
BEQZ X5, f_loop_check
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X9, X5
SUB X5, X12, X12
f_align:
ADD $-1, X5
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
BNEZ X5, f_align
f_loop_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
MOV $32, X9
BLT X12, X9, f_loop16_check
MOV $64, X9
BLT X12, X9, f_loop32_check
f_loop64:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
ADD $64, X10
ADD $64, X11
ADD $-64, X12
BGE X12, X9, f_loop64
BEQZ X12, done
f_loop32_check:
MOV $32, X9
BLT X12, X9, f_loop16_check
f_loop32:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
ADD $32, X10
ADD $32, X11
ADD $-32, X12
BGE X12, X9, f_loop32
BEQZ X12, done
f_loop16_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
f_loop16:
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
ADD $16, X10
ADD $16, X11
ADD $-16, X12
BGE X12, X9, f_loop16
BEQZ X12, done
f_loop8_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8:
MOV 0(X11), X14
MOV X14, 0(X10)
ADD $8, X10
ADD $8, X11
ADD $-8, X12
BGE X12, X9, f_loop8
BEQZ X12, done
JMP f_loop4_check
f_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8_unaligned:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
ADD $8, X10
ADD $8, X11
ADD $-8, X12
BGE X12, X9, f_loop8_unaligned
f_loop4_check:
MOV $4, X9
BLT X12, X9, f_loop1
f_loop4:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
ADD $4, X10
ADD $4, X11
ADD $-4, X12
BGE X12, X9, f_loop4
f_loop1:
BEQZ X12, done
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
ADD $-1, X12
JMP f_loop1
backward:
ADD X10, X12, X10
ADD X11, X12, X11
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, b_loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X5
AND $7, X11, X6
BNE X5, X6, b_loop8_unaligned_check
BEQZ X5, b_loop_check
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X12, X12
b_align:
ADD $-1, X5
ADD $-1, X10
ADD $-1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
BNEZ X5, b_align
b_loop_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
MOV $32, X9
BLT X12, X9, b_loop16_check
MOV $64, X9
BLT X12, X9, b_loop32_check
b_loop64:
ADD $-64, X10
ADD $-64, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
ADD $-64, X12
BGE X12, X9, b_loop64
BEQZ X12, done
b_loop32_check:
MOV $32, X9
BLT X12, X9, b_loop16_check
b_loop32:
ADD $-32, X10
ADD $-32, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
ADD $-32, X12
BGE X12, X9, b_loop32
BEQZ X12, done
b_loop16_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
b_loop16:
ADD $-16, X10
ADD $-16, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
ADD $-16, X12
BGE X12, X9, b_loop16
BEQZ X12, done
b_loop8_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8:
ADD $-8, X10
ADD $-8, X11
MOV 0(X11), X14
MOV X14, 0(X10)
ADD $-8, X12
BGE X12, X9, b_loop8
BEQZ X12, done
JMP b_loop4_check
b_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8_unaligned:
ADD $-8, X10
ADD $-8, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
ADD $-8, X12
BGE X12, X9, b_loop8_unaligned
b_loop4_check:
MOV $4, X9
BLT X12, X9, b_loop1
b_loop4:
ADD $-4, X10
ADD $-4, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
ADD $-4, X12
BGE X12, X9, b_loop4
b_loop1:
BEQZ X12, done
ADD $-1, X10
ADD $-1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $-1, X12
JMP b_loop1
done:
RET