1
0
mirror of https://github.com/golang/go synced 2024-11-23 00:40:08 -07:00
go/src/runtime/memmove_riscv64.s

320 lines
5.5 KiB
ArmAsm
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// See memmove Go doc for important implementation constraints.
// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
// X10 = to
// X11 = from
// X12 = n
BEQ X10, X11, done
BEQZ X12, done
// If the destination is ahead of the source, start at the end of the
// buffer and go backward.
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGTU X10, X11, backward
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, f_loop4_check
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
// Check alignment - if alignment differs we have to do one byte at a time.
runtime: fix alignment code in memmove_riscv64.s The riscv64 implementation of memmove has two optimizations that are applied when both source and destination pointers share the same alignment but that alignment is not 8 bytes. Both optimizations attempt to align the source and destination pointers to 8 byte boundaries before performing 8 byte aligned loads and stores. Both optimizations are incorrect. The first optimization is applied when the destination pointer is smaller than the source pointer. In this case the code increments both pointers by (pointer & 3) bytes rather than (8 - (pointer & 7)) bytes. The second optimization is applied when the destination pointer is larger than the source pointer. In this case the existing code decrements the pointers by (pointer & 3) bytes instead of (pointer & 7). This commit fixes both optimizations avoiding unaligned 8 byte accesses. As this particular optimization is not covered by any of the existing benchmarks a new benchmark, BenchmarkMemmoveUnalignedSrcDst, is provided that exercises both optimizations. Results of the new benchmark, which were run on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. MemmoveUnalignedSrcDst/f_16_0-4 39.48n ± 5% 43.47n ± 2% +10.13% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_0-4 45.39n ± 5% 41.55n ± 4% -8.47% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_1-4 1230.50n ± 1% 83.44n ± 5% -93.22% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_1-4 69.34n ± 4% 67.83n ± 8% ~ (p=0.436 n=10) MemmoveUnalignedSrcDst/f_16_4-4 2349.00n ± 1% 72.09n ± 4% -96.93% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_4-4 2357.00n ± 0% 77.61n ± 4% -96.71% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_7-4 1235.00n ± 0% 62.02n ± 2% -94.98% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_7-4 1246.00n ± 0% 84.05n ± 6% -93.25% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_0-4 49.96n ± 2% 50.01n ± 2% ~ (p=0.755 n=10) MemmoveUnalignedSrcDst/b_64_0-4 52.06n ± 3% 51.65n ± 3% ~ (p=0.631 n=10) MemmoveUnalignedSrcDst/f_64_1-4 8105.50n ± 0% 97.63n ± 1% -98.80% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_1-4 84.07n ± 4% 84.90n ± 5% ~ (p=0.315 n=10) MemmoveUnalignedSrcDst/f_64_4-4 9192.00n ± 0% 86.16n ± 3% -99.06% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_4-4 9195.50n ± 1% 91.88n ± 5% -99.00% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_7-4 8106.50n ± 0% 78.44n ± 9% -99.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_7-4 8107.00n ± 0% 99.19n ± 1% -98.78% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_0-4 90.95n ± 1% 92.16n ± 8% ~ (p=0.123 n=10) MemmoveUnalignedSrcDst/b_256_0-4 96.09n ± 12% 94.90n ± 2% ~ (p=0.143 n=10) MemmoveUnalignedSrcDst/f_256_1-4 35492.5n ± 0% 133.5n ± 0% -99.62% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_1-4 128.7n ± 1% 130.1n ± 1% +1.13% (p=0.005 n=10) MemmoveUnalignedSrcDst/f_256_4-4 36599.0n ± 0% 123.0n ± 1% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_4-4 36675.5n ± 0% 130.7n ± 1% -99.64% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_7-4 35555.5n ± 0% 121.6n ± 2% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_7-4 35584.0n ± 0% 139.1n ± 1% -99.61% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_0-4 956.3n ± 2% 960.8n ± 1% ~ (p=0.306 n=10) MemmoveUnalignedSrcDst/b_4096_0-4 1.015µ ± 2% 1.012µ ± 2% ~ (p=0.076 n=10) MemmoveUnalignedSrcDst/f_4096_1-4 584.406µ ± 0% 1.002µ ± 1% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_1-4 1.044µ ± 1% 1.040µ ± 2% ~ (p=0.090 n=10) MemmoveUnalignedSrcDst/f_4096_4-4 585113.5n ± 0% 988.6n ± 2% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_4-4 586.521µ ± 0% 1.044µ ± 1% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_7-4 585374.5n ± 0% 986.2n ± 0% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_7-4 584.595µ ± 1% 1.055µ ± 0% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_0-4 54.83µ ± 0% 55.00µ ± 0% +0.31% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_0-4 56.54µ ± 0% 56.64µ ± 0% +0.19% (p=0.011 n=10) MemmoveUnalignedSrcDst/f_65536_1-4 9450.51µ ± 0% 58.25µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_1-4 56.65µ ± 0% 56.68µ ± 0% ~ (p=0.353 n=10) MemmoveUnalignedSrcDst/f_65536_4-4 9449.48µ ± 0% 58.24µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_4-4 9462.91µ ± 0% 56.69µ ± 0% -99.40% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_7-4 9477.37µ ± 0% 58.26µ ± 0% -99.39% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_7-4 9467.96µ ± 0% 56.68µ ± 0% -99.40% (p=0.000 n=10) geomean 11.16µ 509.8n -95.43% Change-Id: Idfa1873b81fece3b2b1a0aed398fa5663cc73b83 Reviewed-on: https://go-review.googlesource.com/c/go/+/498377 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-05-26 02:51:21 -06:00
AND $7, X10, X5
AND $7, X11, X6
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BNE X5, X6, f_loop8_unaligned_check
BEQZ X5, f_loop_check
// Move one byte at a time until we reach 8 byte alignment.
runtime: fix alignment code in memmove_riscv64.s The riscv64 implementation of memmove has two optimizations that are applied when both source and destination pointers share the same alignment but that alignment is not 8 bytes. Both optimizations attempt to align the source and destination pointers to 8 byte boundaries before performing 8 byte aligned loads and stores. Both optimizations are incorrect. The first optimization is applied when the destination pointer is smaller than the source pointer. In this case the code increments both pointers by (pointer & 3) bytes rather than (8 - (pointer & 7)) bytes. The second optimization is applied when the destination pointer is larger than the source pointer. In this case the existing code decrements the pointers by (pointer & 3) bytes instead of (pointer & 7). This commit fixes both optimizations avoiding unaligned 8 byte accesses. As this particular optimization is not covered by any of the existing benchmarks a new benchmark, BenchmarkMemmoveUnalignedSrcDst, is provided that exercises both optimizations. Results of the new benchmark, which were run on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. MemmoveUnalignedSrcDst/f_16_0-4 39.48n ± 5% 43.47n ± 2% +10.13% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_0-4 45.39n ± 5% 41.55n ± 4% -8.47% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_1-4 1230.50n ± 1% 83.44n ± 5% -93.22% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_1-4 69.34n ± 4% 67.83n ± 8% ~ (p=0.436 n=10) MemmoveUnalignedSrcDst/f_16_4-4 2349.00n ± 1% 72.09n ± 4% -96.93% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_4-4 2357.00n ± 0% 77.61n ± 4% -96.71% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_7-4 1235.00n ± 0% 62.02n ± 2% -94.98% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_7-4 1246.00n ± 0% 84.05n ± 6% -93.25% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_0-4 49.96n ± 2% 50.01n ± 2% ~ (p=0.755 n=10) MemmoveUnalignedSrcDst/b_64_0-4 52.06n ± 3% 51.65n ± 3% ~ (p=0.631 n=10) MemmoveUnalignedSrcDst/f_64_1-4 8105.50n ± 0% 97.63n ± 1% -98.80% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_1-4 84.07n ± 4% 84.90n ± 5% ~ (p=0.315 n=10) MemmoveUnalignedSrcDst/f_64_4-4 9192.00n ± 0% 86.16n ± 3% -99.06% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_4-4 9195.50n ± 1% 91.88n ± 5% -99.00% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_7-4 8106.50n ± 0% 78.44n ± 9% -99.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_7-4 8107.00n ± 0% 99.19n ± 1% -98.78% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_0-4 90.95n ± 1% 92.16n ± 8% ~ (p=0.123 n=10) MemmoveUnalignedSrcDst/b_256_0-4 96.09n ± 12% 94.90n ± 2% ~ (p=0.143 n=10) MemmoveUnalignedSrcDst/f_256_1-4 35492.5n ± 0% 133.5n ± 0% -99.62% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_1-4 128.7n ± 1% 130.1n ± 1% +1.13% (p=0.005 n=10) MemmoveUnalignedSrcDst/f_256_4-4 36599.0n ± 0% 123.0n ± 1% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_4-4 36675.5n ± 0% 130.7n ± 1% -99.64% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_7-4 35555.5n ± 0% 121.6n ± 2% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_7-4 35584.0n ± 0% 139.1n ± 1% -99.61% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_0-4 956.3n ± 2% 960.8n ± 1% ~ (p=0.306 n=10) MemmoveUnalignedSrcDst/b_4096_0-4 1.015µ ± 2% 1.012µ ± 2% ~ (p=0.076 n=10) MemmoveUnalignedSrcDst/f_4096_1-4 584.406µ ± 0% 1.002µ ± 1% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_1-4 1.044µ ± 1% 1.040µ ± 2% ~ (p=0.090 n=10) MemmoveUnalignedSrcDst/f_4096_4-4 585113.5n ± 0% 988.6n ± 2% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_4-4 586.521µ ± 0% 1.044µ ± 1% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_7-4 585374.5n ± 0% 986.2n ± 0% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_7-4 584.595µ ± 1% 1.055µ ± 0% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_0-4 54.83µ ± 0% 55.00µ ± 0% +0.31% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_0-4 56.54µ ± 0% 56.64µ ± 0% +0.19% (p=0.011 n=10) MemmoveUnalignedSrcDst/f_65536_1-4 9450.51µ ± 0% 58.25µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_1-4 56.65µ ± 0% 56.68µ ± 0% ~ (p=0.353 n=10) MemmoveUnalignedSrcDst/f_65536_4-4 9449.48µ ± 0% 58.24µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_4-4 9462.91µ ± 0% 56.69µ ± 0% -99.40% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_7-4 9477.37µ ± 0% 58.26µ ± 0% -99.39% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_7-4 9467.96µ ± 0% 56.68µ ± 0% -99.40% (p=0.000 n=10) geomean 11.16µ 509.8n -95.43% Change-Id: Idfa1873b81fece3b2b1a0aed398fa5663cc73b83 Reviewed-on: https://go-review.googlesource.com/c/go/+/498377 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-05-26 02:51:21 -06:00
SUB X5, X9, X5
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
SUB X5, X12, X12
f_align:
SUB $1, X5
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
BNEZ X5, f_align
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
f_loop_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
MOV $32, X9
BLT X12, X9, f_loop16_check
MOV $64, X9
BLT X12, X9, f_loop32_check
f_loop64:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
ADD $64, X10
ADD $64, X11
SUB $64, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop64
BEQZ X12, done
f_loop32_check:
MOV $32, X9
BLT X12, X9, f_loop16_check
f_loop32:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
ADD $32, X10
ADD $32, X11
SUB $32, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop32
BEQZ X12, done
f_loop16_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
f_loop16:
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
ADD $16, X10
ADD $16, X11
SUB $16, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop16
BEQZ X12, done
f_loop8_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8:
MOV 0(X11), X14
MOV X14, 0(X10)
ADD $8, X10
ADD $8, X11
SUB $8, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop8
BEQZ X12, done
JMP f_loop4_check
f_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8_unaligned:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
ADD $8, X10
ADD $8, X11
SUB $8, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop8_unaligned
f_loop4_check:
MOV $4, X9
BLT X12, X9, f_loop1
f_loop4:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
ADD $4, X10
ADD $4, X11
SUB $4, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, f_loop4
f_loop1:
BEQZ X12, done
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
SUB $1, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
JMP f_loop1
backward:
ADD X10, X12, X10
ADD X11, X12, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, b_loop4_check
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
// Check alignment - if alignment differs we have to do one byte at a time.
runtime: fix alignment code in memmove_riscv64.s The riscv64 implementation of memmove has two optimizations that are applied when both source and destination pointers share the same alignment but that alignment is not 8 bytes. Both optimizations attempt to align the source and destination pointers to 8 byte boundaries before performing 8 byte aligned loads and stores. Both optimizations are incorrect. The first optimization is applied when the destination pointer is smaller than the source pointer. In this case the code increments both pointers by (pointer & 3) bytes rather than (8 - (pointer & 7)) bytes. The second optimization is applied when the destination pointer is larger than the source pointer. In this case the existing code decrements the pointers by (pointer & 3) bytes instead of (pointer & 7). This commit fixes both optimizations avoiding unaligned 8 byte accesses. As this particular optimization is not covered by any of the existing benchmarks a new benchmark, BenchmarkMemmoveUnalignedSrcDst, is provided that exercises both optimizations. Results of the new benchmark, which were run on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. MemmoveUnalignedSrcDst/f_16_0-4 39.48n ± 5% 43.47n ± 2% +10.13% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_0-4 45.39n ± 5% 41.55n ± 4% -8.47% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_1-4 1230.50n ± 1% 83.44n ± 5% -93.22% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_1-4 69.34n ± 4% 67.83n ± 8% ~ (p=0.436 n=10) MemmoveUnalignedSrcDst/f_16_4-4 2349.00n ± 1% 72.09n ± 4% -96.93% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_4-4 2357.00n ± 0% 77.61n ± 4% -96.71% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_7-4 1235.00n ± 0% 62.02n ± 2% -94.98% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_7-4 1246.00n ± 0% 84.05n ± 6% -93.25% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_0-4 49.96n ± 2% 50.01n ± 2% ~ (p=0.755 n=10) MemmoveUnalignedSrcDst/b_64_0-4 52.06n ± 3% 51.65n ± 3% ~ (p=0.631 n=10) MemmoveUnalignedSrcDst/f_64_1-4 8105.50n ± 0% 97.63n ± 1% -98.80% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_1-4 84.07n ± 4% 84.90n ± 5% ~ (p=0.315 n=10) MemmoveUnalignedSrcDst/f_64_4-4 9192.00n ± 0% 86.16n ± 3% -99.06% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_4-4 9195.50n ± 1% 91.88n ± 5% -99.00% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_7-4 8106.50n ± 0% 78.44n ± 9% -99.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_7-4 8107.00n ± 0% 99.19n ± 1% -98.78% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_0-4 90.95n ± 1% 92.16n ± 8% ~ (p=0.123 n=10) MemmoveUnalignedSrcDst/b_256_0-4 96.09n ± 12% 94.90n ± 2% ~ (p=0.143 n=10) MemmoveUnalignedSrcDst/f_256_1-4 35492.5n ± 0% 133.5n ± 0% -99.62% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_1-4 128.7n ± 1% 130.1n ± 1% +1.13% (p=0.005 n=10) MemmoveUnalignedSrcDst/f_256_4-4 36599.0n ± 0% 123.0n ± 1% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_4-4 36675.5n ± 0% 130.7n ± 1% -99.64% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_7-4 35555.5n ± 0% 121.6n ± 2% -99.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_7-4 35584.0n ± 0% 139.1n ± 1% -99.61% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_0-4 956.3n ± 2% 960.8n ± 1% ~ (p=0.306 n=10) MemmoveUnalignedSrcDst/b_4096_0-4 1.015µ ± 2% 1.012µ ± 2% ~ (p=0.076 n=10) MemmoveUnalignedSrcDst/f_4096_1-4 584.406µ ± 0% 1.002µ ± 1% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_1-4 1.044µ ± 1% 1.040µ ± 2% ~ (p=0.090 n=10) MemmoveUnalignedSrcDst/f_4096_4-4 585113.5n ± 0% 988.6n ± 2% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_4-4 586.521µ ± 0% 1.044µ ± 1% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_7-4 585374.5n ± 0% 986.2n ± 0% -99.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_7-4 584.595µ ± 1% 1.055µ ± 0% -99.82% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_0-4 54.83µ ± 0% 55.00µ ± 0% +0.31% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_0-4 56.54µ ± 0% 56.64µ ± 0% +0.19% (p=0.011 n=10) MemmoveUnalignedSrcDst/f_65536_1-4 9450.51µ ± 0% 58.25µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_1-4 56.65µ ± 0% 56.68µ ± 0% ~ (p=0.353 n=10) MemmoveUnalignedSrcDst/f_65536_4-4 9449.48µ ± 0% 58.24µ ± 0% -99.38% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_4-4 9462.91µ ± 0% 56.69µ ± 0% -99.40% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_7-4 9477.37µ ± 0% 58.26µ ± 0% -99.39% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_7-4 9467.96µ ± 0% 56.68µ ± 0% -99.40% (p=0.000 n=10) geomean 11.16µ 509.8n -95.43% Change-Id: Idfa1873b81fece3b2b1a0aed398fa5663cc73b83 Reviewed-on: https://go-review.googlesource.com/c/go/+/498377 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-05-26 02:51:21 -06:00
AND $7, X10, X5
AND $7, X11, X6
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BNE X5, X6, b_loop8_unaligned_check
BEQZ X5, b_loop_check
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X12, X12
b_align:
SUB $1, X5
SUB $1, X10
SUB $1, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOVB 0(X11), X14
MOVB X14, 0(X10)
BNEZ X5, b_align
b_loop_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
MOV $32, X9
BLT X12, X9, b_loop16_check
MOV $64, X9
BLT X12, X9, b_loop32_check
b_loop64:
SUB $64, X10
SUB $64, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
SUB $64, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop64
BEQZ X12, done
b_loop32_check:
MOV $32, X9
BLT X12, X9, b_loop16_check
b_loop32:
SUB $32, X10
SUB $32, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
SUB $32, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop32
BEQZ X12, done
b_loop16_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
b_loop16:
SUB $16, X10
SUB $16, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
SUB $16, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop16
BEQZ X12, done
b_loop8_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8:
SUB $8, X10
SUB $8, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOV 0(X11), X14
MOV X14, 0(X10)
SUB $8, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop8
BEQZ X12, done
JMP b_loop4_check
b_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8_unaligned:
SUB $8, X10
SUB $8, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
SUB $8, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop8_unaligned
b_loop4_check:
MOV $4, X9
BLT X12, X9, b_loop1
b_loop4:
SUB $4, X10
SUB $4, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
SUB $4, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
BGE X12, X9, b_loop4
b_loop1:
BEQZ X12, done
SUB $1, X10
SUB $1, X11
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
MOVB 0(X11), X14
MOVB X14, 0(X10)
SUB $1, X12
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
JMP b_loop1
runtime: optimise memmove on riscv64 Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-27 13:35:31 -06:00
done:
RET