1
0
mirror of https://github.com/golang/go synced 2024-11-07 12:36:27 -07:00
go/src/runtime/memmove_test.go
Jonathan Swinney d5388e23b5 runtime: improve memmove performance on arm64
Replace the memmove implementation for moves of 17 bytes or larger
with an implementation from ARM optimized software. The moves of 16
bytes or fewer are unchanged, but the registers used are updated to
match the rest of the implementation.

This implementation makes use of new optimizations:
 - software pipelined loop for large (>128 byte) moves
 - medium size moves (17..128 bytes) have a new implementation
 - address realignment when src or dst is unaligned
 - preference for aligned src (loads) or dst (stores) depending on CPU

To support preference for aligned loads or aligned stores, a new CPU
flag is added. This flag indicates that the detected micro
architecture performs better with aligned loads. Some tested CPUs did
not exhibit a significant difference and are left with the default
behavior of realigning based on the destination address (stores).

Neoverse N1 (Tested on Graviton 2)
name                               old time/op    new time/op     delta
Memmove/0-4                          1.88ns ± 1%     1.87ns ± 1%   -0.58%  (p=0.020 n=10+10)
Memmove/1-4                          4.40ns ± 0%     4.40ns ± 0%     ~     (all equal)
Memmove/8-4                          3.88ns ± 3%     3.80ns ± 0%   -1.97%  (p=0.001 n=10+9)
Memmove/16-4                         3.90ns ± 3%     3.80ns ± 0%   -2.49%  (p=0.000 n=10+9)
Memmove/32-4                         4.80ns ± 0%     4.40ns ± 0%   -8.33%  (p=0.000 n=9+8)
Memmove/64-4                         5.86ns ± 0%     5.00ns ± 0%  -14.76%  (p=0.000 n=8+8)
Memmove/128-4                        8.46ns ± 0%     8.06ns ± 0%   -4.62%  (p=0.000 n=10+10)
Memmove/256-4                        12.4ns ± 0%     12.2ns ± 0%   -1.61%  (p=0.000 n=10+10)
Memmove/512-4                        19.5ns ± 0%     19.1ns ± 0%   -2.05%  (p=0.000 n=10+10)
Memmove/1024-4                       33.7ns ± 0%     33.5ns ± 0%   -0.59%  (p=0.000 n=10+10)
Memmove/2048-4                       62.1ns ± 0%     59.0ns ± 0%   -4.99%  (p=0.000 n=10+10)
Memmove/4096-4                        117ns ± 1%      110ns ± 0%   -5.66%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             6.41ns ± 0%     5.62ns ± 0%  -12.32%  (p=0.000 n=10+7)
MemmoveUnalignedDst/128-4            9.40ns ± 0%     8.34ns ± 0%  -11.24%  (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4            12.8ns ± 0%     12.8ns ± 0%     ~     (all equal)
MemmoveUnalignedDst/512-4            20.4ns ± 0%     19.7ns ± 0%   -3.43%  (p=0.000 n=9+10)
MemmoveUnalignedDst/1024-4           34.1ns ± 0%     35.1ns ± 0%   +2.93%  (p=0.000 n=9+9)
MemmoveUnalignedDst/2048-4           61.5ns ± 0%     60.4ns ± 0%   -1.77%  (p=0.000 n=10+10)
MemmoveUnalignedDst/4096-4            122ns ± 0%      113ns ± 0%   -7.38%  (p=0.002 n=8+10)
MemmoveUnalignedSrc/64-4             7.25ns ± 1%     6.26ns ± 0%  -13.64%  (p=0.000 n=9+9)
MemmoveUnalignedSrc/128-4            10.5ns ± 0%      9.7ns ± 0%   -7.52%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4            17.1ns ± 0%     17.3ns ± 0%   +1.17%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/512-4            27.0ns ± 0%     27.0ns ± 0%     ~     (all equal)
MemmoveUnalignedSrc/1024-4           46.7ns ± 0%     35.7ns ± 0%  -23.55%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/2048-4           85.2ns ± 0%     61.2ns ± 0%  -28.17%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/4096-4            162ns ± 0%      113ns ± 0%  -30.25%  (p=0.000 n=10+10)

name                               old speed      new speed       delta
Memmove/4096-4                     35.2GB/s ± 0%   37.1GB/s ± 0%   +5.56%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/1024-4         21.9GB/s ± 0%   28.7GB/s ± 0%  +30.90%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/2048-4         24.0GB/s ± 0%   33.5GB/s ± 0%  +39.18%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/4096-4         25.3GB/s ± 0%   36.2GB/s ± 0%  +43.50%  (p=0.000 n=10+7)

Cortex-A72 (Graviton 1)
name                               old time/op    new time/op    delta
Memmove/0-4                          3.06ns ± 3%    3.08ns ± 1%     ~     (p=0.958 n=10+9)
Memmove/1-4                          8.72ns ± 0%    7.85ns ± 0%   -9.98%  (p=0.002 n=8+10)
Memmove/8-4                          8.29ns ± 0%    8.29ns ± 0%     ~     (all equal)
Memmove/16-4                         8.29ns ± 0%    8.29ns ± 0%     ~     (all equal)
Memmove/32-4                         8.19ns ± 2%    8.29ns ± 0%     ~     (p=0.114 n=10+10)
Memmove/64-4                         18.3ns ± 4%    10.0ns ± 0%  -45.36%  (p=0.000 n=10+10)
Memmove/128-4                        14.8ns ± 0%    17.4ns ± 0%  +17.77%  (p=0.000 n=10+10)
Memmove/256-4                        21.8ns ± 0%    23.1ns ± 0%   +5.96%  (p=0.000 n=10+10)
Memmove/512-4                        35.8ns ± 0%    37.2ns ± 0%   +3.91%  (p=0.000 n=10+10)
Memmove/1024-4                       63.7ns ± 0%    67.2ns ± 0%   +5.49%  (p=0.000 n=10+10)
Memmove/2048-4                        126ns ± 0%     123ns ± 0%   -2.38%  (p=0.000 n=10+10)
Memmove/4096-4                        238ns ± 1%     243ns ± 1%   +1.93%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             19.3ns ± 1%    12.0ns ± 1%  -37.49%  (p=0.000 n=10+10)
MemmoveUnalignedDst/128-4            17.2ns ± 0%    17.4ns ± 0%   +1.16%  (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4            28.2ns ± 8%    29.2ns ± 0%     ~     (p=0.352 n=10+10)
MemmoveUnalignedDst/512-4            49.8ns ± 3%    48.9ns ± 0%     ~     (p=1.000 n=10+10)
MemmoveUnalignedDst/1024-4           89.5ns ± 0%    80.5ns ± 1%  -10.02%  (p=0.000 n=10+10)
MemmoveUnalignedDst/2048-4            180ns ± 0%     127ns ± 0%  -29.44%  (p=0.000 n=9+10)
MemmoveUnalignedDst/4096-4            347ns ± 0%     244ns ± 0%  -29.59%  (p=0.000 n=10+9)
MemmoveUnalignedSrc/128-4            16.1ns ± 0%    21.8ns ± 0%  +35.40%  (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4            24.9ns ± 8%    26.6ns ± 0%   +6.70%  (p=0.015 n=10+10)
MemmoveUnalignedSrc/512-4            39.4ns ± 6%    40.6ns ± 0%     ~     (p=0.352 n=10+10)
MemmoveUnalignedSrc/1024-4           72.5ns ± 0%    83.0ns ± 1%  +14.44%  (p=0.000 n=9+10)
MemmoveUnalignedSrc/2048-4            129ns ± 1%     128ns ± 1%     ~     (p=0.179 n=10+10)
MemmoveUnalignedSrc/4096-4            241ns ± 0%     253ns ± 1%   +4.99%  (p=0.000 n=9+9)

Cortex-A53 (Raspberry Pi 3)
name                               old time/op    new time/op    delta
Memmove/0-4                          11.0ns ± 0%    11.0ns ± 1%     ~     (p=0.294 n=8+10)
Memmove/1-4                          29.6ns ± 0%    28.0ns ± 1%   -5.41%  (p=0.000 n=9+10)
Memmove/8-4                          23.5ns ± 0%    22.1ns ± 0%   -6.11%  (p=0.000 n=8+8)
Memmove/16-4                         23.7ns ± 1%    22.1ns ± 0%   -6.59%  (p=0.000 n=10+8)
Memmove/32-4                         27.9ns ± 0%    27.1ns ± 0%   -3.13%  (p=0.000 n=8+8)
Memmove/64-4                         33.8ns ± 0%    31.5ns ± 1%   -6.99%  (p=0.000 n=8+10)
Memmove/128-4                        45.6ns ± 0%    44.2ns ± 1%   -3.23%  (p=0.000 n=9+10)
Memmove/256-4                        69.3ns ± 0%    69.3ns ± 0%     ~     (p=0.072 n=8+8)
Memmove/512-4                         127ns ± 0%     110ns ± 0%  -13.39%  (p=0.000 n=8+8)
Memmove/1024-4                        222ns ± 0%     205ns ± 1%   -7.66%  (p=0.000 n=7+10)
Memmove/2048-4                        411ns ± 0%     366ns ± 0%  -10.98%  (p=0.000 n=8+9)
Memmove/4096-4                        795ns ± 1%     695ns ± 1%  -12.63%  (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4             44.0ns ± 0%    40.5ns ± 0%   -7.93%  (p=0.000 n=8+8)
MemmoveUnalignedDst/128-4            59.6ns ± 0%    54.9ns ± 0%   -7.85%  (p=0.000 n=9+9)
MemmoveUnalignedDst/256-4            98.2ns ±11%    90.0ns ± 1%     ~     (p=0.130 n=10+10)
MemmoveUnalignedDst/512-4             161ns ± 2%     145ns ± 1%   -9.96%  (p=0.000 n=10+10)
MemmoveUnalignedDst/1024-4            281ns ± 0%     265ns ± 0%   -5.65%  (p=0.000 n=9+8)
MemmoveUnalignedDst/2048-4            528ns ± 0%     482ns ± 0%   -8.73%  (p=0.000 n=8+9)
MemmoveUnalignedDst/4096-4           1.02µs ± 1%    0.92µs ± 0%  -10.00%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/64-4             42.4ns ± 1%    40.5ns ± 0%   -4.39%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/128-4            57.4ns ± 0%    57.0ns ± 1%   -0.75%  (p=0.048 n=9+10)
MemmoveUnalignedSrc/256-4            88.1ns ± 1%    89.6ns ± 0%   +1.70%  (p=0.000 n=9+8)
MemmoveUnalignedSrc/512-4             160ns ± 2%     144ns ± 0%   -9.89%  (p=0.000 n=10+8)
MemmoveUnalignedSrc/1024-4            286ns ± 0%     266ns ± 1%   -6.69%  (p=0.000 n=8+10)
MemmoveUnalignedSrc/2048-4            525ns ± 0%     483ns ± 1%   -7.96%  (p=0.000 n=9+10)
MemmoveUnalignedSrc/4096-4           1.01µs ± 0%    0.92µs ± 1%   -9.40%  (p=0.000 n=8+10)

Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/243357
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-11-02 15:23:43 +00:00

598 lines
12 KiB
Go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime_test
import (
"crypto/rand"
"encoding/binary"
"fmt"
"internal/race"
"internal/testenv"
. "runtime"
"sync/atomic"
"testing"
"unsafe"
)
func TestMemmove(t *testing.T) {
if *flagQuick {
t.Skip("-quick")
}
t.Parallel()
size := 256
if testing.Short() {
size = 128 + 16
}
src := make([]byte, size)
dst := make([]byte, size)
for i := 0; i < size; i++ {
src[i] = byte(128 + (i & 127))
}
for i := 0; i < size; i++ {
dst[i] = byte(i & 127)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // offset in src
for y := 0; y <= size-n; y++ { // offset in dst
copy(dst[y:y+n], src[x:x+n])
for i := 0; i < y; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("prefix dst[%d] = %d", i, dst[i])
}
}
for i := y; i < y+n; i++ {
if dst[i] != byte(128+((i-y+x)&127)) {
t.Fatalf("copied dst[%d] = %d", i, dst[i])
}
dst[i] = byte(i & 127) // reset dst
}
for i := y + n; i < size; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("suffix dst[%d] = %d", i, dst[i])
}
}
}
}
}
}
func TestMemmoveAlias(t *testing.T) {
if *flagQuick {
t.Skip("-quick")
}
t.Parallel()
size := 256
if testing.Short() {
size = 128 + 16
}
buf := make([]byte, size)
for i := 0; i < size; i++ {
buf[i] = byte(i)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // src offset
for y := 0; y <= size-n; y++ { // dst offset
copy(buf[y:y+n], buf[x:x+n])
for i := 0; i < y; i++ {
if buf[i] != byte(i) {
t.Fatalf("prefix buf[%d] = %d", i, buf[i])
}
}
for i := y; i < y+n; i++ {
if buf[i] != byte(i-y+x) {
t.Fatalf("copied buf[%d] = %d", i, buf[i])
}
buf[i] = byte(i) // reset buf
}
for i := y + n; i < size; i++ {
if buf[i] != byte(i) {
t.Fatalf("suffix buf[%d] = %d", i, buf[i])
}
}
}
}
}
}
func TestMemmoveLarge0x180000(t *testing.T) {
if testing.Short() && testenv.Builder() == "" {
t.Skip("-short")
}
t.Parallel()
if race.Enabled {
t.Skip("skipping large memmove test under race detector")
}
testSize(t, 0x180000)
}
func TestMemmoveOverlapLarge0x120000(t *testing.T) {
if testing.Short() && testenv.Builder() == "" {
t.Skip("-short")
}
t.Parallel()
if race.Enabled {
t.Skip("skipping large memmove test under race detector")
}
testOverlap(t, 0x120000)
}
func testSize(t *testing.T, size int) {
src := make([]byte, size)
dst := make([]byte, size)
_, _ = rand.Read(src)
_, _ = rand.Read(dst)
ref := make([]byte, size)
copyref(ref, dst)
for n := size - 50; n > 1; n >>= 1 {
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
copy(dst[y:y+n], src[x:x+n])
copyref(ref[y:y+n], src[x:x+n])
p := cmpb(dst, ref)
if p >= 0 {
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
}
}
}
}
}
func testOverlap(t *testing.T, size int) {
src := make([]byte, size)
test := make([]byte, size)
ref := make([]byte, size)
_, _ = rand.Read(src)
for n := size - 50; n > 1; n >>= 1 {
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
// Reset input
copyref(test, src)
copyref(ref, src)
copy(test[y:y+n], test[x:x+n])
if y <= x {
copyref(ref[y:y+n], ref[x:x+n])
} else {
copybw(ref[y:y+n], ref[x:x+n])
}
p := cmpb(test, ref)
if p >= 0 {
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
}
}
}
}
}
// Forward copy.
func copyref(dst, src []byte) {
for i, v := range src {
dst[i] = v
}
}
// Backwards copy
func copybw(dst, src []byte) {
if len(src) == 0 {
return
}
for i := len(src) - 1; i >= 0; i-- {
dst[i] = src[i]
}
}
// Returns offset of difference
func matchLen(a, b []byte, max int) int {
a = a[:max]
b = b[:max]
for i, av := range a {
if b[i] != av {
return i
}
}
return max
}
func cmpb(a, b []byte) int {
l := matchLen(a, b, len(a))
if l == len(a) {
return -1
}
return l
}
// Ensure that memmove writes pointers atomically, so the GC won't
// observe a partially updated pointer.
func TestMemmoveAtomicity(t *testing.T) {
if race.Enabled {
t.Skip("skip under the race detector -- this test is intentionally racy")
}
var x int
for _, backward := range []bool{true, false} {
for _, n := range []int{3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 49} {
n := n
// test copying [N]*int.
sz := uintptr(n * PtrSize)
name := fmt.Sprint(sz)
if backward {
name += "-backward"
} else {
name += "-forward"
}
t.Run(name, func(t *testing.T) {
// Use overlapping src and dst to force forward/backward copy.
var s [100]*int
src := s[n-1 : 2*n-1]
dst := s[:n]
if backward {
src, dst = dst, src
}
for i := range src {
src[i] = &x
}
for i := range dst {
dst[i] = nil
}
var ready uint32
go func() {
sp := unsafe.Pointer(&src[0])
dp := unsafe.Pointer(&dst[0])
atomic.StoreUint32(&ready, 1)
for i := 0; i < 10000; i++ {
Memmove(dp, sp, sz)
MemclrNoHeapPointers(dp, sz)
}
atomic.StoreUint32(&ready, 2)
}()
for atomic.LoadUint32(&ready) == 0 {
Gosched()
}
for atomic.LoadUint32(&ready) != 2 {
for i := range dst {
p := dst[i]
if p != nil && p != &x {
t.Fatalf("got partially updated pointer %p at dst[%d], want either nil or %p", p, i, &x)
}
}
}
})
}
}
}
func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
for _, n := range sizes {
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n))
fn(b, n)
})
}
}
var bufSizes = []int{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
var bufSizesOverlap = []int{
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
func BenchmarkMemmove(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
y := make([]byte, n)
for i := 0; i < b.N; i++ {
copy(x, y)
}
})
}
func BenchmarkMemmoveOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+16)
for i := 0; i < b.N; i++ {
copy(x[16:n+16], x[:n])
}
})
}
func BenchmarkMemmoveUnalignedDst(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n+1)
y := make([]byte, n)
for i := 0; i < b.N; i++ {
copy(x[1:], y)
}
})
}
func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+16)
for i := 0; i < b.N; i++ {
copy(x[16:n+16], x[1:n+1])
}
})
}
func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
y := make([]byte, n+1)
for i := 0; i < b.N; i++ {
copy(x, y[1:])
}
})
}
func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
x := make([]byte, n+1)
for i := 0; i < b.N; i++ {
copy(x[1:n+1], x[:n])
}
})
}
func TestMemclr(t *testing.T) {
size := 512
if testing.Short() {
size = 128 + 16
}
mem := make([]byte, size)
for i := 0; i < size; i++ {
mem[i] = 0xee
}
for n := 0; n < size; n++ {
for x := 0; x <= size-n; x++ { // offset in mem
MemclrBytes(mem[x : x+n])
for i := 0; i < x; i++ {
if mem[i] != 0xee {
t.Fatalf("overwrite prefix mem[%d] = %d", i, mem[i])
}
}
for i := x; i < x+n; i++ {
if mem[i] != 0 {
t.Fatalf("failed clear mem[%d] = %d", i, mem[i])
}
mem[i] = 0xee
}
for i := x + n; i < size; i++ {
if mem[i] != 0xee {
t.Fatalf("overwrite suffix mem[%d] = %d", i, mem[i])
}
}
}
}
}
func BenchmarkMemclr(b *testing.B) {
for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
x := make([]byte, n)
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
MemclrBytes(x)
}
})
}
for _, m := range []int{1, 4, 8, 16, 64} {
x := make([]byte, m<<20)
b.Run(fmt.Sprint(m, "M"), func(b *testing.B) {
b.SetBytes(int64(m << 20))
for i := 0; i < b.N; i++ {
MemclrBytes(x)
}
})
}
}
func BenchmarkGoMemclr(b *testing.B) {
benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
x := make([]byte, n)
for i := 0; i < b.N; i++ {
for j := range x {
x[j] = 0
}
}
})
}
func BenchmarkClearFat8(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [8 / 4]uint32
_ = x
}
}
func BenchmarkClearFat12(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [12 / 4]uint32
_ = x
}
}
func BenchmarkClearFat16(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [16 / 4]uint32
_ = x
}
}
func BenchmarkClearFat24(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [24 / 4]uint32
_ = x
}
}
func BenchmarkClearFat32(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [32 / 4]uint32
_ = x
}
}
func BenchmarkClearFat40(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [40 / 4]uint32
_ = x
}
}
func BenchmarkClearFat48(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [48 / 4]uint32
_ = x
}
}
func BenchmarkClearFat56(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [56 / 4]uint32
_ = x
}
}
func BenchmarkClearFat64(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [64 / 4]uint32
_ = x
}
}
func BenchmarkClearFat128(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [128 / 4]uint32
_ = x
}
}
func BenchmarkClearFat256(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [256 / 4]uint32
_ = x
}
}
func BenchmarkClearFat512(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [512 / 4]uint32
_ = x
}
}
func BenchmarkClearFat1024(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [1024 / 4]uint32
_ = x
}
}
func BenchmarkCopyFat8(b *testing.B) {
var x [8 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat12(b *testing.B) {
var x [12 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat16(b *testing.B) {
var x [16 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat24(b *testing.B) {
var x [24 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat32(b *testing.B) {
var x [32 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat64(b *testing.B) {
var x [64 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat128(b *testing.B) {
var x [128 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat256(b *testing.B) {
var x [256 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat512(b *testing.B) {
var x [512 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat520(b *testing.B) {
var x [520 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat1024(b *testing.B) {
var x [1024 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
// It used to do 2 2-byte load/stores, which leads to a pipeline stall
// when we try to read the result with one 4-byte load.
func BenchmarkIssue18740(b *testing.B) {
benchmarks := []struct {
name string
nbyte int
f func([]byte) uint64
}{
{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
}
var g [4096]byte
for _, bm := range benchmarks {
buf := make([]byte, bm.nbyte)
b.Run(bm.name, func(b *testing.B) {
for j := 0; j < b.N; j++ {
for i := 0; i < 4096; i += bm.nbyte {
copy(buf[:], g[i:])
sink += bm.f(buf[:])
}
}
})
}
}