mirror of
https://github.com/golang/go
synced 2024-11-07 12:36:27 -07:00
d5388e23b5
Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Martin Möhrmann <moehrmann@google.com> Reviewed-by: eric fang <eric.fang@arm.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
598 lines
12 KiB
Go
598 lines
12 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package runtime_test
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"internal/race"
|
|
"internal/testenv"
|
|
. "runtime"
|
|
"sync/atomic"
|
|
"testing"
|
|
"unsafe"
|
|
)
|
|
|
|
func TestMemmove(t *testing.T) {
|
|
if *flagQuick {
|
|
t.Skip("-quick")
|
|
}
|
|
t.Parallel()
|
|
size := 256
|
|
if testing.Short() {
|
|
size = 128 + 16
|
|
}
|
|
src := make([]byte, size)
|
|
dst := make([]byte, size)
|
|
for i := 0; i < size; i++ {
|
|
src[i] = byte(128 + (i & 127))
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
dst[i] = byte(i & 127)
|
|
}
|
|
for n := 0; n <= size; n++ {
|
|
for x := 0; x <= size-n; x++ { // offset in src
|
|
for y := 0; y <= size-n; y++ { // offset in dst
|
|
copy(dst[y:y+n], src[x:x+n])
|
|
for i := 0; i < y; i++ {
|
|
if dst[i] != byte(i&127) {
|
|
t.Fatalf("prefix dst[%d] = %d", i, dst[i])
|
|
}
|
|
}
|
|
for i := y; i < y+n; i++ {
|
|
if dst[i] != byte(128+((i-y+x)&127)) {
|
|
t.Fatalf("copied dst[%d] = %d", i, dst[i])
|
|
}
|
|
dst[i] = byte(i & 127) // reset dst
|
|
}
|
|
for i := y + n; i < size; i++ {
|
|
if dst[i] != byte(i&127) {
|
|
t.Fatalf("suffix dst[%d] = %d", i, dst[i])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestMemmoveAlias(t *testing.T) {
|
|
if *flagQuick {
|
|
t.Skip("-quick")
|
|
}
|
|
t.Parallel()
|
|
size := 256
|
|
if testing.Short() {
|
|
size = 128 + 16
|
|
}
|
|
buf := make([]byte, size)
|
|
for i := 0; i < size; i++ {
|
|
buf[i] = byte(i)
|
|
}
|
|
for n := 0; n <= size; n++ {
|
|
for x := 0; x <= size-n; x++ { // src offset
|
|
for y := 0; y <= size-n; y++ { // dst offset
|
|
copy(buf[y:y+n], buf[x:x+n])
|
|
for i := 0; i < y; i++ {
|
|
if buf[i] != byte(i) {
|
|
t.Fatalf("prefix buf[%d] = %d", i, buf[i])
|
|
}
|
|
}
|
|
for i := y; i < y+n; i++ {
|
|
if buf[i] != byte(i-y+x) {
|
|
t.Fatalf("copied buf[%d] = %d", i, buf[i])
|
|
}
|
|
buf[i] = byte(i) // reset buf
|
|
}
|
|
for i := y + n; i < size; i++ {
|
|
if buf[i] != byte(i) {
|
|
t.Fatalf("suffix buf[%d] = %d", i, buf[i])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestMemmoveLarge0x180000(t *testing.T) {
|
|
if testing.Short() && testenv.Builder() == "" {
|
|
t.Skip("-short")
|
|
}
|
|
|
|
t.Parallel()
|
|
if race.Enabled {
|
|
t.Skip("skipping large memmove test under race detector")
|
|
}
|
|
testSize(t, 0x180000)
|
|
}
|
|
|
|
func TestMemmoveOverlapLarge0x120000(t *testing.T) {
|
|
if testing.Short() && testenv.Builder() == "" {
|
|
t.Skip("-short")
|
|
}
|
|
|
|
t.Parallel()
|
|
if race.Enabled {
|
|
t.Skip("skipping large memmove test under race detector")
|
|
}
|
|
testOverlap(t, 0x120000)
|
|
}
|
|
|
|
func testSize(t *testing.T, size int) {
|
|
src := make([]byte, size)
|
|
dst := make([]byte, size)
|
|
_, _ = rand.Read(src)
|
|
_, _ = rand.Read(dst)
|
|
|
|
ref := make([]byte, size)
|
|
copyref(ref, dst)
|
|
|
|
for n := size - 50; n > 1; n >>= 1 {
|
|
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
|
|
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
|
|
copy(dst[y:y+n], src[x:x+n])
|
|
copyref(ref[y:y+n], src[x:x+n])
|
|
p := cmpb(dst, ref)
|
|
if p >= 0 {
|
|
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func testOverlap(t *testing.T, size int) {
|
|
src := make([]byte, size)
|
|
test := make([]byte, size)
|
|
ref := make([]byte, size)
|
|
_, _ = rand.Read(src)
|
|
|
|
for n := size - 50; n > 1; n >>= 1 {
|
|
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
|
|
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
|
|
// Reset input
|
|
copyref(test, src)
|
|
copyref(ref, src)
|
|
copy(test[y:y+n], test[x:x+n])
|
|
if y <= x {
|
|
copyref(ref[y:y+n], ref[x:x+n])
|
|
} else {
|
|
copybw(ref[y:y+n], ref[x:x+n])
|
|
}
|
|
p := cmpb(test, ref)
|
|
if p >= 0 {
|
|
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// Forward copy.
|
|
func copyref(dst, src []byte) {
|
|
for i, v := range src {
|
|
dst[i] = v
|
|
}
|
|
}
|
|
|
|
// Backwards copy
|
|
func copybw(dst, src []byte) {
|
|
if len(src) == 0 {
|
|
return
|
|
}
|
|
for i := len(src) - 1; i >= 0; i-- {
|
|
dst[i] = src[i]
|
|
}
|
|
}
|
|
|
|
// Returns offset of difference
|
|
func matchLen(a, b []byte, max int) int {
|
|
a = a[:max]
|
|
b = b[:max]
|
|
for i, av := range a {
|
|
if b[i] != av {
|
|
return i
|
|
}
|
|
}
|
|
return max
|
|
}
|
|
|
|
func cmpb(a, b []byte) int {
|
|
l := matchLen(a, b, len(a))
|
|
if l == len(a) {
|
|
return -1
|
|
}
|
|
return l
|
|
}
|
|
|
|
// Ensure that memmove writes pointers atomically, so the GC won't
|
|
// observe a partially updated pointer.
|
|
func TestMemmoveAtomicity(t *testing.T) {
|
|
if race.Enabled {
|
|
t.Skip("skip under the race detector -- this test is intentionally racy")
|
|
}
|
|
|
|
var x int
|
|
|
|
for _, backward := range []bool{true, false} {
|
|
for _, n := range []int{3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 49} {
|
|
n := n
|
|
|
|
// test copying [N]*int.
|
|
sz := uintptr(n * PtrSize)
|
|
name := fmt.Sprint(sz)
|
|
if backward {
|
|
name += "-backward"
|
|
} else {
|
|
name += "-forward"
|
|
}
|
|
t.Run(name, func(t *testing.T) {
|
|
// Use overlapping src and dst to force forward/backward copy.
|
|
var s [100]*int
|
|
src := s[n-1 : 2*n-1]
|
|
dst := s[:n]
|
|
if backward {
|
|
src, dst = dst, src
|
|
}
|
|
for i := range src {
|
|
src[i] = &x
|
|
}
|
|
for i := range dst {
|
|
dst[i] = nil
|
|
}
|
|
|
|
var ready uint32
|
|
go func() {
|
|
sp := unsafe.Pointer(&src[0])
|
|
dp := unsafe.Pointer(&dst[0])
|
|
atomic.StoreUint32(&ready, 1)
|
|
for i := 0; i < 10000; i++ {
|
|
Memmove(dp, sp, sz)
|
|
MemclrNoHeapPointers(dp, sz)
|
|
}
|
|
atomic.StoreUint32(&ready, 2)
|
|
}()
|
|
|
|
for atomic.LoadUint32(&ready) == 0 {
|
|
Gosched()
|
|
}
|
|
|
|
for atomic.LoadUint32(&ready) != 2 {
|
|
for i := range dst {
|
|
p := dst[i]
|
|
if p != nil && p != &x {
|
|
t.Fatalf("got partially updated pointer %p at dst[%d], want either nil or %p", p, i, &x)
|
|
}
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
|
|
for _, n := range sizes {
|
|
b.Run(fmt.Sprint(n), func(b *testing.B) {
|
|
b.SetBytes(int64(n))
|
|
fn(b, n)
|
|
})
|
|
}
|
|
}
|
|
|
|
var bufSizes = []int{
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
|
32, 64, 128, 256, 512, 1024, 2048, 4096,
|
|
}
|
|
var bufSizesOverlap = []int{
|
|
32, 64, 128, 256, 512, 1024, 2048, 4096,
|
|
}
|
|
|
|
func BenchmarkMemmove(b *testing.B) {
|
|
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
|
|
x := make([]byte, n)
|
|
y := make([]byte, n)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x, y)
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemmoveOverlap(b *testing.B) {
|
|
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
|
|
x := make([]byte, n+16)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x[16:n+16], x[:n])
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemmoveUnalignedDst(b *testing.B) {
|
|
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
|
|
x := make([]byte, n+1)
|
|
y := make([]byte, n)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x[1:], y)
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
|
|
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
|
|
x := make([]byte, n+16)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x[16:n+16], x[1:n+1])
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
|
|
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
|
|
x := make([]byte, n)
|
|
y := make([]byte, n+1)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x, y[1:])
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
|
|
benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
|
|
x := make([]byte, n+1)
|
|
for i := 0; i < b.N; i++ {
|
|
copy(x[1:n+1], x[:n])
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestMemclr(t *testing.T) {
|
|
size := 512
|
|
if testing.Short() {
|
|
size = 128 + 16
|
|
}
|
|
mem := make([]byte, size)
|
|
for i := 0; i < size; i++ {
|
|
mem[i] = 0xee
|
|
}
|
|
for n := 0; n < size; n++ {
|
|
for x := 0; x <= size-n; x++ { // offset in mem
|
|
MemclrBytes(mem[x : x+n])
|
|
for i := 0; i < x; i++ {
|
|
if mem[i] != 0xee {
|
|
t.Fatalf("overwrite prefix mem[%d] = %d", i, mem[i])
|
|
}
|
|
}
|
|
for i := x; i < x+n; i++ {
|
|
if mem[i] != 0 {
|
|
t.Fatalf("failed clear mem[%d] = %d", i, mem[i])
|
|
}
|
|
mem[i] = 0xee
|
|
}
|
|
for i := x + n; i < size; i++ {
|
|
if mem[i] != 0xee {
|
|
t.Fatalf("overwrite suffix mem[%d] = %d", i, mem[i])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func BenchmarkMemclr(b *testing.B) {
|
|
for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
|
|
x := make([]byte, n)
|
|
b.Run(fmt.Sprint(n), func(b *testing.B) {
|
|
b.SetBytes(int64(n))
|
|
for i := 0; i < b.N; i++ {
|
|
MemclrBytes(x)
|
|
}
|
|
})
|
|
}
|
|
for _, m := range []int{1, 4, 8, 16, 64} {
|
|
x := make([]byte, m<<20)
|
|
b.Run(fmt.Sprint(m, "M"), func(b *testing.B) {
|
|
b.SetBytes(int64(m << 20))
|
|
for i := 0; i < b.N; i++ {
|
|
MemclrBytes(x)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkGoMemclr(b *testing.B) {
|
|
benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
|
|
x := make([]byte, n)
|
|
for i := 0; i < b.N; i++ {
|
|
for j := range x {
|
|
x[j] = 0
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkClearFat8(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [8 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat12(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [12 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat16(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [16 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat24(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [24 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat32(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [32 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat40(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [40 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat48(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [48 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat56(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [56 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat64(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [64 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat128(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [128 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat256(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [256 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat512(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [512 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
func BenchmarkClearFat1024(b *testing.B) {
|
|
for i := 0; i < b.N; i++ {
|
|
var x [1024 / 4]uint32
|
|
_ = x
|
|
}
|
|
}
|
|
|
|
func BenchmarkCopyFat8(b *testing.B) {
|
|
var x [8 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat12(b *testing.B) {
|
|
var x [12 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat16(b *testing.B) {
|
|
var x [16 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat24(b *testing.B) {
|
|
var x [24 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat32(b *testing.B) {
|
|
var x [32 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat64(b *testing.B) {
|
|
var x [64 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat128(b *testing.B) {
|
|
var x [128 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat256(b *testing.B) {
|
|
var x [256 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat512(b *testing.B) {
|
|
var x [512 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat520(b *testing.B) {
|
|
var x [520 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
func BenchmarkCopyFat1024(b *testing.B) {
|
|
var x [1024 / 4]uint32
|
|
for i := 0; i < b.N; i++ {
|
|
y := x
|
|
_ = y
|
|
}
|
|
}
|
|
|
|
// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
|
|
// It used to do 2 2-byte load/stores, which leads to a pipeline stall
|
|
// when we try to read the result with one 4-byte load.
|
|
func BenchmarkIssue18740(b *testing.B) {
|
|
benchmarks := []struct {
|
|
name string
|
|
nbyte int
|
|
f func([]byte) uint64
|
|
}{
|
|
{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
|
|
{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
|
|
{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
|
|
}
|
|
|
|
var g [4096]byte
|
|
for _, bm := range benchmarks {
|
|
buf := make([]byte, bm.nbyte)
|
|
b.Run(bm.name, func(b *testing.B) {
|
|
for j := 0; j < b.N; j++ {
|
|
for i := 0; i < 4096; i += bm.nbyte {
|
|
copy(buf[:], g[i:])
|
|
sink += bm.f(buf[:])
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|