1
0
mirror of https://github.com/golang/go synced 2024-11-20 00:34:43 -07:00
go/src/runtime/memmove_test.go
Denis Nagorny d7507e9d11 runtime: improve memmove for amd64
Use AVX if available on 4th generation of Intel(TM) Core(TM) processors.

    (collected on E5 2609v3 @1.9GHz)
    name                        old speed      new speed       delta
    Memmove/1-6                  158MB/s ± 0%    172MB/s ± 0%    +9.09% (p=0.000 n=16+16)
    Memmove/2-6                  316MB/s ± 0%    345MB/s ± 0%    +9.09% (p=0.000 n=18+16)
    Memmove/3-6                  517MB/s ± 0%    517MB/s ± 0%      ~ (p=0.445 n=16+16)
    Memmove/4-6                  687MB/s ± 1%    690MB/s ± 0%    +0.35% (p=0.000 n=20+17)
    Memmove/5-6                  729MB/s ± 0%    729MB/s ± 0%    +0.01% (p=0.000 n=16+18)
    Memmove/6-6                  875MB/s ± 0%    875MB/s ± 0%    +0.01% (p=0.000 n=18+18)
    Memmove/7-6                 1.02GB/s ± 0%   1.02GB/s ± 1%      ~ (p=0.139 n=19+20)
    Memmove/8-6                 1.26GB/s ± 0%   1.26GB/s ± 0%    +0.00% (p=0.000 n=18+18)
    Memmove/9-6                 1.42GB/s ± 0%   1.42GB/s ± 0%    +0.00% (p=0.000 n=17+18)
    Memmove/10-6                1.58GB/s ± 0%   1.58GB/s ± 0%    +0.00% (p=0.000 n=19+19)
    Memmove/11-6                1.74GB/s ± 0%   1.74GB/s ± 0%    +0.00% (p=0.001 n=18+17)
    Memmove/12-6                1.90GB/s ± 0%   1.90GB/s ± 0%    +0.00% (p=0.000 n=19+19)
    Memmove/13-6                2.05GB/s ± 0%   2.05GB/s ± 0%    +0.00% (p=0.000 n=18+19)
    Memmove/14-6                2.21GB/s ± 0%   2.21GB/s ± 0%    +0.00% (p=0.000 n=16+20)
    Memmove/15-6                2.37GB/s ± 0%   2.37GB/s ± 0%    +0.00% (p=0.004 n=19+20)
    Memmove/16-6                2.53GB/s ± 0%   2.53GB/s ± 0%    +0.00% (p=0.000 n=16+16)
    Memmove/32-6                4.67GB/s ± 0%   4.67GB/s ± 0%    +0.00% (p=0.000 n=17+17)
    Memmove/64-6                8.67GB/s ± 0%   8.64GB/s ± 0%    -0.33% (p=0.000 n=18+17)
    Memmove/128-6               12.6GB/s ± 0%   11.6GB/s ± 0%    -8.05% (p=0.000 n=16+19)
    Memmove/256-6               16.3GB/s ± 0%   16.6GB/s ± 0%    +1.66% (p=0.000 n=20+18)
    Memmove/512-6               21.5GB/s ± 0%   24.4GB/s ± 0%   +13.35% (p=0.000 n=18+17)
    Memmove/1024-6              24.7GB/s ± 0%   33.7GB/s ± 0%   +36.12% (p=0.000 n=18+18)
    Memmove/2048-6              27.3GB/s ± 0%   43.3GB/s ± 0%   +58.77% (p=0.000 n=19+17)
    Memmove/4096-6              37.5GB/s ± 0%   50.5GB/s ± 0%   +34.56% (p=0.000 n=19+19)
    MemmoveUnalignedDst/1-6      135MB/s ± 0%    146MB/s ± 0%    +7.69% (p=0.000 n=16+14)
    MemmoveUnalignedDst/2-6      271MB/s ± 0%    292MB/s ± 0%    +7.69% (p=0.000 n=18+18)
    MemmoveUnalignedDst/3-6      438MB/s ± 0%    438MB/s ± 0%      ~ (p=0.352 n=16+19)
    MemmoveUnalignedDst/4-6      584MB/s ± 0%    584MB/s ± 0%      ~ (p=0.876 n=17+17)
    MemmoveUnalignedDst/5-6      631MB/s ± 1%    632MB/s ± 0%    +0.25% (p=0.000 n=20+17)
    MemmoveUnalignedDst/6-6      759MB/s ± 0%    759MB/s ± 0%    +0.00% (p=0.000 n=19+16)
    MemmoveUnalignedDst/7-6      885MB/s ± 0%    883MB/s ± 1%      ~ (p=0.647 n=18+20)
    MemmoveUnalignedDst/8-6     1.08GB/s ± 0%   1.08GB/s ± 0%    +0.00% (p=0.035 n=19+18)
    MemmoveUnalignedDst/9-6     1.22GB/s ± 0%   1.22GB/s ± 0%      ~ (p=0.251 n=18+17)
    MemmoveUnalignedDst/10-6    1.35GB/s ± 0%   1.35GB/s ± 0%      ~ (p=0.327 n=17+18)
    MemmoveUnalignedDst/11-6    1.49GB/s ± 0%   1.49GB/s ± 0%      ~ (p=0.531 n=18+19)
    MemmoveUnalignedDst/12-6    1.63GB/s ± 0%   1.63GB/s ± 0%      ~ (p=0.886 n=19+18)
    MemmoveUnalignedDst/13-6    1.76GB/s ± 0%   1.76GB/s ± 1%    -0.24% (p=0.006 n=18+20)
    MemmoveUnalignedDst/14-6    1.90GB/s ± 0%   1.90GB/s ± 0%      ~ (p=0.818 n=20+19)
    MemmoveUnalignedDst/15-6    2.03GB/s ± 0%   2.03GB/s ± 0%      ~ (p=0.294 n=17+16)
    MemmoveUnalignedDst/16-6    2.17GB/s ± 0%   2.17GB/s ± 0%      ~ (p=0.602 n=16+18)
    MemmoveUnalignedDst/32-6    4.05GB/s ± 0%   4.05GB/s ± 0%    +0.00% (p=0.010 n=18+17)
    MemmoveUnalignedDst/64-6    7.59GB/s ± 0%   7.59GB/s ± 0%    +0.00% (p=0.022 n=18+16)
    MemmoveUnalignedDst/128-6   11.1GB/s ± 0%   11.4GB/s ± 0%    +2.79% (p=0.000 n=18+17)
    MemmoveUnalignedDst/256-6   16.4GB/s ± 0%   16.7GB/s ± 0%    +1.59% (p=0.000 n=20+17)
    MemmoveUnalignedDst/512-6   15.7GB/s ± 0%   21.3GB/s ± 0%   +35.87% (p=0.000 n=18+20)
    MemmoveUnalignedDst/1024-6  16.0GB/s ±20%   31.5GB/s ± 0%   +96.93% (p=0.000 n=20+14)
    MemmoveUnalignedDst/2048-6  19.6GB/s ± 0%   42.1GB/s ± 0%  +115.16% (p=0.000 n=17+18)
    MemmoveUnalignedDst/4096-6  6.41GB/s ± 0%  33.18GB/s ± 0%  +417.56% (p=0.000 n=17+18)
    MemmoveUnalignedSrc/1-6      171MB/s ± 0%    166MB/s ± 0%    -3.33% (p=0.000 n=19+16)
    MemmoveUnalignedSrc/2-6      343MB/s ± 0%    342MB/s ± 1%    -0.41% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/3-6      508MB/s ± 0%    493MB/s ± 1%    -2.90% (p=0.000 n=17+17)
    MemmoveUnalignedSrc/4-6      677MB/s ± 0%    660MB/s ± 2%    -2.55% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/5-6      790MB/s ± 0%    790MB/s ± 0%      ~ (p=0.139 n=17+17)
    MemmoveUnalignedSrc/6-6      948MB/s ± 0%    946MB/s ± 1%      ~ (p=0.330 n=17+19)
    MemmoveUnalignedSrc/7-6     1.11GB/s ± 0%   1.11GB/s ± 0%    -0.05% (p=0.026 n=17+17)
    MemmoveUnalignedSrc/8-6     1.38GB/s ± 0%   1.38GB/s ± 0%      ~ (p=0.091 n=18+16)
    MemmoveUnalignedSrc/9-6     1.42GB/s ± 0%   1.40GB/s ± 1%    -1.04% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/10-6    1.58GB/s ± 0%   1.56GB/s ± 1%    -1.15% (p=0.000 n=18+19)
    MemmoveUnalignedSrc/11-6    1.73GB/s ± 0%   1.71GB/s ± 1%    -1.30% (p=0.000 n=20+20)
    MemmoveUnalignedSrc/12-6    1.89GB/s ± 0%   1.87GB/s ± 1%    -1.18% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/13-6    2.05GB/s ± 0%   2.02GB/s ± 1%    -1.18% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/14-6    2.21GB/s ± 0%   2.18GB/s ± 1%    -1.14% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/15-6    2.36GB/s ± 0%   2.34GB/s ± 1%    -1.04% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/16-6    2.52GB/s ± 0%   2.49GB/s ± 1%    -1.26% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/32-6    4.82GB/s ± 0%   4.61GB/s ± 0%    -4.40% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/64-6    5.03GB/s ± 4%   7.97GB/s ± 0%   +58.55% (p=0.000 n=20+16)
    MemmoveUnalignedSrc/128-6   11.1GB/s ± 0%   11.2GB/s ± 0%    +0.52% (p=0.000 n=17+18)
    MemmoveUnalignedSrc/256-6   16.5GB/s ± 0%   16.4GB/s ± 0%    -0.10% (p=0.000 n=20+18)
    MemmoveUnalignedSrc/512-6   21.0GB/s ± 0%   22.1GB/s ± 0%    +5.48% (p=0.000 n=14+17)
    MemmoveUnalignedSrc/1024-6  24.9GB/s ± 0%   31.9GB/s ± 0%   +28.20% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/2048-6  23.3GB/s ± 0%   33.8GB/s ± 0%   +45.22% (p=0.000 n=17+19)
    MemmoveUnalignedSrc/4096-6  37.3GB/s ± 0%   42.7GB/s ± 0%   +14.30% (p=0.000 n=17+17)

Change-Id: Id66aa3e499ccfb117cb99d623ef326b50d057b64
Reviewed-on: https://go-review.googlesource.com/29590
Run-TryBot: Denis Nagorny <denis.nagorny@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2016-10-06 10:21:58 +00:00

446 lines
8.6 KiB
Go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime_test
import (
"crypto/rand"
"fmt"
"internal/race"
. "runtime"
"testing"
)
func TestMemmove(t *testing.T) {
size := 256
if testing.Short() {
size = 128 + 16
}
src := make([]byte, size)
dst := make([]byte, size)
for i := 0; i < size; i++ {
src[i] = byte(128 + (i & 127))
}
for i := 0; i < size; i++ {
dst[i] = byte(i & 127)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // offset in src
for y := 0; y <= size-n; y++ { // offset in dst
copy(dst[y:y+n], src[x:x+n])
for i := 0; i < y; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("prefix dst[%d] = %d", i, dst[i])
}
}
for i := y; i < y+n; i++ {
if dst[i] != byte(128+((i-y+x)&127)) {
t.Fatalf("copied dst[%d] = %d", i, dst[i])
}
dst[i] = byte(i & 127) // reset dst
}
for i := y + n; i < size; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("suffix dst[%d] = %d", i, dst[i])
}
}
}
}
}
}
func TestMemmoveAlias(t *testing.T) {
size := 256
if testing.Short() {
size = 128 + 16
}
buf := make([]byte, size)
for i := 0; i < size; i++ {
buf[i] = byte(i)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // src offset
for y := 0; y <= size-n; y++ { // dst offset
copy(buf[y:y+n], buf[x:x+n])
for i := 0; i < y; i++ {
if buf[i] != byte(i) {
t.Fatalf("prefix buf[%d] = %d", i, buf[i])
}
}
for i := y; i < y+n; i++ {
if buf[i] != byte(i-y+x) {
t.Fatalf("copied buf[%d] = %d", i, buf[i])
}
buf[i] = byte(i) // reset buf
}
for i := y + n; i < size; i++ {
if buf[i] != byte(i) {
t.Fatalf("suffix buf[%d] = %d", i, buf[i])
}
}
}
}
}
}
func TestMemmoveLarge0x180000(t *testing.T) {
if race.Enabled {
t.Skip("skipping large memmove test under race detector")
}
testSize(t, 0x180000)
}
func TestMemmoveOverlapLarge0x120000(t *testing.T) {
if race.Enabled {
t.Skip("skipping large memmove test under race detector")
}
testOverlap(t, 0x120000)
}
func testSize(t *testing.T, size int) {
src := make([]byte, size)
dst := make([]byte, size)
_, _ = rand.Read(src)
_, _ = rand.Read(dst)
ref := make([]byte, size)
copyref(ref, dst)
for n := size - 50; n > 1; n >>= 1 {
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
copy(dst[y:y+n], src[x:x+n])
copyref(ref[y:y+n], src[x:x+n])
p := cmpb(dst, ref)
if p >= 0 {
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
}
}
}
}
}
func testOverlap(t *testing.T, size int) {
src := make([]byte, size)
test := make([]byte, size)
ref := make([]byte, size)
_, _ = rand.Read(src)
for n := size - 50; n > 1; n >>= 1 {
for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
// Reset input
copyref(test, src)
copyref(ref, src)
copy(test[y:y+n], test[x:x+n])
if y <= x {
copyref(ref[y:y+n], ref[x:x+n])
} else {
copybw(ref[y:y+n], ref[x:x+n])
}
p := cmpb(test, ref)
if p >= 0 {
t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
}
}
}
}
}
// Forward copy.
func copyref(dst, src []byte) {
for i, v := range src {
dst[i] = v
}
}
// Backwards copy
func copybw(dst, src []byte) {
if len(src) == 0 {
return
}
for i := len(src) - 1; i >= 0; i-- {
dst[i] = src[i]
}
}
// Returns offset of difference
func matchLen(a, b []byte, max int) int {
a = a[:max]
b = b[:max]
for i, av := range a {
if b[i] != av {
return i
}
}
return max
}
func cmpb(a, b []byte) int {
l := matchLen(a, b, len(a))
if l == len(a) {
return -1
}
return l
}
func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
for _, n := range sizes {
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n))
fn(b, n)
})
}
}
var bufSizes = []int{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
func BenchmarkMemmove(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
y := make([]byte, n)
for i := 0; i < b.N; i++ {
copy(x, y)
}
})
}
func BenchmarkMemmoveUnalignedDst(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n+1)
y := make([]byte, n)
for i := 0; i < b.N; i++ {
copy(x[1:], y)
}
})
}
func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
y := make([]byte, n+1)
for i := 0; i < b.N; i++ {
copy(x, y[1:])
}
})
}
func TestMemclr(t *testing.T) {
size := 512
if testing.Short() {
size = 128 + 16
}
mem := make([]byte, size)
for i := 0; i < size; i++ {
mem[i] = 0xee
}
for n := 0; n < size; n++ {
for x := 0; x <= size-n; x++ { // offset in mem
MemclrBytes(mem[x : x+n])
for i := 0; i < x; i++ {
if mem[i] != 0xee {
t.Fatalf("overwrite prefix mem[%d] = %d", i, mem[i])
}
}
for i := x; i < x+n; i++ {
if mem[i] != 0 {
t.Fatalf("failed clear mem[%d] = %d", i, mem[i])
}
mem[i] = 0xee
}
for i := x + n; i < size; i++ {
if mem[i] != 0xee {
t.Fatalf("overwrite suffix mem[%d] = %d", i, mem[i])
}
}
}
}
}
func BenchmarkMemclr(b *testing.B) {
for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
x := make([]byte, n)
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
MemclrBytes(x)
}
})
}
for _, m := range []int{1, 4, 8, 16, 64} {
x := make([]byte, m<<20)
b.Run(fmt.Sprint(m, "M"), func(b *testing.B) {
b.SetBytes(int64(m << 20))
for i := 0; i < b.N; i++ {
MemclrBytes(x)
}
})
}
}
func BenchmarkGoMemclr(b *testing.B) {
benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
x := make([]byte, n)
for i := 0; i < b.N; i++ {
for j := range x {
x[j] = 0
}
}
})
}
func BenchmarkClearFat8(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [8 / 4]uint32
_ = x
}
}
func BenchmarkClearFat12(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [12 / 4]uint32
_ = x
}
}
func BenchmarkClearFat16(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [16 / 4]uint32
_ = x
}
}
func BenchmarkClearFat24(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [24 / 4]uint32
_ = x
}
}
func BenchmarkClearFat32(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [32 / 4]uint32
_ = x
}
}
func BenchmarkClearFat40(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [40 / 4]uint32
_ = x
}
}
func BenchmarkClearFat48(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [48 / 4]uint32
_ = x
}
}
func BenchmarkClearFat56(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [56 / 4]uint32
_ = x
}
}
func BenchmarkClearFat64(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [64 / 4]uint32
_ = x
}
}
func BenchmarkClearFat128(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [128 / 4]uint32
_ = x
}
}
func BenchmarkClearFat256(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [256 / 4]uint32
_ = x
}
}
func BenchmarkClearFat512(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [512 / 4]uint32
_ = x
}
}
func BenchmarkClearFat1024(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [1024 / 4]uint32
_ = x
}
}
func BenchmarkCopyFat8(b *testing.B) {
var x [8 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat12(b *testing.B) {
var x [12 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat16(b *testing.B) {
var x [16 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat24(b *testing.B) {
var x [24 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat32(b *testing.B) {
var x [32 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat64(b *testing.B) {
var x [64 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat128(b *testing.B) {
var x [128 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat256(b *testing.B) {
var x [256 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat512(b *testing.B) {
var x [512 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat1024(b *testing.B) {
var x [1024 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}