mirror of
https://github.com/golang/go
synced 2024-11-26 07:47:57 -07:00
runtime: faster x86 memmove (a.k.a. built-in copy())
REP instructions have a high startup cost, so we handle small sizes with some straightline code. The REP MOVSx instructions are really fast for large sizes. The cutover is approximately 1K. We implement up to 128/256 because that is the maximum SSE register load (loading all data into registers before any stores lets us ignore copy direction). (on a Sandy Bridge E5-1650 @ 3.20GHz) benchmark old ns/op new ns/op delta BenchmarkMemmove0 3 3 +0.86% BenchmarkMemmove1 5 5 +5.40% BenchmarkMemmove2 18 8 -56.84% BenchmarkMemmove3 18 7 -58.45% BenchmarkMemmove4 36 7 -78.63% BenchmarkMemmove5 36 8 -77.91% BenchmarkMemmove6 36 8 -77.76% BenchmarkMemmove7 36 8 -77.82% BenchmarkMemmove8 18 8 -56.33% BenchmarkMemmove9 18 7 -58.34% BenchmarkMemmove10 18 7 -58.34% BenchmarkMemmove11 18 7 -58.45% BenchmarkMemmove12 36 7 -78.51% BenchmarkMemmove13 36 7 -78.48% BenchmarkMemmove14 36 7 -78.56% BenchmarkMemmove15 36 7 -78.56% BenchmarkMemmove16 18 7 -58.24% BenchmarkMemmove32 18 8 -54.33% BenchmarkMemmove64 18 8 -53.37% BenchmarkMemmove128 20 9 -55.93% BenchmarkMemmove256 25 11 -55.16% BenchmarkMemmove512 33 33 -1.19% BenchmarkMemmove1024 43 44 +2.06% BenchmarkMemmove2048 61 61 +0.16% BenchmarkMemmove4096 95 95 +0.00% R=golang-dev, bradfitz, remyoudompheng, khr, iant, dominik.honnef CC=golang-dev https://golang.org/cl/9038048
This commit is contained in:
parent
af1dd56d1b
commit
6021449236
@ -27,6 +27,34 @@ TEXT runtime·memmove(SB), 7, $0
|
||||
MOVL to+0(FP), DI
|
||||
MOVL fr+4(FP), SI
|
||||
MOVL n+8(FP), BX
|
||||
|
||||
// REP instructions have a high startup cost, so we handle small sizes
|
||||
// with some straightline code. The REP MOVSL instruction is really fast
|
||||
// for large sizes. The cutover is approximately 1K. We implement up to
|
||||
// 128 because that is the maximum SSE register load (loading all data
|
||||
// into registers lets us ignore copy direction).
|
||||
tail:
|
||||
TESTL BX, BX
|
||||
JEQ move_0
|
||||
CMPL BX, $2
|
||||
JBE move_1or2
|
||||
CMPL BX, $4
|
||||
JBE move_3or4
|
||||
CMPL BX, $8
|
||||
JBE move_5through8
|
||||
CMPL BX, $16
|
||||
JBE move_9through16
|
||||
TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
|
||||
JEQ nosse2
|
||||
CMPL BX, $32
|
||||
JBE move_17through32
|
||||
CMPL BX, $64
|
||||
JBE move_33through64
|
||||
CMPL BX, $128
|
||||
JBE move_65through128
|
||||
// TODO: use branch table and BSR to make this just a single dispatch
|
||||
|
||||
nosse2:
|
||||
/*
|
||||
* check and set for backwards
|
||||
*/
|
||||
@ -42,11 +70,7 @@ forward:
|
||||
ANDL $3, BX
|
||||
|
||||
REP; MOVSL
|
||||
MOVL BX, CX
|
||||
REP; MOVSB
|
||||
|
||||
MOVL to+0(FP),AX
|
||||
RET
|
||||
JMP tail
|
||||
/*
|
||||
* check overlap
|
||||
*/
|
||||
@ -75,12 +99,73 @@ back:
|
||||
SUBL $4, SI
|
||||
REP; MOVSL
|
||||
|
||||
ADDL $3, DI
|
||||
ADDL $3, SI
|
||||
MOVL BX, CX
|
||||
REP; MOVSB
|
||||
|
||||
CLD
|
||||
MOVL to+0(FP),AX
|
||||
RET
|
||||
ADDL $4, DI
|
||||
ADDL $4, SI
|
||||
SUBL BX, DI
|
||||
SUBL BX, SI
|
||||
JMP tail
|
||||
|
||||
move_1or2:
|
||||
MOVB (SI), AX
|
||||
MOVB -1(SI)(BX*1), CX
|
||||
MOVB AX, (DI)
|
||||
MOVB CX, -1(DI)(BX*1)
|
||||
move_0:
|
||||
RET
|
||||
move_3or4:
|
||||
MOVW (SI), AX
|
||||
MOVW -2(SI)(BX*1), CX
|
||||
MOVW AX, (DI)
|
||||
MOVW CX, -2(DI)(BX*1)
|
||||
RET
|
||||
move_5through8:
|
||||
MOVL (SI), AX
|
||||
MOVL -4(SI)(BX*1), CX
|
||||
MOVL AX, (DI)
|
||||
MOVL CX, -4(DI)(BX*1)
|
||||
RET
|
||||
move_9through16:
|
||||
MOVL (SI), AX
|
||||
MOVL 4(SI), CX
|
||||
MOVL -8(SI)(BX*1), DX
|
||||
MOVL -4(SI)(BX*1), BP
|
||||
MOVL AX, (DI)
|
||||
MOVL CX, 4(DI)
|
||||
MOVL DX, -8(DI)(BX*1)
|
||||
MOVL BP, -4(DI)(BX*1)
|
||||
RET
|
||||
move_17through32:
|
||||
MOVOU (SI), X0
|
||||
MOVOU -16(SI)(BX*1), X1
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, -16(DI)(BX*1)
|
||||
RET
|
||||
move_33through64:
|
||||
MOVOU (SI), X0
|
||||
MOVOU 16(SI), X1
|
||||
MOVOU -32(SI)(BX*1), X2
|
||||
MOVOU -16(SI)(BX*1), X3
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, 16(DI)
|
||||
MOVOU X2, -32(DI)(BX*1)
|
||||
MOVOU X3, -16(DI)(BX*1)
|
||||
RET
|
||||
move_65through128:
|
||||
MOVOU (SI), X0
|
||||
MOVOU 16(SI), X1
|
||||
MOVOU 32(SI), X2
|
||||
MOVOU 48(SI), X3
|
||||
MOVOU -64(SI)(BX*1), X4
|
||||
MOVOU -48(SI)(BX*1), X5
|
||||
MOVOU -32(SI)(BX*1), X6
|
||||
MOVOU -16(SI)(BX*1), X7
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, 16(DI)
|
||||
MOVOU X2, 32(DI)
|
||||
MOVOU X3, 48(DI)
|
||||
MOVOU X4, -64(DI)(BX*1)
|
||||
MOVOU X5, -48(DI)(BX*1)
|
||||
MOVOU X6, -32(DI)(BX*1)
|
||||
MOVOU X7, -16(DI)(BX*1)
|
||||
RET
|
||||
|
@ -30,6 +30,32 @@ TEXT runtime·memmove(SB), 7, $0
|
||||
MOVQ fr+8(FP), SI
|
||||
MOVQ n+16(FP), BX
|
||||
|
||||
// REP instructions have a high startup cost, so we handle small sizes
|
||||
// with some straightline code. The REP MOVSQ instruction is really fast
|
||||
// for large sizes. The cutover is approximately 1K. We implement up to
|
||||
// 256 because that is the maximum SSE register load (loading all data
|
||||
// into registers lets us ignore copy direction).
|
||||
tail:
|
||||
TESTQ BX, BX
|
||||
JEQ move_0
|
||||
CMPQ BX, $2
|
||||
JBE move_1or2
|
||||
CMPQ BX, $4
|
||||
JBE move_3or4
|
||||
CMPQ BX, $8
|
||||
JBE move_5through8
|
||||
CMPQ BX, $16
|
||||
JBE move_9through16
|
||||
CMPQ BX, $32
|
||||
JBE move_17through32
|
||||
CMPQ BX, $64
|
||||
JBE move_33through64
|
||||
CMPQ BX, $128
|
||||
JBE move_65through128
|
||||
CMPQ BX, $256
|
||||
JBE move_129through256
|
||||
// TODO: use branch table and BSR to make this just a single dispatch
|
||||
|
||||
/*
|
||||
* check and set for backwards
|
||||
*/
|
||||
@ -45,11 +71,8 @@ forward:
|
||||
ANDQ $7, BX
|
||||
|
||||
REP; MOVSQ
|
||||
MOVQ BX, CX
|
||||
REP; MOVSB
|
||||
JMP tail
|
||||
|
||||
MOVQ to+0(FP),AX
|
||||
RET
|
||||
back:
|
||||
/*
|
||||
* check overlap
|
||||
@ -78,12 +101,103 @@ back:
|
||||
SUBQ $8, SI
|
||||
REP; MOVSQ
|
||||
|
||||
ADDQ $7, DI
|
||||
ADDQ $7, SI
|
||||
MOVQ BX, CX
|
||||
REP; MOVSB
|
||||
|
||||
CLD
|
||||
MOVQ to+0(FP),AX
|
||||
RET
|
||||
ADDQ $8, DI
|
||||
ADDQ $8, SI
|
||||
SUBQ BX, DI
|
||||
SUBQ BX, SI
|
||||
JMP tail
|
||||
|
||||
move_1or2:
|
||||
MOVB (SI), AX
|
||||
MOVB -1(SI)(BX*1), CX
|
||||
MOVB AX, (DI)
|
||||
MOVB CX, -1(DI)(BX*1)
|
||||
move_0:
|
||||
RET
|
||||
move_3or4:
|
||||
MOVW (SI), AX
|
||||
MOVW -2(SI)(BX*1), CX
|
||||
MOVW AX, (DI)
|
||||
MOVW CX, -2(DI)(BX*1)
|
||||
RET
|
||||
move_5through8:
|
||||
MOVL (SI), AX
|
||||
MOVL -4(SI)(BX*1), CX
|
||||
MOVL AX, (DI)
|
||||
MOVL CX, -4(DI)(BX*1)
|
||||
RET
|
||||
move_9through16:
|
||||
MOVQ (SI), AX
|
||||
MOVQ -8(SI)(BX*1), CX
|
||||
MOVQ AX, (DI)
|
||||
MOVQ CX, -8(DI)(BX*1)
|
||||
RET
|
||||
move_17through32:
|
||||
MOVOU (SI), X0
|
||||
MOVOU -16(SI)(BX*1), X1
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, -16(DI)(BX*1)
|
||||
RET
|
||||
move_33through64:
|
||||
MOVOU (SI), X0
|
||||
MOVOU 16(SI), X1
|
||||
MOVOU -32(SI)(BX*1), X2
|
||||
MOVOU -16(SI)(BX*1), X3
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, 16(DI)
|
||||
MOVOU X2, -32(DI)(BX*1)
|
||||
MOVOU X3, -16(DI)(BX*1)
|
||||
RET
|
||||
move_65through128:
|
||||
MOVOU (SI), X0
|
||||
MOVOU 16(SI), X1
|
||||
MOVOU 32(SI), X2
|
||||
MOVOU 48(SI), X3
|
||||
MOVOU -64(SI)(BX*1), X4
|
||||
MOVOU -48(SI)(BX*1), X5
|
||||
MOVOU -32(SI)(BX*1), X6
|
||||
MOVOU -16(SI)(BX*1), X7
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, 16(DI)
|
||||
MOVOU X2, 32(DI)
|
||||
MOVOU X3, 48(DI)
|
||||
MOVOU X4, -64(DI)(BX*1)
|
||||
MOVOU X5, -48(DI)(BX*1)
|
||||
MOVOU X6, -32(DI)(BX*1)
|
||||
MOVOU X7, -16(DI)(BX*1)
|
||||
RET
|
||||
move_129through256:
|
||||
MOVOU (SI), X0
|
||||
MOVOU 16(SI), X1
|
||||
MOVOU 32(SI), X2
|
||||
MOVOU 48(SI), X3
|
||||
MOVOU 64(SI), X4
|
||||
MOVOU 80(SI), X5
|
||||
MOVOU 96(SI), X6
|
||||
MOVOU 112(SI), X7
|
||||
MOVOU -128(SI)(BX*1), X8
|
||||
MOVOU -112(SI)(BX*1), X9
|
||||
MOVOU -96(SI)(BX*1), X10
|
||||
MOVOU -80(SI)(BX*1), X11
|
||||
MOVOU -64(SI)(BX*1), X12
|
||||
MOVOU -48(SI)(BX*1), X13
|
||||
MOVOU -32(SI)(BX*1), X14
|
||||
MOVOU -16(SI)(BX*1), X15
|
||||
MOVOU X0, (DI)
|
||||
MOVOU X1, 16(DI)
|
||||
MOVOU X2, 32(DI)
|
||||
MOVOU X3, 48(DI)
|
||||
MOVOU X4, 64(DI)
|
||||
MOVOU X5, 80(DI)
|
||||
MOVOU X6, 96(DI)
|
||||
MOVOU X7, 112(DI)
|
||||
MOVOU X8, -128(DI)(BX*1)
|
||||
MOVOU X9, -112(DI)(BX*1)
|
||||
MOVOU X10, -96(DI)(BX*1)
|
||||
MOVOU X11, -80(DI)(BX*1)
|
||||
MOVOU X12, -64(DI)(BX*1)
|
||||
MOVOU X13, -48(DI)(BX*1)
|
||||
MOVOU X14, -32(DI)(BX*1)
|
||||
MOVOU X15, -16(DI)(BX*1)
|
||||
RET
|
||||
|
116
src/pkg/runtime/memmove_test.go
Normal file
116
src/pkg/runtime/memmove_test.go
Normal file
@ -0,0 +1,116 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runtime_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMemmove(t *testing.T) {
|
||||
size := 256
|
||||
if testing.Short() {
|
||||
size = 128 + 16
|
||||
}
|
||||
src := make([]byte, size)
|
||||
dst := make([]byte, size)
|
||||
for i := 0; i < size; i++ {
|
||||
src[i] = byte(128 + (i & 127))
|
||||
}
|
||||
for i := 0; i < size; i++ {
|
||||
dst[i] = byte(i & 127)
|
||||
}
|
||||
for n := 0; n <= size; n++ {
|
||||
for x := 0; x <= size-n; x++ { // offset in src
|
||||
for y := 0; y <= size-n; y++ { // offset in dst
|
||||
copy(dst[y:y+n], src[x:x+n])
|
||||
for i := 0; i < y; i++ {
|
||||
if dst[i] != byte(i&127) {
|
||||
t.Fatalf("prefix dst[%d] = %d", i, dst[i])
|
||||
}
|
||||
}
|
||||
for i := y; i < y+n; i++ {
|
||||
if dst[i] != byte(128+((i-y+x)&127)) {
|
||||
t.Fatalf("copied dst[%d] = %d", i, dst[i])
|
||||
}
|
||||
dst[i] = byte(i & 127) // reset dst
|
||||
}
|
||||
for i := y + n; i < size; i++ {
|
||||
if dst[i] != byte(i&127) {
|
||||
t.Fatalf("suffix dst[%d] = %d", i, dst[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemmoveAlias(t *testing.T) {
|
||||
size := 256
|
||||
if testing.Short() {
|
||||
size = 128 + 16
|
||||
}
|
||||
buf := make([]byte, size)
|
||||
for i := 0; i < size; i++ {
|
||||
buf[i] = byte(i)
|
||||
}
|
||||
for n := 0; n <= size; n++ {
|
||||
for x := 0; x <= size-n; x++ { // src offset
|
||||
for y := 0; y <= size-n; y++ { // dst offset
|
||||
copy(buf[y:y+n], buf[x:x+n])
|
||||
for i := 0; i < y; i++ {
|
||||
if buf[i] != byte(i) {
|
||||
t.Fatalf("prefix buf[%d] = %d", i, buf[i])
|
||||
}
|
||||
}
|
||||
for i := y; i < y+n; i++ {
|
||||
if buf[i] != byte(i-y+x) {
|
||||
t.Fatalf("copied buf[%d] = %d", i, buf[i])
|
||||
}
|
||||
buf[i] = byte(i) // reset buf
|
||||
}
|
||||
for i := y + n; i < size; i++ {
|
||||
if buf[i] != byte(i) {
|
||||
t.Fatalf("suffix buf[%d] = %d", i, buf[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func bmMemmove(n int, b *testing.B) {
|
||||
x := make([]byte, n)
|
||||
y := make([]byte, n)
|
||||
b.SetBytes(int64(n))
|
||||
for i := 0; i < b.N; i++ {
|
||||
copy(x, y)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMemmove0(b *testing.B) { bmMemmove(0, b) }
|
||||
func BenchmarkMemmove1(b *testing.B) { bmMemmove(1, b) }
|
||||
func BenchmarkMemmove2(b *testing.B) { bmMemmove(2, b) }
|
||||
func BenchmarkMemmove3(b *testing.B) { bmMemmove(3, b) }
|
||||
func BenchmarkMemmove4(b *testing.B) { bmMemmove(4, b) }
|
||||
func BenchmarkMemmove5(b *testing.B) { bmMemmove(5, b) }
|
||||
func BenchmarkMemmove6(b *testing.B) { bmMemmove(6, b) }
|
||||
func BenchmarkMemmove7(b *testing.B) { bmMemmove(7, b) }
|
||||
func BenchmarkMemmove8(b *testing.B) { bmMemmove(8, b) }
|
||||
func BenchmarkMemmove9(b *testing.B) { bmMemmove(9, b) }
|
||||
func BenchmarkMemmove10(b *testing.B) { bmMemmove(10, b) }
|
||||
func BenchmarkMemmove11(b *testing.B) { bmMemmove(11, b) }
|
||||
func BenchmarkMemmove12(b *testing.B) { bmMemmove(12, b) }
|
||||
func BenchmarkMemmove13(b *testing.B) { bmMemmove(13, b) }
|
||||
func BenchmarkMemmove14(b *testing.B) { bmMemmove(14, b) }
|
||||
func BenchmarkMemmove15(b *testing.B) { bmMemmove(15, b) }
|
||||
func BenchmarkMemmove16(b *testing.B) { bmMemmove(16, b) }
|
||||
func BenchmarkMemmove32(b *testing.B) { bmMemmove(32, b) }
|
||||
func BenchmarkMemmove64(b *testing.B) { bmMemmove(64, b) }
|
||||
func BenchmarkMemmove128(b *testing.B) { bmMemmove(128, b) }
|
||||
func BenchmarkMemmove256(b *testing.B) { bmMemmove(256, b) }
|
||||
func BenchmarkMemmove512(b *testing.B) { bmMemmove(512, b) }
|
||||
func BenchmarkMemmove1024(b *testing.B) { bmMemmove(1024, b) }
|
||||
func BenchmarkMemmove2048(b *testing.B) { bmMemmove(2048, b) }
|
||||
func BenchmarkMemmove4096(b *testing.B) { bmMemmove(4096, b) }
|
Loading…
Reference in New Issue
Block a user