runtime: faster x86 memmove (a.k.a. built-in copy())

REP instructions have a high startup cost, so we handle small sizes with some straightline code. The REP MOVSx instructions are really fast for large sizes. The cutover is approximately 1K. We implement up to 128/256 because that is the maximum SSE register load (loading all data into registers before any stores lets us ignore copy direction). (on a Sandy Bridge E5-1650 @ 3.20GHz) benchmark old ns/op new ns/op delta BenchmarkMemmove0 3 3 +0.86% BenchmarkMemmove1 5 5 +5.40% BenchmarkMemmove2 18 8 -56.84% BenchmarkMemmove3 18 7 -58.45% BenchmarkMemmove4 36 7 -78.63% BenchmarkMemmove5 36 8 -77.91% BenchmarkMemmove6 36 8 -77.76% BenchmarkMemmove7 36 8 -77.82% BenchmarkMemmove8 18 8 -56.33% BenchmarkMemmove9 18 7 -58.34% BenchmarkMemmove10 18 7 -58.34% BenchmarkMemmove11 18 7 -58.45% BenchmarkMemmove12 36 7 -78.51% BenchmarkMemmove13 36 7 -78.48% BenchmarkMemmove14 36 7 -78.56% BenchmarkMemmove15 36 7 -78.56% BenchmarkMemmove16 18 7 -58.24% BenchmarkMemmove32 18 8 -54.33% BenchmarkMemmove64 18 8 -53.37% BenchmarkMemmove128 20 9 -55.93% BenchmarkMemmove256 25 11 -55.16% BenchmarkMemmove512 33 33 -1.19% BenchmarkMemmove1024 43 44 +2.06% BenchmarkMemmove2048 61 61 +0.16% BenchmarkMemmove4096 95 95 +0.00% R=golang-dev, bradfitz, remyoudompheng, khr, iant, dominik.honnef CC=golang-dev https://golang.org/cl/9038048
2024-11-26 10:18:12 -07:00 · 2013-05-17 12:53:49 -07:00 · 2013-05-17 12:53:49 -07:00 · 6021449236
commit 6021449236
parent af1dd56d1b
3 changed files with 338 additions and 23 deletions
--- a/src/pkg/runtime/memmove_386.s
+++ b/src/pkg/runtime/memmove_386.s
@ -27,6 +27,34 @@ TEXT runtime·memmove(SB), 7, $0
 	MOVL	to+0(FP), DI
 	MOVL	fr+4(FP), SI
 	MOVL	n+8(FP), BX
 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code.  The REP MOVSL instruction is really fast
 	// for large sizes.  The cutover is approximately 1K.  We implement up to
 	// 128 because that is the maximum SSE register load (loading all data
 	// into registers lets us ignore copy direction).
 tail:
 	TESTL	BX, BX
 	JEQ	move_0
 	CMPL	BX, $2
 	JBE	move_1or2
 	CMPL	BX, $4
 	JBE	move_3or4
 	CMPL	BX, $8
 	JBE	move_5through8
 	CMPL	BX, $16
 	JBE	move_9through16
 	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
 	JEQ	nosse2
 	CMPL	BX, $32
 	JBE	move_17through32
 	CMPL	BX, $64
 	JBE	move_33through64
 	CMPL	BX, $128
 	JBE	move_65through128
 	// TODO: use branch table and BSR to make this just a single dispatch
 nosse2:
 /*
 * check and set for backwards
 */
@ -42,11 +70,7 @@ forward:
 	ANDL	$3, BX
 	REP;	MOVSL
-	MOVL	BX, CX
+	JMP	tail
 	REP;	MOVSB
 	MOVL	to+0(FP),AX
 	RET
 /*
 * check overlap
 */
@ -75,12 +99,73 @@ back:
 	SUBL	$4, SI
 	REP;	MOVSL
 	ADDL	$3, DI
 	ADDL	$3, SI
 	MOVL	BX, CX
 	REP;	MOVSB
 	CLD
-	MOVL	to+0(FP),AX
+	ADDL	$4, DI
-	RET
+	ADDL	$4, SI
 	SUBL	BX, DI
 	SUBL	BX, SI
 	JMP	tail
 move_1or2:
 	MOVB	(SI), AX
 	MOVB	-1(SI)(BX*1), CX
 	MOVB	AX, (DI)
 	MOVB	CX, -1(DI)(BX*1)
 move_0:
 	RET
 move_3or4:
 	MOVW	(SI), AX
 	MOVW	-2(SI)(BX*1), CX
 	MOVW	AX, (DI)
 	MOVW	CX, -2(DI)(BX*1)
 	RET
 move_5through8:
 	MOVL	(SI), AX
 	MOVL	-4(SI)(BX*1), CX
 	MOVL	AX, (DI)
 	MOVL	CX, -4(DI)(BX*1)
 	RET
 move_9through16:
 	MOVL	(SI), AX
 	MOVL	4(SI), CX
 	MOVL	-8(SI)(BX*1), DX
 	MOVL	-4(SI)(BX*1), BP
 	MOVL	AX, (DI)
 	MOVL	CX, 4(DI)
 	MOVL	DX, -8(DI)(BX*1)
 	MOVL	BP, -4(DI)(BX*1)
 	RET
 move_17through32:
 	MOVOU	(SI), X0
 	MOVOU	-16(SI)(BX*1), X1
 	MOVOU	X0, (DI)
 	MOVOU	X1, -16(DI)(BX*1)
 	RET
 move_33through64:
 	MOVOU	(SI), X0
 	MOVOU	16(SI), X1
 	MOVOU	-32(SI)(BX*1), X2
 	MOVOU	-16(SI)(BX*1), X3
 	MOVOU	X0, (DI)
 	MOVOU	X1, 16(DI)
 	MOVOU	X2, -32(DI)(BX*1)
 	MOVOU	X3, -16(DI)(BX*1)
 	RET
 move_65through128:
 	MOVOU	(SI), X0
 	MOVOU	16(SI), X1
 	MOVOU	32(SI), X2
 	MOVOU	48(SI), X3
 	MOVOU	-64(SI)(BX*1), X4
 	MOVOU	-48(SI)(BX*1), X5
 	MOVOU	-32(SI)(BX*1), X6
 	MOVOU	-16(SI)(BX*1), X7
 	MOVOU	X0, (DI)
 	MOVOU	X1, 16(DI)
 	MOVOU	X2, 32(DI)
 	MOVOU	X3, 48(DI)
 	MOVOU	X4, -64(DI)(BX*1)
 	MOVOU	X5, -48(DI)(BX*1)
 	MOVOU	X6, -32(DI)(BX*1)
 	MOVOU	X7, -16(DI)(BX*1)
 	RET
--- a/src/pkg/runtime/memmove_amd64.s
+++ b/src/pkg/runtime/memmove_amd64.s
@ -30,6 +30,32 @@ TEXT runtime·memmove(SB), 7, $0
 	MOVQ	fr+8(FP), SI
 	MOVQ	n+16(FP), BX
 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code.  The REP MOVSQ instruction is really fast
 	// for large sizes.  The cutover is approximately 1K.  We implement up to
 	// 256 because that is the maximum SSE register load (loading all data
 	// into registers lets us ignore copy direction).
 tail:
 	TESTQ	BX, BX
 	JEQ	move_0
 	CMPQ	BX, $2
 	JBE	move_1or2
 	CMPQ	BX, $4
 	JBE	move_3or4
 	CMPQ	BX, $8
 	JBE	move_5through8
 	CMPQ	BX, $16
 	JBE	move_9through16
 	CMPQ	BX, $32
 	JBE	move_17through32
 	CMPQ	BX, $64
 	JBE	move_33through64
 	CMPQ	BX, $128
 	JBE	move_65through128
 	CMPQ	BX, $256
 	JBE	move_129through256
 	// TODO: use branch table and BSR to make this just a single dispatch
 /*
 * check and set for backwards
 */
@ -45,11 +71,8 @@ forward:
 	ANDQ	$7, BX
 	REP;	MOVSQ
-	MOVQ	BX, CX
+	JMP	tail
 	REP;	MOVSB
 	MOVQ	to+0(FP),AX
 	RET
 back:
 /*
 * check overlap
@ -78,12 +101,103 @@ back:
 	SUBQ	$8, SI
 	REP;	MOVSQ
 	ADDQ	$7, DI
 	ADDQ	$7, SI
 	MOVQ	BX, CX
 	REP;	MOVSB
 	CLD
-	MOVQ	to+0(FP),AX
+	ADDQ	$8, DI
-	RET
+	ADDQ	$8, SI
 	SUBQ	BX, DI
 	SUBQ	BX, SI
 	JMP	tail
 move_1or2:
 	MOVB	(SI), AX
 	MOVB	-1(SI)(BX*1), CX
 	MOVB	AX, (DI)
 	MOVB	CX, -1(DI)(BX*1)
 move_0:
 	RET
 move_3or4:
 	MOVW	(SI), AX
 	MOVW	-2(SI)(BX*1), CX
 	MOVW	AX, (DI)
 	MOVW	CX, -2(DI)(BX*1)
 	RET
 move_5through8:
 	MOVL	(SI), AX
 	MOVL	-4(SI)(BX*1), CX
 	MOVL	AX, (DI)
 	MOVL	CX, -4(DI)(BX*1)
 	RET
 move_9through16:
 	MOVQ	(SI), AX
 	MOVQ	-8(SI)(BX*1), CX
 	MOVQ	AX, (DI)
 	MOVQ	CX, -8(DI)(BX*1)
 	RET
 move_17through32:
 	MOVOU	(SI), X0
 	MOVOU	-16(SI)(BX*1), X1
 	MOVOU	X0, (DI)
 	MOVOU	X1, -16(DI)(BX*1)
 	RET
 move_33through64:
 	MOVOU	(SI), X0
 	MOVOU	16(SI), X1
 	MOVOU	-32(SI)(BX*1), X2
 	MOVOU	-16(SI)(BX*1), X3
 	MOVOU	X0, (DI)
 	MOVOU	X1, 16(DI)
 	MOVOU	X2, -32(DI)(BX*1)
 	MOVOU	X3, -16(DI)(BX*1)
 	RET
 move_65through128:
 	MOVOU	(SI), X0
 	MOVOU	16(SI), X1
 	MOVOU	32(SI), X2
 	MOVOU	48(SI), X3
 	MOVOU	-64(SI)(BX*1), X4
 	MOVOU	-48(SI)(BX*1), X5
 	MOVOU	-32(SI)(BX*1), X6
 	MOVOU	-16(SI)(BX*1), X7
 	MOVOU	X0, (DI)
 	MOVOU	X1, 16(DI)
 	MOVOU	X2, 32(DI)
 	MOVOU	X3, 48(DI)
 	MOVOU	X4, -64(DI)(BX*1)
 	MOVOU	X5, -48(DI)(BX*1)
 	MOVOU	X6, -32(DI)(BX*1)
 	MOVOU	X7, -16(DI)(BX*1)
 	RET
 move_129through256:
 	MOVOU	(SI), X0
 	MOVOU	16(SI), X1
 	MOVOU	32(SI), X2
 	MOVOU	48(SI), X3
 	MOVOU	64(SI), X4
 	MOVOU	80(SI), X5
 	MOVOU	96(SI), X6
 	MOVOU	112(SI), X7
 	MOVOU	-128(SI)(BX*1), X8
 	MOVOU	-112(SI)(BX*1), X9
 	MOVOU	-96(SI)(BX*1), X10
 	MOVOU	-80(SI)(BX*1), X11
 	MOVOU	-64(SI)(BX*1), X12
 	MOVOU	-48(SI)(BX*1), X13
 	MOVOU	-32(SI)(BX*1), X14
 	MOVOU	-16(SI)(BX*1), X15
 	MOVOU	X0, (DI)
 	MOVOU	X1, 16(DI)
 	MOVOU	X2, 32(DI)
 	MOVOU	X3, 48(DI)
 	MOVOU	X4, 64(DI)
 	MOVOU	X5, 80(DI)
 	MOVOU	X6, 96(DI)
 	MOVOU	X7, 112(DI)
 	MOVOU	X8, -128(DI)(BX*1)
 	MOVOU	X9, -112(DI)(BX*1)
 	MOVOU	X10, -96(DI)(BX*1)
 	MOVOU	X11, -80(DI)(BX*1)
 	MOVOU	X12, -64(DI)(BX*1)
 	MOVOU	X13, -48(DI)(BX*1)
 	MOVOU	X14, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
--- a/src/pkg/runtime/memmove_test.go
+++ b/src/pkg/runtime/memmove_test.go
@ -0,0 +1,116 @@
 // Copyright 2013 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package runtime_test
 import (
 	"testing"
 )
 func TestMemmove(t *testing.T) {
 	size := 256
 	if testing.Short() {
 		size = 128 + 16
 	}
 	src := make([]byte, size)
 	dst := make([]byte, size)
 	for i := 0; i < size; i++ {
 		src[i] = byte(128 + (i & 127))
 	}
 	for i := 0; i < size; i++ {
 		dst[i] = byte(i & 127)
 	}
 	for n := 0; n <= size; n++ {
 		for x := 0; x <= size-n; x++ { // offset in src
 			for y := 0; y <= size-n; y++ { // offset in dst
 				copy(dst[y:y+n], src[x:x+n])
 				for i := 0; i < y; i++ {
 					if dst[i] != byte(i&127) {
 						t.Fatalf("prefix dst[%d] = %d", i, dst[i])
 					}
 				}
 				for i := y; i < y+n; i++ {
 					if dst[i] != byte(128+((i-y+x)&127)) {
 						t.Fatalf("copied dst[%d] = %d", i, dst[i])
 					}
 					dst[i] = byte(i & 127) // reset dst
 				}
 				for i := y + n; i < size; i++ {
 					if dst[i] != byte(i&127) {
 						t.Fatalf("suffix dst[%d] = %d", i, dst[i])
 					}
 				}
 			}
 		}
 	}
 }
 func TestMemmoveAlias(t *testing.T) {
 	size := 256
 	if testing.Short() {
 		size = 128 + 16
 	}
 	buf := make([]byte, size)
 	for i := 0; i < size; i++ {
 		buf[i] = byte(i)
 	}
 	for n := 0; n <= size; n++ {
 		for x := 0; x <= size-n; x++ { // src offset
 			for y := 0; y <= size-n; y++ { // dst offset
 				copy(buf[y:y+n], buf[x:x+n])
 				for i := 0; i < y; i++ {
 					if buf[i] != byte(i) {
 						t.Fatalf("prefix buf[%d] = %d", i, buf[i])
 					}
 				}
 				for i := y; i < y+n; i++ {
 					if buf[i] != byte(i-y+x) {
 						t.Fatalf("copied buf[%d] = %d", i, buf[i])
 					}
 					buf[i] = byte(i) // reset buf
 				}
 				for i := y + n; i < size; i++ {
 					if buf[i] != byte(i) {
 						t.Fatalf("suffix buf[%d] = %d", i, buf[i])
 					}
 				}
 			}
 		}
 	}
 }
 func bmMemmove(n int, b *testing.B) {
 	x := make([]byte, n)
 	y := make([]byte, n)
 	b.SetBytes(int64(n))
 	for i := 0; i < b.N; i++ {
 		copy(x, y)
 	}
 }
 func BenchmarkMemmove0(b *testing.B)    { bmMemmove(0, b) }
 func BenchmarkMemmove1(b *testing.B)    { bmMemmove(1, b) }
 func BenchmarkMemmove2(b *testing.B)    { bmMemmove(2, b) }
 func BenchmarkMemmove3(b *testing.B)    { bmMemmove(3, b) }
 func BenchmarkMemmove4(b *testing.B)    { bmMemmove(4, b) }
 func BenchmarkMemmove5(b *testing.B)    { bmMemmove(5, b) }
 func BenchmarkMemmove6(b *testing.B)    { bmMemmove(6, b) }
 func BenchmarkMemmove7(b *testing.B)    { bmMemmove(7, b) }
 func BenchmarkMemmove8(b *testing.B)    { bmMemmove(8, b) }
 func BenchmarkMemmove9(b *testing.B)    { bmMemmove(9, b) }
 func BenchmarkMemmove10(b *testing.B)   { bmMemmove(10, b) }
 func BenchmarkMemmove11(b *testing.B)   { bmMemmove(11, b) }
 func BenchmarkMemmove12(b *testing.B)   { bmMemmove(12, b) }
 func BenchmarkMemmove13(b *testing.B)   { bmMemmove(13, b) }
 func BenchmarkMemmove14(b *testing.B)   { bmMemmove(14, b) }
 func BenchmarkMemmove15(b *testing.B)   { bmMemmove(15, b) }
 func BenchmarkMemmove16(b *testing.B)   { bmMemmove(16, b) }
 func BenchmarkMemmove32(b *testing.B)   { bmMemmove(32, b) }
 func BenchmarkMemmove64(b *testing.B)   { bmMemmove(64, b) }
 func BenchmarkMemmove128(b *testing.B)  { bmMemmove(128, b) }
 func BenchmarkMemmove256(b *testing.B)  { bmMemmove(256, b) }
 func BenchmarkMemmove512(b *testing.B)  { bmMemmove(512, b) }
 func BenchmarkMemmove1024(b *testing.B) { bmMemmove(1024, b) }
 func BenchmarkMemmove2048(b *testing.B) { bmMemmove(2048, b) }
 func BenchmarkMemmove4096(b *testing.B) { bmMemmove(4096, b) }