From 49fdfe21dd5d1a08b1edaac5c0caeedf492d32e2 Mon Sep 17 00:00:00 2001 From: Evan Shaw Date: Mon, 8 Nov 2010 17:33:53 -0800 Subject: [PATCH] bytes: SSE for bytes.IndexByte on amd64 Performance on 2.8 GHz Intel Core i7: Before: BenchmarkIndexByte4K 1000000 2997 ns/op 1366.70 MB/s BenchmarkIndexByte4M 500 3049772 ns/op 1375.28 MB/s BenchmarkIndexByte64M 50 49582280 ns/op 1353.48 MB/s After: BenchmarkIndexByte4K 10000000 298 ns/op 13744.97 MB/s BenchmarkIndexByte4M 10000 285993 ns/op 14665.76 MB/s BenchmarkIndexByte64M 500 4618172 ns/op 14531.48 MB/s R=rsc, PeterGo, r2, r CC=golang-dev https://golang.org/cl/2888041 --- src/pkg/bytes/asm_amd64.s | 93 +++++++++++++++++++++++++++++++++---- src/pkg/bytes/bytes_test.go | 53 +++++++++++++++++++++ 2 files changed, 137 insertions(+), 9 deletions(-) diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s index 7e78700ecfb..c6793cbdcc7 100644 --- a/src/pkg/bytes/asm_amd64.s +++ b/src/pkg/bytes/asm_amd64.s @@ -3,15 +3,90 @@ // license that can be found in the LICENSE file. TEXT ·IndexByte(SB),7,$0 - MOVQ p+0(FP), SI - MOVL len+8(FP), CX - MOVB b+16(FP), AL - MOVQ SI, DI + MOVQ p+0(FP), SI + MOVL len+8(FP), BX + MOVB b+16(FP), AL + MOVQ SI, DI + + CMPL BX, $16 + JLT small + + // round up to first 16-byte boundary + TESTQ $15, SI + JZ aligned + MOVQ SI, CX + ANDQ $~15, CX + ADDQ $16, CX + + // search the beginning + SUBQ SI, CX REPN; SCASB - JZ 3(PC) - MOVL $-1, ret+24(FP) + JZ success + +// DI is 16-byte aligned; get ready to search using SSE instructions +aligned: + // round down to last 16-byte boundary + MOVQ BX, R11 + ADDQ SI, R11 + ANDQ $~15, R11 + + // shuffle X0 around so that each byte contains c + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + JMP condition + +sse: + // move the next 16-byte chunk of the buffer into X1 + MOVO (DI), X1 + // compare bytes in X0 to X1 + PCMPEQB X0, X1 + // take the top bit of each byte in X1 and put the result in DX + PMOVMSKB X1, DX + TESTL DX, DX + JNZ ssesuccess + ADDQ $16, DI + +condition: + CMPQ DI, R11 + JLT sse + + // search the end + MOVQ SI, CX + ADDQ BX, CX + SUBQ R11, CX + // if CX == 0, the zero flag will be set and we'll end up + // returning a false success + JZ failure + REPN; SCASB + JZ success + +failure: + MOVL $-1, ret+24(FP) RET - SUBQ SI, DI - SUBL $1, DI - MOVL DI, ret+24(FP) + +// handle for lengths < 16 +small: + MOVL BX, CX + REPN; SCASB + JZ success + MOVL $-1, ret+24(FP) RET + +// we've found the chunk containing the byte +// now just figure out which specific byte it is +ssesuccess: + // get the index of the least significant set bit + BSFW DX, DX + SUBQ SI, DI + ADDQ DI, DX + MOVL DX, ret+24(FP) + RET + +success: + SUBQ SI, DI + SUBL $1, DI + MOVL DI, ret+24(FP) + RET + diff --git a/src/pkg/bytes/bytes_test.go b/src/pkg/bytes/bytes_test.go index 6f42338eb8e..f3ca371f83e 100644 --- a/src/pkg/bytes/bytes_test.go +++ b/src/pkg/bytes/bytes_test.go @@ -93,6 +93,9 @@ var indexTests = []BinOpTest{ {"abc", "b", 1}, {"abc", "c", 2}, {"abc", "x", -1}, + {"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33}, + {"foofyfoobarfoobar", "y", 4}, + {"oooooooooooooooooooooo", "r", -1}, } var lastIndexTests = []BinOpTest{ @@ -177,6 +180,56 @@ func TestIndexByte(t *testing.T) { } } +// test a larger buffer with different sizes and alignments +func TestIndexByteBig(t *testing.T) { + const n = 1024 + b := make([]byte, n) + for i := 0; i < n; i++ { + // different start alignments + b1 := b[i:] + for j := 0; j < len(b1); j++ { + b1[j] = 'x' + pos := IndexByte(b1, 'x') + if pos != j { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + b1[j] = 0 + pos = IndexByte(b1, 'x') + if pos != -1 { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + } + // different end alignments + b1 = b[:i] + for j := 0; j < len(b1); j++ { + b1[j] = 'x' + pos := IndexByte(b1, 'x') + if pos != j { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + b1[j] = 0 + pos = IndexByte(b1, 'x') + if pos != -1 { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + } + // different start and end alignments + b1 = b[i/2 : n-(i+1)/2] + for j := 0; j < len(b1); j++ { + b1[j] = 'x' + pos := IndexByte(b1, 'x') + if pos != j { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + b1[j] = 0 + pos = IndexByte(b1, 'x') + if pos != -1 { + t.Errorf("IndexByte(%q, 'x') = %v", b1, pos) + } + } + } +} + func TestIndexRune(t *testing.T) { for _, tt := range indexRuneTests { a := []byte(tt.a)