mirror of
https://github.com/golang/go
synced 2024-11-21 16:04:45 -07:00
bytes: SSE for bytes.IndexByte on amd64
Performance on 2.8 GHz Intel Core i7: Before: BenchmarkIndexByte4K 1000000 2997 ns/op 1366.70 MB/s BenchmarkIndexByte4M 500 3049772 ns/op 1375.28 MB/s BenchmarkIndexByte64M 50 49582280 ns/op 1353.48 MB/s After: BenchmarkIndexByte4K 10000000 298 ns/op 13744.97 MB/s BenchmarkIndexByte4M 10000 285993 ns/op 14665.76 MB/s BenchmarkIndexByte64M 500 4618172 ns/op 14531.48 MB/s R=rsc, PeterGo, r2, r CC=golang-dev https://golang.org/cl/2888041
This commit is contained in:
parent
e9c901dbf4
commit
49fdfe21dd
@ -3,15 +3,90 @@
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
TEXT ·IndexByte(SB),7,$0
|
||||
MOVQ p+0(FP), SI
|
||||
MOVL len+8(FP), CX
|
||||
MOVB b+16(FP), AL
|
||||
MOVQ SI, DI
|
||||
MOVQ p+0(FP), SI
|
||||
MOVL len+8(FP), BX
|
||||
MOVB b+16(FP), AL
|
||||
MOVQ SI, DI
|
||||
|
||||
CMPL BX, $16
|
||||
JLT small
|
||||
|
||||
// round up to first 16-byte boundary
|
||||
TESTQ $15, SI
|
||||
JZ aligned
|
||||
MOVQ SI, CX
|
||||
ANDQ $~15, CX
|
||||
ADDQ $16, CX
|
||||
|
||||
// search the beginning
|
||||
SUBQ SI, CX
|
||||
REPN; SCASB
|
||||
JZ 3(PC)
|
||||
MOVL $-1, ret+24(FP)
|
||||
JZ success
|
||||
|
||||
// DI is 16-byte aligned; get ready to search using SSE instructions
|
||||
aligned:
|
||||
// round down to last 16-byte boundary
|
||||
MOVQ BX, R11
|
||||
ADDQ SI, R11
|
||||
ANDQ $~15, R11
|
||||
|
||||
// shuffle X0 around so that each byte contains c
|
||||
MOVD AX, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PSHUFL $0, X0, X0
|
||||
JMP condition
|
||||
|
||||
sse:
|
||||
// move the next 16-byte chunk of the buffer into X1
|
||||
MOVO (DI), X1
|
||||
// compare bytes in X0 to X1
|
||||
PCMPEQB X0, X1
|
||||
// take the top bit of each byte in X1 and put the result in DX
|
||||
PMOVMSKB X1, DX
|
||||
TESTL DX, DX
|
||||
JNZ ssesuccess
|
||||
ADDQ $16, DI
|
||||
|
||||
condition:
|
||||
CMPQ DI, R11
|
||||
JLT sse
|
||||
|
||||
// search the end
|
||||
MOVQ SI, CX
|
||||
ADDQ BX, CX
|
||||
SUBQ R11, CX
|
||||
// if CX == 0, the zero flag will be set and we'll end up
|
||||
// returning a false success
|
||||
JZ failure
|
||||
REPN; SCASB
|
||||
JZ success
|
||||
|
||||
failure:
|
||||
MOVL $-1, ret+24(FP)
|
||||
RET
|
||||
SUBQ SI, DI
|
||||
SUBL $1, DI
|
||||
MOVL DI, ret+24(FP)
|
||||
|
||||
// handle for lengths < 16
|
||||
small:
|
||||
MOVL BX, CX
|
||||
REPN; SCASB
|
||||
JZ success
|
||||
MOVL $-1, ret+24(FP)
|
||||
RET
|
||||
|
||||
// we've found the chunk containing the byte
|
||||
// now just figure out which specific byte it is
|
||||
ssesuccess:
|
||||
// get the index of the least significant set bit
|
||||
BSFW DX, DX
|
||||
SUBQ SI, DI
|
||||
ADDQ DI, DX
|
||||
MOVL DX, ret+24(FP)
|
||||
RET
|
||||
|
||||
success:
|
||||
SUBQ SI, DI
|
||||
SUBL $1, DI
|
||||
MOVL DI, ret+24(FP)
|
||||
RET
|
||||
|
||||
|
@ -93,6 +93,9 @@ var indexTests = []BinOpTest{
|
||||
{"abc", "b", 1},
|
||||
{"abc", "c", 2},
|
||||
{"abc", "x", -1},
|
||||
{"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
|
||||
{"foofyfoobarfoobar", "y", 4},
|
||||
{"oooooooooooooooooooooo", "r", -1},
|
||||
}
|
||||
|
||||
var lastIndexTests = []BinOpTest{
|
||||
@ -177,6 +180,56 @@ func TestIndexByte(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// test a larger buffer with different sizes and alignments
|
||||
func TestIndexByteBig(t *testing.T) {
|
||||
const n = 1024
|
||||
b := make([]byte, n)
|
||||
for i := 0; i < n; i++ {
|
||||
// different start alignments
|
||||
b1 := b[i:]
|
||||
for j := 0; j < len(b1); j++ {
|
||||
b1[j] = 'x'
|
||||
pos := IndexByte(b1, 'x')
|
||||
if pos != j {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
b1[j] = 0
|
||||
pos = IndexByte(b1, 'x')
|
||||
if pos != -1 {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
}
|
||||
// different end alignments
|
||||
b1 = b[:i]
|
||||
for j := 0; j < len(b1); j++ {
|
||||
b1[j] = 'x'
|
||||
pos := IndexByte(b1, 'x')
|
||||
if pos != j {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
b1[j] = 0
|
||||
pos = IndexByte(b1, 'x')
|
||||
if pos != -1 {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
}
|
||||
// different start and end alignments
|
||||
b1 = b[i/2 : n-(i+1)/2]
|
||||
for j := 0; j < len(b1); j++ {
|
||||
b1[j] = 'x'
|
||||
pos := IndexByte(b1, 'x')
|
||||
if pos != j {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
b1[j] = 0
|
||||
pos = IndexByte(b1, 'x')
|
||||
if pos != -1 {
|
||||
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndexRune(t *testing.T) {
|
||||
for _, tt := range indexRuneTests {
|
||||
a := []byte(tt.a)
|
||||
|
Loading…
Reference in New Issue
Block a user