1
0
mirror of https://github.com/golang/go synced 2024-10-04 17:21:20 -06:00
go/src/pkg/bytes/asm_amd64.s
Evan Shaw 49fdfe21dd bytes: SSE for bytes.IndexByte on amd64
Performance on 2.8 GHz Intel Core i7:

Before:
BenchmarkIndexByte4K  1000000              2997 ns/op        1366.70 MB/s
BenchmarkIndexByte4M      500           3049772 ns/op        1375.28 MB/s
BenchmarkIndexByte64M      50          49582280 ns/op        1353.48 MB/s

After:
BenchmarkIndexByte4K 10000000               298 ns/op       13744.97 MB/s
BenchmarkIndexByte4M    10000            285993 ns/op       14665.76 MB/s
BenchmarkIndexByte64M     500           4618172 ns/op       14531.48 MB/s

R=rsc, PeterGo, r2, r
CC=golang-dev
https://golang.org/cl/2888041
2010-11-08 17:33:53 -08:00

93 lines
1.6 KiB
ArmAsm

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
TEXT ·IndexByte(SB),7,$0
MOVQ p+0(FP), SI
MOVL len+8(FP), BX
MOVB b+16(FP), AL
MOVQ SI, DI
CMPL BX, $16
JLT small
// round up to first 16-byte boundary
TESTQ $15, SI
JZ aligned
MOVQ SI, CX
ANDQ $~15, CX
ADDQ $16, CX
// search the beginning
SUBQ SI, CX
REPN; SCASB
JZ success
// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
// round down to last 16-byte boundary
MOVQ BX, R11
ADDQ SI, R11
ANDQ $~15, R11
// shuffle X0 around so that each byte contains c
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
JMP condition
sse:
// move the next 16-byte chunk of the buffer into X1
MOVO (DI), X1
// compare bytes in X0 to X1
PCMPEQB X0, X1
// take the top bit of each byte in X1 and put the result in DX
PMOVMSKB X1, DX
TESTL DX, DX
JNZ ssesuccess
ADDQ $16, DI
condition:
CMPQ DI, R11
JLT sse
// search the end
MOVQ SI, CX
ADDQ BX, CX
SUBQ R11, CX
// if CX == 0, the zero flag will be set and we'll end up
// returning a false success
JZ failure
REPN; SCASB
JZ success
failure:
MOVL $-1, ret+24(FP)
RET
// handle for lengths < 16
small:
MOVL BX, CX
REPN; SCASB
JZ success
MOVL $-1, ret+24(FP)
RET
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
// get the index of the least significant set bit
BSFW DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVL DX, ret+24(FP)
RET
success:
SUBQ SI, DI
SUBL $1, DI
MOVL DI, ret+24(FP)
RET