1
0
mirror of https://github.com/golang/go synced 2024-11-23 07:40:04 -07:00

internal/bytealg: move short string Index implementations into bytealg

Also move the arm64 CountByte implementation while we're here.

Fixes #19792

Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e
Reviewed-on: https://go-review.googlesource.com/98518
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Keith Randall 2018-03-04 09:47:47 -08:00
parent f6332bb84a
commit ee58eccc56
27 changed files with 932 additions and 1123 deletions

View File

@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool {
return len(s) == len(t)
}
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
case n <= bytealg.MaxLen:
// Use brute force when s and sep both are small
if len(s) <= bytealg.MaxBruteForce {
return bytealg.Index(s, sep)
}
c := sep[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte is faster than bytealg.Index, so use it as long as
// we're not getting lots of false positives.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
fails++
i++
// Switch to bytealg.Index when IndexByte produces too many false positives.
if fails > bytealg.Cutover(i) {
r := bytealg.Index(s[i:], sep)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
c := sep[0]
i := 0
fails := 0
t := s[:len(s)-n+1]
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
break
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// Give up on IndexByte, it isn't skipping ahead
// far enough to be better than Rabin-Karp.
// Experiments (using IndexPeriodic) suggest
// the cutover is about 16 byte skips.
// TODO: if large prefixes of sep are matching
// we should cutover at even larger average skips,
// because Equal becomes that much more expensive.
// This code does not take that effect into account.
j := indexRabinKarp(s[i:], sep)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}
func indexRabinKarp(s, sep []byte) int {
// Rabin-Karp search
hashsep, pow := hashStr(sep)

View File

@ -1,79 +0,0 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytes
import "internal/cpu"
//go:noescape
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
func indexShortStr(s, c []byte) int // ../runtime/asm_amd64.s
func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
var shortStringLen int
func init() {
if cpu.X86.HasAVX2 {
shortStringLen = 63
} else {
shortStringLen = 31
}
}
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
case n <= shortStringLen:
// Use brute force when s and sep both are small
if len(s) <= 64 {
return indexShortStr(s, sep)
}
c := sep[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte skips 16/32 bytes per iteration,
// so it's faster than indexShortStr.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
fails++
i++
// Switch to indexShortStr when IndexByte produces too many false positives.
// Too many means more that 1 error per 8 characters.
// Allow some errors in the beginning.
if fails > (i+16)/8 {
r := indexShortStr(s[i:], sep)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
return indexRabinKarp(s, sep)
}

View File

@ -1,72 +0,0 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytes
func countByte(s []byte, c byte) int // bytes_arm64.s
// 8 bytes can be completely loaded into 1 register.
const shortStringLen = 8
//go:noescape
func indexShortStr(s, sep []byte) int
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
case n <= shortStringLen:
// Use brute force when both s and sep are small.
// Empirical data shows that it can get better
// performance when len(s) <= 16.
if len(s) <= 16 {
return indexShortStr(s, sep)
}
}
c := sep[0]
i := 0
fails := 0
t := s[:len(s)-n+1]
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
break
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// Give up on IndexByte, it isn't skipping ahead
// far enough to be better than Rabin-Karp.
// Experiments (using IndexPeriodic) suggest
// the cutover is about 16 byte skips.
// TODO: if large prefixes of sep are matching
// we should cutover at even larger average skips,
// because Equal becomes that much more expensive.
// This code does not take that effect into account.
j := indexRabinKarp(s[i:], sep)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}

View File

@ -1,59 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!s390x,!arm64
package bytes
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
}
c := sep[0]
i := 0
fails := 0
t := s[:len(s)-n+1]
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
break
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// Give up on IndexByte, it isn't skipping ahead
// far enough to be better than Rabin-Karp.
// Experiments (using IndexPeriodic) suggest
// the cutover is about 16 byte skips.
// TODO: if large prefixes of sep are matching
// we should cutover at even larger average skips,
// because Equal becomes that much more expensive.
// This code does not take that effect into account.
j := indexRabinKarp(s[i:], sep)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}

View File

@ -1,80 +0,0 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytes
//go:noescape
// indexShortStr returns the index of the first instance of sep in s,
// or -1 if sep is not present in s.
// indexShortStr requires 2 <= len(sep) <= shortStringLen
func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
// supportsVX reports whether the vector facility is available.
// indexShortStr must not be called if the vector facility is not
// available.
func supportsVX() bool // ../runtime/asm_s390x.s
var shortStringLen = -1
func init() {
if supportsVX() {
shortStringLen = 64
}
}
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
case n <= shortStringLen:
// Use brute force when s and sep both are small
if len(s) <= 64 {
return indexShortStr(s, sep)
}
c := sep[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte skips 16/32 bytes per iteration,
// so it's faster than indexShortStr.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
fails++
i++
// Switch to indexShortStr when IndexByte produces too many false positives.
// Too many means more that 1 error per 8 characters.
// Allow some errors in the beginning.
if fails > (i+16)/8 {
r := indexShortStr(s[i:], sep)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
return indexRabinKarp(s, sep)
}

View File

@ -1,20 +1,16 @@
// amd64-specific vet whitelist. See readme.txt for details.
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
// False positives.
// Nothing much to do about cross-package assembly. Unfortunate.
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
// reflect trampolines intentionally omit arg size. Same for morestack.
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
// Nothing much to do about cross-package assembly. Unfortunate.
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
// Intentionally missing declarations. These are special assembly routines.
// Some are jumped into from other routines, with values in specific registers.
// duff* have direct calls from the compiler.
@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de
runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration

View File

@ -1,11 +1,6 @@
runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration

View File

@ -0,0 +1,22 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytealg
import (
"internal/cpu"
"unsafe"
)
// Offsets into internal/cpu records for use in assembly.
const (
x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
)
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
var MaxLen int

View File

@ -0,0 +1,90 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count(SB),NOSPLIT,$0-40
MOVD b_base+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B countbytebody<>(SB)
TEXT ·CountString(SB),NOSPLIT,$0-32
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B countbytebody<>(SB)
// input:
// R0: data
// R2: data len
// R1: byte to find
// R8: address to put result
TEXT countbytebody<>(SB),NOSPLIT,$0
// R11 = count of byte to search
MOVD $0, R11
// short path to handle 0-byte case
CBZ R2, done
CMP $0x20, R2
// jump directly to tail if length < 32
BLO tail
ANDS $0x1f, R0, R9
BEQ chunk
// Work with not 32-byte aligned head
BIC $0x1f, R0, R3
ADD $0x20, R3
head_loop:
MOVBU.P 1(R0), R5
CMP R5, R1
CINC EQ, R11, R11
SUB $1, R2, R2
CMP R0, R3
BNE head_loop
// Work with 32-byte aligned chunks
chunk:
BIC $0x1f, R2, R9
// The first chunk can also be the last
CBZ R9, tail
// R3 = end of 32-byte chunks
ADD R0, R9, R3
MOVD $1, R5
VMOV R5, V5.B16
// R2 = length of tail
SUB R9, R2, R2
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
VMOV R1, V0.B16
// Clear the low 64-bit element of V7 and V8
VEOR V7.B8, V7.B8, V7.B8
VEOR V8.B8, V8.B8, V8.B8
// Count the target byte in 32-byte chunk
chunk_loop:
VLD1.P (R0), [V1.B16, V2.B16]
CMP R0, R3
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// Clear the higher 7 bits
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
// Count lanes match the requested byte
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
VUADDLV V6.B16, V7
// Accumulate the count in low 64-bit element of V8 when inside the loop
VADD V7, V8
BNE chunk_loop
VMOV V8.D[0], R6
ADD R6, R11, R11
CBZ R2, done
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R2, R2
CMP R5, R1
CINC EQ, R11, R11
CBNZ R2, tail
done:
MOVD R11, (R8)
RET

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64
// +build !amd64,!arm64
package bytealg

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64
// +build amd64 arm64
package bytealg

View File

@ -4,24 +4,8 @@
package bytealg
import (
"internal/cpu"
"unsafe"
)
// Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
// Because equal_native.go is unconditional, it's a good place to compute asm constants.
// TODO: find a better way to do this?
// Offsets into internal/cpu records for use in assembly.
const (
x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
)
//go:noescape
func Equal(a, b []byte) bool

View File

@ -0,0 +1,26 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytealg
import "internal/cpu"
const MaxBruteForce = 64
func init() {
if cpu.X86.HasAVX2 {
MaxLen = 63
} else {
MaxLen = 31
}
}
// Cutover reports the number of failures of IndexByte we should tolerate
// before switching over to Index.
// n is the number of bytes processed so far.
// See the bytes.Index implementation for details.
func Cutover(n int) int {
// 1 error per 8 characters, plus a few slop to start.
return (n + 16) / 8
}

View File

@ -0,0 +1,274 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Index(SB),NOSPLIT,$0-56
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+24(FP), BP
MOVQ b_len+32(FP), AX
MOVQ DI, R10
LEAQ ret+48(FP), R11
JMP indexbody<>(SB)
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+16(FP), BP
MOVQ b_len+24(FP), AX
MOVQ DI, R10
LEAQ ret+32(FP), R11
JMP indexbody<>(SB)
// AX: length of string, that we are searching for
// DX: length of string, in which we are searching
// DI: pointer to string, in which we are searching
// BP: pointer to string, that we are searching for
// R11: address, where to put return value
// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
TEXT indexbody<>(SB),NOSPLIT,$0
CMPQ AX, DX
JA fail
CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more
MOVW (BP), BP
LEAQ -1(DI)(DX*1), DX
loop2:
MOVW (DI), SI
CMPW SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop2
JMP fail
_3_or_more:
CMPQ AX, $3
JA _4_or_more
MOVW 1(BP), BX
MOVW (BP), BP
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
CMPW SI,BP
JZ partial_success3
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
partial_success3:
MOVW 1(DI), SI
CMPW SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
_4_or_more:
CMPQ AX, $4
JA _5_or_more
MOVL (BP), BP
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
CMPL SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop4
JMP fail
_5_or_more:
CMPQ AX, $7
JA _8_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVL -4(BP)(AX*1), BX
MOVL (BP), BP
loop5to7:
MOVL (DI), SI
CMPL SI,BP
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
partial_success5to7:
MOVL -4(AX)(DI*1), SI
CMPL SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
_8_or_more:
CMPQ AX, $8
JA _9_or_more
MOVQ (BP), BP
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
CMPQ SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop8
JMP fail
_9_or_more:
CMPQ AX, $15
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVQ -8(BP)(AX*1), BX
MOVQ (BP), BP
loop9to15:
MOVQ (DI), SI
CMPQ SI,BP
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
partial_success9to15:
MOVQ -8(AX)(DI*1), SI
CMPQ SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
_16_or_more:
CMPQ AX, $16
JA _17_or_more
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
PCMPEQB X1, X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop16
JMP fail
_17_or_more:
CMPQ AX, $31
JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(BP)(AX*1), X0
MOVOU (BP), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE partial_success17to31
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
partial_success17to31:
MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3
PMOVMSKB X3, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
// So no need to check cpuid
_32_or_more:
CMPQ AX, $32
JA _33_to_63
VMOVDQU (BP), Y1
LEAQ -31(DI)(DX*1), DX
loop32:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop32
JMP fail_avx2
_33_to_63:
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
VMOVDQU -32(BP)(AX*1), Y0
VMOVDQU (BP), Y1
loop33to63:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE partial_success33to63
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
JMP fail_avx2
partial_success33to63:
VMOVDQU -32(AX)(DI*1), Y3
VPCMPEQB Y0, Y3, Y4
VPMOVMSKB Y4, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
fail_avx2:
VZEROUPPER
fail:
MOVQ $-1, (R11)
RET
success_avx2:
VZEROUPPER
JMP success
sse42:
CMPB internalcpu·X86+const_x86_HasSSE42(SB), $1
JNE no_sse42
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(BP), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success:
SUBQ R10, DI
MOVQ DI, (R11)
RET

View File

@ -0,0 +1,23 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytealg
// Empirical data shows that using IndexShortStr can get better
// performance when len(s) <= 16.
const MaxBruteForce = 16
func init() {
// 8 bytes can be completely loaded into 1 register.
MaxLen = 8
}
// Cutover reports the number of failures of IndexByte we should tolerate
// before switching over to IndexShortStr.
// n is the number of bytes processed so far.
// See the bytes.Index implementation for details.
func Cutover(n int) int {
// 1 error per 16 characters, plus a few slop to start.
return 4 + n>>4
}

View File

@ -1,88 +1,40 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// countByte(s []byte, c byte) int
TEXT bytes·countByte(SB),NOSPLIT,$0-40
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+24(FP), R1
// R11 = count of byte to search
MOVD $0, R11
// short path to handle 0-byte case
CBZ R2, done
CMP $0x20, R2
// jump directly to tail if length < 32
BLO tail
ANDS $0x1f, R0, R9
BEQ chunk
// Work with not 32-byte aligned head
BIC $0x1f, R0, R3
ADD $0x20, R3
head_loop:
MOVBU.P 1(R0), R5
CMP R5, R1
CINC EQ, R11, R11
SUB $1, R2, R2
CMP R0, R3
BNE head_loop
// Work with 32-byte aligned chunks
chunk:
BIC $0x1f, R2, R9
// The first chunk can also be the last
CBZ R9, tail
// R3 = end of 32-byte chunks
ADD R0, R9, R3
MOVD $1, R5
VMOV R5, V5.B16
// R2 = length of tail
SUB R9, R2, R2
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
VMOV R1, V0.B16
// Clear the low 64-bit element of V7 and V8
VEOR V7.B8, V7.B8, V7.B8
VEOR V8.B8, V8.B8, V8.B8
// Count the target byte in 32-byte chunk
chunk_loop:
VLD1.P (R0), [V1.B16, V2.B16]
CMP R0, R3
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// Clear the higher 7 bits
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
// Count lanes match the requested byte
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
VUADDLV V6.B16, V7
// Accumulate the count in low 64-bit element of V8 when inside the loop
VADD V7, V8
BNE chunk_loop
VMOV V8.D[0], R6
ADD R6, R11, R11
CBZ R2, done
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R2, R2
CMP R5, R1
CINC EQ, R11, R11
CBNZ R2, tail
done:
MOVD R11, ret+32(FP)
RET
TEXT ·Index(SB),NOSPLIT,$0-56
MOVD a_base+0(FP), R0
MOVD a_len+8(FP), R1
MOVD b_base+24(FP), R2
MOVD b_len+32(FP), R3
MOVD $ret+48(FP), R9
B indexbody<>(SB)
// indexShortStr(s, sep []byte) int
// precondition: 2 <= len(sep) <= 8
TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVD a_base+0(FP), R0
MOVD a_len+8(FP), R1
MOVD b_base+16(FP), R2
MOVD b_len+24(FP), R3
MOVD $ret+32(FP), R9
B indexbody<>(SB)
// input:
// R0: haystack
// R1: length of haystack
// R2: needle
// R3: length of needle (2 <= len <= 8)
// R9: address to put result
TEXT indexbody<>(SB),NOSPLIT,$0-56
// main idea is to load 'sep' into separate register(s)
// to avoid repeatedly re-load it again and again
// for sebsequent substring comparisons
MOVD s+0(FP), R0
MOVD s_len+8(FP), R1
MOVD sep+24(FP), R2
MOVD sep_len+32(FP), R3
MOVD a_base+0(FP), R0
MOVD a_len+8(FP), R1
MOVD b_base+24(FP), R2
MOVD b_len+32(FP), R3
SUB R3, R1, R4
// R4 contains the start of last substring for comparsion
ADD R0, R4, R4
@ -189,9 +141,9 @@ loop_2:
BLS loop_2
not_found:
MOVD $-1, R0
MOVD R0, ret+48(FP)
MOVD R0, (R9)
RET
found:
SUB R8, R0, R0
MOVD R0, ret+48(FP)
MOVD R0, (R9)
RET

View File

@ -0,0 +1,29 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!arm64,!s390x
package bytealg
const MaxBruteForce = 0
// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
// Requires 2 <= len(b) <= MaxLen.
func Index(a, b []byte) int {
panic("unimplemented")
}
// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
// Requires 2 <= len(b) <= MaxLen.
func IndexString(a, b string) int {
panic("unimplemented")
}
// Cutover reports the number of failures of IndexByte we should tolerate
// before switching over to Index.
// n is the number of bytes processed so far.
// See the bytes.Index implementation for details.
func Cutover(n int) int {
panic("unimplemented")
}

View File

@ -0,0 +1,19 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 arm64 s390x
package bytealg
//go:noescape
// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
// Requires 2 <= len(b) <= MaxLen.
func Index(a, b []byte) int
//go:noescape
// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
// Requires 2 <= len(b) <= MaxLen.
func IndexString(a, b string) int

View File

@ -0,0 +1,31 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytealg
import "internal/cpu"
const MaxBruteForce = 64
func init() {
// Note: we're kind of lucky that this flag is available at this point.
// The runtime sets HasVX when processing auxv records, and that happens
// to happen *before* running the init functions of packages that
// the runtime depends on.
// TODO: it would really be nicer for internal/cpu to figure out this
// flag by itself. Then we wouldn't need to depend on quirks of
// early startup initialization order.
if cpu.S390X.HasVX {
MaxLen = 64
}
}
// Cutover reports the number of failures of IndexByte we should tolerate
// before switching over to Index.
// n is the number of bytes processed so far.
// See the bytes.Index implementation for details.
func Cutover(n int) int {
// 1 error per 8 characters, plus a few slop to start.
return (n + 16) / 8
}

View File

@ -0,0 +1,216 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// Caller must confirm availability of vx facility before calling.
TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+48(FP), R5
BR indexbody<>(SB)
// Caller must confirm availability of vx facility before calling.
TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+32(FP), R5
BR indexbody<>(SB)
// s: string we are searching
// sep: string to search for
// R1=&s[0], R2=len(s)
// R3=&sep[0], R4=len(sep)
// R5=&ret (int)
// Caller must confirm availability of vx facility before calling.
TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
CMPBGT R4, R2, notfound
ADD R1, R2
SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
CMPBEQ R4, $0, notfound
SUB $1, R4 // R4=len(sep)-1 for use as VLL index
VLL R4, (R3), V0 // contains first 16 bytes of sep
MOVD R1, R7
index2plus:
CMPBNE R4, $1, index3plus
MOVD $15(R7), R9
CMPBGE R9, R2, index2to16
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
CMPBGE R9, R2, index2to16
index2loop:
VL 0(R7), V2 // 16 bytes, even indices
VL 1(R7), V4 // 16 bytes, odd indices
VCEQH V1, V2, V5 // compare even indices
VCEQH V1, V4, V6 // compare odd indices
VSEL V5, V6, V31, V7 // merge even and odd indices
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index3plus:
CMPBNE R4, $2, index4plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $1, R0
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
VREPB $2, V0, V8
index3loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 2-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
VCEQH V1, V2, V5 // compare 2-byte even indices
VCEQH V1, V4, V6 // compare 2-byte odd indices
VCEQB V8, V9, V10 // compare last bytes
VSEL V5, V6, V31, V7 // merge even and odd indices
VN V7, V10, V7 // AND indices with last byte
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index4plus:
CMPBNE R4, $3, index5plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $2, R0
VGBM $0x8888, V29 // 0xff000000ff000000...
VGBM $0x2222, V30 // 0x0000ff000000ff00...
VGBM $0xcccc, V31 // 0xffff0000ffff0000...
VONE V16
VREPF $0, V0, V1
index4loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 3-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
VCEQF V1, V2, V5 // compare index 0, 4, ...
VCEQF V1, V4, V6 // compare index 1, 5, ...
VCEQF V1, V9, V11 // compare index 2, 6, ...
VCEQF V1, V10, V12 // compare index 3, 7, ...
VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
VSEL V13, V14, V31, V7 // final merge
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index5plus:
CMPBGT R4, $15, index17plus
index2to16:
CMPBGT R7, R2, notfound
MOVD $1(R7), R8
CMPBGT R8, R2, index2to16tail
index2to16loop:
// unrolled 2x
VLL R4, (R7), V1
VLL R4, 1(R7), V2
VCEQGS V0, V1, V3
BEQ found
MOVD $1(R7), R7
VCEQGS V0, V2, V4
BEQ found
MOVD $1(R7), R7
CMPBLT R7, R2, index2to16loop
CMPBGT R7, R2, notfound
index2to16tail:
VLL R4, (R7), V1
VCEQGS V0, V1, V2
BEQ found
BR notfound
index17plus:
CMPBGT R4, $31, index33plus
SUB $16, R4, R0
VLL R0, 16(R3), V1
VONE V7
index17to32loop:
VL (R7), V2
VLL R0, 16(R7), V3
VCEQG V0, V2, V4
VCEQG V1, V3, V5
VN V4, V5, V6
VCEQGS V6, V7, V8
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index17to32loop
BR notfound
index33plus:
CMPBGT R4, $47, index49plus
SUB $32, R4, R0
VL 16(R3), V1
VLL R0, 32(R3), V2
VONE V11
index33to48loop:
VL (R7), V3
VL 16(R7), V4
VLL R0, 32(R7), V5
VCEQG V0, V3, V6
VCEQG V1, V4, V7
VCEQG V2, V5, V8
VN V6, V7, V9
VN V8, V9, V10
VCEQGS V10, V11, V12
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index33to48loop
BR notfound
index49plus:
CMPBGT R4, $63, index65plus
SUB $48, R4, R0
VL 16(R3), V1
VL 32(R3), V2
VLL R0, 48(R3), V3
VONE V15
index49to64loop:
VL (R7), V4
VL 16(R7), V5
VL 32(R7), V6
VLL R0, 48(R7), V7
VCEQG V0, V4, V8
VCEQG V1, V5, V9
VCEQG V2, V6, V10
VCEQG V3, V7, V11
VN V8, V9, V12
VN V10, V11, V13
VN V12, V13, V14
VCEQGS V14, V15, V16
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index49to64loop
notfound:
MOVD $-1, (R5)
RET
index65plus:
// not implemented
MOVD $0, (R0)
RET
foundV17: // index is in doubleword V17[0]
VLGVG $0, V17, R8
ADD R8, R7
found:
SUB R1, R7
MOVD R7, (R5)
RET

View File

@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
GLOBL shifts<>(SB),RODATA,$256
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
MOVQ s+0(FP), DI
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
MOVQ s_len+8(FP), DX
MOVQ c+16(FP), BP
MOVQ c_len+24(FP), AX
MOVQ DI, R10
LEAQ ret+32(FP), R11
JMP runtime·indexShortStr(SB)
TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
MOVQ s+0(FP), DI
MOVQ s_len+8(FP), DX
MOVQ c+24(FP), BP
MOVQ c_len+32(FP), AX
MOVQ DI, R10
LEAQ ret+48(FP), R11
JMP runtime·indexShortStr(SB)
// AX: length of string, that we are searching for
// DX: length of string, in which we are searching
// DI: pointer to string, in which we are searching
// BP: pointer to string, that we are searching for
// R11: address, where to put return value
TEXT runtime·indexShortStr(SB),NOSPLIT,$0
CMPQ AX, DX
JA fail
CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more
MOVW (BP), BP
LEAQ -1(DI)(DX*1), DX
loop2:
MOVW (DI), SI
CMPW SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop2
JMP fail
_3_or_more:
CMPQ AX, $3
JA _4_or_more
MOVW 1(BP), BX
MOVW (BP), BP
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
CMPW SI,BP
JZ partial_success3
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
partial_success3:
MOVW 1(DI), SI
CMPW SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
_4_or_more:
CMPQ AX, $4
JA _5_or_more
MOVL (BP), BP
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
CMPL SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop4
JMP fail
_5_or_more:
CMPQ AX, $7
JA _8_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVL -4(BP)(AX*1), BX
MOVL (BP), BP
loop5to7:
MOVL (DI), SI
CMPL SI,BP
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
partial_success5to7:
MOVL -4(AX)(DI*1), SI
CMPL SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
_8_or_more:
CMPQ AX, $8
JA _9_or_more
MOVQ (BP), BP
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
CMPQ SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop8
JMP fail
_9_or_more:
CMPQ AX, $15
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVQ -8(BP)(AX*1), BX
MOVQ (BP), BP
loop9to15:
MOVQ (DI), SI
CMPQ SI,BP
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
partial_success9to15:
MOVQ -8(AX)(DI*1), SI
CMPQ SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
_16_or_more:
CMPQ AX, $16
JA _17_or_more
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
PCMPEQB X1, X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop16
JMP fail
_17_or_more:
CMPQ AX, $31
JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(BP)(AX*1), X0
MOVOU (BP), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE partial_success17to31
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
partial_success17to31:
MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3
PMOVMSKB X3, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
// So no need to check cpuid
_32_or_more:
CMPQ AX, $32
JA _33_to_63
VMOVDQU (BP), Y1
LEAQ -31(DI)(DX*1), DX
loop32:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop32
JMP fail_avx2
_33_to_63:
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
VMOVDQU -32(BP)(AX*1), Y0
VMOVDQU (BP), Y1
loop33to63:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE partial_success33to63
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
JMP fail_avx2
partial_success33to63:
VMOVDQU -32(AX)(DI*1), Y3
VPCMPEQB Y0, Y3, Y4
VPMOVMSKB Y4, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
fail_avx2:
VZEROUPPER
fail:
MOVQ $-1, (R11)
RET
success_avx2:
VZEROUPPER
JMP success
sse42:
CMPB runtime·support_sse42(SB), $1
JNE no_sse42
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(BP), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success:
SUBQ R10, DI
MOVQ DI, (R11)
RET
TEXT runtime·return0(SB), NOSPLIT, $0
MOVL $0, AX
RET

View File

@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
// compile barrier.
RET
// func supportsVX() bool
TEXT strings·supportsVX(SB),NOSPLIT,$0-1
MOVBZ runtime·cpu+facilities_hasVX(SB), R0
MOVB R0, ret+0(FP)
RET
// func supportsVX() bool
TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
MOVBZ runtime·cpu+facilities_hasVX(SB), R0
MOVB R0, ret+0(FP)
RET
// func indexShortStr(s, sep string) int
// Caller must confirm availability of vx facility before calling.
TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+32(FP), R5
BR runtime·indexShortStr(SB)
// func indexShortStr(s, sep []byte) int
// Caller must confirm availability of vx facility before calling.
TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+48(FP), R5
BR runtime·indexShortStr(SB)
// s: string we are searching
// sep: string to search for
// R1=&s[0], R2=len(s)
// R3=&sep[0], R4=len(sep)
// R5=&ret (int)
// Caller must confirm availability of vx facility before calling.
TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
CMPBGT R4, R2, notfound
ADD R1, R2
SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
CMPBEQ R4, $0, notfound
SUB $1, R4 // R4=len(sep)-1 for use as VLL index
VLL R4, (R3), V0 // contains first 16 bytes of sep
MOVD R1, R7
index2plus:
CMPBNE R4, $1, index3plus
MOVD $15(R7), R9
CMPBGE R9, R2, index2to16
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
CMPBGE R9, R2, index2to16
index2loop:
VL 0(R7), V2 // 16 bytes, even indices
VL 1(R7), V4 // 16 bytes, odd indices
VCEQH V1, V2, V5 // compare even indices
VCEQH V1, V4, V6 // compare odd indices
VSEL V5, V6, V31, V7 // merge even and odd indices
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index3plus:
CMPBNE R4, $2, index4plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $1, R0
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
VREPB $2, V0, V8
index3loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 2-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
VCEQH V1, V2, V5 // compare 2-byte even indices
VCEQH V1, V4, V6 // compare 2-byte odd indices
VCEQB V8, V9, V10 // compare last bytes
VSEL V5, V6, V31, V7 // merge even and odd indices
VN V7, V10, V7 // AND indices with last byte
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index4plus:
CMPBNE R4, $3, index5plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $2, R0
VGBM $0x8888, V29 // 0xff000000ff000000...
VGBM $0x2222, V30 // 0x0000ff000000ff00...
VGBM $0xcccc, V31 // 0xffff0000ffff0000...
VONE V16
VREPF $0, V0, V1
index4loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 3-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
VCEQF V1, V2, V5 // compare index 0, 4, ...
VCEQF V1, V4, V6 // compare index 1, 5, ...
VCEQF V1, V9, V11 // compare index 2, 6, ...
VCEQF V1, V10, V12 // compare index 3, 7, ...
VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
VSEL V13, V14, V31, V7 // final merge
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index5plus:
CMPBGT R4, $15, index17plus
index2to16:
CMPBGT R7, R2, notfound
MOVD $1(R7), R8
CMPBGT R8, R2, index2to16tail
index2to16loop:
// unrolled 2x
VLL R4, (R7), V1
VLL R4, 1(R7), V2
VCEQGS V0, V1, V3
BEQ found
MOVD $1(R7), R7
VCEQGS V0, V2, V4
BEQ found
MOVD $1(R7), R7
CMPBLT R7, R2, index2to16loop
CMPBGT R7, R2, notfound
index2to16tail:
VLL R4, (R7), V1
VCEQGS V0, V1, V2
BEQ found
BR notfound
index17plus:
CMPBGT R4, $31, index33plus
SUB $16, R4, R0
VLL R0, 16(R3), V1
VONE V7
index17to32loop:
VL (R7), V2
VLL R0, 16(R7), V3
VCEQG V0, V2, V4
VCEQG V1, V3, V5
VN V4, V5, V6
VCEQGS V6, V7, V8
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index17to32loop
BR notfound
index33plus:
CMPBGT R4, $47, index49plus
SUB $32, R4, R0
VL 16(R3), V1
VLL R0, 32(R3), V2
VONE V11
index33to48loop:
VL (R7), V3
VL 16(R7), V4
VLL R0, 32(R7), V5
VCEQG V0, V3, V6
VCEQG V1, V4, V7
VCEQG V2, V5, V8
VN V6, V7, V9
VN V8, V9, V10
VCEQGS V10, V11, V12
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index33to48loop
BR notfound
index49plus:
CMPBGT R4, $63, index65plus
SUB $48, R4, R0
VL 16(R3), V1
VL 32(R3), V2
VLL R0, 48(R3), V3
VONE V15
index49to64loop:
VL (R7), V4
VL 16(R7), V5
VL 32(R7), V6
VLL R0, 48(R7), V7
VCEQG V0, V4, V8
VCEQG V1, V5, V9
VCEQG V2, V6, V10
VCEQG V3, V7, V11
VN V8, V9, V12
VN V10, V11, V13
VN V12, V13, V14
VCEQGS V14, V15, V16
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index49to64loop
notfound:
MOVD $-1, (R5)
RET
index65plus:
// not implemented
MOVD $0, (R0)
RET
foundV17: // index is in doubleword V17[0]
VLGVG $0, V17, R8
ADD R8, R7
found:
SUB R1, R7
MOVD R7, (R5)
RET
// This is called from .init_array and follows the platform, not Go, ABI.
// We are overly conservative. We could only save the registers we use.
// However, since this function is only called once per loaded module

View File

@ -4,32 +4,16 @@
package runtime
import (
internalcpu "internal/cpu"
"runtime/internal/sys"
)
import "internal/cpu"
const (
// bit masks taken from bits/hwcap.h
_HWCAP_S390_VX = 2048 // vector facility
)
// facilities is padded to avoid false sharing.
type facilities struct {
_ [sys.CacheLineSize]byte
hasVX bool // vector facility
_ [sys.CacheLineSize]byte
}
// cpu indicates the availability of s390x facilities that can be used in
// Go assembly but are optional on models supported by Go.
// TODO: remove this once we're only using internal/cpu.
var cpu facilities
func archauxv(tag, val uintptr) {
switch tag {
case _AT_HWCAP: // CPU capability bit flags
internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
cpu.hasVX = val&_HWCAP_S390_VX != 0
cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
}
}

View File

@ -932,6 +932,85 @@ func EqualFold(s, t string) bool {
return s == t
}
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
func Index(s, substr string) int {
n := len(substr)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, substr[0])
case n == len(s):
if substr == s {
return 0
}
return -1
case n > len(s):
return -1
case n <= bytealg.MaxLen:
// Use brute force when s and substr both are small
if len(s) <= bytealg.MaxBruteForce {
return bytealg.IndexString(s, substr)
}
c := substr[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte is faster than bytealg.IndexString, so use it as long as
// we're not getting lots of false positives.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if s[i:i+n] == substr {
return i
}
fails++
i++
// Switch to bytealg.IndexString when IndexByte produces too many false positives.
if fails > bytealg.Cutover(i) {
r := bytealg.IndexString(s[i:], substr)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
c := substr[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if s[i:i+n] == substr {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// See comment in ../bytes/bytes_generic.go.
j := indexRabinKarp(s[i:], substr)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}
func indexRabinKarp(s, substr string) int {
// Rabin-Karp search
hashss, pow := hashStr(substr)

View File

@ -1,79 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
import "internal/cpu"
//go:noescape
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
func indexShortStr(s, c string) int // ../runtime/asm_amd64.s
func countByte(s string, c byte) int // ../runtime/asm_amd64.s
var shortStringLen int
func init() {
if cpu.X86.HasAVX2 {
shortStringLen = 63
} else {
shortStringLen = 31
}
}
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
func Index(s, substr string) int {
n := len(substr)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, substr[0])
case n == len(s):
if substr == s {
return 0
}
return -1
case n > len(s):
return -1
case n <= shortStringLen:
// Use brute force when s and substr both are small
if len(s) <= 64 {
return indexShortStr(s, substr)
}
c := substr[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte skips 16/32 bytes per iteration,
// so it's faster than indexShortStr.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if s[i:i+n] == substr {
return i
}
fails++
i++
// Switch to indexShortStr when IndexByte produces too many false positives.
// Too many means more that 1 error per 8 characters.
// Allow some errors in the beginning.
if fails > (i+16)/8 {
r := indexShortStr(s[i:], substr)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
return indexRabinKarp(s, substr)
}

View File

@ -1,55 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!s390x
package strings
// TODO: implements short string optimization on non amd64 platforms
// and get rid of strings_amd64.go
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
func Index(s, substr string) int {
n := len(substr)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, substr[0])
case n == len(s):
if substr == s {
return 0
}
return -1
case n > len(s):
return -1
}
c := substr[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if s[i:i+n] == substr {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// See comment in ../bytes/bytes_generic.go.
j := indexRabinKarp(s[i:], substr)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}

View File

@ -1,80 +0,0 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
//go:noescape
// indexShortStr returns the index of the first instance of sep in s,
// or -1 if sep is not present in s.
// indexShortStr requires 2 <= len(sep) <= shortStringLen
func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
// supportsVX reports whether the vector facility is available.
// indexShortStr must not be called if the vector facility is not
// available.
func supportsVX() bool // ../runtime/asm_s390x.s
var shortStringLen = -1
func init() {
if supportsVX() {
shortStringLen = 64
}
}
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
func Index(s, substr string) int {
n := len(substr)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, substr[0])
case n == len(s):
if substr == s {
return 0
}
return -1
case n > len(s):
return -1
case n <= shortStringLen:
// Use brute force when s and substr both are small
if len(s) <= 64 {
return indexShortStr(s, substr)
}
c := substr[0]
i := 0
t := s[:len(s)-n+1]
fails := 0
for i < len(t) {
if t[i] != c {
// IndexByte skips 16/32 bytes per iteration,
// so it's faster than indexShortStr.
o := IndexByte(t[i:], c)
if o < 0 {
return -1
}
i += o
}
if s[i:i+n] == substr {
return i
}
fails++
i++
// Switch to indexShortStr when IndexByte produces too many false positives.
// Too many means more that 1 error per 8 characters.
// Allow some errors in the beginning.
if fails > (i+16)/8 {
r := indexShortStr(s[i:], substr)
if r >= 0 {
return r + i
}
return -1
}
}
return -1
}
return indexRabinKarp(s, substr)
}