mirror of
https://github.com/golang/go
synced 2024-11-23 07:40:04 -07:00
internal/bytealg: move short string Index implementations into bytealg
Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
parent
f6332bb84a
commit
ee58eccc56
@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool {
|
||||
return len(s) == len(t)
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
|
||||
func Index(s, sep []byte) int {
|
||||
n := len(sep)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, sep[0])
|
||||
case n == len(s):
|
||||
if Equal(sep, s) {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= bytealg.MaxLen:
|
||||
// Use brute force when s and sep both are small
|
||||
if len(s) <= bytealg.MaxBruteForce {
|
||||
return bytealg.Index(s, sep)
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte is faster than bytealg.Index, so use it as long as
|
||||
// we're not getting lots of false positives.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to bytealg.Index when IndexByte produces too many false positives.
|
||||
if fails > bytealg.Cutover(i) {
|
||||
r := bytealg.Index(s[i:], sep)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
fails := 0
|
||||
t := s[:len(s)-n+1]
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
break
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
i++
|
||||
fails++
|
||||
if fails >= 4+i>>4 && i < len(t) {
|
||||
// Give up on IndexByte, it isn't skipping ahead
|
||||
// far enough to be better than Rabin-Karp.
|
||||
// Experiments (using IndexPeriodic) suggest
|
||||
// the cutover is about 16 byte skips.
|
||||
// TODO: if large prefixes of sep are matching
|
||||
// we should cutover at even larger average skips,
|
||||
// because Equal becomes that much more expensive.
|
||||
// This code does not take that effect into account.
|
||||
j := indexRabinKarp(s[i:], sep)
|
||||
if j < 0 {
|
||||
return -1
|
||||
}
|
||||
return i + j
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func indexRabinKarp(s, sep []byte) int {
|
||||
// Rabin-Karp search
|
||||
hashsep, pow := hashStr(sep)
|
||||
|
@ -1,79 +0,0 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytes
|
||||
|
||||
import "internal/cpu"
|
||||
|
||||
//go:noescape
|
||||
|
||||
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
|
||||
// indexShortStr requires 2 <= len(c) <= shortStringLen
|
||||
func indexShortStr(s, c []byte) int // ../runtime/asm_amd64.s
|
||||
func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
|
||||
|
||||
var shortStringLen int
|
||||
|
||||
func init() {
|
||||
if cpu.X86.HasAVX2 {
|
||||
shortStringLen = 63
|
||||
} else {
|
||||
shortStringLen = 31
|
||||
}
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
|
||||
func Index(s, sep []byte) int {
|
||||
n := len(sep)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, sep[0])
|
||||
case n == len(s):
|
||||
if Equal(sep, s) {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= shortStringLen:
|
||||
// Use brute force when s and sep both are small
|
||||
if len(s) <= 64 {
|
||||
return indexShortStr(s, sep)
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte skips 16/32 bytes per iteration,
|
||||
// so it's faster than indexShortStr.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to indexShortStr when IndexByte produces too many false positives.
|
||||
// Too many means more that 1 error per 8 characters.
|
||||
// Allow some errors in the beginning.
|
||||
if fails > (i+16)/8 {
|
||||
r := indexShortStr(s[i:], sep)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
return indexRabinKarp(s, sep)
|
||||
}
|
@ -1,72 +0,0 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytes
|
||||
|
||||
func countByte(s []byte, c byte) int // bytes_arm64.s
|
||||
|
||||
// 8 bytes can be completely loaded into 1 register.
|
||||
const shortStringLen = 8
|
||||
|
||||
//go:noescape
|
||||
func indexShortStr(s, sep []byte) int
|
||||
|
||||
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
|
||||
func Index(s, sep []byte) int {
|
||||
n := len(sep)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, sep[0])
|
||||
case n == len(s):
|
||||
if Equal(sep, s) {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= shortStringLen:
|
||||
// Use brute force when both s and sep are small.
|
||||
// Empirical data shows that it can get better
|
||||
// performance when len(s) <= 16.
|
||||
if len(s) <= 16 {
|
||||
return indexShortStr(s, sep)
|
||||
}
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
fails := 0
|
||||
t := s[:len(s)-n+1]
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
break
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
i++
|
||||
fails++
|
||||
if fails >= 4+i>>4 && i < len(t) {
|
||||
// Give up on IndexByte, it isn't skipping ahead
|
||||
// far enough to be better than Rabin-Karp.
|
||||
// Experiments (using IndexPeriodic) suggest
|
||||
// the cutover is about 16 byte skips.
|
||||
// TODO: if large prefixes of sep are matching
|
||||
// we should cutover at even larger average skips,
|
||||
// because Equal becomes that much more expensive.
|
||||
// This code does not take that effect into account.
|
||||
j := indexRabinKarp(s[i:], sep)
|
||||
if j < 0 {
|
||||
return -1
|
||||
}
|
||||
return i + j
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!s390x,!arm64
|
||||
|
||||
package bytes
|
||||
|
||||
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
|
||||
func Index(s, sep []byte) int {
|
||||
n := len(sep)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, sep[0])
|
||||
case n == len(s):
|
||||
if Equal(sep, s) {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
fails := 0
|
||||
t := s[:len(s)-n+1]
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
break
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
i++
|
||||
fails++
|
||||
if fails >= 4+i>>4 && i < len(t) {
|
||||
// Give up on IndexByte, it isn't skipping ahead
|
||||
// far enough to be better than Rabin-Karp.
|
||||
// Experiments (using IndexPeriodic) suggest
|
||||
// the cutover is about 16 byte skips.
|
||||
// TODO: if large prefixes of sep are matching
|
||||
// we should cutover at even larger average skips,
|
||||
// because Equal becomes that much more expensive.
|
||||
// This code does not take that effect into account.
|
||||
j := indexRabinKarp(s[i:], sep)
|
||||
if j < 0 {
|
||||
return -1
|
||||
}
|
||||
return i + j
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
@ -1,80 +0,0 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytes
|
||||
|
||||
//go:noescape
|
||||
|
||||
// indexShortStr returns the index of the first instance of sep in s,
|
||||
// or -1 if sep is not present in s.
|
||||
// indexShortStr requires 2 <= len(sep) <= shortStringLen
|
||||
func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
|
||||
|
||||
// supportsVX reports whether the vector facility is available.
|
||||
// indexShortStr must not be called if the vector facility is not
|
||||
// available.
|
||||
func supportsVX() bool // ../runtime/asm_s390x.s
|
||||
|
||||
var shortStringLen = -1
|
||||
|
||||
func init() {
|
||||
if supportsVX() {
|
||||
shortStringLen = 64
|
||||
}
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
|
||||
func Index(s, sep []byte) int {
|
||||
n := len(sep)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, sep[0])
|
||||
case n == len(s):
|
||||
if Equal(sep, s) {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= shortStringLen:
|
||||
// Use brute force when s and sep both are small
|
||||
if len(s) <= 64 {
|
||||
return indexShortStr(s, sep)
|
||||
}
|
||||
c := sep[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte skips 16/32 bytes per iteration,
|
||||
// so it's faster than indexShortStr.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if Equal(s[i:i+n], sep) {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to indexShortStr when IndexByte produces too many false positives.
|
||||
// Too many means more that 1 error per 8 characters.
|
||||
// Allow some errors in the beginning.
|
||||
if fails > (i+16)/8 {
|
||||
r := indexShortStr(s[i:], sep)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
return indexRabinKarp(s, sep)
|
||||
}
|
@ -1,20 +1,16 @@
|
||||
// amd64-specific vet whitelist. See readme.txt for details.
|
||||
|
||||
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
|
||||
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
|
||||
|
||||
// False positives.
|
||||
|
||||
// Nothing much to do about cross-package assembly. Unfortunate.
|
||||
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
|
||||
internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
|
||||
|
||||
// reflect trampolines intentionally omit arg size. Same for morestack.
|
||||
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
|
||||
runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
|
||||
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
|
||||
|
||||
// Nothing much to do about cross-package assembly. Unfortunate.
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
|
||||
|
||||
// Intentionally missing declarations. These are special assembly routines.
|
||||
// Some are jumped into from other routines, with values in specific registers.
|
||||
// duff* have direct calls from the compiler.
|
||||
@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de
|
||||
runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
|
||||
runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
|
||||
runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
|
||||
|
@ -1,11 +1,6 @@
|
||||
runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
|
||||
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
|
||||
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
|
||||
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
|
||||
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
|
||||
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
|
||||
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
|
||||
runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
|
||||
runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
|
||||
runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
|
||||
runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration
|
||||
|
22
src/internal/bytealg/bytealg.go
Normal file
22
src/internal/bytealg/bytealg.go
Normal file
@ -0,0 +1,22 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytealg
|
||||
|
||||
import (
|
||||
"internal/cpu"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Offsets into internal/cpu records for use in assembly.
|
||||
const (
|
||||
x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
|
||||
x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
|
||||
x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
||||
x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
|
||||
s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
|
||||
)
|
||||
|
||||
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
|
||||
var MaxLen int
|
90
src/internal/bytealg/count_arm64.s
Normal file
90
src/internal/bytealg/count_arm64.s
Normal file
@ -0,0 +1,90 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Count(SB),NOSPLIT,$0-40
|
||||
MOVD b_base+0(FP), R0
|
||||
MOVD b_len+8(FP), R2
|
||||
MOVBU c+24(FP), R1
|
||||
MOVD $ret+32(FP), R8
|
||||
B countbytebody<>(SB)
|
||||
|
||||
TEXT ·CountString(SB),NOSPLIT,$0-32
|
||||
MOVD s_base+0(FP), R0
|
||||
MOVD s_len+8(FP), R2
|
||||
MOVBU c+16(FP), R1
|
||||
MOVD $ret+24(FP), R8
|
||||
B countbytebody<>(SB)
|
||||
|
||||
// input:
|
||||
// R0: data
|
||||
// R2: data len
|
||||
// R1: byte to find
|
||||
// R8: address to put result
|
||||
TEXT countbytebody<>(SB),NOSPLIT,$0
|
||||
// R11 = count of byte to search
|
||||
MOVD $0, R11
|
||||
// short path to handle 0-byte case
|
||||
CBZ R2, done
|
||||
CMP $0x20, R2
|
||||
// jump directly to tail if length < 32
|
||||
BLO tail
|
||||
ANDS $0x1f, R0, R9
|
||||
BEQ chunk
|
||||
// Work with not 32-byte aligned head
|
||||
BIC $0x1f, R0, R3
|
||||
ADD $0x20, R3
|
||||
head_loop:
|
||||
MOVBU.P 1(R0), R5
|
||||
CMP R5, R1
|
||||
CINC EQ, R11, R11
|
||||
SUB $1, R2, R2
|
||||
CMP R0, R3
|
||||
BNE head_loop
|
||||
// Work with 32-byte aligned chunks
|
||||
chunk:
|
||||
BIC $0x1f, R2, R9
|
||||
// The first chunk can also be the last
|
||||
CBZ R9, tail
|
||||
// R3 = end of 32-byte chunks
|
||||
ADD R0, R9, R3
|
||||
MOVD $1, R5
|
||||
VMOV R5, V5.B16
|
||||
// R2 = length of tail
|
||||
SUB R9, R2, R2
|
||||
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
|
||||
VMOV R1, V0.B16
|
||||
// Clear the low 64-bit element of V7 and V8
|
||||
VEOR V7.B8, V7.B8, V7.B8
|
||||
VEOR V8.B8, V8.B8, V8.B8
|
||||
// Count the target byte in 32-byte chunk
|
||||
chunk_loop:
|
||||
VLD1.P (R0), [V1.B16, V2.B16]
|
||||
CMP R0, R3
|
||||
VCMEQ V0.B16, V1.B16, V3.B16
|
||||
VCMEQ V0.B16, V2.B16, V4.B16
|
||||
// Clear the higher 7 bits
|
||||
VAND V5.B16, V3.B16, V3.B16
|
||||
VAND V5.B16, V4.B16, V4.B16
|
||||
// Count lanes match the requested byte
|
||||
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
|
||||
VUADDLV V6.B16, V7
|
||||
// Accumulate the count in low 64-bit element of V8 when inside the loop
|
||||
VADD V7, V8
|
||||
BNE chunk_loop
|
||||
VMOV V8.D[0], R6
|
||||
ADD R6, R11, R11
|
||||
CBZ R2, done
|
||||
tail:
|
||||
// Work with tail shorter than 32 bytes
|
||||
MOVBU.P 1(R0), R5
|
||||
SUB $1, R2, R2
|
||||
CMP R5, R1
|
||||
CINC EQ, R11, R11
|
||||
CBNZ R2, tail
|
||||
done:
|
||||
MOVD R11, (R8)
|
||||
RET
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64
|
||||
// +build !amd64,!arm64
|
||||
|
||||
package bytealg
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64
|
||||
// +build amd64 arm64
|
||||
|
||||
package bytealg
|
||||
|
||||
|
@ -4,24 +4,8 @@
|
||||
|
||||
package bytealg
|
||||
|
||||
import (
|
||||
"internal/cpu"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
|
||||
|
||||
// Because equal_native.go is unconditional, it's a good place to compute asm constants.
|
||||
// TODO: find a better way to do this?
|
||||
|
||||
// Offsets into internal/cpu records for use in assembly.
|
||||
const (
|
||||
x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
|
||||
x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
||||
x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
|
||||
s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func Equal(a, b []byte) bool
|
||||
|
||||
|
26
src/internal/bytealg/index_amd64.go
Normal file
26
src/internal/bytealg/index_amd64.go
Normal file
@ -0,0 +1,26 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytealg
|
||||
|
||||
import "internal/cpu"
|
||||
|
||||
const MaxBruteForce = 64
|
||||
|
||||
func init() {
|
||||
if cpu.X86.HasAVX2 {
|
||||
MaxLen = 63
|
||||
} else {
|
||||
MaxLen = 31
|
||||
}
|
||||
}
|
||||
|
||||
// Cutover reports the number of failures of IndexByte we should tolerate
|
||||
// before switching over to Index.
|
||||
// n is the number of bytes processed so far.
|
||||
// See the bytes.Index implementation for details.
|
||||
func Cutover(n int) int {
|
||||
// 1 error per 8 characters, plus a few slop to start.
|
||||
return (n + 16) / 8
|
||||
}
|
274
src/internal/bytealg/index_amd64.s
Normal file
274
src/internal/bytealg/index_amd64.s
Normal file
@ -0,0 +1,274 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Index(SB),NOSPLIT,$0-56
|
||||
MOVQ a_base+0(FP), DI
|
||||
MOVQ a_len+8(FP), DX
|
||||
MOVQ b_base+24(FP), BP
|
||||
MOVQ b_len+32(FP), AX
|
||||
MOVQ DI, R10
|
||||
LEAQ ret+48(FP), R11
|
||||
JMP indexbody<>(SB)
|
||||
|
||||
TEXT ·IndexString(SB),NOSPLIT,$0-40
|
||||
MOVQ a_base+0(FP), DI
|
||||
MOVQ a_len+8(FP), DX
|
||||
MOVQ b_base+16(FP), BP
|
||||
MOVQ b_len+24(FP), AX
|
||||
MOVQ DI, R10
|
||||
LEAQ ret+32(FP), R11
|
||||
JMP indexbody<>(SB)
|
||||
|
||||
// AX: length of string, that we are searching for
|
||||
// DX: length of string, in which we are searching
|
||||
// DI: pointer to string, in which we are searching
|
||||
// BP: pointer to string, that we are searching for
|
||||
// R11: address, where to put return value
|
||||
// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
|
||||
TEXT indexbody<>(SB),NOSPLIT,$0
|
||||
CMPQ AX, DX
|
||||
JA fail
|
||||
CMPQ DX, $16
|
||||
JAE sse42
|
||||
no_sse42:
|
||||
CMPQ AX, $2
|
||||
JA _3_or_more
|
||||
MOVW (BP), BP
|
||||
LEAQ -1(DI)(DX*1), DX
|
||||
loop2:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop2
|
||||
JMP fail
|
||||
_3_or_more:
|
||||
CMPQ AX, $3
|
||||
JA _4_or_more
|
||||
MOVW 1(BP), BX
|
||||
MOVW (BP), BP
|
||||
LEAQ -2(DI)(DX*1), DX
|
||||
loop3:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,BP
|
||||
JZ partial_success3
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
partial_success3:
|
||||
MOVW 1(DI), SI
|
||||
CMPW SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
_4_or_more:
|
||||
CMPQ AX, $4
|
||||
JA _5_or_more
|
||||
MOVL (BP), BP
|
||||
LEAQ -3(DI)(DX*1), DX
|
||||
loop4:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop4
|
||||
JMP fail
|
||||
_5_or_more:
|
||||
CMPQ AX, $7
|
||||
JA _8_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVL -4(BP)(AX*1), BX
|
||||
MOVL (BP), BP
|
||||
loop5to7:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,BP
|
||||
JZ partial_success5to7
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
partial_success5to7:
|
||||
MOVL -4(AX)(DI*1), SI
|
||||
CMPL SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
_8_or_more:
|
||||
CMPQ AX, $8
|
||||
JA _9_or_more
|
||||
MOVQ (BP), BP
|
||||
LEAQ -7(DI)(DX*1), DX
|
||||
loop8:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop8
|
||||
JMP fail
|
||||
_9_or_more:
|
||||
CMPQ AX, $15
|
||||
JA _16_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVQ -8(BP)(AX*1), BX
|
||||
MOVQ (BP), BP
|
||||
loop9to15:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,BP
|
||||
JZ partial_success9to15
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
partial_success9to15:
|
||||
MOVQ -8(AX)(DI*1), SI
|
||||
CMPQ SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
_16_or_more:
|
||||
CMPQ AX, $16
|
||||
JA _17_or_more
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), DX
|
||||
loop16:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1, X2
|
||||
PMOVMSKB X2, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop16
|
||||
JMP fail
|
||||
_17_or_more:
|
||||
CMPQ AX, $31
|
||||
JA _32_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVOU -16(BP)(AX*1), X0
|
||||
MOVOU (BP), X1
|
||||
loop17to31:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1,X2
|
||||
PMOVMSKB X2, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE partial_success17to31
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
JMP fail
|
||||
partial_success17to31:
|
||||
MOVOU -16(AX)(DI*1), X3
|
||||
PCMPEQB X0, X3
|
||||
PMOVMSKB X3, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
JMP fail
|
||||
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
|
||||
// So no need to check cpuid
|
||||
_32_or_more:
|
||||
CMPQ AX, $32
|
||||
JA _33_to_63
|
||||
VMOVDQU (BP), Y1
|
||||
LEAQ -31(DI)(DX*1), DX
|
||||
loop32:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE success_avx2
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop32
|
||||
JMP fail_avx2
|
||||
_33_to_63:
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
VMOVDQU -32(BP)(AX*1), Y0
|
||||
VMOVDQU (BP), Y1
|
||||
loop33to63:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE partial_success33to63
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop33to63
|
||||
JMP fail_avx2
|
||||
partial_success33to63:
|
||||
VMOVDQU -32(AX)(DI*1), Y3
|
||||
VPCMPEQB Y0, Y3, Y4
|
||||
VPMOVMSKB Y4, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE success_avx2
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop33to63
|
||||
fail_avx2:
|
||||
VZEROUPPER
|
||||
fail:
|
||||
MOVQ $-1, (R11)
|
||||
RET
|
||||
success_avx2:
|
||||
VZEROUPPER
|
||||
JMP success
|
||||
sse42:
|
||||
CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
|
||||
JNE no_sse42
|
||||
CMPQ AX, $12
|
||||
// PCMPESTRI is slower than normal compare,
|
||||
// so using it makes sense only if we advance 4+ bytes per compare
|
||||
// This value was determined experimentally and is the ~same
|
||||
// on Nehalem (first with SSE42) and Haswell.
|
||||
JAE _9_or_more
|
||||
LEAQ 16(BP), SI
|
||||
TESTW $0xff0, SI
|
||||
JEQ no_sse42
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), SI
|
||||
MOVQ $16, R9
|
||||
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
|
||||
loop_sse42:
|
||||
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
|
||||
// for equality (bits 2,3 are 11)
|
||||
// result is not masked or inverted (bits 4,5 are 00)
|
||||
// and corresponds to first matching byte (bit 6 is 0)
|
||||
PCMPESTRI $0x0c, (DI), X1
|
||||
// CX == 16 means no match,
|
||||
// CX > R9 means partial match at the end of the string,
|
||||
// otherwise sep is at offset CX from X1 start
|
||||
CMPQ CX, R9
|
||||
JBE sse42_success
|
||||
ADDQ R9, DI
|
||||
CMPQ DI, SI
|
||||
JB loop_sse42
|
||||
PCMPESTRI $0x0c, -1(SI), X1
|
||||
CMPQ CX, R9
|
||||
JA fail
|
||||
LEAQ -1(SI), DI
|
||||
sse42_success:
|
||||
ADDQ CX, DI
|
||||
success:
|
||||
SUBQ R10, DI
|
||||
MOVQ DI, (R11)
|
||||
RET
|
23
src/internal/bytealg/index_arm64.go
Normal file
23
src/internal/bytealg/index_arm64.go
Normal file
@ -0,0 +1,23 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytealg
|
||||
|
||||
// Empirical data shows that using IndexShortStr can get better
|
||||
// performance when len(s) <= 16.
|
||||
const MaxBruteForce = 16
|
||||
|
||||
func init() {
|
||||
// 8 bytes can be completely loaded into 1 register.
|
||||
MaxLen = 8
|
||||
}
|
||||
|
||||
// Cutover reports the number of failures of IndexByte we should tolerate
|
||||
// before switching over to IndexShortStr.
|
||||
// n is the number of bytes processed so far.
|
||||
// See the bytes.Index implementation for details.
|
||||
func Cutover(n int) int {
|
||||
// 1 error per 16 characters, plus a few slop to start.
|
||||
return 4 + n>>4
|
||||
}
|
@ -1,88 +1,40 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
// countByte(s []byte, c byte) int
|
||||
TEXT bytes·countByte(SB),NOSPLIT,$0-40
|
||||
MOVD s_base+0(FP), R0
|
||||
MOVD s_len+8(FP), R2
|
||||
MOVBU c+24(FP), R1
|
||||
// R11 = count of byte to search
|
||||
MOVD $0, R11
|
||||
// short path to handle 0-byte case
|
||||
CBZ R2, done
|
||||
CMP $0x20, R2
|
||||
// jump directly to tail if length < 32
|
||||
BLO tail
|
||||
ANDS $0x1f, R0, R9
|
||||
BEQ chunk
|
||||
// Work with not 32-byte aligned head
|
||||
BIC $0x1f, R0, R3
|
||||
ADD $0x20, R3
|
||||
head_loop:
|
||||
MOVBU.P 1(R0), R5
|
||||
CMP R5, R1
|
||||
CINC EQ, R11, R11
|
||||
SUB $1, R2, R2
|
||||
CMP R0, R3
|
||||
BNE head_loop
|
||||
// Work with 32-byte aligned chunks
|
||||
chunk:
|
||||
BIC $0x1f, R2, R9
|
||||
// The first chunk can also be the last
|
||||
CBZ R9, tail
|
||||
// R3 = end of 32-byte chunks
|
||||
ADD R0, R9, R3
|
||||
MOVD $1, R5
|
||||
VMOV R5, V5.B16
|
||||
// R2 = length of tail
|
||||
SUB R9, R2, R2
|
||||
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
|
||||
VMOV R1, V0.B16
|
||||
// Clear the low 64-bit element of V7 and V8
|
||||
VEOR V7.B8, V7.B8, V7.B8
|
||||
VEOR V8.B8, V8.B8, V8.B8
|
||||
// Count the target byte in 32-byte chunk
|
||||
chunk_loop:
|
||||
VLD1.P (R0), [V1.B16, V2.B16]
|
||||
CMP R0, R3
|
||||
VCMEQ V0.B16, V1.B16, V3.B16
|
||||
VCMEQ V0.B16, V2.B16, V4.B16
|
||||
// Clear the higher 7 bits
|
||||
VAND V5.B16, V3.B16, V3.B16
|
||||
VAND V5.B16, V4.B16, V4.B16
|
||||
// Count lanes match the requested byte
|
||||
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
|
||||
VUADDLV V6.B16, V7
|
||||
// Accumulate the count in low 64-bit element of V8 when inside the loop
|
||||
VADD V7, V8
|
||||
BNE chunk_loop
|
||||
VMOV V8.D[0], R6
|
||||
ADD R6, R11, R11
|
||||
CBZ R2, done
|
||||
tail:
|
||||
// Work with tail shorter than 32 bytes
|
||||
MOVBU.P 1(R0), R5
|
||||
SUB $1, R2, R2
|
||||
CMP R5, R1
|
||||
CINC EQ, R11, R11
|
||||
CBNZ R2, tail
|
||||
done:
|
||||
MOVD R11, ret+32(FP)
|
||||
RET
|
||||
TEXT ·Index(SB),NOSPLIT,$0-56
|
||||
MOVD a_base+0(FP), R0
|
||||
MOVD a_len+8(FP), R1
|
||||
MOVD b_base+24(FP), R2
|
||||
MOVD b_len+32(FP), R3
|
||||
MOVD $ret+48(FP), R9
|
||||
B indexbody<>(SB)
|
||||
|
||||
// indexShortStr(s, sep []byte) int
|
||||
// precondition: 2 <= len(sep) <= 8
|
||||
TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
|
||||
TEXT ·IndexString(SB),NOSPLIT,$0-40
|
||||
MOVD a_base+0(FP), R0
|
||||
MOVD a_len+8(FP), R1
|
||||
MOVD b_base+16(FP), R2
|
||||
MOVD b_len+24(FP), R3
|
||||
MOVD $ret+32(FP), R9
|
||||
B indexbody<>(SB)
|
||||
|
||||
// input:
|
||||
// R0: haystack
|
||||
// R1: length of haystack
|
||||
// R2: needle
|
||||
// R3: length of needle (2 <= len <= 8)
|
||||
// R9: address to put result
|
||||
TEXT indexbody<>(SB),NOSPLIT,$0-56
|
||||
// main idea is to load 'sep' into separate register(s)
|
||||
// to avoid repeatedly re-load it again and again
|
||||
// for sebsequent substring comparisons
|
||||
MOVD s+0(FP), R0
|
||||
MOVD s_len+8(FP), R1
|
||||
MOVD sep+24(FP), R2
|
||||
MOVD sep_len+32(FP), R3
|
||||
MOVD a_base+0(FP), R0
|
||||
MOVD a_len+8(FP), R1
|
||||
MOVD b_base+24(FP), R2
|
||||
MOVD b_len+32(FP), R3
|
||||
SUB R3, R1, R4
|
||||
// R4 contains the start of last substring for comparsion
|
||||
ADD R0, R4, R4
|
||||
@ -189,9 +141,9 @@ loop_2:
|
||||
BLS loop_2
|
||||
not_found:
|
||||
MOVD $-1, R0
|
||||
MOVD R0, ret+48(FP)
|
||||
MOVD R0, (R9)
|
||||
RET
|
||||
found:
|
||||
SUB R8, R0, R0
|
||||
MOVD R0, ret+48(FP)
|
||||
MOVD R0, (R9)
|
||||
RET
|
29
src/internal/bytealg/index_generic.go
Normal file
29
src/internal/bytealg/index_generic.go
Normal file
@ -0,0 +1,29 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!arm64,!s390x
|
||||
|
||||
package bytealg
|
||||
|
||||
const MaxBruteForce = 0
|
||||
|
||||
// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
|
||||
// Requires 2 <= len(b) <= MaxLen.
|
||||
func Index(a, b []byte) int {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
|
||||
// Requires 2 <= len(b) <= MaxLen.
|
||||
func IndexString(a, b string) int {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Cutover reports the number of failures of IndexByte we should tolerate
|
||||
// before switching over to Index.
|
||||
// n is the number of bytes processed so far.
|
||||
// See the bytes.Index implementation for details.
|
||||
func Cutover(n int) int {
|
||||
panic("unimplemented")
|
||||
}
|
19
src/internal/bytealg/index_native.go
Normal file
19
src/internal/bytealg/index_native.go
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64 arm64 s390x
|
||||
|
||||
package bytealg
|
||||
|
||||
//go:noescape
|
||||
|
||||
// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
|
||||
// Requires 2 <= len(b) <= MaxLen.
|
||||
func Index(a, b []byte) int
|
||||
|
||||
//go:noescape
|
||||
|
||||
// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
|
||||
// Requires 2 <= len(b) <= MaxLen.
|
||||
func IndexString(a, b string) int
|
31
src/internal/bytealg/index_s390x.go
Normal file
31
src/internal/bytealg/index_s390x.go
Normal file
@ -0,0 +1,31 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytealg
|
||||
|
||||
import "internal/cpu"
|
||||
|
||||
const MaxBruteForce = 64
|
||||
|
||||
func init() {
|
||||
// Note: we're kind of lucky that this flag is available at this point.
|
||||
// The runtime sets HasVX when processing auxv records, and that happens
|
||||
// to happen *before* running the init functions of packages that
|
||||
// the runtime depends on.
|
||||
// TODO: it would really be nicer for internal/cpu to figure out this
|
||||
// flag by itself. Then we wouldn't need to depend on quirks of
|
||||
// early startup initialization order.
|
||||
if cpu.S390X.HasVX {
|
||||
MaxLen = 64
|
||||
}
|
||||
}
|
||||
|
||||
// Cutover reports the number of failures of IndexByte we should tolerate
|
||||
// before switching over to Index.
|
||||
// n is the number of bytes processed so far.
|
||||
// See the bytes.Index implementation for details.
|
||||
func Cutover(n int) int {
|
||||
// 1 error per 8 characters, plus a few slop to start.
|
||||
return (n + 16) / 8
|
||||
}
|
216
src/internal/bytealg/index_s390x.s
Normal file
216
src/internal/bytealg/index_s390x.s
Normal file
@ -0,0 +1,216 @@
|
||||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
|
||||
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
|
||||
LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
|
||||
MOVD $ret+48(FP), R5
|
||||
BR indexbody<>(SB)
|
||||
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
|
||||
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
|
||||
LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
|
||||
MOVD $ret+32(FP), R5
|
||||
BR indexbody<>(SB)
|
||||
|
||||
// s: string we are searching
|
||||
// sep: string to search for
|
||||
// R1=&s[0], R2=len(s)
|
||||
// R3=&sep[0], R4=len(sep)
|
||||
// R5=&ret (int)
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
|
||||
CMPBGT R4, R2, notfound
|
||||
ADD R1, R2
|
||||
SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
|
||||
CMPBEQ R4, $0, notfound
|
||||
SUB $1, R4 // R4=len(sep)-1 for use as VLL index
|
||||
VLL R4, (R3), V0 // contains first 16 bytes of sep
|
||||
MOVD R1, R7
|
||||
index2plus:
|
||||
CMPBNE R4, $1, index3plus
|
||||
MOVD $15(R7), R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
|
||||
VONE V16
|
||||
VREPH $0, V0, V1
|
||||
CMPBGE R9, R2, index2to16
|
||||
index2loop:
|
||||
VL 0(R7), V2 // 16 bytes, even indices
|
||||
VL 1(R7), V4 // 16 bytes, odd indices
|
||||
VCEQH V1, V2, V5 // compare even indices
|
||||
VCEQH V1, V4, V6 // compare odd indices
|
||||
VSEL V5, V6, V31, V7 // merge even and odd indices
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index3plus:
|
||||
CMPBNE R4, $2, index4plus
|
||||
ADD $15, R7, R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
MOVD $1, R0
|
||||
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
|
||||
VONE V16
|
||||
VREPH $0, V0, V1
|
||||
VREPB $2, V0, V8
|
||||
index3loop:
|
||||
VL (R7), V2 // load 16-bytes into V2
|
||||
VLL R0, 16(R7), V3 // load 2-bytes into V3
|
||||
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
|
||||
VCEQH V1, V2, V5 // compare 2-byte even indices
|
||||
VCEQH V1, V4, V6 // compare 2-byte odd indices
|
||||
VCEQB V8, V9, V10 // compare last bytes
|
||||
VSEL V5, V6, V31, V7 // merge even and odd indices
|
||||
VN V7, V10, V7 // AND indices with last byte
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index4plus:
|
||||
CMPBNE R4, $3, index5plus
|
||||
ADD $15, R7, R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
MOVD $2, R0
|
||||
VGBM $0x8888, V29 // 0xff000000ff000000...
|
||||
VGBM $0x2222, V30 // 0x0000ff000000ff00...
|
||||
VGBM $0xcccc, V31 // 0xffff0000ffff0000...
|
||||
VONE V16
|
||||
VREPF $0, V0, V1
|
||||
index4loop:
|
||||
VL (R7), V2 // load 16-bytes into V2
|
||||
VLL R0, 16(R7), V3 // load 3-bytes into V3
|
||||
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
|
||||
VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
|
||||
VCEQF V1, V2, V5 // compare index 0, 4, ...
|
||||
VCEQF V1, V4, V6 // compare index 1, 5, ...
|
||||
VCEQF V1, V9, V11 // compare index 2, 6, ...
|
||||
VCEQF V1, V10, V12 // compare index 3, 7, ...
|
||||
VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
|
||||
VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
|
||||
VSEL V13, V14, V31, V7 // final merge
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index5plus:
|
||||
CMPBGT R4, $15, index17plus
|
||||
index2to16:
|
||||
CMPBGT R7, R2, notfound
|
||||
MOVD $1(R7), R8
|
||||
CMPBGT R8, R2, index2to16tail
|
||||
index2to16loop:
|
||||
// unrolled 2x
|
||||
VLL R4, (R7), V1
|
||||
VLL R4, 1(R7), V2
|
||||
VCEQGS V0, V1, V3
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
VCEQGS V0, V2, V4
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLT R7, R2, index2to16loop
|
||||
CMPBGT R7, R2, notfound
|
||||
index2to16tail:
|
||||
VLL R4, (R7), V1
|
||||
VCEQGS V0, V1, V2
|
||||
BEQ found
|
||||
BR notfound
|
||||
|
||||
index17plus:
|
||||
CMPBGT R4, $31, index33plus
|
||||
SUB $16, R4, R0
|
||||
VLL R0, 16(R3), V1
|
||||
VONE V7
|
||||
index17to32loop:
|
||||
VL (R7), V2
|
||||
VLL R0, 16(R7), V3
|
||||
VCEQG V0, V2, V4
|
||||
VCEQG V1, V3, V5
|
||||
VN V4, V5, V6
|
||||
VCEQGS V6, V7, V8
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index17to32loop
|
||||
BR notfound
|
||||
|
||||
index33plus:
|
||||
CMPBGT R4, $47, index49plus
|
||||
SUB $32, R4, R0
|
||||
VL 16(R3), V1
|
||||
VLL R0, 32(R3), V2
|
||||
VONE V11
|
||||
index33to48loop:
|
||||
VL (R7), V3
|
||||
VL 16(R7), V4
|
||||
VLL R0, 32(R7), V5
|
||||
VCEQG V0, V3, V6
|
||||
VCEQG V1, V4, V7
|
||||
VCEQG V2, V5, V8
|
||||
VN V6, V7, V9
|
||||
VN V8, V9, V10
|
||||
VCEQGS V10, V11, V12
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index33to48loop
|
||||
BR notfound
|
||||
|
||||
index49plus:
|
||||
CMPBGT R4, $63, index65plus
|
||||
SUB $48, R4, R0
|
||||
VL 16(R3), V1
|
||||
VL 32(R3), V2
|
||||
VLL R0, 48(R3), V3
|
||||
VONE V15
|
||||
index49to64loop:
|
||||
VL (R7), V4
|
||||
VL 16(R7), V5
|
||||
VL 32(R7), V6
|
||||
VLL R0, 48(R7), V7
|
||||
VCEQG V0, V4, V8
|
||||
VCEQG V1, V5, V9
|
||||
VCEQG V2, V6, V10
|
||||
VCEQG V3, V7, V11
|
||||
VN V8, V9, V12
|
||||
VN V10, V11, V13
|
||||
VN V12, V13, V14
|
||||
VCEQGS V14, V15, V16
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index49to64loop
|
||||
notfound:
|
||||
MOVD $-1, (R5)
|
||||
RET
|
||||
|
||||
index65plus:
|
||||
// not implemented
|
||||
MOVD $0, (R0)
|
||||
RET
|
||||
|
||||
foundV17: // index is in doubleword V17[0]
|
||||
VLGVG $0, V17, R8
|
||||
ADD R8, R7
|
||||
found:
|
||||
SUB R1, R7
|
||||
MOVD R7, (R5)
|
||||
RET
|
@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
|
||||
DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
|
||||
GLOBL shifts<>(SB),RODATA,$256
|
||||
|
||||
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
|
||||
MOVQ s+0(FP), DI
|
||||
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
|
||||
MOVQ s_len+8(FP), DX
|
||||
MOVQ c+16(FP), BP
|
||||
MOVQ c_len+24(FP), AX
|
||||
MOVQ DI, R10
|
||||
LEAQ ret+32(FP), R11
|
||||
JMP runtime·indexShortStr(SB)
|
||||
|
||||
TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
|
||||
MOVQ s+0(FP), DI
|
||||
MOVQ s_len+8(FP), DX
|
||||
MOVQ c+24(FP), BP
|
||||
MOVQ c_len+32(FP), AX
|
||||
MOVQ DI, R10
|
||||
LEAQ ret+48(FP), R11
|
||||
JMP runtime·indexShortStr(SB)
|
||||
|
||||
// AX: length of string, that we are searching for
|
||||
// DX: length of string, in which we are searching
|
||||
// DI: pointer to string, in which we are searching
|
||||
// BP: pointer to string, that we are searching for
|
||||
// R11: address, where to put return value
|
||||
TEXT runtime·indexShortStr(SB),NOSPLIT,$0
|
||||
CMPQ AX, DX
|
||||
JA fail
|
||||
CMPQ DX, $16
|
||||
JAE sse42
|
||||
no_sse42:
|
||||
CMPQ AX, $2
|
||||
JA _3_or_more
|
||||
MOVW (BP), BP
|
||||
LEAQ -1(DI)(DX*1), DX
|
||||
loop2:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop2
|
||||
JMP fail
|
||||
_3_or_more:
|
||||
CMPQ AX, $3
|
||||
JA _4_or_more
|
||||
MOVW 1(BP), BX
|
||||
MOVW (BP), BP
|
||||
LEAQ -2(DI)(DX*1), DX
|
||||
loop3:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,BP
|
||||
JZ partial_success3
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
partial_success3:
|
||||
MOVW 1(DI), SI
|
||||
CMPW SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
_4_or_more:
|
||||
CMPQ AX, $4
|
||||
JA _5_or_more
|
||||
MOVL (BP), BP
|
||||
LEAQ -3(DI)(DX*1), DX
|
||||
loop4:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop4
|
||||
JMP fail
|
||||
_5_or_more:
|
||||
CMPQ AX, $7
|
||||
JA _8_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVL -4(BP)(AX*1), BX
|
||||
MOVL (BP), BP
|
||||
loop5to7:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,BP
|
||||
JZ partial_success5to7
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
partial_success5to7:
|
||||
MOVL -4(AX)(DI*1), SI
|
||||
CMPL SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
_8_or_more:
|
||||
CMPQ AX, $8
|
||||
JA _9_or_more
|
||||
MOVQ (BP), BP
|
||||
LEAQ -7(DI)(DX*1), DX
|
||||
loop8:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop8
|
||||
JMP fail
|
||||
_9_or_more:
|
||||
CMPQ AX, $15
|
||||
JA _16_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVQ -8(BP)(AX*1), BX
|
||||
MOVQ (BP), BP
|
||||
loop9to15:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,BP
|
||||
JZ partial_success9to15
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
partial_success9to15:
|
||||
MOVQ -8(AX)(DI*1), SI
|
||||
CMPQ SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
_16_or_more:
|
||||
CMPQ AX, $16
|
||||
JA _17_or_more
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), DX
|
||||
loop16:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1, X2
|
||||
PMOVMSKB X2, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop16
|
||||
JMP fail
|
||||
_17_or_more:
|
||||
CMPQ AX, $31
|
||||
JA _32_or_more
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVOU -16(BP)(AX*1), X0
|
||||
MOVOU (BP), X1
|
||||
loop17to31:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1,X2
|
||||
PMOVMSKB X2, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE partial_success17to31
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
JMP fail
|
||||
partial_success17to31:
|
||||
MOVOU -16(AX)(DI*1), X3
|
||||
PCMPEQB X0, X3
|
||||
PMOVMSKB X3, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
JMP fail
|
||||
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
|
||||
// So no need to check cpuid
|
||||
_32_or_more:
|
||||
CMPQ AX, $32
|
||||
JA _33_to_63
|
||||
VMOVDQU (BP), Y1
|
||||
LEAQ -31(DI)(DX*1), DX
|
||||
loop32:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE success_avx2
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop32
|
||||
JMP fail_avx2
|
||||
_33_to_63:
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
VMOVDQU -32(BP)(AX*1), Y0
|
||||
VMOVDQU (BP), Y1
|
||||
loop33to63:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE partial_success33to63
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop33to63
|
||||
JMP fail_avx2
|
||||
partial_success33to63:
|
||||
VMOVDQU -32(AX)(DI*1), Y3
|
||||
VPCMPEQB Y0, Y3, Y4
|
||||
VPMOVMSKB Y4, SI
|
||||
CMPL SI, $0xffffffff
|
||||
JE success_avx2
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,DX
|
||||
JB loop33to63
|
||||
fail_avx2:
|
||||
VZEROUPPER
|
||||
fail:
|
||||
MOVQ $-1, (R11)
|
||||
RET
|
||||
success_avx2:
|
||||
VZEROUPPER
|
||||
JMP success
|
||||
sse42:
|
||||
CMPB runtime·support_sse42(SB), $1
|
||||
JNE no_sse42
|
||||
CMPQ AX, $12
|
||||
// PCMPESTRI is slower than normal compare,
|
||||
// so using it makes sense only if we advance 4+ bytes per compare
|
||||
// This value was determined experimentally and is the ~same
|
||||
// on Nehalem (first with SSE42) and Haswell.
|
||||
JAE _9_or_more
|
||||
LEAQ 16(BP), SI
|
||||
TESTW $0xff0, SI
|
||||
JEQ no_sse42
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), SI
|
||||
MOVQ $16, R9
|
||||
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
|
||||
loop_sse42:
|
||||
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
|
||||
// for equality (bits 2,3 are 11)
|
||||
// result is not masked or inverted (bits 4,5 are 00)
|
||||
// and corresponds to first matching byte (bit 6 is 0)
|
||||
PCMPESTRI $0x0c, (DI), X1
|
||||
// CX == 16 means no match,
|
||||
// CX > R9 means partial match at the end of the string,
|
||||
// otherwise sep is at offset CX from X1 start
|
||||
CMPQ CX, R9
|
||||
JBE sse42_success
|
||||
ADDQ R9, DI
|
||||
CMPQ DI, SI
|
||||
JB loop_sse42
|
||||
PCMPESTRI $0x0c, -1(SI), X1
|
||||
CMPQ CX, R9
|
||||
JA fail
|
||||
LEAQ -1(SI), DI
|
||||
sse42_success:
|
||||
ADDQ CX, DI
|
||||
success:
|
||||
SUBQ R10, DI
|
||||
MOVQ DI, (R11)
|
||||
RET
|
||||
|
||||
TEXT runtime·return0(SB), NOSPLIT, $0
|
||||
MOVL $0, AX
|
||||
RET
|
||||
|
@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
|
||||
// compile barrier.
|
||||
RET
|
||||
|
||||
// func supportsVX() bool
|
||||
TEXT strings·supportsVX(SB),NOSPLIT,$0-1
|
||||
MOVBZ runtime·cpu+facilities_hasVX(SB), R0
|
||||
MOVB R0, ret+0(FP)
|
||||
RET
|
||||
|
||||
// func supportsVX() bool
|
||||
TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
|
||||
MOVBZ runtime·cpu+facilities_hasVX(SB), R0
|
||||
MOVB R0, ret+0(FP)
|
||||
RET
|
||||
|
||||
// func indexShortStr(s, sep string) int
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
|
||||
LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
|
||||
LMG sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
|
||||
MOVD $ret+32(FP), R5
|
||||
BR runtime·indexShortStr(SB)
|
||||
|
||||
// func indexShortStr(s, sep []byte) int
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
|
||||
LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
|
||||
LMG sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
|
||||
MOVD $ret+48(FP), R5
|
||||
BR runtime·indexShortStr(SB)
|
||||
|
||||
// s: string we are searching
|
||||
// sep: string to search for
|
||||
// R1=&s[0], R2=len(s)
|
||||
// R3=&sep[0], R4=len(sep)
|
||||
// R5=&ret (int)
|
||||
// Caller must confirm availability of vx facility before calling.
|
||||
TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
|
||||
CMPBGT R4, R2, notfound
|
||||
ADD R1, R2
|
||||
SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
|
||||
CMPBEQ R4, $0, notfound
|
||||
SUB $1, R4 // R4=len(sep)-1 for use as VLL index
|
||||
VLL R4, (R3), V0 // contains first 16 bytes of sep
|
||||
MOVD R1, R7
|
||||
index2plus:
|
||||
CMPBNE R4, $1, index3plus
|
||||
MOVD $15(R7), R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
|
||||
VONE V16
|
||||
VREPH $0, V0, V1
|
||||
CMPBGE R9, R2, index2to16
|
||||
index2loop:
|
||||
VL 0(R7), V2 // 16 bytes, even indices
|
||||
VL 1(R7), V4 // 16 bytes, odd indices
|
||||
VCEQH V1, V2, V5 // compare even indices
|
||||
VCEQH V1, V4, V6 // compare odd indices
|
||||
VSEL V5, V6, V31, V7 // merge even and odd indices
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index3plus:
|
||||
CMPBNE R4, $2, index4plus
|
||||
ADD $15, R7, R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
MOVD $1, R0
|
||||
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
|
||||
VONE V16
|
||||
VREPH $0, V0, V1
|
||||
VREPB $2, V0, V8
|
||||
index3loop:
|
||||
VL (R7), V2 // load 16-bytes into V2
|
||||
VLL R0, 16(R7), V3 // load 2-bytes into V3
|
||||
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
|
||||
VCEQH V1, V2, V5 // compare 2-byte even indices
|
||||
VCEQH V1, V4, V6 // compare 2-byte odd indices
|
||||
VCEQB V8, V9, V10 // compare last bytes
|
||||
VSEL V5, V6, V31, V7 // merge even and odd indices
|
||||
VN V7, V10, V7 // AND indices with last byte
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index4plus:
|
||||
CMPBNE R4, $3, index5plus
|
||||
ADD $15, R7, R9
|
||||
CMPBGE R9, R2, index2to16
|
||||
MOVD $2, R0
|
||||
VGBM $0x8888, V29 // 0xff000000ff000000...
|
||||
VGBM $0x2222, V30 // 0x0000ff000000ff00...
|
||||
VGBM $0xcccc, V31 // 0xffff0000ffff0000...
|
||||
VONE V16
|
||||
VREPF $0, V0, V1
|
||||
index4loop:
|
||||
VL (R7), V2 // load 16-bytes into V2
|
||||
VLL R0, 16(R7), V3 // load 3-bytes into V3
|
||||
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
|
||||
VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
|
||||
VCEQF V1, V2, V5 // compare index 0, 4, ...
|
||||
VCEQF V1, V4, V6 // compare index 1, 5, ...
|
||||
VCEQF V1, V9, V11 // compare index 2, 6, ...
|
||||
VCEQF V1, V10, V12 // compare index 3, 7, ...
|
||||
VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
|
||||
VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
|
||||
VSEL V13, V14, V31, V7 // final merge
|
||||
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
|
||||
BLT foundV17
|
||||
MOVD $16(R7), R7 // R7+=16
|
||||
ADD $15, R7, R9
|
||||
CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
|
||||
CMPBLE R7, R2, index2to16
|
||||
BR notfound
|
||||
|
||||
index5plus:
|
||||
CMPBGT R4, $15, index17plus
|
||||
index2to16:
|
||||
CMPBGT R7, R2, notfound
|
||||
MOVD $1(R7), R8
|
||||
CMPBGT R8, R2, index2to16tail
|
||||
index2to16loop:
|
||||
// unrolled 2x
|
||||
VLL R4, (R7), V1
|
||||
VLL R4, 1(R7), V2
|
||||
VCEQGS V0, V1, V3
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
VCEQGS V0, V2, V4
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLT R7, R2, index2to16loop
|
||||
CMPBGT R7, R2, notfound
|
||||
index2to16tail:
|
||||
VLL R4, (R7), V1
|
||||
VCEQGS V0, V1, V2
|
||||
BEQ found
|
||||
BR notfound
|
||||
|
||||
index17plus:
|
||||
CMPBGT R4, $31, index33plus
|
||||
SUB $16, R4, R0
|
||||
VLL R0, 16(R3), V1
|
||||
VONE V7
|
||||
index17to32loop:
|
||||
VL (R7), V2
|
||||
VLL R0, 16(R7), V3
|
||||
VCEQG V0, V2, V4
|
||||
VCEQG V1, V3, V5
|
||||
VN V4, V5, V6
|
||||
VCEQGS V6, V7, V8
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index17to32loop
|
||||
BR notfound
|
||||
|
||||
index33plus:
|
||||
CMPBGT R4, $47, index49plus
|
||||
SUB $32, R4, R0
|
||||
VL 16(R3), V1
|
||||
VLL R0, 32(R3), V2
|
||||
VONE V11
|
||||
index33to48loop:
|
||||
VL (R7), V3
|
||||
VL 16(R7), V4
|
||||
VLL R0, 32(R7), V5
|
||||
VCEQG V0, V3, V6
|
||||
VCEQG V1, V4, V7
|
||||
VCEQG V2, V5, V8
|
||||
VN V6, V7, V9
|
||||
VN V8, V9, V10
|
||||
VCEQGS V10, V11, V12
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index33to48loop
|
||||
BR notfound
|
||||
|
||||
index49plus:
|
||||
CMPBGT R4, $63, index65plus
|
||||
SUB $48, R4, R0
|
||||
VL 16(R3), V1
|
||||
VL 32(R3), V2
|
||||
VLL R0, 48(R3), V3
|
||||
VONE V15
|
||||
index49to64loop:
|
||||
VL (R7), V4
|
||||
VL 16(R7), V5
|
||||
VL 32(R7), V6
|
||||
VLL R0, 48(R7), V7
|
||||
VCEQG V0, V4, V8
|
||||
VCEQG V1, V5, V9
|
||||
VCEQG V2, V6, V10
|
||||
VCEQG V3, V7, V11
|
||||
VN V8, V9, V12
|
||||
VN V10, V11, V13
|
||||
VN V12, V13, V14
|
||||
VCEQGS V14, V15, V16
|
||||
BEQ found
|
||||
MOVD $1(R7), R7
|
||||
CMPBLE R7, R2, index49to64loop
|
||||
notfound:
|
||||
MOVD $-1, (R5)
|
||||
RET
|
||||
|
||||
index65plus:
|
||||
// not implemented
|
||||
MOVD $0, (R0)
|
||||
RET
|
||||
|
||||
foundV17: // index is in doubleword V17[0]
|
||||
VLGVG $0, V17, R8
|
||||
ADD R8, R7
|
||||
found:
|
||||
SUB R1, R7
|
||||
MOVD R7, (R5)
|
||||
RET
|
||||
|
||||
// This is called from .init_array and follows the platform, not Go, ABI.
|
||||
// We are overly conservative. We could only save the registers we use.
|
||||
// However, since this function is only called once per loaded module
|
||||
|
@ -4,32 +4,16 @@
|
||||
|
||||
package runtime
|
||||
|
||||
import (
|
||||
internalcpu "internal/cpu"
|
||||
"runtime/internal/sys"
|
||||
)
|
||||
import "internal/cpu"
|
||||
|
||||
const (
|
||||
// bit masks taken from bits/hwcap.h
|
||||
_HWCAP_S390_VX = 2048 // vector facility
|
||||
)
|
||||
|
||||
// facilities is padded to avoid false sharing.
|
||||
type facilities struct {
|
||||
_ [sys.CacheLineSize]byte
|
||||
hasVX bool // vector facility
|
||||
_ [sys.CacheLineSize]byte
|
||||
}
|
||||
|
||||
// cpu indicates the availability of s390x facilities that can be used in
|
||||
// Go assembly but are optional on models supported by Go.
|
||||
// TODO: remove this once we're only using internal/cpu.
|
||||
var cpu facilities
|
||||
|
||||
func archauxv(tag, val uintptr) {
|
||||
switch tag {
|
||||
case _AT_HWCAP: // CPU capability bit flags
|
||||
internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
|
||||
cpu.hasVX = val&_HWCAP_S390_VX != 0
|
||||
cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
|
||||
}
|
||||
}
|
||||
|
@ -932,6 +932,85 @@ func EqualFold(s, t string) bool {
|
||||
return s == t
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
|
||||
func Index(s, substr string) int {
|
||||
n := len(substr)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, substr[0])
|
||||
case n == len(s):
|
||||
if substr == s {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= bytealg.MaxLen:
|
||||
// Use brute force when s and substr both are small
|
||||
if len(s) <= bytealg.MaxBruteForce {
|
||||
return bytealg.IndexString(s, substr)
|
||||
}
|
||||
c := substr[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte is faster than bytealg.IndexString, so use it as long as
|
||||
// we're not getting lots of false positives.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if s[i:i+n] == substr {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to bytealg.IndexString when IndexByte produces too many false positives.
|
||||
if fails > bytealg.Cutover(i) {
|
||||
r := bytealg.IndexString(s[i:], substr)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
c := substr[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if s[i:i+n] == substr {
|
||||
return i
|
||||
}
|
||||
i++
|
||||
fails++
|
||||
if fails >= 4+i>>4 && i < len(t) {
|
||||
// See comment in ../bytes/bytes_generic.go.
|
||||
j := indexRabinKarp(s[i:], substr)
|
||||
if j < 0 {
|
||||
return -1
|
||||
}
|
||||
return i + j
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func indexRabinKarp(s, substr string) int {
|
||||
// Rabin-Karp search
|
||||
hashss, pow := hashStr(substr)
|
||||
|
@ -1,79 +0,0 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package strings
|
||||
|
||||
import "internal/cpu"
|
||||
|
||||
//go:noescape
|
||||
|
||||
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
|
||||
// indexShortStr requires 2 <= len(c) <= shortStringLen
|
||||
func indexShortStr(s, c string) int // ../runtime/asm_amd64.s
|
||||
func countByte(s string, c byte) int // ../runtime/asm_amd64.s
|
||||
|
||||
var shortStringLen int
|
||||
|
||||
func init() {
|
||||
if cpu.X86.HasAVX2 {
|
||||
shortStringLen = 63
|
||||
} else {
|
||||
shortStringLen = 31
|
||||
}
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
|
||||
func Index(s, substr string) int {
|
||||
n := len(substr)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, substr[0])
|
||||
case n == len(s):
|
||||
if substr == s {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= shortStringLen:
|
||||
// Use brute force when s and substr both are small
|
||||
if len(s) <= 64 {
|
||||
return indexShortStr(s, substr)
|
||||
}
|
||||
c := substr[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte skips 16/32 bytes per iteration,
|
||||
// so it's faster than indexShortStr.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if s[i:i+n] == substr {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to indexShortStr when IndexByte produces too many false positives.
|
||||
// Too many means more that 1 error per 8 characters.
|
||||
// Allow some errors in the beginning.
|
||||
if fails > (i+16)/8 {
|
||||
r := indexShortStr(s[i:], substr)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
return indexRabinKarp(s, substr)
|
||||
}
|
@ -1,55 +0,0 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!s390x
|
||||
|
||||
package strings
|
||||
|
||||
// TODO: implements short string optimization on non amd64 platforms
|
||||
// and get rid of strings_amd64.go
|
||||
|
||||
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
|
||||
func Index(s, substr string) int {
|
||||
n := len(substr)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, substr[0])
|
||||
case n == len(s):
|
||||
if substr == s {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
}
|
||||
c := substr[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if s[i:i+n] == substr {
|
||||
return i
|
||||
}
|
||||
i++
|
||||
fails++
|
||||
if fails >= 4+i>>4 && i < len(t) {
|
||||
// See comment in ../bytes/bytes_generic.go.
|
||||
j := indexRabinKarp(s[i:], substr)
|
||||
if j < 0 {
|
||||
return -1
|
||||
}
|
||||
return i + j
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
@ -1,80 +0,0 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package strings
|
||||
|
||||
//go:noescape
|
||||
|
||||
// indexShortStr returns the index of the first instance of sep in s,
|
||||
// or -1 if sep is not present in s.
|
||||
// indexShortStr requires 2 <= len(sep) <= shortStringLen
|
||||
func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
|
||||
|
||||
// supportsVX reports whether the vector facility is available.
|
||||
// indexShortStr must not be called if the vector facility is not
|
||||
// available.
|
||||
func supportsVX() bool // ../runtime/asm_s390x.s
|
||||
|
||||
var shortStringLen = -1
|
||||
|
||||
func init() {
|
||||
if supportsVX() {
|
||||
shortStringLen = 64
|
||||
}
|
||||
}
|
||||
|
||||
// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
|
||||
func Index(s, substr string) int {
|
||||
n := len(substr)
|
||||
switch {
|
||||
case n == 0:
|
||||
return 0
|
||||
case n == 1:
|
||||
return IndexByte(s, substr[0])
|
||||
case n == len(s):
|
||||
if substr == s {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
case n > len(s):
|
||||
return -1
|
||||
case n <= shortStringLen:
|
||||
// Use brute force when s and substr both are small
|
||||
if len(s) <= 64 {
|
||||
return indexShortStr(s, substr)
|
||||
}
|
||||
c := substr[0]
|
||||
i := 0
|
||||
t := s[:len(s)-n+1]
|
||||
fails := 0
|
||||
for i < len(t) {
|
||||
if t[i] != c {
|
||||
// IndexByte skips 16/32 bytes per iteration,
|
||||
// so it's faster than indexShortStr.
|
||||
o := IndexByte(t[i:], c)
|
||||
if o < 0 {
|
||||
return -1
|
||||
}
|
||||
i += o
|
||||
}
|
||||
if s[i:i+n] == substr {
|
||||
return i
|
||||
}
|
||||
fails++
|
||||
i++
|
||||
// Switch to indexShortStr when IndexByte produces too many false positives.
|
||||
// Too many means more that 1 error per 8 characters.
|
||||
// Allow some errors in the beginning.
|
||||
if fails > (i+16)/8 {
|
||||
r := indexShortStr(s[i:], substr)
|
||||
if r >= 0 {
|
||||
return r + i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
return indexRabinKarp(s, substr)
|
||||
}
|
Loading…
Reference in New Issue
Block a user