From ee58eccc565c0871d3f16fd702fd8649a3fb61ea Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Sun, 4 Mar 2018 09:47:47 -0800 Subject: [PATCH] internal/bytealg: move short string Index implementations into bytealg Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: Brad Fitzpatrick --- src/bytes/bytes.go | 86 ++++++ src/bytes/bytes_amd64.go | 79 ----- src/bytes/bytes_arm64.go | 72 ----- src/bytes/bytes_generic.go | 59 ---- src/bytes/bytes_s390x.go | 80 ----- src/cmd/vet/all/whitelist/amd64.txt | 11 +- src/cmd/vet/all/whitelist/s390x.txt | 5 - src/internal/bytealg/bytealg.go | 22 ++ src/internal/bytealg/count_arm64.s | 90 ++++++ src/internal/bytealg/count_generic.go | 2 +- src/internal/bytealg/count_native.go | 2 +- src/internal/bytealg/equal_native.go | 16 - src/internal/bytealg/index_amd64.go | 26 ++ src/internal/bytealg/index_amd64.s | 274 ++++++++++++++++++ src/internal/bytealg/index_arm64.go | 23 ++ .../bytealg/index_arm64.s} | 108 ++----- src/internal/bytealg/index_generic.go | 29 ++ src/internal/bytealg/index_native.go | 19 ++ src/internal/bytealg/index_s390x.go | 31 ++ src/internal/bytealg/index_s390x.s | 216 ++++++++++++++ src/runtime/asm_amd64.s | 268 ----------------- src/runtime/asm_s390x.s | 224 -------------- src/runtime/os_linux_s390x.go | 20 +- src/strings/strings.go | 79 +++++ src/strings/strings_amd64.go | 79 ----- src/strings/strings_generic.go | 55 ---- src/strings/strings_s390x.go | 80 ----- 27 files changed, 932 insertions(+), 1123 deletions(-) delete mode 100644 src/bytes/bytes_amd64.go delete mode 100644 src/bytes/bytes_arm64.go delete mode 100644 src/bytes/bytes_generic.go delete mode 100644 src/bytes/bytes_s390x.go create mode 100644 src/internal/bytealg/bytealg.go create mode 100644 src/internal/bytealg/count_arm64.s create mode 100644 src/internal/bytealg/index_amd64.go create mode 100644 src/internal/bytealg/index_amd64.s create mode 100644 src/internal/bytealg/index_arm64.go rename src/{bytes/bytes_arm64.s => internal/bytealg/index_arm64.s} (50%) create mode 100644 src/internal/bytealg/index_generic.go create mode 100644 src/internal/bytealg/index_native.go create mode 100644 src/internal/bytealg/index_s390x.go create mode 100644 src/internal/bytealg/index_s390x.s delete mode 100644 src/strings/strings_amd64.go delete mode 100644 src/strings/strings_generic.go delete mode 100644 src/strings/strings_s390x.go diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 08d8260e9ee..32bf6ab30d3 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool { return len(s) == len(t) } +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep []byte) int { + n := len(sep) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, sep[0]) + case n == len(s): + if Equal(sep, s) { + return 0 + } + return -1 + case n > len(s): + return -1 + case n <= bytealg.MaxLen: + // Use brute force when s and sep both are small + if len(s) <= bytealg.MaxBruteForce { + return bytealg.Index(s, sep) + } + c := sep[0] + i := 0 + t := s[:len(s)-n+1] + fails := 0 + for i < len(t) { + if t[i] != c { + // IndexByte is faster than bytealg.Index, so use it as long as + // we're not getting lots of false positives. + o := IndexByte(t[i:], c) + if o < 0 { + return -1 + } + i += o + } + if Equal(s[i:i+n], sep) { + return i + } + fails++ + i++ + // Switch to bytealg.Index when IndexByte produces too many false positives. + if fails > bytealg.Cutover(i) { + r := bytealg.Index(s[i:], sep) + if r >= 0 { + return r + i + } + return -1 + } + } + return -1 + } + c := sep[0] + i := 0 + fails := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o + } + if Equal(s[i:i+n], sep) { + return i + } + i++ + fails++ + if fails >= 4+i>>4 && i < len(t) { + // Give up on IndexByte, it isn't skipping ahead + // far enough to be better than Rabin-Karp. + // Experiments (using IndexPeriodic) suggest + // the cutover is about 16 byte skips. + // TODO: if large prefixes of sep are matching + // we should cutover at even larger average skips, + // because Equal becomes that much more expensive. + // This code does not take that effect into account. + j := indexRabinKarp(s[i:], sep) + if j < 0 { + return -1 + } + return i + j + } + } + return -1 +} + func indexRabinKarp(s, sep []byte) int { // Rabin-Karp search hashsep, pow := hashStr(sep) diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go deleted file mode 100644 index 2fc88c68fc8..00000000000 --- a/src/bytes/bytes_amd64.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package bytes - -import "internal/cpu" - -//go:noescape - -// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s. -// indexShortStr requires 2 <= len(c) <= shortStringLen -func indexShortStr(s, c []byte) int // ../runtime/asm_amd64.s -func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s - -var shortStringLen int - -func init() { - if cpu.X86.HasAVX2 { - shortStringLen = 63 - } else { - shortStringLen = 31 - } -} - -// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. -func Index(s, sep []byte) int { - n := len(sep) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, sep[0]) - case n == len(s): - if Equal(sep, s) { - return 0 - } - return -1 - case n > len(s): - return -1 - case n <= shortStringLen: - // Use brute force when s and sep both are small - if len(s) <= 64 { - return indexShortStr(s, sep) - } - c := sep[0] - i := 0 - t := s[:len(s)-n+1] - fails := 0 - for i < len(t) { - if t[i] != c { - // IndexByte skips 16/32 bytes per iteration, - // so it's faster than indexShortStr. - o := IndexByte(t[i:], c) - if o < 0 { - return -1 - } - i += o - } - if Equal(s[i:i+n], sep) { - return i - } - fails++ - i++ - // Switch to indexShortStr when IndexByte produces too many false positives. - // Too many means more that 1 error per 8 characters. - // Allow some errors in the beginning. - if fails > (i+16)/8 { - r := indexShortStr(s[i:], sep) - if r >= 0 { - return r + i - } - return -1 - } - } - return -1 - } - return indexRabinKarp(s, sep) -} diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go deleted file mode 100644 index 39e9562db18..00000000000 --- a/src/bytes/bytes_arm64.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package bytes - -func countByte(s []byte, c byte) int // bytes_arm64.s - -// 8 bytes can be completely loaded into 1 register. -const shortStringLen = 8 - -//go:noescape -func indexShortStr(s, sep []byte) int - -// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. -func Index(s, sep []byte) int { - n := len(sep) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, sep[0]) - case n == len(s): - if Equal(sep, s) { - return 0 - } - return -1 - case n > len(s): - return -1 - case n <= shortStringLen: - // Use brute force when both s and sep are small. - // Empirical data shows that it can get better - // performance when len(s) <= 16. - if len(s) <= 16 { - return indexShortStr(s, sep) - } - } - c := sep[0] - i := 0 - fails := 0 - t := s[:len(s)-n+1] - for i < len(t) { - if t[i] != c { - o := IndexByte(t[i:], c) - if o < 0 { - break - } - i += o - } - if Equal(s[i:i+n], sep) { - return i - } - i++ - fails++ - if fails >= 4+i>>4 && i < len(t) { - // Give up on IndexByte, it isn't skipping ahead - // far enough to be better than Rabin-Karp. - // Experiments (using IndexPeriodic) suggest - // the cutover is about 16 byte skips. - // TODO: if large prefixes of sep are matching - // we should cutover at even larger average skips, - // because Equal becomes that much more expensive. - // This code does not take that effect into account. - j := indexRabinKarp(s[i:], sep) - if j < 0 { - return -1 - } - return i + j - } - } - return -1 -} diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go deleted file mode 100644 index 347d28473ff..00000000000 --- a/src/bytes/bytes_generic.go +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64,!s390x,!arm64 - -package bytes - -// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. -func Index(s, sep []byte) int { - n := len(sep) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, sep[0]) - case n == len(s): - if Equal(sep, s) { - return 0 - } - return -1 - case n > len(s): - return -1 - } - c := sep[0] - i := 0 - fails := 0 - t := s[:len(s)-n+1] - for i < len(t) { - if t[i] != c { - o := IndexByte(t[i:], c) - if o < 0 { - break - } - i += o - } - if Equal(s[i:i+n], sep) { - return i - } - i++ - fails++ - if fails >= 4+i>>4 && i < len(t) { - // Give up on IndexByte, it isn't skipping ahead - // far enough to be better than Rabin-Karp. - // Experiments (using IndexPeriodic) suggest - // the cutover is about 16 byte skips. - // TODO: if large prefixes of sep are matching - // we should cutover at even larger average skips, - // because Equal becomes that much more expensive. - // This code does not take that effect into account. - j := indexRabinKarp(s[i:], sep) - if j < 0 { - return -1 - } - return i + j - } - } - return -1 -} diff --git a/src/bytes/bytes_s390x.go b/src/bytes/bytes_s390x.go deleted file mode 100644 index 84f040d43d3..00000000000 --- a/src/bytes/bytes_s390x.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package bytes - -//go:noescape - -// indexShortStr returns the index of the first instance of sep in s, -// or -1 if sep is not present in s. -// indexShortStr requires 2 <= len(sep) <= shortStringLen -func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s - -// supportsVX reports whether the vector facility is available. -// indexShortStr must not be called if the vector facility is not -// available. -func supportsVX() bool // ../runtime/asm_s390x.s - -var shortStringLen = -1 - -func init() { - if supportsVX() { - shortStringLen = 64 - } -} - -// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. -func Index(s, sep []byte) int { - n := len(sep) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, sep[0]) - case n == len(s): - if Equal(sep, s) { - return 0 - } - return -1 - case n > len(s): - return -1 - case n <= shortStringLen: - // Use brute force when s and sep both are small - if len(s) <= 64 { - return indexShortStr(s, sep) - } - c := sep[0] - i := 0 - t := s[:len(s)-n+1] - fails := 0 - for i < len(t) { - if t[i] != c { - // IndexByte skips 16/32 bytes per iteration, - // so it's faster than indexShortStr. - o := IndexByte(t[i:], c) - if o < 0 { - return -1 - } - i += o - } - if Equal(s[i:i+n], sep) { - return i - } - fails++ - i++ - // Switch to indexShortStr when IndexByte produces too many false positives. - // Too many means more that 1 error per 8 characters. - // Allow some errors in the beginning. - if fails > (i+16)/8 { - r := indexShortStr(s[i:], sep) - if r >= 0 { - return r + i - } - return -1 - } - } - return -1 - } - return indexRabinKarp(s, sep) -} diff --git a/src/cmd/vet/all/whitelist/amd64.txt b/src/cmd/vet/all/whitelist/amd64.txt index 0e8ff741945..2268b393539 100644 --- a/src/cmd/vet/all/whitelist/amd64.txt +++ b/src/cmd/vet/all/whitelist/amd64.txt @@ -1,20 +1,16 @@ // amd64-specific vet whitelist. See readme.txt for details. -internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes -internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime - // False positives. +// Nothing much to do about cross-package assembly. Unfortunate. +internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes +internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime // reflect trampolines intentionally omit arg size. Same for morestack. runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame -// Nothing much to do about cross-package assembly. Unfortunate. -runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings -runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes - // Intentionally missing declarations. These are special assembly routines. // Some are jumped into from other routines, with values in specific registers. // duff* have direct calls from the compiler. @@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration -runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration diff --git a/src/cmd/vet/all/whitelist/s390x.txt b/src/cmd/vet/all/whitelist/s390x.txt index 025c9dce52b..57ff51f360a 100644 --- a/src/cmd/vet/all/whitelist/s390x.txt +++ b/src/cmd/vet/all/whitelist/s390x.txt @@ -1,11 +1,6 @@ runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime -runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings -runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes -runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings -runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes -runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go new file mode 100644 index 00000000000..1ab7c30f4ea --- /dev/null +++ b/src/internal/bytealg/bytealg.go @@ -0,0 +1,22 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import ( + "internal/cpu" + "unsafe" +) + +// Offsets into internal/cpu records for use in assembly. +const ( + x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2) + x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) + x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX) +) + +// MaxLen is the maximum length of the string to be searched for (argument b) in Index. +var MaxLen int diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s new file mode 100644 index 00000000000..8cd703d943c --- /dev/null +++ b/src/internal/bytealg/count_arm64.s @@ -0,0 +1,90 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 + MOVD b_base+0(FP), R0 + MOVD b_len+8(FP), R2 + MOVBU c+24(FP), R1 + MOVD $ret+32(FP), R8 + B countbytebody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+16(FP), R1 + MOVD $ret+24(FP), R8 + B countbytebody<>(SB) + +// input: +// R0: data +// R2: data len +// R1: byte to find +// R8: address to put result +TEXT countbytebody<>(SB),NOSPLIT,$0 + // R11 = count of byte to search + MOVD $0, R11 + // short path to handle 0-byte case + CBZ R2, done + CMP $0x20, R2 + // jump directly to tail if length < 32 + BLO tail + ANDS $0x1f, R0, R9 + BEQ chunk + // Work with not 32-byte aligned head + BIC $0x1f, R0, R3 + ADD $0x20, R3 +head_loop: + MOVBU.P 1(R0), R5 + CMP R5, R1 + CINC EQ, R11, R11 + SUB $1, R2, R2 + CMP R0, R3 + BNE head_loop + // Work with 32-byte aligned chunks +chunk: + BIC $0x1f, R2, R9 + // The first chunk can also be the last + CBZ R9, tail + // R3 = end of 32-byte chunks + ADD R0, R9, R3 + MOVD $1, R5 + VMOV R5, V5.B16 + // R2 = length of tail + SUB R9, R2, R2 + // Duplicate R1 (byte to search) to 16 1-byte elements of V0 + VMOV R1, V0.B16 + // Clear the low 64-bit element of V7 and V8 + VEOR V7.B8, V7.B8, V7.B8 + VEOR V8.B8, V8.B8, V8.B8 + // Count the target byte in 32-byte chunk +chunk_loop: + VLD1.P (R0), [V1.B16, V2.B16] + CMP R0, R3 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // Clear the higher 7 bits + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + // Count lanes match the requested byte + VADDP V4.B16, V3.B16, V6.B16 // 32B->16B + VUADDLV V6.B16, V7 + // Accumulate the count in low 64-bit element of V8 when inside the loop + VADD V7, V8 + BNE chunk_loop + VMOV V8.D[0], R6 + ADD R6, R11, R11 + CBZ R2, done +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R2, R2 + CMP R5, R1 + CINC EQ, R11, R11 + CBNZ R2, tail +done: + MOVD R11, (R8) + RET diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go index acc5a79827f..a763b3bc616 100644 --- a/src/internal/bytealg/count_generic.go +++ b/src/internal/bytealg/count_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64 +// +build !amd64,!arm64 package bytealg diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go index e6d3b066aa0..a62c4cb5c09 100644 --- a/src/internal/bytealg/count_native.go +++ b/src/internal/bytealg/count_native.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 +// +build amd64 arm64 package bytealg diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go index 55d184a58ba..b5c453086c9 100644 --- a/src/internal/bytealg/equal_native.go +++ b/src/internal/bytealg/equal_native.go @@ -4,24 +4,8 @@ package bytealg -import ( - "internal/cpu" - "unsafe" -) - // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly. -// Because equal_native.go is unconditional, it's a good place to compute asm constants. -// TODO: find a better way to do this? - -// Offsets into internal/cpu records for use in assembly. -const ( - x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2) - x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) - x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) - s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX) -) - //go:noescape func Equal(a, b []byte) bool diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go new file mode 100644 index 00000000000..c7a1941e5f0 --- /dev/null +++ b/src/internal/bytealg/index_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + if cpu.X86.HasAVX2 { + MaxLen = 63 + } else { + MaxLen = 31 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s new file mode 100644 index 00000000000..f7297c0cab4 --- /dev/null +++ b/src/internal/bytealg/index_amd64.s @@ -0,0 +1,274 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+24(FP), BP + MOVQ b_len+32(FP), AX + MOVQ DI, R10 + LEAQ ret+48(FP), R11 + JMP indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+16(FP), BP + MOVQ b_len+24(FP), AX + MOVQ DI, R10 + LEAQ ret+32(FP), R11 + JMP indexbody<>(SB) + +// AX: length of string, that we are searching for +// DX: length of string, in which we are searching +// DI: pointer to string, in which we are searching +// BP: pointer to string, that we are searching for +// R11: address, where to put return value +// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them +TEXT indexbody<>(SB),NOSPLIT,$0 + CMPQ AX, DX + JA fail + CMPQ DX, $16 + JAE sse42 +no_sse42: + CMPQ AX, $2 + JA _3_or_more + MOVW (BP), BP + LEAQ -1(DI)(DX*1), DX +loop2: + MOVW (DI), SI + CMPW SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop2 + JMP fail +_3_or_more: + CMPQ AX, $3 + JA _4_or_more + MOVW 1(BP), BX + MOVW (BP), BP + LEAQ -2(DI)(DX*1), DX +loop3: + MOVW (DI), SI + CMPW SI,BP + JZ partial_success3 + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +partial_success3: + MOVW 1(DI), SI + CMPW SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +_4_or_more: + CMPQ AX, $4 + JA _5_or_more + MOVL (BP), BP + LEAQ -3(DI)(DX*1), DX +loop4: + MOVL (DI), SI + CMPL SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop4 + JMP fail +_5_or_more: + CMPQ AX, $7 + JA _8_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVL -4(BP)(AX*1), BX + MOVL (BP), BP +loop5to7: + MOVL (DI), SI + CMPL SI,BP + JZ partial_success5to7 + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +partial_success5to7: + MOVL -4(AX)(DI*1), SI + CMPL SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +_8_or_more: + CMPQ AX, $8 + JA _9_or_more + MOVQ (BP), BP + LEAQ -7(DI)(DX*1), DX +loop8: + MOVQ (DI), SI + CMPQ SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop8 + JMP fail +_9_or_more: + CMPQ AX, $15 + JA _16_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVQ -8(BP)(AX*1), BX + MOVQ (BP), BP +loop9to15: + MOVQ (DI), SI + CMPQ SI,BP + JZ partial_success9to15 + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +partial_success9to15: + MOVQ -8(AX)(DI*1), SI + CMPQ SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +_16_or_more: + CMPQ AX, $16 + JA _17_or_more + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), DX +loop16: + MOVOU (DI), X2 + PCMPEQB X1, X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop16 + JMP fail +_17_or_more: + CMPQ AX, $31 + JA _32_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVOU -16(BP)(AX*1), X0 + MOVOU (BP), X1 +loop17to31: + MOVOU (DI), X2 + PCMPEQB X1,X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE partial_success17to31 + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +partial_success17to31: + MOVOU -16(AX)(DI*1), X3 + PCMPEQB X0, X3 + PMOVMSKB X3, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 +// So no need to check cpuid +_32_or_more: + CMPQ AX, $32 + JA _33_to_63 + VMOVDQU (BP), Y1 + LEAQ -31(DI)(DX*1), DX +loop32: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop32 + JMP fail_avx2 +_33_to_63: + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + VMOVDQU -32(BP)(AX*1), Y0 + VMOVDQU (BP), Y1 +loop33to63: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE partial_success33to63 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 + JMP fail_avx2 +partial_success33to63: + VMOVDQU -32(AX)(DI*1), Y3 + VPCMPEQB Y0, Y3, Y4 + VPMOVMSKB Y4, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 +fail_avx2: + VZEROUPPER +fail: + MOVQ $-1, (R11) + RET +success_avx2: + VZEROUPPER + JMP success +sse42: + CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1 + JNE no_sse42 + CMPQ AX, $12 + // PCMPESTRI is slower than normal compare, + // so using it makes sense only if we advance 4+ bytes per compare + // This value was determined experimentally and is the ~same + // on Nehalem (first with SSE42) and Haswell. + JAE _9_or_more + LEAQ 16(BP), SI + TESTW $0xff0, SI + JEQ no_sse42 + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), SI + MOVQ $16, R9 + SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 +loop_sse42: + // 0x0c means: unsigned byte compare (bits 0,1 are 00) + // for equality (bits 2,3 are 11) + // result is not masked or inverted (bits 4,5 are 00) + // and corresponds to first matching byte (bit 6 is 0) + PCMPESTRI $0x0c, (DI), X1 + // CX == 16 means no match, + // CX > R9 means partial match at the end of the string, + // otherwise sep is at offset CX from X1 start + CMPQ CX, R9 + JBE sse42_success + ADDQ R9, DI + CMPQ DI, SI + JB loop_sse42 + PCMPESTRI $0x0c, -1(SI), X1 + CMPQ CX, R9 + JA fail + LEAQ -1(SI), DI +sse42_success: + ADDQ CX, DI +success: + SUBQ R10, DI + MOVQ DI, (R11) + RET diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go new file mode 100644 index 00000000000..0f87ae106c9 --- /dev/null +++ b/src/internal/bytealg/index_arm64.go @@ -0,0 +1,23 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +// Empirical data shows that using IndexShortStr can get better +// performance when len(s) <= 16. +const MaxBruteForce = 16 + +func init() { + // 8 bytes can be completely loaded into 1 register. + MaxLen = 8 +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to IndexShortStr. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 16 characters, plus a few slop to start. + return 4 + n>>4 +} diff --git a/src/bytes/bytes_arm64.s b/src/internal/bytealg/index_arm64.s similarity index 50% rename from src/bytes/bytes_arm64.s rename to src/internal/bytealg/index_arm64.s index 84e96d52cec..8cffcd10b5d 100644 --- a/src/bytes/bytes_arm64.s +++ b/src/internal/bytealg/index_arm64.s @@ -1,88 +1,40 @@ -// Copyright 2017 The Go Authors. All rights reserved. +// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include "go_asm.h" #include "textflag.h" -// countByte(s []byte, c byte) int -TEXT bytes·countByte(SB),NOSPLIT,$0-40 - MOVD s_base+0(FP), R0 - MOVD s_len+8(FP), R2 - MOVBU c+24(FP), R1 - // R11 = count of byte to search - MOVD $0, R11 - // short path to handle 0-byte case - CBZ R2, done - CMP $0x20, R2 - // jump directly to tail if length < 32 - BLO tail - ANDS $0x1f, R0, R9 - BEQ chunk - // Work with not 32-byte aligned head - BIC $0x1f, R0, R3 - ADD $0x20, R3 -head_loop: - MOVBU.P 1(R0), R5 - CMP R5, R1 - CINC EQ, R11, R11 - SUB $1, R2, R2 - CMP R0, R3 - BNE head_loop - // Work with 32-byte aligned chunks -chunk: - BIC $0x1f, R2, R9 - // The first chunk can also be the last - CBZ R9, tail - // R3 = end of 32-byte chunks - ADD R0, R9, R3 - MOVD $1, R5 - VMOV R5, V5.B16 - // R2 = length of tail - SUB R9, R2, R2 - // Duplicate R1 (byte to search) to 16 1-byte elements of V0 - VMOV R1, V0.B16 - // Clear the low 64-bit element of V7 and V8 - VEOR V7.B8, V7.B8, V7.B8 - VEOR V8.B8, V8.B8, V8.B8 - // Count the target byte in 32-byte chunk -chunk_loop: - VLD1.P (R0), [V1.B16, V2.B16] - CMP R0, R3 - VCMEQ V0.B16, V1.B16, V3.B16 - VCMEQ V0.B16, V2.B16, V4.B16 - // Clear the higher 7 bits - VAND V5.B16, V3.B16, V3.B16 - VAND V5.B16, V4.B16, V4.B16 - // Count lanes match the requested byte - VADDP V4.B16, V3.B16, V6.B16 // 32B->16B - VUADDLV V6.B16, V7 - // Accumulate the count in low 64-bit element of V8 when inside the loop - VADD V7, V8 - BNE chunk_loop - VMOV V8.D[0], R6 - ADD R6, R11, R11 - CBZ R2, done -tail: - // Work with tail shorter than 32 bytes - MOVBU.P 1(R0), R5 - SUB $1, R2, R2 - CMP R5, R1 - CINC EQ, R11, R11 - CBNZ R2, tail -done: - MOVD R11, ret+32(FP) - RET +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 + MOVD $ret+48(FP), R9 + B indexbody<>(SB) -// indexShortStr(s, sep []byte) int -// precondition: 2 <= len(sep) <= 8 -TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+16(FP), R2 + MOVD b_len+24(FP), R3 + MOVD $ret+32(FP), R9 + B indexbody<>(SB) + +// input: +// R0: haystack +// R1: length of haystack +// R2: needle +// R3: length of needle (2 <= len <= 8) +// R9: address to put result +TEXT indexbody<>(SB),NOSPLIT,$0-56 // main idea is to load 'sep' into separate register(s) // to avoid repeatedly re-load it again and again // for sebsequent substring comparisons - MOVD s+0(FP), R0 - MOVD s_len+8(FP), R1 - MOVD sep+24(FP), R2 - MOVD sep_len+32(FP), R3 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 SUB R3, R1, R4 // R4 contains the start of last substring for comparsion ADD R0, R4, R4 @@ -189,9 +141,9 @@ loop_2: BLS loop_2 not_found: MOVD $-1, R0 - MOVD R0, ret+48(FP) + MOVD R0, (R9) RET found: SUB R8, R0, R0 - MOVD R0, ret+48(FP) + MOVD R0, (R9) RET diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go new file mode 100644 index 00000000000..98e859f9255 --- /dev/null +++ b/src/internal/bytealg/index_generic.go @@ -0,0 +1,29 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!arm64,!s390x + +package bytealg + +const MaxBruteForce = 0 + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int { + panic("unimplemented") +} + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int { + panic("unimplemented") +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + panic("unimplemented") +} diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go new file mode 100644 index 00000000000..fde42142458 --- /dev/null +++ b/src/internal/bytealg/index_native.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 arm64 s390x + +package bytealg + +//go:noescape + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int + +//go:noescape + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go new file mode 100644 index 00000000000..9340cf11354 --- /dev/null +++ b/src/internal/bytealg/index_s390x.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + // Note: we're kind of lucky that this flag is available at this point. + // The runtime sets HasVX when processing auxv records, and that happens + // to happen *before* running the init functions of packages that + // the runtime depends on. + // TODO: it would really be nicer for internal/cpu to figure out this + // flag by itself. Then we wouldn't need to depend on quirks of + // early startup initialization order. + if cpu.S390X.HasVX { + MaxLen = 64 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s new file mode 100644 index 00000000000..491d5bcfd25 --- /dev/null +++ b/src/internal/bytealg/index_s390x.s @@ -0,0 +1,216 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// Caller must confirm availability of vx facility before calling. +TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+48(FP), R5 + BR indexbody<>(SB) + +// Caller must confirm availability of vx facility before calling. +TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+32(FP), R5 + BR indexbody<>(SB) + +// s: string we are searching +// sep: string to search for +// R1=&s[0], R2=len(s) +// R3=&sep[0], R4=len(sep) +// R5=&ret (int) +// Caller must confirm availability of vx facility before calling. +TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0 + CMPBGT R4, R2, notfound + ADD R1, R2 + SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index) + CMPBEQ R4, $0, notfound + SUB $1, R4 // R4=len(sep)-1 for use as VLL index + VLL R4, (R3), V0 // contains first 16 bytes of sep + MOVD R1, R7 +index2plus: + CMPBNE R4, $1, index3plus + MOVD $15(R7), R9 + CMPBGE R9, R2, index2to16 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + CMPBGE R9, R2, index2to16 +index2loop: + VL 0(R7), V2 // 16 bytes, even indices + VL 1(R7), V4 // 16 bytes, odd indices + VCEQH V1, V2, V5 // compare even indices + VCEQH V1, V4, V6 // compare odd indices + VSEL V5, V6, V31, V7 // merge even and odd indices + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index3plus: + CMPBNE R4, $2, index4plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $1, R0 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + VREPB $2, V0, V8 +index3loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 2-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2 + VCEQH V1, V2, V5 // compare 2-byte even indices + VCEQH V1, V4, V6 // compare 2-byte odd indices + VCEQB V8, V9, V10 // compare last bytes + VSEL V5, V6, V31, V7 // merge even and odd indices + VN V7, V10, V7 // AND indices with last byte + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index4plus: + CMPBNE R4, $3, index5plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $2, R0 + VGBM $0x8888, V29 // 0xff000000ff000000... + VGBM $0x2222, V30 // 0x0000ff000000ff00... + VGBM $0xcccc, V31 // 0xffff0000ffff0000... + VONE V16 + VREPF $0, V0, V1 +index4loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 3-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1 + VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1 + VCEQF V1, V2, V5 // compare index 0, 4, ... + VCEQF V1, V4, V6 // compare index 1, 5, ... + VCEQF V1, V9, V11 // compare index 2, 6, ... + VCEQF V1, V10, V12 // compare index 3, 7, ... + VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ... + VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ... + VSEL V13, V14, V31, V7 // final merge + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index5plus: + CMPBGT R4, $15, index17plus +index2to16: + CMPBGT R7, R2, notfound + MOVD $1(R7), R8 + CMPBGT R8, R2, index2to16tail +index2to16loop: + // unrolled 2x + VLL R4, (R7), V1 + VLL R4, 1(R7), V2 + VCEQGS V0, V1, V3 + BEQ found + MOVD $1(R7), R7 + VCEQGS V0, V2, V4 + BEQ found + MOVD $1(R7), R7 + CMPBLT R7, R2, index2to16loop + CMPBGT R7, R2, notfound +index2to16tail: + VLL R4, (R7), V1 + VCEQGS V0, V1, V2 + BEQ found + BR notfound + +index17plus: + CMPBGT R4, $31, index33plus + SUB $16, R4, R0 + VLL R0, 16(R3), V1 + VONE V7 +index17to32loop: + VL (R7), V2 + VLL R0, 16(R7), V3 + VCEQG V0, V2, V4 + VCEQG V1, V3, V5 + VN V4, V5, V6 + VCEQGS V6, V7, V8 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index17to32loop + BR notfound + +index33plus: + CMPBGT R4, $47, index49plus + SUB $32, R4, R0 + VL 16(R3), V1 + VLL R0, 32(R3), V2 + VONE V11 +index33to48loop: + VL (R7), V3 + VL 16(R7), V4 + VLL R0, 32(R7), V5 + VCEQG V0, V3, V6 + VCEQG V1, V4, V7 + VCEQG V2, V5, V8 + VN V6, V7, V9 + VN V8, V9, V10 + VCEQGS V10, V11, V12 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index33to48loop + BR notfound + +index49plus: + CMPBGT R4, $63, index65plus + SUB $48, R4, R0 + VL 16(R3), V1 + VL 32(R3), V2 + VLL R0, 48(R3), V3 + VONE V15 +index49to64loop: + VL (R7), V4 + VL 16(R7), V5 + VL 32(R7), V6 + VLL R0, 48(R7), V7 + VCEQG V0, V4, V8 + VCEQG V1, V5, V9 + VCEQG V2, V6, V10 + VCEQG V3, V7, V11 + VN V8, V9, V12 + VN V10, V11, V13 + VN V12, V13, V14 + VCEQGS V14, V15, V16 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index49to64loop +notfound: + MOVD $-1, (R5) + RET + +index65plus: + // not implemented + MOVD $0, (R0) + RET + +foundV17: // index is in doubleword V17[0] + VLGVG $0, V17, R8 + ADD R8, R7 +found: + SUB R1, R7 + MOVD R7, (R5) + RET diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index f91a01da72f..ab5407bbcd0 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 GLOBL shifts<>(SB),RODATA,$256 -TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 - MOVQ s+0(FP), DI - // We want len in DX and AX, because PCMPESTRI implicitly consumes them - MOVQ s_len+8(FP), DX - MOVQ c+16(FP), BP - MOVQ c_len+24(FP), AX - MOVQ DI, R10 - LEAQ ret+32(FP), R11 - JMP runtime·indexShortStr(SB) - -TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 - MOVQ s+0(FP), DI - MOVQ s_len+8(FP), DX - MOVQ c+24(FP), BP - MOVQ c_len+32(FP), AX - MOVQ DI, R10 - LEAQ ret+48(FP), R11 - JMP runtime·indexShortStr(SB) - -// AX: length of string, that we are searching for -// DX: length of string, in which we are searching -// DI: pointer to string, in which we are searching -// BP: pointer to string, that we are searching for -// R11: address, where to put return value -TEXT runtime·indexShortStr(SB),NOSPLIT,$0 - CMPQ AX, DX - JA fail - CMPQ DX, $16 - JAE sse42 -no_sse42: - CMPQ AX, $2 - JA _3_or_more - MOVW (BP), BP - LEAQ -1(DI)(DX*1), DX -loop2: - MOVW (DI), SI - CMPW SI,BP - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop2 - JMP fail -_3_or_more: - CMPQ AX, $3 - JA _4_or_more - MOVW 1(BP), BX - MOVW (BP), BP - LEAQ -2(DI)(DX*1), DX -loop3: - MOVW (DI), SI - CMPW SI,BP - JZ partial_success3 - ADDQ $1,DI - CMPQ DI,DX - JB loop3 - JMP fail -partial_success3: - MOVW 1(DI), SI - CMPW SI,BX - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop3 - JMP fail -_4_or_more: - CMPQ AX, $4 - JA _5_or_more - MOVL (BP), BP - LEAQ -3(DI)(DX*1), DX -loop4: - MOVL (DI), SI - CMPL SI,BP - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop4 - JMP fail -_5_or_more: - CMPQ AX, $7 - JA _8_or_more - LEAQ 1(DI)(DX*1), DX - SUBQ AX, DX - MOVL -4(BP)(AX*1), BX - MOVL (BP), BP -loop5to7: - MOVL (DI), SI - CMPL SI,BP - JZ partial_success5to7 - ADDQ $1,DI - CMPQ DI,DX - JB loop5to7 - JMP fail -partial_success5to7: - MOVL -4(AX)(DI*1), SI - CMPL SI,BX - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop5to7 - JMP fail -_8_or_more: - CMPQ AX, $8 - JA _9_or_more - MOVQ (BP), BP - LEAQ -7(DI)(DX*1), DX -loop8: - MOVQ (DI), SI - CMPQ SI,BP - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop8 - JMP fail -_9_or_more: - CMPQ AX, $15 - JA _16_or_more - LEAQ 1(DI)(DX*1), DX - SUBQ AX, DX - MOVQ -8(BP)(AX*1), BX - MOVQ (BP), BP -loop9to15: - MOVQ (DI), SI - CMPQ SI,BP - JZ partial_success9to15 - ADDQ $1,DI - CMPQ DI,DX - JB loop9to15 - JMP fail -partial_success9to15: - MOVQ -8(AX)(DI*1), SI - CMPQ SI,BX - JZ success - ADDQ $1,DI - CMPQ DI,DX - JB loop9to15 - JMP fail -_16_or_more: - CMPQ AX, $16 - JA _17_or_more - MOVOU (BP), X1 - LEAQ -15(DI)(DX*1), DX -loop16: - MOVOU (DI), X2 - PCMPEQB X1, X2 - PMOVMSKB X2, SI - CMPQ SI, $0xffff - JE success - ADDQ $1,DI - CMPQ DI,DX - JB loop16 - JMP fail -_17_or_more: - CMPQ AX, $31 - JA _32_or_more - LEAQ 1(DI)(DX*1), DX - SUBQ AX, DX - MOVOU -16(BP)(AX*1), X0 - MOVOU (BP), X1 -loop17to31: - MOVOU (DI), X2 - PCMPEQB X1,X2 - PMOVMSKB X2, SI - CMPQ SI, $0xffff - JE partial_success17to31 - ADDQ $1,DI - CMPQ DI,DX - JB loop17to31 - JMP fail -partial_success17to31: - MOVOU -16(AX)(DI*1), X3 - PCMPEQB X0, X3 - PMOVMSKB X3, SI - CMPQ SI, $0xffff - JE success - ADDQ $1,DI - CMPQ DI,DX - JB loop17to31 - JMP fail -// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 -// So no need to check cpuid -_32_or_more: - CMPQ AX, $32 - JA _33_to_63 - VMOVDQU (BP), Y1 - LEAQ -31(DI)(DX*1), DX -loop32: - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, SI - CMPL SI, $0xffffffff - JE success_avx2 - ADDQ $1,DI - CMPQ DI,DX - JB loop32 - JMP fail_avx2 -_33_to_63: - LEAQ 1(DI)(DX*1), DX - SUBQ AX, DX - VMOVDQU -32(BP)(AX*1), Y0 - VMOVDQU (BP), Y1 -loop33to63: - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, SI - CMPL SI, $0xffffffff - JE partial_success33to63 - ADDQ $1,DI - CMPQ DI,DX - JB loop33to63 - JMP fail_avx2 -partial_success33to63: - VMOVDQU -32(AX)(DI*1), Y3 - VPCMPEQB Y0, Y3, Y4 - VPMOVMSKB Y4, SI - CMPL SI, $0xffffffff - JE success_avx2 - ADDQ $1,DI - CMPQ DI,DX - JB loop33to63 -fail_avx2: - VZEROUPPER -fail: - MOVQ $-1, (R11) - RET -success_avx2: - VZEROUPPER - JMP success -sse42: - CMPB runtime·support_sse42(SB), $1 - JNE no_sse42 - CMPQ AX, $12 - // PCMPESTRI is slower than normal compare, - // so using it makes sense only if we advance 4+ bytes per compare - // This value was determined experimentally and is the ~same - // on Nehalem (first with SSE42) and Haswell. - JAE _9_or_more - LEAQ 16(BP), SI - TESTW $0xff0, SI - JEQ no_sse42 - MOVOU (BP), X1 - LEAQ -15(DI)(DX*1), SI - MOVQ $16, R9 - SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 -loop_sse42: - // 0x0c means: unsigned byte compare (bits 0,1 are 00) - // for equality (bits 2,3 are 11) - // result is not masked or inverted (bits 4,5 are 00) - // and corresponds to first matching byte (bit 6 is 0) - PCMPESTRI $0x0c, (DI), X1 - // CX == 16 means no match, - // CX > R9 means partial match at the end of the string, - // otherwise sep is at offset CX from X1 start - CMPQ CX, R9 - JBE sse42_success - ADDQ R9, DI - CMPQ DI, SI - JB loop_sse42 - PCMPESTRI $0x0c, -1(SI), X1 - CMPQ CX, R9 - JA fail - LEAQ -1(SI), DI -sse42_success: - ADDQ CX, DI -success: - SUBQ R10, DI - MOVQ DI, (R11) - RET - TEXT runtime·return0(SB), NOSPLIT, $0 MOVL $0, AX RET diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s index ed4cd6b3d34..1c7e44cdae4 100644 --- a/src/runtime/asm_s390x.s +++ b/src/runtime/asm_s390x.s @@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0 // compile barrier. RET -// func supportsVX() bool -TEXT strings·supportsVX(SB),NOSPLIT,$0-1 - MOVBZ runtime·cpu+facilities_hasVX(SB), R0 - MOVB R0, ret+0(FP) - RET - -// func supportsVX() bool -TEXT bytes·supportsVX(SB),NOSPLIT,$0-1 - MOVBZ runtime·cpu+facilities_hasVX(SB), R0 - MOVB R0, ret+0(FP) - RET - -// func indexShortStr(s, sep string) int -// Caller must confirm availability of vx facility before calling. -TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40 - LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s) - LMG sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep) - MOVD $ret+32(FP), R5 - BR runtime·indexShortStr(SB) - -// func indexShortStr(s, sep []byte) int -// Caller must confirm availability of vx facility before calling. -TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56 - LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s) - LMG sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep) - MOVD $ret+48(FP), R5 - BR runtime·indexShortStr(SB) - -// s: string we are searching -// sep: string to search for -// R1=&s[0], R2=len(s) -// R3=&sep[0], R4=len(sep) -// R5=&ret (int) -// Caller must confirm availability of vx facility before calling. -TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0 - CMPBGT R4, R2, notfound - ADD R1, R2 - SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index) - CMPBEQ R4, $0, notfound - SUB $1, R4 // R4=len(sep)-1 for use as VLL index - VLL R4, (R3), V0 // contains first 16 bytes of sep - MOVD R1, R7 -index2plus: - CMPBNE R4, $1, index3plus - MOVD $15(R7), R9 - CMPBGE R9, R2, index2to16 - VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... - VONE V16 - VREPH $0, V0, V1 - CMPBGE R9, R2, index2to16 -index2loop: - VL 0(R7), V2 // 16 bytes, even indices - VL 1(R7), V4 // 16 bytes, odd indices - VCEQH V1, V2, V5 // compare even indices - VCEQH V1, V4, V6 // compare odd indices - VSEL V5, V6, V31, V7 // merge even and odd indices - VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found - BLT foundV17 - MOVD $16(R7), R7 // R7+=16 - ADD $15, R7, R9 - CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search) - CMPBLE R7, R2, index2to16 - BR notfound - -index3plus: - CMPBNE R4, $2, index4plus - ADD $15, R7, R9 - CMPBGE R9, R2, index2to16 - MOVD $1, R0 - VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... - VONE V16 - VREPH $0, V0, V1 - VREPB $2, V0, V8 -index3loop: - VL (R7), V2 // load 16-bytes into V2 - VLL R0, 16(R7), V3 // load 2-bytes into V3 - VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 - VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2 - VCEQH V1, V2, V5 // compare 2-byte even indices - VCEQH V1, V4, V6 // compare 2-byte odd indices - VCEQB V8, V9, V10 // compare last bytes - VSEL V5, V6, V31, V7 // merge even and odd indices - VN V7, V10, V7 // AND indices with last byte - VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found - BLT foundV17 - MOVD $16(R7), R7 // R7+=16 - ADD $15, R7, R9 - CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search) - CMPBLE R7, R2, index2to16 - BR notfound - -index4plus: - CMPBNE R4, $3, index5plus - ADD $15, R7, R9 - CMPBGE R9, R2, index2to16 - MOVD $2, R0 - VGBM $0x8888, V29 // 0xff000000ff000000... - VGBM $0x2222, V30 // 0x0000ff000000ff00... - VGBM $0xcccc, V31 // 0xffff0000ffff0000... - VONE V16 - VREPF $0, V0, V1 -index4loop: - VL (R7), V2 // load 16-bytes into V2 - VLL R0, 16(R7), V3 // load 3-bytes into V3 - VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 - VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1 - VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1 - VCEQF V1, V2, V5 // compare index 0, 4, ... - VCEQF V1, V4, V6 // compare index 1, 5, ... - VCEQF V1, V9, V11 // compare index 2, 6, ... - VCEQF V1, V10, V12 // compare index 3, 7, ... - VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ... - VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ... - VSEL V13, V14, V31, V7 // final merge - VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found - BLT foundV17 - MOVD $16(R7), R7 // R7+=16 - ADD $15, R7, R9 - CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search) - CMPBLE R7, R2, index2to16 - BR notfound - -index5plus: - CMPBGT R4, $15, index17plus -index2to16: - CMPBGT R7, R2, notfound - MOVD $1(R7), R8 - CMPBGT R8, R2, index2to16tail -index2to16loop: - // unrolled 2x - VLL R4, (R7), V1 - VLL R4, 1(R7), V2 - VCEQGS V0, V1, V3 - BEQ found - MOVD $1(R7), R7 - VCEQGS V0, V2, V4 - BEQ found - MOVD $1(R7), R7 - CMPBLT R7, R2, index2to16loop - CMPBGT R7, R2, notfound -index2to16tail: - VLL R4, (R7), V1 - VCEQGS V0, V1, V2 - BEQ found - BR notfound - -index17plus: - CMPBGT R4, $31, index33plus - SUB $16, R4, R0 - VLL R0, 16(R3), V1 - VONE V7 -index17to32loop: - VL (R7), V2 - VLL R0, 16(R7), V3 - VCEQG V0, V2, V4 - VCEQG V1, V3, V5 - VN V4, V5, V6 - VCEQGS V6, V7, V8 - BEQ found - MOVD $1(R7), R7 - CMPBLE R7, R2, index17to32loop - BR notfound - -index33plus: - CMPBGT R4, $47, index49plus - SUB $32, R4, R0 - VL 16(R3), V1 - VLL R0, 32(R3), V2 - VONE V11 -index33to48loop: - VL (R7), V3 - VL 16(R7), V4 - VLL R0, 32(R7), V5 - VCEQG V0, V3, V6 - VCEQG V1, V4, V7 - VCEQG V2, V5, V8 - VN V6, V7, V9 - VN V8, V9, V10 - VCEQGS V10, V11, V12 - BEQ found - MOVD $1(R7), R7 - CMPBLE R7, R2, index33to48loop - BR notfound - -index49plus: - CMPBGT R4, $63, index65plus - SUB $48, R4, R0 - VL 16(R3), V1 - VL 32(R3), V2 - VLL R0, 48(R3), V3 - VONE V15 -index49to64loop: - VL (R7), V4 - VL 16(R7), V5 - VL 32(R7), V6 - VLL R0, 48(R7), V7 - VCEQG V0, V4, V8 - VCEQG V1, V5, V9 - VCEQG V2, V6, V10 - VCEQG V3, V7, V11 - VN V8, V9, V12 - VN V10, V11, V13 - VN V12, V13, V14 - VCEQGS V14, V15, V16 - BEQ found - MOVD $1(R7), R7 - CMPBLE R7, R2, index49to64loop -notfound: - MOVD $-1, (R5) - RET - -index65plus: - // not implemented - MOVD $0, (R0) - RET - -foundV17: // index is in doubleword V17[0] - VLGVG $0, V17, R8 - ADD R8, R7 -found: - SUB R1, R7 - MOVD R7, (R5) - RET - // This is called from .init_array and follows the platform, not Go, ABI. // We are overly conservative. We could only save the registers we use. // However, since this function is only called once per loaded module diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go index 21290528363..55d35c7cff1 100644 --- a/src/runtime/os_linux_s390x.go +++ b/src/runtime/os_linux_s390x.go @@ -4,32 +4,16 @@ package runtime -import ( - internalcpu "internal/cpu" - "runtime/internal/sys" -) +import "internal/cpu" const ( // bit masks taken from bits/hwcap.h _HWCAP_S390_VX = 2048 // vector facility ) -// facilities is padded to avoid false sharing. -type facilities struct { - _ [sys.CacheLineSize]byte - hasVX bool // vector facility - _ [sys.CacheLineSize]byte -} - -// cpu indicates the availability of s390x facilities that can be used in -// Go assembly but are optional on models supported by Go. -// TODO: remove this once we're only using internal/cpu. -var cpu facilities - func archauxv(tag, val uintptr) { switch tag { case _AT_HWCAP: // CPU capability bit flags - internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0 - cpu.hasVX = val&_HWCAP_S390_VX != 0 + cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0 } } diff --git a/src/strings/strings.go b/src/strings/strings.go index 7d3ed37edd7..b0a53fdefdf 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -932,6 +932,85 @@ func EqualFold(s, t string) bool { return s == t } +// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. +func Index(s, substr string) int { + n := len(substr) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, substr[0]) + case n == len(s): + if substr == s { + return 0 + } + return -1 + case n > len(s): + return -1 + case n <= bytealg.MaxLen: + // Use brute force when s and substr both are small + if len(s) <= bytealg.MaxBruteForce { + return bytealg.IndexString(s, substr) + } + c := substr[0] + i := 0 + t := s[:len(s)-n+1] + fails := 0 + for i < len(t) { + if t[i] != c { + // IndexByte is faster than bytealg.IndexString, so use it as long as + // we're not getting lots of false positives. + o := IndexByte(t[i:], c) + if o < 0 { + return -1 + } + i += o + } + if s[i:i+n] == substr { + return i + } + fails++ + i++ + // Switch to bytealg.IndexString when IndexByte produces too many false positives. + if fails > bytealg.Cutover(i) { + r := bytealg.IndexString(s[i:], substr) + if r >= 0 { + return r + i + } + return -1 + } + } + return -1 + } + c := substr[0] + i := 0 + t := s[:len(s)-n+1] + fails := 0 + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + return -1 + } + i += o + } + if s[i:i+n] == substr { + return i + } + i++ + fails++ + if fails >= 4+i>>4 && i < len(t) { + // See comment in ../bytes/bytes_generic.go. + j := indexRabinKarp(s[i:], substr) + if j < 0 { + return -1 + } + return i + j + } + } + return -1 +} + func indexRabinKarp(s, substr string) int { // Rabin-Karp search hashss, pow := hashStr(substr) diff --git a/src/strings/strings_amd64.go b/src/strings/strings_amd64.go deleted file mode 100644 index 75e7d0c1391..00000000000 --- a/src/strings/strings_amd64.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package strings - -import "internal/cpu" - -//go:noescape - -// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s. -// indexShortStr requires 2 <= len(c) <= shortStringLen -func indexShortStr(s, c string) int // ../runtime/asm_amd64.s -func countByte(s string, c byte) int // ../runtime/asm_amd64.s - -var shortStringLen int - -func init() { - if cpu.X86.HasAVX2 { - shortStringLen = 63 - } else { - shortStringLen = 31 - } -} - -// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. -func Index(s, substr string) int { - n := len(substr) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, substr[0]) - case n == len(s): - if substr == s { - return 0 - } - return -1 - case n > len(s): - return -1 - case n <= shortStringLen: - // Use brute force when s and substr both are small - if len(s) <= 64 { - return indexShortStr(s, substr) - } - c := substr[0] - i := 0 - t := s[:len(s)-n+1] - fails := 0 - for i < len(t) { - if t[i] != c { - // IndexByte skips 16/32 bytes per iteration, - // so it's faster than indexShortStr. - o := IndexByte(t[i:], c) - if o < 0 { - return -1 - } - i += o - } - if s[i:i+n] == substr { - return i - } - fails++ - i++ - // Switch to indexShortStr when IndexByte produces too many false positives. - // Too many means more that 1 error per 8 characters. - // Allow some errors in the beginning. - if fails > (i+16)/8 { - r := indexShortStr(s[i:], substr) - if r >= 0 { - return r + i - } - return -1 - } - } - return -1 - } - return indexRabinKarp(s, substr) -} diff --git a/src/strings/strings_generic.go b/src/strings/strings_generic.go deleted file mode 100644 index ac3b8dce851..00000000000 --- a/src/strings/strings_generic.go +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64,!s390x - -package strings - -// TODO: implements short string optimization on non amd64 platforms -// and get rid of strings_amd64.go - -// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. -func Index(s, substr string) int { - n := len(substr) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, substr[0]) - case n == len(s): - if substr == s { - return 0 - } - return -1 - case n > len(s): - return -1 - } - c := substr[0] - i := 0 - t := s[:len(s)-n+1] - fails := 0 - for i < len(t) { - if t[i] != c { - o := IndexByte(t[i:], c) - if o < 0 { - return -1 - } - i += o - } - if s[i:i+n] == substr { - return i - } - i++ - fails++ - if fails >= 4+i>>4 && i < len(t) { - // See comment in ../bytes/bytes_generic.go. - j := indexRabinKarp(s[i:], substr) - if j < 0 { - return -1 - } - return i + j - } - } - return -1 -} diff --git a/src/strings/strings_s390x.go b/src/strings/strings_s390x.go deleted file mode 100644 index b2e459b04ec..00000000000 --- a/src/strings/strings_s390x.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package strings - -//go:noescape - -// indexShortStr returns the index of the first instance of sep in s, -// or -1 if sep is not present in s. -// indexShortStr requires 2 <= len(sep) <= shortStringLen -func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s - -// supportsVX reports whether the vector facility is available. -// indexShortStr must not be called if the vector facility is not -// available. -func supportsVX() bool // ../runtime/asm_s390x.s - -var shortStringLen = -1 - -func init() { - if supportsVX() { - shortStringLen = 64 - } -} - -// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. -func Index(s, substr string) int { - n := len(substr) - switch { - case n == 0: - return 0 - case n == 1: - return IndexByte(s, substr[0]) - case n == len(s): - if substr == s { - return 0 - } - return -1 - case n > len(s): - return -1 - case n <= shortStringLen: - // Use brute force when s and substr both are small - if len(s) <= 64 { - return indexShortStr(s, substr) - } - c := substr[0] - i := 0 - t := s[:len(s)-n+1] - fails := 0 - for i < len(t) { - if t[i] != c { - // IndexByte skips 16/32 bytes per iteration, - // so it's faster than indexShortStr. - o := IndexByte(t[i:], c) - if o < 0 { - return -1 - } - i += o - } - if s[i:i+n] == substr { - return i - } - fails++ - i++ - // Switch to indexShortStr when IndexByte produces too many false positives. - // Too many means more that 1 error per 8 characters. - // Allow some errors in the beginning. - if fails > (i+16)/8 { - r := indexShortStr(s[i:], substr) - if r >= 0 { - return r + i - } - return -1 - } - } - return -1 - } - return indexRabinKarp(s, substr) -}