internal/bytealg: move short string Index implementations into bytealg

Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2024-11-23 07:40:04 -07:00 · 2018-03-04 09:47:47 -08:00 · 2018-03-04 09:47:47 -08:00 · ee58eccc56
commit ee58eccc56
parent f6332bb84a
27 changed files with 932 additions and 1123 deletions
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool {
 	return len(s) == len(t)
 }

+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+	n := len(sep)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, sep[0])
+	case n == len(s):
+		if Equal(sep, s) {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	case n <= bytealg.MaxLen:
+		// Use brute force when s and sep both are small
+		if len(s) <= bytealg.MaxBruteForce {
+			return bytealg.Index(s, sep)
+		}
+		c := sep[0]
+		i := 0
+		t := s[:len(s)-n+1]
+		fails := 0
+		for i < len(t) {
+			if t[i] != c {
+				// IndexByte is faster than bytealg.Index, so use it as long as
+				// we're not getting lots of false positives.
+				o := IndexByte(t[i:], c)
+				if o < 0 {
+					return -1
+				}
+				i += o
+			}
+			if Equal(s[i:i+n], sep) {
+				return i
+			}
+			fails++
+			i++
+			// Switch to bytealg.Index when IndexByte produces too many false positives.
+			if fails > bytealg.Cutover(i) {
+				r := bytealg.Index(s[i:], sep)
+				if r >= 0 {
+					return r + i
+				}
+				return -1
+			}
+		}
+		return -1
+	}
+	c := sep[0]
+	i := 0
+	fails := 0
+	t := s[:len(s)-n+1]
+	for i < len(t) {
+		if t[i] != c {
+			o := IndexByte(t[i:], c)
+			if o < 0 {
+				break
+			}
+			i += o
+		}
+		if Equal(s[i:i+n], sep) {
+			return i
+		}
+		i++
+		fails++
+		if fails >= 4+i>>4 && i < len(t) {
+			// Give up on IndexByte, it isn't skipping ahead
+			// far enough to be better than Rabin-Karp.
+			// Experiments (using IndexPeriodic) suggest
+			// the cutover is about 16 byte skips.
+			// TODO: if large prefixes of sep are matching
+			// we should cutover at even larger average skips,
+			// because Equal becomes that much more expensive.
+			// This code does not take that effect into account.
+			j := indexRabinKarp(s[i:], sep)
+			if j < 0 {
+				return -1
+			}
+			return i + j
+		}
+	}
+	return -1
+}
+
 func indexRabinKarp(s, sep []byte) int {
 	// Rabin-Karp search
 	hashsep, pow := hashStr(sep)
--- a/src/bytes/bytes_amd64.go
+++ b/src/bytes/bytes_amd64.go
@ -1,79 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c []byte) int  // ../runtime/asm_amd64.s
-func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
-	if cpu.X86.HasAVX2 {
-		shortStringLen = 63
-	} else {
-		shortStringLen = 31
-	}
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-	n := len(sep)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, sep[0])
-	case n == len(s):
-		if Equal(sep, s) {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	case n <= shortStringLen:
-		// Use brute force when s and sep both are small
-		if len(s) <= 64 {
-			return indexShortStr(s, sep)
-		}
-		c := sep[0]
-		i := 0
-		t := s[:len(s)-n+1]
-		fails := 0
-		for i < len(t) {
-			if t[i] != c {
-				// IndexByte skips 16/32 bytes per iteration,
-				// so it's faster than indexShortStr.
-				o := IndexByte(t[i:], c)
-				if o < 0 {
-					return -1
-				}
-				i += o
-			}
-			if Equal(s[i:i+n], sep) {
-				return i
-			}
-			fails++
-			i++
-			// Switch to indexShortStr when IndexByte produces too many false positives.
-			// Too many means more that 1 error per 8 characters.
-			// Allow some errors in the beginning.
-			if fails > (i+16)/8 {
-				r := indexShortStr(s[i:], sep)
-				if r >= 0 {
-					return r + i
-				}
-				return -1
-			}
-		}
-		return -1
-	}
-	return indexRabinKarp(s, sep)
-}
--- a/src/bytes/bytes_arm64.go
+++ b/src/bytes/bytes_arm64.go
@ -1,72 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-func countByte(s []byte, c byte) int // bytes_arm64.s
-
-// 8 bytes can be completely loaded into 1 register.
-const shortStringLen = 8
-
-//go:noescape
-func indexShortStr(s, sep []byte) int
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-	n := len(sep)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, sep[0])
-	case n == len(s):
-		if Equal(sep, s) {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	case n <= shortStringLen:
-		// Use brute force when both s and sep are small.
-		// Empirical data shows that it can get better
-		// performance when len(s) <= 16.
-		if len(s) <= 16 {
-			return indexShortStr(s, sep)
-		}
-	}
-	c := sep[0]
-	i := 0
-	fails := 0
-	t := s[:len(s)-n+1]
-	for i < len(t) {
-		if t[i] != c {
-			o := IndexByte(t[i:], c)
-			if o < 0 {
-				break
-			}
-			i += o
-		}
-		if Equal(s[i:i+n], sep) {
-			return i
-		}
-		i++
-		fails++
-		if fails >= 4+i>>4 && i < len(t) {
-			// Give up on IndexByte, it isn't skipping ahead
-			// far enough to be better than Rabin-Karp.
-			// Experiments (using IndexPeriodic) suggest
-			// the cutover is about 16 byte skips.
-			// TODO: if large prefixes of sep are matching
-			// we should cutover at even larger average skips,
-			// because Equal becomes that much more expensive.
-			// This code does not take that effect into account.
-			j := indexRabinKarp(s[i:], sep)
-			if j < 0 {
-				return -1
-			}
-			return i + j
-		}
-	}
-	return -1
-}
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@ -1,59 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x,!arm64
-
-package bytes
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-	n := len(sep)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, sep[0])
-	case n == len(s):
-		if Equal(sep, s) {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	}
-	c := sep[0]
-	i := 0
-	fails := 0
-	t := s[:len(s)-n+1]
-	for i < len(t) {
-		if t[i] != c {
-			o := IndexByte(t[i:], c)
-			if o < 0 {
-				break
-			}
-			i += o
-		}
-		if Equal(s[i:i+n], sep) {
-			return i
-		}
-		i++
-		fails++
-		if fails >= 4+i>>4 && i < len(t) {
-			// Give up on IndexByte, it isn't skipping ahead
-			// far enough to be better than Rabin-Karp.
-			// Experiments (using IndexPeriodic) suggest
-			// the cutover is about 16 byte skips.
-			// TODO: if large prefixes of sep are matching
-			// we should cutover at even larger average skips,
-			// because Equal becomes that much more expensive.
-			// This code does not take that effect into account.
-			j := indexRabinKarp(s[i:], sep)
-			if j < 0 {
-				return -1
-			}
-			return i + j
-		}
-	}
-	return -1
-}
--- a/src/bytes/bytes_s390x.go
+++ b/src/bytes/bytes_s390x.go
@ -1,80 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
-	if supportsVX() {
-		shortStringLen = 64
-	}
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-	n := len(sep)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, sep[0])
-	case n == len(s):
-		if Equal(sep, s) {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	case n <= shortStringLen:
-		// Use brute force when s and sep both are small
-		if len(s) <= 64 {
-			return indexShortStr(s, sep)
-		}
-		c := sep[0]
-		i := 0
-		t := s[:len(s)-n+1]
-		fails := 0
-		for i < len(t) {
-			if t[i] != c {
-				// IndexByte skips 16/32 bytes per iteration,
-				// so it's faster than indexShortStr.
-				o := IndexByte(t[i:], c)
-				if o < 0 {
-					return -1
-				}
-				i += o
-			}
-			if Equal(s[i:i+n], sep) {
-				return i
-			}
-			fails++
-			i++
-			// Switch to indexShortStr when IndexByte produces too many false positives.
-			// Too many means more that 1 error per 8 characters.
-			// Allow some errors in the beginning.
-			if fails > (i+16)/8 {
-				r := indexShortStr(s[i:], sep)
-				if r >= 0 {
-					return r + i
-				}
-				return -1
-			}
-		}
-		return -1
-	}
-	return indexRabinKarp(s, sep)
-}
--- a/src/cmd/vet/all/whitelist/amd64.txt
+++ b/src/cmd/vet/all/whitelist/amd64.txt
@ -1,20 +1,16 @@
 // amd64-specific vet whitelist. See readme.txt for details.

-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
-
 // False positives.

+// Nothing much to do about cross-package assembly. Unfortunate.
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime

 // reflect trampolines intentionally omit arg size. Same for morestack.
 runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
 runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
 runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame

-// Nothing much to do about cross-package assembly. Unfortunate.
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
-
 // Intentionally missing declarations. These are special assembly routines.
 // Some are jumped into from other routines, with values in specific registers.
 // duff* have direct calls from the compiler.
@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de
 runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
 runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
 runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
-runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
--- a/src/cmd/vet/all/whitelist/s390x.txt
+++ b/src/cmd/vet/all/whitelist/s390x.txt
@ -1,11 +1,6 @@
 runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
 internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
 internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
-runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
 runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
 runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
 runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration
--- a/src/internal/bytealg/bytealg.go
+++ b/src/internal/bytealg/bytealg.go
@ -0,0 +1,22 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
+	x86_HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+var MaxLen int
--- a/src/internal/bytealg/count_arm64.s
+++ b/src/internal/bytealg/count_arm64.s
@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64
+// +build !amd64,!arm64

 package bytealg

--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build amd64
+// +build amd64 arm64

 package bytealg

--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@ -4,24 +4,8 @@

 package bytealg

-import (
-	"internal/cpu"
-	"unsafe"
-)
-
 // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.

-// Because equal_native.go is unconditional, it's a good place to compute asm constants.
-// TODO: find a better way to do this?
-
-// Offsets into internal/cpu records for use in assembly.
-const (
-	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
-	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
-	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
-	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
-)
-
 //go:noescape
 func Equal(a, b []byte) bool

--- a/src/internal/bytealg/index_amd64.go
+++ b/src/internal/bytealg/index_amd64.go
@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), BP
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), BP
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (BP), BP
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(BP), BX
+	MOVW (BP), BP
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (BP), BP
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(BP)(AX*1), BX
+	MOVL (BP), BP
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (BP), BP
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(BP)(AX*1), BX
+	MOVQ (BP), BP
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(BP)(AX*1), X0
+	MOVOU (BP), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (BP), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(BP)(AX*1), Y0
+	VMOVDQU (BP), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+	CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
+	JNE no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(BP), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
--- a/src/internal/bytealg/index_arm64.go
+++ b/src/internal/bytealg/index_arm64.go
@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using IndexShortStr can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// 8 bytes can be completely loaded into 1 register.
+	MaxLen = 8
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to IndexShortStr.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
--- a/src/internal/bytealg/index_arm64.s
+++ b/src/internal/bytealg/index_arm64.s
@ -1,88 +1,40 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+#include "go_asm.h"
 #include "textflag.h"

-// countByte(s []byte, c byte) int
-TEXT bytes·countByte(SB),NOSPLIT,$0-40
-	MOVD	s_base+0(FP), R0
-	MOVD	s_len+8(FP), R2
-	MOVBU	c+24(FP), R1
-	// R11 = count of byte to search
-	MOVD	$0, R11
-	// short path to handle 0-byte case
-	CBZ	R2, done
-	CMP	$0x20, R2
-	// jump directly to tail if length < 32
-	BLO	tail
-	ANDS	$0x1f, R0, R9
-	BEQ	chunk
-	// Work with not 32-byte aligned head
-	BIC	$0x1f, R0, R3
-	ADD	$0x20, R3
-head_loop:
-	MOVBU.P	1(R0), R5
-	CMP	R5, R1
-	CINC	EQ, R11, R11
-	SUB	$1, R2, R2
-	CMP	R0, R3
-	BNE	head_loop
-	// Work with 32-byte aligned chunks
-chunk:
-	BIC	$0x1f, R2, R9
-	// The first chunk can also be the last
-	CBZ	R9, tail
-	// R3 = end of 32-byte chunks
-	ADD	R0, R9, R3
-	MOVD	$1, R5
-	VMOV	R5, V5.B16
-	// R2 = length of tail
-	SUB	R9, R2, R2
-	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
-	VMOV	R1, V0.B16
-	// Clear the low 64-bit element of V7 and V8
-	VEOR	V7.B8, V7.B8, V7.B8
-	VEOR	V8.B8, V8.B8, V8.B8
-	// Count the target byte in 32-byte chunk
-chunk_loop:
-	VLD1.P	(R0), [V1.B16, V2.B16]
-	CMP	R0, R3
-	VCMEQ	V0.B16, V1.B16, V3.B16
-	VCMEQ	V0.B16, V2.B16, V4.B16
-	// Clear the higher 7 bits
-	VAND	V5.B16, V3.B16, V3.B16
-	VAND	V5.B16, V4.B16, V4.B16
-	// Count lanes match the requested byte
-	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
-	VUADDLV	V6.B16, V7
-	// Accumulate the count in low 64-bit element of V8 when inside the loop
-	VADD	V7, V8
-	BNE	chunk_loop
-	VMOV	V8.D[0], R6
-	ADD	R6, R11, R11
-	CBZ	R2, done
-tail:
-	// Work with tail shorter than 32 bytes
-	MOVBU.P	1(R0), R5
-	SUB	$1, R2, R2
-	CMP	R5, R1
-	CINC	EQ, R11, R11
-	CBNZ	R2, tail
-done:
-	MOVD	R11, ret+32(FP)
-	RET
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)

-// indexShortStr(s, sep []byte) int
-// precondition: 2 <= len(sep) <= 8
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 8)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
 	// main idea is to load 'sep' into separate register(s)
 	// to avoid repeatedly re-load it again and again
 	// for sebsequent substring comparisons
-	MOVD	s+0(FP), R0
-	MOVD	s_len+8(FP), R1
-	MOVD	sep+24(FP), R2
-	MOVD	sep_len+32(FP), R3
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
 	SUB	R3, R1, R4
 	// R4 contains the start of last substring for comparsion
 	ADD	R0, R4, R4
@ -189,9 +141,9 @@ loop_2:
 	BLS	loop_2
 not_found:
 	MOVD	$-1, R0
-	MOVD	R0, ret+48(FP)
+	MOVD	R0, (R9)
 	RET
 found:
 	SUB	R8, R0, R0
-	MOVD	R0, ret+48(FP)
+	MOVD	R0, (R9)
 	RET
--- a/src/internal/bytealg/index_generic.go
+++ b/src/internal/bytealg/index_generic.go
@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
--- a/src/internal/bytealg/index_native.go
+++ b/src/internal/bytealg/index_native.go
@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
--- a/src/internal/bytealg/index_s390x.go
+++ b/src/internal/bytealg/index_s390x.go
@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
--- a/src/internal/bytealg/index_s390x.s
+++ b/src/internal/bytealg/index_s390x.s
@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
 GLOBL shifts<>(SB),RODATA,$256

-TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
-	MOVQ s+0(FP), DI
-	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
-	MOVQ s_len+8(FP), DX
-	MOVQ c+16(FP), BP
-	MOVQ c_len+24(FP), AX
-	MOVQ DI, R10
-	LEAQ ret+32(FP), R11
-	JMP  runtime·indexShortStr(SB)
-
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
-	MOVQ s+0(FP), DI
-	MOVQ s_len+8(FP), DX
-	MOVQ c+24(FP), BP
-	MOVQ c_len+32(FP), AX
-	MOVQ DI, R10
-	LEAQ ret+48(FP), R11
-	JMP  runtime·indexShortStr(SB)
-
-// AX: length of string, that we are searching for
-// DX: length of string, in which we are searching
-// DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
-// R11: address, where to put return value
-TEXT runtime·indexShortStr(SB),NOSPLIT,$0
-	CMPQ AX, DX
-	JA fail
-	CMPQ DX, $16
-	JAE sse42
-no_sse42:
-	CMPQ AX, $2
-	JA   _3_or_more
-	MOVW (BP), BP
-	LEAQ -1(DI)(DX*1), DX
-loop2:
-	MOVW (DI), SI
-	CMPW SI,BP
-	JZ success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop2
-	JMP fail
-_3_or_more:
-	CMPQ AX, $3
-	JA   _4_or_more
-	MOVW 1(BP), BX
-	MOVW (BP), BP
-	LEAQ -2(DI)(DX*1), DX
-loop3:
-	MOVW (DI), SI
-	CMPW SI,BP
-	JZ   partial_success3
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop3
-	JMP fail
-partial_success3:
-	MOVW 1(DI), SI
-	CMPW SI,BX
-	JZ success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop3
-	JMP fail
-_4_or_more:
-	CMPQ AX, $4
-	JA   _5_or_more
-	MOVL (BP), BP
-	LEAQ -3(DI)(DX*1), DX
-loop4:
-	MOVL (DI), SI
-	CMPL SI,BP
-	JZ   success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop4
-	JMP fail
-_5_or_more:
-	CMPQ AX, $7
-	JA   _8_or_more
-	LEAQ 1(DI)(DX*1), DX
-	SUBQ AX, DX
-	MOVL -4(BP)(AX*1), BX
-	MOVL (BP), BP
-loop5to7:
-	MOVL (DI), SI
-	CMPL SI,BP
-	JZ   partial_success5to7
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop5to7
-	JMP fail
-partial_success5to7:
-	MOVL -4(AX)(DI*1), SI
-	CMPL SI,BX
-	JZ success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop5to7
-	JMP fail
-_8_or_more:
-	CMPQ AX, $8
-	JA   _9_or_more
-	MOVQ (BP), BP
-	LEAQ -7(DI)(DX*1), DX
-loop8:
-	MOVQ (DI), SI
-	CMPQ SI,BP
-	JZ   success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop8
-	JMP fail
-_9_or_more:
-	CMPQ AX, $15
-	JA   _16_or_more
-	LEAQ 1(DI)(DX*1), DX
-	SUBQ AX, DX
-	MOVQ -8(BP)(AX*1), BX
-	MOVQ (BP), BP
-loop9to15:
-	MOVQ (DI), SI
-	CMPQ SI,BP
-	JZ   partial_success9to15
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop9to15
-	JMP fail
-partial_success9to15:
-	MOVQ -8(AX)(DI*1), SI
-	CMPQ SI,BX
-	JZ success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop9to15
-	JMP fail
-_16_or_more:
-	CMPQ AX, $16
-	JA   _17_or_more
-	MOVOU (BP), X1
-	LEAQ -15(DI)(DX*1), DX
-loop16:
-	MOVOU (DI), X2
-	PCMPEQB X1, X2
-	PMOVMSKB X2, SI
-	CMPQ  SI, $0xffff
-	JE   success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop16
-	JMP fail
-_17_or_more:
-	CMPQ AX, $31
-	JA   _32_or_more
-	LEAQ 1(DI)(DX*1), DX
-	SUBQ AX, DX
-	MOVOU -16(BP)(AX*1), X0
-	MOVOU (BP), X1
-loop17to31:
-	MOVOU (DI), X2
-	PCMPEQB X1,X2
-	PMOVMSKB X2, SI
-	CMPQ  SI, $0xffff
-	JE   partial_success17to31
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop17to31
-	JMP fail
-partial_success17to31:
-	MOVOU -16(AX)(DI*1), X3
-	PCMPEQB X0, X3
-	PMOVMSKB X3, SI
-	CMPQ  SI, $0xffff
-	JE success
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop17to31
-	JMP fail
-// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
-// So no need to check cpuid
-_32_or_more:
-	CMPQ AX, $32
-	JA   _33_to_63
-	VMOVDQU (BP), Y1
-	LEAQ -31(DI)(DX*1), DX
-loop32:
-	VMOVDQU (DI), Y2
-	VPCMPEQB Y1, Y2, Y3
-	VPMOVMSKB Y3, SI
-	CMPL  SI, $0xffffffff
-	JE   success_avx2
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop32
-	JMP fail_avx2
-_33_to_63:
-	LEAQ 1(DI)(DX*1), DX
-	SUBQ AX, DX
-	VMOVDQU -32(BP)(AX*1), Y0
-	VMOVDQU (BP), Y1
-loop33to63:
-	VMOVDQU (DI), Y2
-	VPCMPEQB Y1, Y2, Y3
-	VPMOVMSKB Y3, SI
-	CMPL  SI, $0xffffffff
-	JE   partial_success33to63
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop33to63
-	JMP fail_avx2
-partial_success33to63:
-	VMOVDQU -32(AX)(DI*1), Y3
-	VPCMPEQB Y0, Y3, Y4
-	VPMOVMSKB Y4, SI
-	CMPL  SI, $0xffffffff
-	JE success_avx2
-	ADDQ $1,DI
-	CMPQ DI,DX
-	JB loop33to63
-fail_avx2:
-	VZEROUPPER
-fail:
-	MOVQ $-1, (R11)
-	RET
-success_avx2:
-	VZEROUPPER
-	JMP success
-sse42:
-	CMPB runtime·support_sse42(SB), $1
-	JNE no_sse42
-	CMPQ AX, $12
-	// PCMPESTRI is slower than normal compare,
-	// so using it makes sense only if we advance 4+ bytes per compare
-	// This value was determined experimentally and is the ~same
-	// on Nehalem (first with SSE42) and Haswell.
-	JAE _9_or_more
-	LEAQ 16(BP), SI
-	TESTW $0xff0, SI
-	JEQ no_sse42
-	MOVOU (BP), X1
-	LEAQ -15(DI)(DX*1), SI
-	MOVQ $16, R9
-	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
-loop_sse42:
-	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
-	// for equality (bits 2,3 are 11)
-	// result is not masked or inverted (bits 4,5 are 00)
-	// and corresponds to first matching byte (bit 6 is 0)
-	PCMPESTRI $0x0c, (DI), X1
-	// CX == 16 means no match,
-	// CX > R9 means partial match at the end of the string,
-	// otherwise sep is at offset CX from X1 start
-	CMPQ CX, R9
-	JBE sse42_success
-	ADDQ R9, DI
-	CMPQ DI, SI
-	JB loop_sse42
-	PCMPESTRI $0x0c, -1(SI), X1
-	CMPQ CX, R9
-	JA fail
-	LEAQ -1(SI), DI
-sse42_success:
-	ADDQ CX, DI
-success:
-	SUBQ R10, DI
-	MOVQ DI, (R11)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVL	$0, AX
 	RET
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
        // compile barrier.
 	RET

-// func supportsVX() bool
-TEXT strings·supportsVX(SB),NOSPLIT,$0-1
-	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
-	MOVB	R0, ret+0(FP)
-	RET
-
-// func supportsVX() bool
-TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
-	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
-	MOVB	R0, ret+0(FP)
-	RET
-
-// func indexShortStr(s, sep string) int
-// Caller must confirm availability of vx facility before calling.
-TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
-	LMG	s+0(FP), R1, R2   // R1=&s[0],   R2=len(s)
-	LMG	sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
-	MOVD	$ret+32(FP), R5
-	BR	runtime·indexShortStr(SB)
-
-// func indexShortStr(s, sep []byte) int
-// Caller must confirm availability of vx facility before calling.
-TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
-	LMG	s+0(FP), R1, R2    // R1=&s[0],   R2=len(s)
-	LMG	sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
-	MOVD	$ret+48(FP), R5
-	BR	runtime·indexShortStr(SB)
-
-// s: string we are searching
-// sep: string to search for
-// R1=&s[0], R2=len(s)
-// R3=&sep[0], R4=len(sep)
-// R5=&ret (int)
-// Caller must confirm availability of vx facility before calling.
-TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
-	CMPBGT	R4, R2, notfound
-	ADD	R1, R2
-	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
-	CMPBEQ	R4, $0, notfound
-	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
-	VLL	R4, (R3), V0 // contains first 16 bytes of sep
-	MOVD	R1, R7
-index2plus:
-	CMPBNE	R4, $1, index3plus
-	MOVD	$15(R7), R9
-	CMPBGE	R9, R2, index2to16
-	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
-	VONE	V16
-	VREPH	$0, V0, V1
-	CMPBGE	R9, R2, index2to16
-index2loop:
-	VL	0(R7), V2          // 16 bytes, even indices
-	VL	1(R7), V4          // 16 bytes, odd indices
-	VCEQH	V1, V2, V5         // compare even indices
-	VCEQH	V1, V4, V6         // compare odd indices
-	VSEL	V5, V6, V31, V7    // merge even and odd indices
-	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
-	BLT	foundV17
-	MOVD	$16(R7), R7        // R7+=16
-	ADD	$15, R7, R9
-	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
-	CMPBLE	R7, R2, index2to16
-	BR	notfound
-
-index3plus:
-	CMPBNE	R4, $2, index4plus
-	ADD	$15, R7, R9
-	CMPBGE	R9, R2, index2to16
-	MOVD	$1, R0
-	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
-	VONE	V16
-	VREPH	$0, V0, V1
-	VREPB	$2, V0, V8
-index3loop:
-	VL	(R7), V2           // load 16-bytes into V2
-	VLL	R0, 16(R7), V3     // load 2-bytes into V3
-	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
-	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
-	VCEQH	V1, V2, V5         // compare 2-byte even indices
-	VCEQH	V1, V4, V6         // compare 2-byte odd indices
-	VCEQB	V8, V9, V10        // compare last bytes
-	VSEL	V5, V6, V31, V7    // merge even and odd indices
-	VN	V7, V10, V7        // AND indices with last byte
-	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
-	BLT	foundV17
-	MOVD	$16(R7), R7        // R7+=16
-	ADD	$15, R7, R9
-	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
-	CMPBLE	R7, R2, index2to16
-	BR	notfound
-
-index4plus:
-	CMPBNE	R4, $3, index5plus
-	ADD	$15, R7, R9
-	CMPBGE	R9, R2, index2to16
-	MOVD	$2, R0
-	VGBM	$0x8888, V29       // 0xff000000ff000000...
-	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
-	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
-	VONE	V16
-	VREPF	$0, V0, V1
-index4loop:
-	VL	(R7), V2           // load 16-bytes into V2
-	VLL	R0, 16(R7), V3     // load 3-bytes into V3
-	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
-	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
-	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
-	VCEQF	V1, V2, V5         // compare index 0, 4, ...
-	VCEQF	V1, V4, V6         // compare index 1, 5, ...
-	VCEQF	V1, V9, V11        // compare index 2, 6, ...
-	VCEQF	V1, V10, V12       // compare index 3, 7, ...
-	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
-	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
-	VSEL	V13, V14, V31, V7  // final merge
-	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
-	BLT	foundV17
-	MOVD	$16(R7), R7        // R7+=16
-	ADD	$15, R7, R9
-	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
-	CMPBLE	R7, R2, index2to16
-	BR	notfound
-
-index5plus:
-	CMPBGT	R4, $15, index17plus
-index2to16:
-	CMPBGT	R7, R2, notfound
-	MOVD	$1(R7), R8
-	CMPBGT	R8, R2, index2to16tail
-index2to16loop:
-	// unrolled 2x
-	VLL	R4, (R7), V1
-	VLL	R4, 1(R7), V2
-	VCEQGS	V0, V1, V3
-	BEQ	found
-	MOVD	$1(R7), R7
-	VCEQGS	V0, V2, V4
-	BEQ	found
-	MOVD	$1(R7), R7
-	CMPBLT	R7, R2, index2to16loop
-	CMPBGT	R7, R2, notfound
-index2to16tail:
-	VLL	R4, (R7), V1
-	VCEQGS	V0, V1, V2
-	BEQ	found
-	BR	notfound
-
-index17plus:
-	CMPBGT	R4, $31, index33plus
-	SUB	$16, R4, R0
-	VLL	R0, 16(R3), V1
-	VONE	V7
-index17to32loop:
-	VL	(R7), V2
-	VLL	R0, 16(R7), V3
-	VCEQG	V0, V2, V4
-	VCEQG	V1, V3, V5
-	VN	V4, V5, V6
-	VCEQGS	V6, V7, V8
-	BEQ	found
-	MOVD	$1(R7), R7
-	CMPBLE  R7, R2, index17to32loop
-	BR	notfound
-
-index33plus:
-	CMPBGT	R4, $47, index49plus
-	SUB	$32, R4, R0
-	VL	16(R3), V1
-	VLL	R0, 32(R3), V2
-	VONE	V11
-index33to48loop:
-	VL	(R7), V3
-	VL	16(R7), V4
-	VLL	R0, 32(R7), V5
-	VCEQG	V0, V3, V6
-	VCEQG	V1, V4, V7
-	VCEQG	V2, V5, V8
-	VN	V6, V7, V9
-	VN	V8, V9, V10
-	VCEQGS	V10, V11, V12
-	BEQ	found
-	MOVD	$1(R7), R7
-	CMPBLE  R7, R2, index33to48loop
-	BR	notfound
-
-index49plus:
-	CMPBGT	R4, $63, index65plus
-	SUB	$48, R4, R0
-	VL	16(R3), V1
-	VL	32(R3), V2
-	VLL	R0, 48(R3), V3
-	VONE	V15
-index49to64loop:
-	VL	(R7), V4
-	VL	16(R7), V5
-	VL	32(R7), V6
-	VLL	R0, 48(R7), V7
-	VCEQG	V0, V4, V8
-	VCEQG	V1, V5, V9
-	VCEQG	V2, V6, V10
-	VCEQG	V3, V7, V11
-	VN	V8, V9, V12
-	VN	V10, V11, V13
-	VN	V12, V13, V14
-	VCEQGS	V14, V15, V16
-	BEQ	found
-	MOVD	$1(R7), R7
-	CMPBLE  R7, R2, index49to64loop
-notfound:
-	MOVD	$-1, (R5)
-	RET
-
-index65plus:
-	// not implemented
-	MOVD	$0, (R0)
-	RET
-
-foundV17: // index is in doubleword V17[0]
-	VLGVG	$0, V17, R8
-	ADD	R8, R7
-found:
-	SUB	R1, R7
-	MOVD	R7, (R5)
-	RET
-
 // This is called from .init_array and follows the platform, not Go, ABI.
 // We are overly conservative. We could only save the registers we use.
 // However, since this function is only called once per loaded module
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@ -4,32 +4,16 @@

 package runtime

-import (
-	internalcpu "internal/cpu"
-	"runtime/internal/sys"
-)
+import "internal/cpu"

 const (
 	// bit masks taken from bits/hwcap.h
 	_HWCAP_S390_VX = 2048 // vector facility
 )

-// facilities is padded to avoid false sharing.
-type facilities struct {
-	_     [sys.CacheLineSize]byte
-	hasVX bool // vector facility
-	_     [sys.CacheLineSize]byte
-}
-
-// cpu indicates the availability of s390x facilities that can be used in
-// Go assembly but are optional on models supported by Go.
-// TODO: remove this once we're only using internal/cpu.
-var cpu facilities
-
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP: // CPU capability bit flags
-		internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
-		cpu.hasVX = val&_HWCAP_S390_VX != 0
+		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
 	}
 }
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@ -932,6 +932,85 @@ func EqualFold(s, t string) bool {
 	return s == t
 }

+// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
+func Index(s, substr string) int {
+	n := len(substr)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, substr[0])
+	case n == len(s):
+		if substr == s {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	case n <= bytealg.MaxLen:
+		// Use brute force when s and substr both are small
+		if len(s) <= bytealg.MaxBruteForce {
+			return bytealg.IndexString(s, substr)
+		}
+		c := substr[0]
+		i := 0
+		t := s[:len(s)-n+1]
+		fails := 0
+		for i < len(t) {
+			if t[i] != c {
+				// IndexByte is faster than bytealg.IndexString, so use it as long as
+				// we're not getting lots of false positives.
+				o := IndexByte(t[i:], c)
+				if o < 0 {
+					return -1
+				}
+				i += o
+			}
+			if s[i:i+n] == substr {
+				return i
+			}
+			fails++
+			i++
+			// Switch to bytealg.IndexString when IndexByte produces too many false positives.
+			if fails > bytealg.Cutover(i) {
+				r := bytealg.IndexString(s[i:], substr)
+				if r >= 0 {
+					return r + i
+				}
+				return -1
+			}
+		}
+		return -1
+	}
+	c := substr[0]
+	i := 0
+	t := s[:len(s)-n+1]
+	fails := 0
+	for i < len(t) {
+		if t[i] != c {
+			o := IndexByte(t[i:], c)
+			if o < 0 {
+				return -1
+			}
+			i += o
+		}
+		if s[i:i+n] == substr {
+			return i
+		}
+		i++
+		fails++
+		if fails >= 4+i>>4 && i < len(t) {
+			// See comment in ../bytes/bytes_generic.go.
+			j := indexRabinKarp(s[i:], substr)
+			if j < 0 {
+				return -1
+			}
+			return i + j
+		}
+	}
+	return -1
+}
+
 func indexRabinKarp(s, substr string) int {
 	// Rabin-Karp search
 	hashss, pow := hashStr(substr)
--- a/src/strings/strings_amd64.go
+++ b/src/strings/strings_amd64.go
@ -1,79 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c string) int  // ../runtime/asm_amd64.s
-func countByte(s string, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
-	if cpu.X86.HasAVX2 {
-		shortStringLen = 63
-	} else {
-		shortStringLen = 31
-	}
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-	n := len(substr)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, substr[0])
-	case n == len(s):
-		if substr == s {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	case n <= shortStringLen:
-		// Use brute force when s and substr both are small
-		if len(s) <= 64 {
-			return indexShortStr(s, substr)
-		}
-		c := substr[0]
-		i := 0
-		t := s[:len(s)-n+1]
-		fails := 0
-		for i < len(t) {
-			if t[i] != c {
-				// IndexByte skips 16/32 bytes per iteration,
-				// so it's faster than indexShortStr.
-				o := IndexByte(t[i:], c)
-				if o < 0 {
-					return -1
-				}
-				i += o
-			}
-			if s[i:i+n] == substr {
-				return i
-			}
-			fails++
-			i++
-			// Switch to indexShortStr when IndexByte produces too many false positives.
-			// Too many means more that 1 error per 8 characters.
-			// Allow some errors in the beginning.
-			if fails > (i+16)/8 {
-				r := indexShortStr(s[i:], substr)
-				if r >= 0 {
-					return r + i
-				}
-				return -1
-			}
-		}
-		return -1
-	}
-	return indexRabinKarp(s, substr)
-}
--- a/src/strings/strings_generic.go
+++ b/src/strings/strings_generic.go
@ -1,55 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x
-
-package strings
-
-// TODO: implements short string optimization on non amd64 platforms
-// and get rid of strings_amd64.go
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-	n := len(substr)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, substr[0])
-	case n == len(s):
-		if substr == s {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	}
-	c := substr[0]
-	i := 0
-	t := s[:len(s)-n+1]
-	fails := 0
-	for i < len(t) {
-		if t[i] != c {
-			o := IndexByte(t[i:], c)
-			if o < 0 {
-				return -1
-			}
-			i += o
-		}
-		if s[i:i+n] == substr {
-			return i
-		}
-		i++
-		fails++
-		if fails >= 4+i>>4 && i < len(t) {
-			// See comment in ../bytes/bytes_generic.go.
-			j := indexRabinKarp(s[i:], substr)
-			if j < 0 {
-				return -1
-			}
-			return i + j
-		}
-	}
-	return -1
-}
--- a/src/strings/strings_s390x.go
+++ b/src/strings/strings_s390x.go
@ -1,80 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
-	if supportsVX() {
-		shortStringLen = 64
-	}
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-	n := len(substr)
-	switch {
-	case n == 0:
-		return 0
-	case n == 1:
-		return IndexByte(s, substr[0])
-	case n == len(s):
-		if substr == s {
-			return 0
-		}
-		return -1
-	case n > len(s):
-		return -1
-	case n <= shortStringLen:
-		// Use brute force when s and substr both are small
-		if len(s) <= 64 {
-			return indexShortStr(s, substr)
-		}
-		c := substr[0]
-		i := 0
-		t := s[:len(s)-n+1]
-		fails := 0
-		for i < len(t) {
-			if t[i] != c {
-				// IndexByte skips 16/32 bytes per iteration,
-				// so it's faster than indexShortStr.
-				o := IndexByte(t[i:], c)
-				if o < 0 {
-					return -1
-				}
-				i += o
-			}
-			if s[i:i+n] == substr {
-				return i
-			}
-			fails++
-			i++
-			// Switch to indexShortStr when IndexByte produces too many false positives.
-			// Too many means more that 1 error per 8 characters.
-			// Allow some errors in the beginning.
-			if fails > (i+16)/8 {
-				r := indexShortStr(s[i:], substr)
-				if r >= 0 {
-					return r + i
-				}
-				return -1
-			}
-		}
-		return -1
-	}
-	return indexRabinKarp(s, substr)
-}