mirror of
https://github.com/golang/go
synced 2024-11-23 19:30:05 -07:00
strings: speed up Fields
- use a string lookup to detect if a single byte is a space character - determine the exact number of fields for ASCII and a possibly underestimated number of fields for non ASCII strings by doing a separate byte for byte scan of the input string before collecting the fields in an extra pass - provide a fast path for ASCII only strings when collecting the fields - avoid utf8.DecodeRuneInString and unicode.IsSpace for ASCII characters Used golang.org/cl/33108 from Joe Tsai as starting point. name old time/op new time/op delta Fields/ASCII/16 284ns ± 1% 116ns ± 2% -59.30% (p=0.000 n=9+10) Fields/ASCII/256 3.81µs ± 1% 0.80µs ± 1% -79.10% (p=0.000 n=10+10) Fields/ASCII/4096 61.4µs ± 1% 12.3µs ± 1% -79.96% (p=0.000 n=10+9) Fields/ASCII/65536 982µs ± 1% 235µs ± 0% -76.04% (p=0.000 n=10+9) Fields/ASCII/1048576 16.7ms ± 2% 5.4ms ± 1% -67.52% (p=0.000 n=10+10) Fields/Mixed/16 314ns ± 1% 168ns ± 1% -46.33% (p=0.000 n=9+10) Fields/Mixed/256 3.92µs ± 1% 1.17µs ± 1% -70.19% (p=0.000 n=10+10) Fields/Mixed/4096 69.1µs ± 1% 19.0µs ± 1% -72.53% (p=0.000 n=10+10) Fields/Mixed/65536 1.12ms ± 1% 0.39ms ± 0% -65.37% (p=0.000 n=10+9) Fields/Mixed/1048576 19.0ms ± 2% 7.3ms ± 4% -61.75% (p=0.000 n=10+9) name old speed new speed delta Fields/ASCII/16 56.3MB/s ± 1% 138.1MB/s ± 2% +145.31% (p=0.000 n=9+10) Fields/ASCII/256 67.1MB/s ± 1% 321.0MB/s ± 1% +378.26% (p=0.000 n=10+10) Fields/ASCII/4096 66.7MB/s ± 1% 333.0MB/s ± 1% +398.97% (p=0.000 n=10+9) Fields/ASCII/65536 66.7MB/s ± 1% 278.4MB/s ± 0% +317.39% (p=0.000 n=10+9) Fields/ASCII/1048576 62.7MB/s ± 2% 192.9MB/s ± 1% +207.82% (p=0.000 n=10+10) Fields/Mixed/16 51.0MB/s ± 2% 94.9MB/s ± 1% +85.87% (p=0.000 n=10+10) Fields/Mixed/256 65.4MB/s ± 1% 219.2MB/s ± 1% +235.33% (p=0.000 n=10+10) Fields/Mixed/4096 59.3MB/s ± 1% 215.7MB/s ± 1% +263.98% (p=0.000 n=10+10) Fields/Mixed/65536 58.6MB/s ± 1% 169.1MB/s ± 0% +188.73% (p=0.000 n=10+9) Fields/Mixed/1048576 55.1MB/s ± 2% 144.0MB/s ± 4% +161.44% (p=0.000 n=10+9) Updates #19789 Updates #17856 Change-Id: If2ce1479542702e9cd65a82a462ba55ac8eb3876 Reviewed-on: https://go-review.googlesource.com/37959 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
This commit is contained in:
parent
5cadc91b3c
commit
bebfd4ba41
@ -290,11 +290,118 @@ func SplitAfter(s, sep string) []string {
|
||||
return genSplit(s, sep, len(sep), -1)
|
||||
}
|
||||
|
||||
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
|
||||
|
||||
// Fields splits the string s around each instance of one or more consecutive white space
|
||||
// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
|
||||
// empty list if s contains only white space.
|
||||
func Fields(s string) []string {
|
||||
return FieldsFunc(s, unicode.IsSpace)
|
||||
// First count the fields.
|
||||
// This is an exact count if s is ASCII, otherwise it is an approximation.
|
||||
n := 0
|
||||
wasSpace := 1
|
||||
// setBits is used to track which bits are set in the bytes of s.
|
||||
setBits := uint8(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
r := s[i]
|
||||
setBits |= r
|
||||
isSpace := int(asciiSpace[r])
|
||||
n += wasSpace & ^isSpace
|
||||
wasSpace = isSpace
|
||||
}
|
||||
|
||||
if setBits < utf8.RuneSelf { // ASCII fast path
|
||||
a := make([]string, n)
|
||||
na := 0
|
||||
fieldStart := 0
|
||||
i := 0
|
||||
// Skip spaces in the front of the input.
|
||||
for i < len(s) && asciiSpace[s[i]] != 0 {
|
||||
i++
|
||||
}
|
||||
fieldStart = i
|
||||
for i < len(s) {
|
||||
if asciiSpace[s[i]] == 0 {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
a[na] = s[fieldStart:i]
|
||||
na++
|
||||
i++
|
||||
// Skip spaces in between fields.
|
||||
for i < len(s) && asciiSpace[s[i]] != 0 {
|
||||
i++
|
||||
}
|
||||
fieldStart = i
|
||||
}
|
||||
if fieldStart < len(s) { // Last field might end at EOF.
|
||||
a[na] = s[fieldStart:]
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// Some runes in the input string are not ASCII.
|
||||
// Same general approach as in the ASCII path but
|
||||
// uses DecodeRuneInString and unicode.IsSpace if
|
||||
// a non-ASCII rune needs to be decoded and checked
|
||||
// if it corresponds to a space.
|
||||
a := make([]string, 0, n)
|
||||
fieldStart := 0
|
||||
i := 0
|
||||
// Skip spaces in the front of the input.
|
||||
for i < len(s) {
|
||||
if c := s[i]; c < utf8.RuneSelf {
|
||||
if asciiSpace[c] == 0 {
|
||||
break
|
||||
}
|
||||
i++
|
||||
} else {
|
||||
r, w := utf8.DecodeRuneInString(s[i:])
|
||||
if !unicode.IsSpace(r) {
|
||||
break
|
||||
}
|
||||
i += w
|
||||
}
|
||||
}
|
||||
fieldStart = i
|
||||
for i < len(s) {
|
||||
if c := s[i]; c < utf8.RuneSelf {
|
||||
if asciiSpace[c] == 0 {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
a = append(a, s[fieldStart:i])
|
||||
i++
|
||||
} else {
|
||||
r, w := utf8.DecodeRuneInString(s[i:])
|
||||
if !unicode.IsSpace(r) {
|
||||
i += w
|
||||
continue
|
||||
}
|
||||
a = append(a, s[fieldStart:i])
|
||||
i += w
|
||||
}
|
||||
// Skip spaces in between fields.
|
||||
for i < len(s) {
|
||||
if c := s[i]; c < utf8.RuneSelf {
|
||||
if asciiSpace[c] == 0 {
|
||||
break
|
||||
}
|
||||
i++
|
||||
} else {
|
||||
r, w := utf8.DecodeRuneInString(s[i:])
|
||||
if !unicode.IsSpace(r) {
|
||||
break
|
||||
}
|
||||
i += w
|
||||
}
|
||||
}
|
||||
fieldStart = i
|
||||
}
|
||||
if fieldStart < len(s) { // Last field might end at EOF.
|
||||
a = append(a, s[fieldStart:])
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
|
||||
|
@ -452,6 +452,7 @@ var fieldstests = []FieldsTest{
|
||||
{"", []string{}},
|
||||
{" ", []string{}},
|
||||
{" \t ", []string{}},
|
||||
{"\u2000", []string{}},
|
||||
{" abc ", []string{"abc"}},
|
||||
{"1 2 3 4", []string{"1", "2", "3", "4"}},
|
||||
{"1 2 3 4", []string{"1", "2", "3", "4"}},
|
||||
@ -459,6 +460,9 @@ var fieldstests = []FieldsTest{
|
||||
{"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}},
|
||||
{"\u2000\u2001\u2002", []string{}},
|
||||
{"\n™\t™\n", []string{"™", "™"}},
|
||||
{"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}},
|
||||
{"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}},
|
||||
{"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}},
|
||||
{faces, []string{faces}},
|
||||
}
|
||||
|
||||
@ -1473,19 +1477,55 @@ var makeFieldsInput = func() string {
|
||||
return string(x)
|
||||
}
|
||||
|
||||
var fieldsInput = makeFieldsInput()
|
||||
var makeFieldsInputASCII = func() string {
|
||||
x := make([]byte, 1<<20)
|
||||
// Input is ~10% space, rest ASCII non-space.
|
||||
for i := range x {
|
||||
if rand.Intn(10) == 0 {
|
||||
x[i] = ' '
|
||||
} else {
|
||||
x[i] = 'x'
|
||||
}
|
||||
}
|
||||
return string(x)
|
||||
}
|
||||
|
||||
var stringdata = []struct{ name, data string }{
|
||||
{"ASCII", makeFieldsInputASCII()},
|
||||
{"Mixed", makeFieldsInput()},
|
||||
}
|
||||
|
||||
func BenchmarkFields(b *testing.B) {
|
||||
b.SetBytes(int64(len(fieldsInput)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
Fields(fieldsInput)
|
||||
for _, sd := range stringdata {
|
||||
b.Run(sd.name, func(b *testing.B) {
|
||||
for j := 1 << 4; j <= 1<<20; j <<= 4 {
|
||||
b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(j))
|
||||
data := sd.data[:j]
|
||||
for i := 0; i < b.N; i++ {
|
||||
Fields(data)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFieldsFunc(b *testing.B) {
|
||||
b.SetBytes(int64(len(fieldsInput)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
FieldsFunc(fieldsInput, unicode.IsSpace)
|
||||
for _, sd := range stringdata {
|
||||
b.Run(sd.name, func(b *testing.B) {
|
||||
for j := 1 << 4; j <= 1<<20; j <<= 4 {
|
||||
b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(j))
|
||||
data := sd.data[:j]
|
||||
for i := 0; i < b.N; i++ {
|
||||
FieldsFunc(data, unicode.IsSpace)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user