mirror of
https://github.com/golang/go
synced 2024-11-12 06:30:21 -07:00
unicode: improve SimpleFold performance for ascii
This change significantly speeds up case-insensitive regexp matching. benchmark old ns/op new ns/op delta BenchmarkMatchEasy0i_32-8 2690 1473 -45.24% BenchmarkMatchEasy0i_1K-8 80404 42269 -47.43% BenchmarkMatchEasy0i_32K-8 3272187 2076118 -36.55% BenchmarkMatchEasy0i_1M-8 104805990 66503805 -36.55% BenchmarkMatchEasy0i_32M-8 3360192200 2126121600 -36.73% benchmark old MB/s new MB/s speedup BenchmarkMatchEasy0i_32-8 11.90 21.72 1.83x BenchmarkMatchEasy0i_1K-8 12.74 24.23 1.90x BenchmarkMatchEasy0i_32K-8 10.01 15.78 1.58x BenchmarkMatchEasy0i_1M-8 10.00 15.77 1.58x BenchmarkMatchEasy0i_32M-8 9.99 15.78 1.58x Issue #13288 Change-Id: I94af7bb29e75d60b4f6ee760124867ab271b9642 Reviewed-on: https://go-review.googlesource.com/16943 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
6e4a8615f6
commit
e607abbfd6
@ -672,6 +672,7 @@ func benchmark(b *testing.B, re string, n int) {
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
||||||
|
easy0i = "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"
|
||||||
easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
|
easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
|
||||||
medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
||||||
hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
|
||||||
@ -682,6 +683,11 @@ func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) }
|
|||||||
func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
|
func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
|
||||||
func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
|
func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
|
||||||
func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
|
func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
|
||||||
|
func BenchmarkMatchEasy0i_32(b *testing.B) { benchmark(b, easy0i, 32<<0) }
|
||||||
|
func BenchmarkMatchEasy0i_1K(b *testing.B) { benchmark(b, easy0i, 1<<10) }
|
||||||
|
func BenchmarkMatchEasy0i_32K(b *testing.B) { benchmark(b, easy0i, 32<<10) }
|
||||||
|
func BenchmarkMatchEasy0i_1M(b *testing.B) { benchmark(b, easy0i, 1<<20) }
|
||||||
|
func BenchmarkMatchEasy0i_32M(b *testing.B) { benchmark(b, easy0i, 32<<20) }
|
||||||
func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
|
func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
|
||||||
func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
|
func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
|
||||||
func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
|
func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
|
||||||
|
@ -332,6 +332,10 @@ type foldPair struct {
|
|||||||
// SimpleFold('1') = '1'
|
// SimpleFold('1') = '1'
|
||||||
//
|
//
|
||||||
func SimpleFold(r rune) rune {
|
func SimpleFold(r rune) rune {
|
||||||
|
if int(r) < len(asciiFold) {
|
||||||
|
return rune(asciiFold[r])
|
||||||
|
}
|
||||||
|
|
||||||
// Consult caseOrbit table for special cases.
|
// Consult caseOrbit table for special cases.
|
||||||
lo := 0
|
lo := 0
|
||||||
hi := len(caseOrbit)
|
hi := len(caseOrbit)
|
||||||
|
@ -1172,6 +1172,7 @@ func printCasefold() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printAsciiFold()
|
||||||
printCaseOrbit()
|
printCaseOrbit()
|
||||||
|
|
||||||
// Tables of category and script folding exceptions: code points
|
// Tables of category and script folding exceptions: code points
|
||||||
@ -1269,6 +1270,25 @@ var comment = map[string]string{
|
|||||||
"// If there is no entry for a script name, there are no such points.\n",
|
"// If there is no entry for a script name, there are no such points.\n",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func printAsciiFold() {
|
||||||
|
printf("var asciiFold = [MaxASCII + 1]uint16{\n")
|
||||||
|
for i := rune(0); i <= unicode.MaxASCII; i++ {
|
||||||
|
c := chars[i]
|
||||||
|
f := c.caseOrbit
|
||||||
|
if f == 0 {
|
||||||
|
if c.lowerCase != i && c.lowerCase != 0 {
|
||||||
|
f = c.lowerCase
|
||||||
|
} else if c.upperCase != i && c.upperCase != 0 {
|
||||||
|
f = c.upperCase
|
||||||
|
} else {
|
||||||
|
f = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\t0x%04X,\n", f)
|
||||||
|
}
|
||||||
|
printf("}\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
func printCaseOrbit() {
|
func printCaseOrbit() {
|
||||||
if *test {
|
if *test {
|
||||||
for j := range chars {
|
for j := range chars {
|
||||||
|
@ -6834,6 +6834,137 @@ var properties = [MaxLatin1 + 1]uint8{
|
|||||||
0xFF: pLl | pp, // 'ÿ'
|
0xFF: pLl | pp, // 'ÿ'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var asciiFold = [MaxASCII + 1]uint16{
|
||||||
|
0x0000,
|
||||||
|
0x0001,
|
||||||
|
0x0002,
|
||||||
|
0x0003,
|
||||||
|
0x0004,
|
||||||
|
0x0005,
|
||||||
|
0x0006,
|
||||||
|
0x0007,
|
||||||
|
0x0008,
|
||||||
|
0x0009,
|
||||||
|
0x000A,
|
||||||
|
0x000B,
|
||||||
|
0x000C,
|
||||||
|
0x000D,
|
||||||
|
0x000E,
|
||||||
|
0x000F,
|
||||||
|
0x0010,
|
||||||
|
0x0011,
|
||||||
|
0x0012,
|
||||||
|
0x0013,
|
||||||
|
0x0014,
|
||||||
|
0x0015,
|
||||||
|
0x0016,
|
||||||
|
0x0017,
|
||||||
|
0x0018,
|
||||||
|
0x0019,
|
||||||
|
0x001A,
|
||||||
|
0x001B,
|
||||||
|
0x001C,
|
||||||
|
0x001D,
|
||||||
|
0x001E,
|
||||||
|
0x001F,
|
||||||
|
0x0020,
|
||||||
|
0x0021,
|
||||||
|
0x0022,
|
||||||
|
0x0023,
|
||||||
|
0x0024,
|
||||||
|
0x0025,
|
||||||
|
0x0026,
|
||||||
|
0x0027,
|
||||||
|
0x0028,
|
||||||
|
0x0029,
|
||||||
|
0x002A,
|
||||||
|
0x002B,
|
||||||
|
0x002C,
|
||||||
|
0x002D,
|
||||||
|
0x002E,
|
||||||
|
0x002F,
|
||||||
|
0x0030,
|
||||||
|
0x0031,
|
||||||
|
0x0032,
|
||||||
|
0x0033,
|
||||||
|
0x0034,
|
||||||
|
0x0035,
|
||||||
|
0x0036,
|
||||||
|
0x0037,
|
||||||
|
0x0038,
|
||||||
|
0x0039,
|
||||||
|
0x003A,
|
||||||
|
0x003B,
|
||||||
|
0x003C,
|
||||||
|
0x003D,
|
||||||
|
0x003E,
|
||||||
|
0x003F,
|
||||||
|
0x0040,
|
||||||
|
0x0061,
|
||||||
|
0x0062,
|
||||||
|
0x0063,
|
||||||
|
0x0064,
|
||||||
|
0x0065,
|
||||||
|
0x0066,
|
||||||
|
0x0067,
|
||||||
|
0x0068,
|
||||||
|
0x0069,
|
||||||
|
0x006A,
|
||||||
|
0x006B,
|
||||||
|
0x006C,
|
||||||
|
0x006D,
|
||||||
|
0x006E,
|
||||||
|
0x006F,
|
||||||
|
0x0070,
|
||||||
|
0x0071,
|
||||||
|
0x0072,
|
||||||
|
0x0073,
|
||||||
|
0x0074,
|
||||||
|
0x0075,
|
||||||
|
0x0076,
|
||||||
|
0x0077,
|
||||||
|
0x0078,
|
||||||
|
0x0079,
|
||||||
|
0x007A,
|
||||||
|
0x005B,
|
||||||
|
0x005C,
|
||||||
|
0x005D,
|
||||||
|
0x005E,
|
||||||
|
0x005F,
|
||||||
|
0x0060,
|
||||||
|
0x0041,
|
||||||
|
0x0042,
|
||||||
|
0x0043,
|
||||||
|
0x0044,
|
||||||
|
0x0045,
|
||||||
|
0x0046,
|
||||||
|
0x0047,
|
||||||
|
0x0048,
|
||||||
|
0x0049,
|
||||||
|
0x004A,
|
||||||
|
0x212A,
|
||||||
|
0x004C,
|
||||||
|
0x004D,
|
||||||
|
0x004E,
|
||||||
|
0x004F,
|
||||||
|
0x0050,
|
||||||
|
0x0051,
|
||||||
|
0x0052,
|
||||||
|
0x017F,
|
||||||
|
0x0054,
|
||||||
|
0x0055,
|
||||||
|
0x0056,
|
||||||
|
0x0057,
|
||||||
|
0x0058,
|
||||||
|
0x0059,
|
||||||
|
0x005A,
|
||||||
|
0x007B,
|
||||||
|
0x007C,
|
||||||
|
0x007D,
|
||||||
|
0x007E,
|
||||||
|
0x007F,
|
||||||
|
}
|
||||||
|
|
||||||
var caseOrbit = []foldPair{
|
var caseOrbit = []foldPair{
|
||||||
{0x004B, 0x006B},
|
{0x004B, 0x006B},
|
||||||
{0x0053, 0x0073},
|
{0x0053, 0x0073},
|
||||||
|
Loading…
Reference in New Issue
Block a user