diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go index c7389d4d6f5..1c1391d55b5 100644 --- a/src/unicode/utf8/utf8.go +++ b/src/unicode/utf8/utf8.go @@ -61,6 +61,12 @@ const ( s7 = 0x44 // accept 4, size 4 ) +const ( + runeErrorByte0 = t3 | (RuneError >> 12) + runeErrorByte1 = tx | (RuneError>>6)&maskx + runeErrorByte2 = tx | RuneError&maskx +) + // first is information about the first byte in a UTF-8 sequence. var first = [256]uint8{ // 1 2 3 4 5 6 7 8 9 A B C D E F @@ -340,32 +346,41 @@ func RuneLen(r rune) int { // If the rune is out of range, it writes the encoding of [RuneError]. // It returns the number of bytes written. func EncodeRune(p []byte, r rune) int { - // Negative values are erroneous. Making it unsigned addresses the problem. - switch i := uint32(r); { - case i <= rune1Max: + // This function is inlineable for fast handling of ASCII. + if uint32(r) <= rune1Max { p[0] = byte(r) return 1 + } + return encodeRuneNonASCII(p, r) +} + +func encodeRuneNonASCII(p []byte, r rune) int { + // Negative values are erroneous. Making it unsigned addresses the problem. + switch i := uint32(r); { case i <= rune2Max: _ = p[1] // eliminate bounds checks p[0] = t2 | byte(r>>6) p[1] = tx | byte(r)&maskx return 2 - case i > MaxRune, surrogateMin <= i && i <= surrogateMax: - r = RuneError - fallthrough - case i <= rune3Max: + case i < surrogateMin, surrogateMax < i && i <= rune3Max: _ = p[2] // eliminate bounds checks p[0] = t3 | byte(r>>12) p[1] = tx | byte(r>>6)&maskx p[2] = tx | byte(r)&maskx return 3 - default: + case i > rune3Max && i <= MaxRune: _ = p[3] // eliminate bounds checks p[0] = t4 | byte(r>>18) p[1] = tx | byte(r>>12)&maskx p[2] = tx | byte(r>>6)&maskx p[3] = tx | byte(r)&maskx return 4 + default: + _ = p[2] // eliminate bounds checks + p[0] = runeErrorByte0 + p[1] = runeErrorByte1 + p[2] = runeErrorByte2 + return 3 } } @@ -385,13 +400,12 @@ func appendRuneNonASCII(p []byte, r rune) []byte { switch i := uint32(r); { case i <= rune2Max: return append(p, t2|byte(r>>6), tx|byte(r)&maskx) - case i > MaxRune, surrogateMin <= i && i <= surrogateMax: - r = RuneError - fallthrough - case i <= rune3Max: + case i < surrogateMin, surrogateMax < i && i <= rune3Max: return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx) - default: + case i > rune3Max && i <= MaxRune: return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx) + default: + return append(p, runeErrorByte0, runeErrorByte1, runeErrorByte2) } } diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go index 19a04dc92e7..fa23419b364 100644 --- a/src/unicode/utf8/utf8_test.go +++ b/src/unicode/utf8/utf8_test.go @@ -641,28 +641,98 @@ func init() { func BenchmarkEncodeASCIIRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { - EncodeRune(buf, 'a') + EncodeRune(buf, 'a') // 1 byte + } +} + +func BenchmarkEncodeSpanishRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + EncodeRune(buf, 'Ñ') // 2 bytes } } func BenchmarkEncodeJapaneseRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { - EncodeRune(buf, '本') + EncodeRune(buf, '本') // 3 bytes + } +} + +func BenchmarkEncodeMaxRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + EncodeRune(buf, MaxRune) // 4 bytes + } +} + +func BenchmarkEncodeInvalidRuneMaxPlusOne(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + EncodeRune(buf, MaxRune+1) // 3 bytes: RuneError + } +} + +func BenchmarkEncodeInvalidRuneSurrogate(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + EncodeRune(buf, 0xD800) // 3 bytes: RuneError + } +} + +func BenchmarkEncodeInvalidRuneNegative(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + EncodeRune(buf, -1) // 3 bytes: RuneError } } func BenchmarkAppendASCIIRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { - AppendRune(buf[:0], 'a') + AppendRune(buf[:0], 'a') // 1 byte + } +} + +func BenchmarkAppendSpanishRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], 'Ñ') // 2 bytes } } func BenchmarkAppendJapaneseRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { - AppendRune(buf[:0], '本') + AppendRune(buf[:0], '本') // 3 bytes + } +} + +func BenchmarkAppendMaxRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], MaxRune) // 4 bytes + } +} + +func BenchmarkAppendInvalidRuneMaxPlusOne(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], MaxRune+1) // 3 bytes: RuneError + } +} + +func BenchmarkAppendInvalidRuneSurrogate(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], 0xD800) // 3 bytes: RuneError + } +} + +func BenchmarkAppendInvalidRuneNegative(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], -1) // 3 bytes: RuneError } }