mirror of
https://github.com/golang/go
synced 2024-11-12 07:30:25 -07:00
utf8: add DecodeLastRune and DecodeLastRuneInString to
enable traversing rune-by-rune backwards in strings R=r, rsc CC=golang-dev https://golang.org/cl/2192050
This commit is contained in:
parent
1959c3ac5b
commit
f11271b82e
@ -209,6 +209,73 @@ func DecodeRuneInString(s string) (rune, size int) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DecodeLastRune unpacks the last UTF-8 encoding in p
|
||||||
|
// and returns the rune and its width in bytes.
|
||||||
|
func DecodeLastRune(p []byte) (rune, size int) {
|
||||||
|
end := len(p)
|
||||||
|
if end == 0 {
|
||||||
|
return RuneError, 0
|
||||||
|
}
|
||||||
|
start := end - 1
|
||||||
|
rune = int(p[start])
|
||||||
|
if rune < RuneSelf {
|
||||||
|
return rune, 1
|
||||||
|
}
|
||||||
|
// guard against O(n^2) behavior when traversing
|
||||||
|
// backwards through strings with long sequences of
|
||||||
|
// invalid UTF-8.
|
||||||
|
lim := end - UTFMax
|
||||||
|
if lim < 0 {
|
||||||
|
lim = 0
|
||||||
|
}
|
||||||
|
for start--; start >= lim; start-- {
|
||||||
|
if RuneStart(p[start]) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if start < 0 {
|
||||||
|
start = 0
|
||||||
|
}
|
||||||
|
rune, size = DecodeRune(p[start:end])
|
||||||
|
if start+size != end {
|
||||||
|
return RuneError, 1
|
||||||
|
}
|
||||||
|
return rune, size
|
||||||
|
}
|
||||||
|
|
||||||
|
// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
|
||||||
|
func DecodeLastRuneInString(s string) (rune, size int) {
|
||||||
|
end := len(s)
|
||||||
|
if end == 0 {
|
||||||
|
return RuneError, 0
|
||||||
|
}
|
||||||
|
start := end - 1
|
||||||
|
rune = int(s[start])
|
||||||
|
if rune < RuneSelf {
|
||||||
|
return rune, 1
|
||||||
|
}
|
||||||
|
// guard against O(n^2) behavior when traversing
|
||||||
|
// backwards through strings with long sequences of
|
||||||
|
// invalid UTF-8.
|
||||||
|
lim := end - UTFMax
|
||||||
|
if lim < 0 {
|
||||||
|
lim = 0
|
||||||
|
}
|
||||||
|
for start--; start >= lim; start-- {
|
||||||
|
if RuneStart(s[start]) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if start < 0 {
|
||||||
|
start = 0
|
||||||
|
}
|
||||||
|
rune, size = DecodeRuneInString(s[start:end])
|
||||||
|
if start+size != end {
|
||||||
|
return RuneError, 1
|
||||||
|
}
|
||||||
|
return rune, size
|
||||||
|
}
|
||||||
|
|
||||||
// RuneLen returns the number of bytes required to encode the rune.
|
// RuneLen returns the number of bytes required to encode the rune.
|
||||||
func RuneLen(rune int) int {
|
func RuneLen(rune int) int {
|
||||||
switch {
|
switch {
|
||||||
|
@ -44,6 +44,12 @@ var utf8map = []Utf8Map{
|
|||||||
Utf8Map{0xFFFD, "\xef\xbf\xbd"},
|
Utf8Map{0xFFFD, "\xef\xbf\xbd"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var testStrings = []string{
|
||||||
|
"",
|
||||||
|
"abcd",
|
||||||
|
"\x80\x80\x80\x80",
|
||||||
|
}
|
||||||
|
|
||||||
// strings.Bytes with one extra byte at end
|
// strings.Bytes with one extra byte at end
|
||||||
func makeBytes(s string) []byte {
|
func makeBytes(s string) []byte {
|
||||||
s += "\x00"
|
s += "\x00"
|
||||||
@ -141,6 +147,79 @@ func TestDecodeRune(t *testing.T) {
|
|||||||
if rune != RuneError || size != 1 {
|
if rune != RuneError || size != 1 {
|
||||||
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, rune, size, RuneError, 1)
|
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, rune, size, RuneError, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that DecodeRune and DecodeLastRune correspond to
|
||||||
|
// the equivalent range loop.
|
||||||
|
func TestSequencing(t *testing.T) {
|
||||||
|
for _, ts := range testStrings {
|
||||||
|
for _, m := range utf8map {
|
||||||
|
for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
|
||||||
|
testSequence(t, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testSequence(t *testing.T, s string) {
|
||||||
|
type info struct {
|
||||||
|
index int
|
||||||
|
rune int
|
||||||
|
}
|
||||||
|
index := make([]info, len(s))
|
||||||
|
b := []byte(s)
|
||||||
|
si := 0
|
||||||
|
j := 0
|
||||||
|
for i, r := range s {
|
||||||
|
if si != i {
|
||||||
|
t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
index[j] = info{i, r}
|
||||||
|
j++
|
||||||
|
rune1, size1 := DecodeRune(b[i:])
|
||||||
|
if r != rune1 {
|
||||||
|
t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], rune1, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
rune2, size2 := DecodeRuneInString(s[i:])
|
||||||
|
if r != rune2 {
|
||||||
|
t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], rune2, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if size1 != size2 {
|
||||||
|
t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
si += size1
|
||||||
|
}
|
||||||
|
j--
|
||||||
|
for si = len(s); si > 0; {
|
||||||
|
rune1, size1 := DecodeLastRune(b[0:si])
|
||||||
|
rune2, size2 := DecodeLastRuneInString(s[0:si])
|
||||||
|
if size1 != size2 {
|
||||||
|
t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rune1 != index[j].rune {
|
||||||
|
t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, rune1, index[j].rune)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rune2 != index[j].rune {
|
||||||
|
t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, rune2, index[j].rune)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
si -= size1
|
||||||
|
if si != index[j].index {
|
||||||
|
t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j--
|
||||||
|
}
|
||||||
|
if si != 0 {
|
||||||
|
t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user