mirror of
https://github.com/golang/go
synced 2024-11-12 07:30:25 -07:00
utf8: Add new type String to automate string indexing by code point.
R=rsc, rog CC=golang-dev https://golang.org/cl/2275041
This commit is contained in:
parent
d47266558d
commit
6f32c82953
@ -6,6 +6,7 @@ include ../../Make.inc
|
|||||||
|
|
||||||
TARG=utf8
|
TARG=utf8
|
||||||
GOFILES=\
|
GOFILES=\
|
||||||
|
string.go\
|
||||||
utf8.go\
|
utf8.go\
|
||||||
|
|
||||||
include ../../Make.pkg
|
include ../../Make.pkg
|
||||||
|
166
src/pkg/utf8/string.go
Normal file
166
src/pkg/utf8/string.go
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
// Copyright 2009 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package utf8
|
||||||
|
|
||||||
|
// String wraps a regular string with a small structure that provides more
|
||||||
|
// efficient indexing by code point index, as opposed to byte index.
|
||||||
|
// Scanning incrementally forwards or backwards is O(1) per index operation
|
||||||
|
// (although not as fast a range clause going forwards). Random access is
|
||||||
|
// O(N) in the length of the string, but the overhead is less than always
|
||||||
|
// scanning from the beginning.
|
||||||
|
// If the string is ASCII, random access is O(1).
|
||||||
|
type String struct {
|
||||||
|
str string
|
||||||
|
numRunes int
|
||||||
|
// If width > 0, the rune at runePos starts at bytePos and has the specified width.
|
||||||
|
width int
|
||||||
|
bytePos int
|
||||||
|
runePos int
|
||||||
|
nonASCII int // byte index of the first non-ASCII rune.
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewString returns a new UTF-8 string with the provided contents.
|
||||||
|
func NewString(contents string) *String {
|
||||||
|
for i := 0; i < len(contents); i++ {
|
||||||
|
if contents[i] >= RuneSelf {
|
||||||
|
// Not ASCII.
|
||||||
|
_, wid := DecodeRuneInString(contents)
|
||||||
|
return &String{
|
||||||
|
str: contents,
|
||||||
|
numRunes: RuneCountInString(contents),
|
||||||
|
width: wid,
|
||||||
|
nonASCII: i,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// ASCII is simple. Also, the empty string is ASCII.
|
||||||
|
return &String{str: contents, numRunes: len(contents), nonASCII: len(contents)}
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns the contents of the String. This method also means the
|
||||||
|
// String is directly printable by fmt.Print.
|
||||||
|
func (s *String) String() string {
|
||||||
|
return s.str
|
||||||
|
}
|
||||||
|
|
||||||
|
// RuneCount returns the number of runes (Unicode code points) in the String.
|
||||||
|
func (s *String) RuneCount() int {
|
||||||
|
return s.numRunes
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
|
||||||
|
func (s *String) IsASCII() bool {
|
||||||
|
return s.width == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// At returns the rune with index i in the String. The sequence of runes is the same
|
||||||
|
// as iterating over the contents with a "for range" clause.
|
||||||
|
func (s *String) At(i int) int {
|
||||||
|
// ASCII is easy. Let the compiler catch the indexing error if there is one.
|
||||||
|
if i < s.nonASCII {
|
||||||
|
return int(s.str[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we do need to know the index is valid.
|
||||||
|
if i < 0 || i >= s.numRunes {
|
||||||
|
panic(outOfRange)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rune int
|
||||||
|
|
||||||
|
// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
|
||||||
|
// With these cases, all scans from beginning or end work in O(1) time per rune.
|
||||||
|
switch {
|
||||||
|
|
||||||
|
case i == s.runePos-1: // backing up one rune
|
||||||
|
rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
|
||||||
|
s.runePos = i
|
||||||
|
s.bytePos -= s.width
|
||||||
|
return rune
|
||||||
|
case i == s.runePos+1: // moving ahead one rune
|
||||||
|
s.runePos = i
|
||||||
|
s.bytePos += s.width
|
||||||
|
fallthrough
|
||||||
|
case i == s.runePos:
|
||||||
|
rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
|
||||||
|
return rune
|
||||||
|
case i == 0: // start of string
|
||||||
|
rune, s.width = DecodeRuneInString(s.str)
|
||||||
|
s.runePos = 0
|
||||||
|
s.bytePos = 0
|
||||||
|
return rune
|
||||||
|
|
||||||
|
case i == s.numRunes-1: // last rune in string
|
||||||
|
rune, s.width = DecodeLastRuneInString(s.str)
|
||||||
|
s.runePos = i
|
||||||
|
s.bytePos = len(s.str) - s.width
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
|
||||||
|
// We need to do a linear scan. There are three places to start from:
|
||||||
|
// 1) The beginning
|
||||||
|
// 2) bytePos/runePos.
|
||||||
|
// 3) The end
|
||||||
|
// Choose the closest in rune count, scanning backwards if necessary.
|
||||||
|
forward := true
|
||||||
|
if i < s.runePos {
|
||||||
|
// Between beginning and pos. Which is closer?
|
||||||
|
// Since both i and runePos are guaranteed >= nonASCII, that's the
|
||||||
|
// lowest location we need to start from.
|
||||||
|
if i < (s.runePos-s.nonASCII)/2 {
|
||||||
|
// Scan forward from beginning
|
||||||
|
s.bytePos, s.runePos = s.nonASCII, s.nonASCII
|
||||||
|
} else {
|
||||||
|
// Scan backwards from where we are
|
||||||
|
forward = false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Between pos and end. Which is closer?
|
||||||
|
if i-s.runePos < (s.numRunes-s.runePos)/2 {
|
||||||
|
// Scan forward from pos
|
||||||
|
} else {
|
||||||
|
// Scan backwards from end
|
||||||
|
s.bytePos, s.runePos = len(s.str), s.numRunes
|
||||||
|
forward = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if forward {
|
||||||
|
// TODO: Is it much faster to use a range loop for this scan?
|
||||||
|
for {
|
||||||
|
rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
|
||||||
|
if s.runePos == i {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
s.runePos++
|
||||||
|
s.bytePos += s.width
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for {
|
||||||
|
rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
|
||||||
|
s.runePos--
|
||||||
|
s.bytePos -= s.width
|
||||||
|
if s.runePos == i {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
|
||||||
|
// We want the panic in At(i) to satisfy os.Error, because that's what
|
||||||
|
// runtime panics satisfy, but we can't import os. This is our solution.
|
||||||
|
|
||||||
|
// error is the type of the error returned if a user calls String.At(i) with i out of range.
|
||||||
|
// It satisfies os.Error and runtime.Error.
|
||||||
|
type error string
|
||||||
|
|
||||||
|
func (err error) String() string {
|
||||||
|
return string(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (err error) RunTimeError() {
|
||||||
|
}
|
||||||
|
|
||||||
|
var outOfRange = error("utf8.String: index out of Range")
|
70
src/pkg/utf8/string_test.go
Normal file
70
src/pkg/utf8/string_test.go
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
// Copyright 2009 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package utf8_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"rand"
|
||||||
|
"testing"
|
||||||
|
. "utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestScanForwards(t *testing.T) {
|
||||||
|
for _, s := range testStrings {
|
||||||
|
runes := []int(s)
|
||||||
|
str := NewString(s)
|
||||||
|
if str.RuneCount() != len(runes) {
|
||||||
|
t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||||
|
break
|
||||||
|
}
|
||||||
|
for i, expect := range runes {
|
||||||
|
got := str.At(i)
|
||||||
|
if got != expect {
|
||||||
|
t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestScanBackwards(t *testing.T) {
|
||||||
|
for _, s := range testStrings {
|
||||||
|
runes := []int(s)
|
||||||
|
str := NewString(s)
|
||||||
|
if str.RuneCount() != len(runes) {
|
||||||
|
t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||||
|
break
|
||||||
|
}
|
||||||
|
for i := len(runes) - 1; i >= 0; i-- {
|
||||||
|
expect := runes[i]
|
||||||
|
got := str.At(i)
|
||||||
|
if got != expect {
|
||||||
|
t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const randCount = 100000
|
||||||
|
|
||||||
|
func TestRandomAccess(t *testing.T) {
|
||||||
|
for _, s := range testStrings {
|
||||||
|
if len(s) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
runes := []int(s)
|
||||||
|
str := NewString(s)
|
||||||
|
if str.RuneCount() != len(runes) {
|
||||||
|
t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||||
|
break
|
||||||
|
}
|
||||||
|
for j := 0; j < randCount; j++ {
|
||||||
|
i := rand.Intn(len(runes))
|
||||||
|
expect := runes[i]
|
||||||
|
got := str.At(i)
|
||||||
|
if got != expect {
|
||||||
|
t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -47,20 +47,16 @@ var utf8map = []Utf8Map{
|
|||||||
var testStrings = []string{
|
var testStrings = []string{
|
||||||
"",
|
"",
|
||||||
"abcd",
|
"abcd",
|
||||||
|
"☺☻☹",
|
||||||
|
"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||||
|
"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||||
"\x80\x80\x80\x80",
|
"\x80\x80\x80\x80",
|
||||||
}
|
}
|
||||||
|
|
||||||
// strings.Bytes with one extra byte at end
|
|
||||||
func makeBytes(s string) []byte {
|
|
||||||
s += "\x00"
|
|
||||||
b := []byte(s)
|
|
||||||
return b[0 : len(s)-1]
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFullRune(t *testing.T) {
|
func TestFullRune(t *testing.T) {
|
||||||
for i := 0; i < len(utf8map); i++ {
|
for i := 0; i < len(utf8map); i++ {
|
||||||
m := utf8map[i]
|
m := utf8map[i]
|
||||||
b := makeBytes(m.str)
|
b := []byte(m.str)
|
||||||
if !FullRune(b) {
|
if !FullRune(b) {
|
||||||
t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune)
|
t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune)
|
||||||
}
|
}
|
||||||
@ -82,7 +78,7 @@ func TestFullRune(t *testing.T) {
|
|||||||
func TestEncodeRune(t *testing.T) {
|
func TestEncodeRune(t *testing.T) {
|
||||||
for i := 0; i < len(utf8map); i++ {
|
for i := 0; i < len(utf8map); i++ {
|
||||||
m := utf8map[i]
|
m := utf8map[i]
|
||||||
b := makeBytes(m.str)
|
b := []byte(m.str)
|
||||||
var buf [10]byte
|
var buf [10]byte
|
||||||
n := EncodeRune(m.rune, buf[0:])
|
n := EncodeRune(m.rune, buf[0:])
|
||||||
b1 := buf[0:n]
|
b1 := buf[0:n]
|
||||||
@ -95,7 +91,7 @@ func TestEncodeRune(t *testing.T) {
|
|||||||
func TestDecodeRune(t *testing.T) {
|
func TestDecodeRune(t *testing.T) {
|
||||||
for i := 0; i < len(utf8map); i++ {
|
for i := 0; i < len(utf8map); i++ {
|
||||||
m := utf8map[i]
|
m := utf8map[i]
|
||||||
b := makeBytes(m.str)
|
b := []byte(m.str)
|
||||||
rune, size := DecodeRune(b)
|
rune, size := DecodeRune(b)
|
||||||
if rune != m.rune || size != len(b) {
|
if rune != m.rune || size != len(b) {
|
||||||
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b))
|
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b))
|
||||||
@ -163,6 +159,26 @@ func TestSequencing(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check that a range loop and a []int conversion visit the same runes.
|
||||||
|
// Not really a test of this package, but the assumption is used here and
|
||||||
|
// it's good to verify
|
||||||
|
func TestIntConversion(t *testing.T) {
|
||||||
|
for _, ts := range testStrings {
|
||||||
|
runes := []int(ts)
|
||||||
|
if RuneCountInString(ts) != len(runes) {
|
||||||
|
t.Error("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
i := 0
|
||||||
|
for _, r := range ts {
|
||||||
|
if r != runes[i] {
|
||||||
|
t.Errorf("%q[%d]: expected %c (U+%04x); got %c (U+%04x)", ts, i, runes[i], runes[i], r, r)
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func testSequence(t *testing.T, s string) {
|
func testSequence(t *testing.T, s string) {
|
||||||
type info struct {
|
type info struct {
|
||||||
index int
|
index int
|
||||||
@ -252,7 +268,7 @@ func TestRuneCount(t *testing.T) {
|
|||||||
if out := RuneCountInString(tt.in); out != tt.out {
|
if out := RuneCountInString(tt.in); out != tt.out {
|
||||||
t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
|
t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
|
||||||
}
|
}
|
||||||
if out := RuneCount(makeBytes(tt.in)); out != tt.out {
|
if out := RuneCount([]byte(tt.in)); out != tt.out {
|
||||||
t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
|
t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user