utf8: Add new type String to automate string indexing by code point.

R=rsc, rog CC=golang-dev https://golang.org/cl/2275041
2024-11-12 07:30:25 -07:00 · 2010-09-25 06:58:34 +10:00 · 2010-09-25 06:58:34 +10:00 · 6f32c82953
commit 6f32c82953
parent d47266558d
4 changed files with 264 additions and 11 deletions
--- a/src/pkg/utf8/Makefile
+++ b/src/pkg/utf8/Makefile
@ -6,6 +6,7 @@ include ../../Make.inc
 TARG=utf8
 GOFILES=\
 	string.go\
 	utf8.go\
 include ../../Make.pkg
--- a/src/pkg/utf8/string.go
+++ b/src/pkg/utf8/string.go
@ -0,0 +1,166 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package utf8
 // String wraps a regular string with a small structure that provides more
 // efficient indexing by code point index, as opposed to byte index.
 // Scanning incrementally forwards or backwards is O(1) per index operation
 // (although not as fast a range clause going forwards).  Random access is
 // O(N) in the length of the string, but the overhead is less than always
 // scanning from the beginning.
 // If the string is ASCII, random access is O(1).
 type String struct {
 	str      string
 	numRunes int
 	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
 	width    int
 	bytePos  int
 	runePos  int
 	nonASCII int // byte index of the first non-ASCII rune.
 }
 // NewString returns a new UTF-8 string with the provided contents.
 func NewString(contents string) *String {
 	for i := 0; i < len(contents); i++ {
 		if contents[i] >= RuneSelf {
 			// Not ASCII.
 			_, wid := DecodeRuneInString(contents)
 			return &String{
 				str:      contents,
 				numRunes: RuneCountInString(contents),
 				width:    wid,
 				nonASCII: i,
 			}
 		}
 	}
 	// ASCII is simple.  Also, the empty string is ASCII.
 	return &String{str: contents, numRunes: len(contents), nonASCII: len(contents)}
 }
 // String returns the contents of the String.  This method also means the
 // String is directly printable by fmt.Print.
 func (s *String) String() string {
 	return s.str
 }
 // RuneCount returns the number of runes (Unicode code points) in the String.
 func (s *String) RuneCount() int {
 	return s.numRunes
 }
 // IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
 func (s *String) IsASCII() bool {
 	return s.width == 0
 }
 // At returns the rune with index i in the String.  The sequence of runes is the same
 // as iterating over the contents with a "for range" clause.
 func (s *String) At(i int) int {
 	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
 	if i < s.nonASCII {
 		return int(s.str[i])
 	}
 	// Now we do need to know the index is valid.
 	if i < 0 || i >= s.numRunes {
 		panic(outOfRange)
 	}
 	var rune int
 	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
 	// With these cases, all scans from beginning or end work in O(1) time per rune.
 	switch {
 	case i == s.runePos-1: // backing up one rune
 		rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
 		s.runePos = i
 		s.bytePos -= s.width
 		return rune
 	case i == s.runePos+1: // moving ahead one rune
 		s.runePos = i
 		s.bytePos += s.width
 		fallthrough
 	case i == s.runePos:
 		rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
 		return rune
 	case i == 0: // start of string
 		rune, s.width = DecodeRuneInString(s.str)
 		s.runePos = 0
 		s.bytePos = 0
 		return rune
 	case i == s.numRunes-1: // last rune in string
 		rune, s.width = DecodeLastRuneInString(s.str)
 		s.runePos = i
 		s.bytePos = len(s.str) - s.width
 		return rune
 	}
 	// We need to do a linear scan.  There are three places to start from:
 	// 1) The beginning
 	// 2) bytePos/runePos.
 	// 3) The end
 	// Choose the closest in rune count, scanning backwards if necessary.
 	forward := true
 	if i < s.runePos {
 		// Between beginning and pos.  Which is closer?
 		// Since both i and runePos are guaranteed >= nonASCII, that's the
 		// lowest location we need to start from.
 		if i < (s.runePos-s.nonASCII)/2 {
 			// Scan forward from beginning
 			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
 		} else {
 			// Scan backwards from where we are
 			forward = false
 		}
 	} else {
 		// Between pos and end.  Which is closer?
 		if i-s.runePos < (s.numRunes-s.runePos)/2 {
 			// Scan forward from pos
 		} else {
 			// Scan backwards from end
 			s.bytePos, s.runePos = len(s.str), s.numRunes
 			forward = false
 		}
 	}
 	if forward {
 		// TODO: Is it much faster to use a range loop for this scan?
 		for {
 			rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
 			if s.runePos == i {
 				break
 			}
 			s.runePos++
 			s.bytePos += s.width
 		}
 	} else {
 		for {
 			rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
 			s.runePos--
 			s.bytePos -= s.width
 			if s.runePos == i {
 				break
 			}
 		}
 	}
 	return rune
 }
 // We want the panic in At(i) to satisfy os.Error, because that's what
 // runtime panics satisfy, but we can't import os.  This is our solution.
 // error is the type of the error returned if a user calls String.At(i) with i out of range.
 // It satisfies os.Error and runtime.Error.
 type error string
 func (err error) String() string {
 	return string(err)
 }
 func (err error) RunTimeError() {
 }
 var outOfRange = error("utf8.String: index out of Range")
--- a/src/pkg/utf8/string_test.go
+++ b/src/pkg/utf8/string_test.go
@ -0,0 +1,70 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package utf8_test
 import (
 	"rand"
 	"testing"
 	. "utf8"
 )
 func TestScanForwards(t *testing.T) {
 	for _, s := range testStrings {
 		runes := []int(s)
 		str := NewString(s)
 		if str.RuneCount() != len(runes) {
 			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
 			break
 		}
 		for i, expect := range runes {
 			got := str.At(i)
 			if got != expect {
 				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
 			}
 		}
 	}
 }
 func TestScanBackwards(t *testing.T) {
 	for _, s := range testStrings {
 		runes := []int(s)
 		str := NewString(s)
 		if str.RuneCount() != len(runes) {
 			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
 			break
 		}
 		for i := len(runes) - 1; i >= 0; i-- {
 			expect := runes[i]
 			got := str.At(i)
 			if got != expect {
 				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
 			}
 		}
 	}
 }
 const randCount = 100000
 func TestRandomAccess(t *testing.T) {
 	for _, s := range testStrings {
 		if len(s) == 0 {
 			continue
 		}
 		runes := []int(s)
 		str := NewString(s)
 		if str.RuneCount() != len(runes) {
 			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
 			break
 		}
 		for j := 0; j < randCount; j++ {
 			i := rand.Intn(len(runes))
 			expect := runes[i]
 			got := str.At(i)
 			if got != expect {
 				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
 			}
 		}
 	}
 }
--- a/src/pkg/utf8/utf8_test.go
+++ b/src/pkg/utf8/utf8_test.go
@ -47,20 +47,16 @@ var utf8map = []Utf8Map{
 var testStrings = []string{
 	"",
 	"abcd",
 	"☺☻☹",
 	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
 	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
 	"\x80\x80\x80\x80",
 }
 // strings.Bytes with one extra byte at end
 func makeBytes(s string) []byte {
 	s += "\x00"
 	b := []byte(s)
 	return b[0 : len(s)-1]
 }
 func TestFullRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		if !FullRune(b) {
 			t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune)
 		}
@ -82,7 +78,7 @@ func TestFullRune(t *testing.T) {
 func TestEncodeRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		var buf [10]byte
 		n := EncodeRune(m.rune, buf[0:])
 		b1 := buf[0:n]
@ -95,7 +91,7 @@ func TestEncodeRune(t *testing.T) {
 func TestDecodeRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		rune, size := DecodeRune(b)
 		if rune != m.rune || size != len(b) {
 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b))
@ -163,6 +159,26 @@ func TestSequencing(t *testing.T) {
 	}
 }
 // Check that a range loop and a []int conversion visit the same runes.
 // Not really a test of this package, but the assumption is used here and
 // it's good to verify
 func TestIntConversion(t *testing.T) {
 	for _, ts := range testStrings {
 		runes := []int(ts)
 		if RuneCountInString(ts) != len(runes) {
 			t.Error("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
 			break
 		}
 		i := 0
 		for _, r := range ts {
 			if r != runes[i] {
 				t.Errorf("%q[%d]: expected %c (U+%04x); got %c (U+%04x)", ts, i, runes[i], runes[i], r, r)
 			}
 			i++
 		}
 	}
 }
 func testSequence(t *testing.T, s string) {
 	type info struct {
 		index int
@ -252,7 +268,7 @@ func TestRuneCount(t *testing.T) {
 		if out := RuneCountInString(tt.in); out != tt.out {
 			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
 		}
-		if out := RuneCount(makeBytes(tt.in)); out != tt.out {
+		if out := RuneCount([]byte(tt.in)); out != tt.out {
 			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
 		}
 	}