utf8: Add new type String to automate string indexing by code point.

R=rsc, rog CC=golang-dev https://golang.org/cl/2275041
2024-11-25 02:07:58 -07:00 · 2010-09-25 06:58:34 +10:00 · 2010-09-25 06:58:34 +10:00 · 6f32c82953
commit 6f32c82953
parent d47266558d
4 changed files with 264 additions and 11 deletions
--- a/src/pkg/utf8/Makefile
+++ b/src/pkg/utf8/Makefile
@ -6,6 +6,7 @@ include ../../Make.inc

 TARG=utf8
 GOFILES=\
+	string.go\
 	utf8.go\

 include ../../Make.pkg
--- a/src/pkg/utf8/string.go
+++ b/src/pkg/utf8/string.go
@ -0,0 +1,166 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8
+
+// String wraps a regular string with a small structure that provides more
+// efficient indexing by code point index, as opposed to byte index.
+// Scanning incrementally forwards or backwards is O(1) per index operation
+// (although not as fast a range clause going forwards).  Random access is
+// O(N) in the length of the string, but the overhead is less than always
+// scanning from the beginning.
+// If the string is ASCII, random access is O(1).
+type String struct {
+	str      string
+	numRunes int
+	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
+	width    int
+	bytePos  int
+	runePos  int
+	nonASCII int // byte index of the first non-ASCII rune.
+}
+
+// NewString returns a new UTF-8 string with the provided contents.
+func NewString(contents string) *String {
+	for i := 0; i < len(contents); i++ {
+		if contents[i] >= RuneSelf {
+			// Not ASCII.
+			_, wid := DecodeRuneInString(contents)
+			return &String{
+				str:      contents,
+				numRunes: RuneCountInString(contents),
+				width:    wid,
+				nonASCII: i,
+			}
+		}
+	}
+	// ASCII is simple.  Also, the empty string is ASCII.
+	return &String{str: contents, numRunes: len(contents), nonASCII: len(contents)}
+}
+
+// String returns the contents of the String.  This method also means the
+// String is directly printable by fmt.Print.
+func (s *String) String() string {
+	return s.str
+}
+
+// RuneCount returns the number of runes (Unicode code points) in the String.
+func (s *String) RuneCount() int {
+	return s.numRunes
+}
+
+// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
+func (s *String) IsASCII() bool {
+	return s.width == 0
+}
+
+// At returns the rune with index i in the String.  The sequence of runes is the same
+// as iterating over the contents with a "for range" clause.
+func (s *String) At(i int) int {
+	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
+	if i < s.nonASCII {
+		return int(s.str[i])
+	}
+
+	// Now we do need to know the index is valid.
+	if i < 0 || i >= s.numRunes {
+		panic(outOfRange)
+	}
+
+	var rune int
+
+	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
+	// With these cases, all scans from beginning or end work in O(1) time per rune.
+	switch {
+
+	case i == s.runePos-1: // backing up one rune
+		rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
+		s.runePos = i
+		s.bytePos -= s.width
+		return rune
+	case i == s.runePos+1: // moving ahead one rune
+		s.runePos = i
+		s.bytePos += s.width
+		fallthrough
+	case i == s.runePos:
+		rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
+		return rune
+	case i == 0: // start of string
+		rune, s.width = DecodeRuneInString(s.str)
+		s.runePos = 0
+		s.bytePos = 0
+		return rune
+
+	case i == s.numRunes-1: // last rune in string
+		rune, s.width = DecodeLastRuneInString(s.str)
+		s.runePos = i
+		s.bytePos = len(s.str) - s.width
+		return rune
+	}
+
+	// We need to do a linear scan.  There are three places to start from:
+	// 1) The beginning
+	// 2) bytePos/runePos.
+	// 3) The end
+	// Choose the closest in rune count, scanning backwards if necessary.
+	forward := true
+	if i < s.runePos {
+		// Between beginning and pos.  Which is closer?
+		// Since both i and runePos are guaranteed >= nonASCII, that's the
+		// lowest location we need to start from.
+		if i < (s.runePos-s.nonASCII)/2 {
+			// Scan forward from beginning
+			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
+		} else {
+			// Scan backwards from where we are
+			forward = false
+		}
+	} else {
+		// Between pos and end.  Which is closer?
+		if i-s.runePos < (s.numRunes-s.runePos)/2 {
+			// Scan forward from pos
+		} else {
+			// Scan backwards from end
+			s.bytePos, s.runePos = len(s.str), s.numRunes
+			forward = false
+		}
+	}
+	if forward {
+		// TODO: Is it much faster to use a range loop for this scan?
+		for {
+			rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
+			if s.runePos == i {
+				break
+			}
+			s.runePos++
+			s.bytePos += s.width
+		}
+	} else {
+		for {
+			rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
+			s.runePos--
+			s.bytePos -= s.width
+			if s.runePos == i {
+				break
+			}
+		}
+	}
+	return rune
+}
+
+// We want the panic in At(i) to satisfy os.Error, because that's what
+// runtime panics satisfy, but we can't import os.  This is our solution.
+
+// error is the type of the error returned if a user calls String.At(i) with i out of range.
+// It satisfies os.Error and runtime.Error.
+type error string
+
+func (err error) String() string {
+	return string(err)
+}
+
+func (err error) RunTimeError() {
+}
+
+var outOfRange = error("utf8.String: index out of Range")
--- a/src/pkg/utf8/string_test.go
+++ b/src/pkg/utf8/string_test.go
@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+	"rand"
+	"testing"
+	. "utf8"
+)
+
+func TestScanForwards(t *testing.T) {
+	for _, s := range testStrings {
+		runes := []int(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for i, expect := range runes {
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
+
+func TestScanBackwards(t *testing.T) {
+	for _, s := range testStrings {
+		runes := []int(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for i := len(runes) - 1; i >= 0; i-- {
+			expect := runes[i]
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
+
+const randCount = 100000
+
+func TestRandomAccess(t *testing.T) {
+	for _, s := range testStrings {
+		if len(s) == 0 {
+			continue
+		}
+		runes := []int(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for j := 0; j < randCount; j++ {
+			i := rand.Intn(len(runes))
+			expect := runes[i]
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
--- a/src/pkg/utf8/utf8_test.go
+++ b/src/pkg/utf8/utf8_test.go
@ -47,20 +47,16 @@ var utf8map = []Utf8Map{
 var testStrings = []string{
 	"",
 	"abcd",
+	"☺☻☹",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
 	"\x80\x80\x80\x80",
 }

-// strings.Bytes with one extra byte at end
-func makeBytes(s string) []byte {
-	s += "\x00"
-	b := []byte(s)
-	return b[0 : len(s)-1]
-}
-
 func TestFullRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		if !FullRune(b) {
 			t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune)
 		}
@ -82,7 +78,7 @@ func TestFullRune(t *testing.T) {
 func TestEncodeRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		var buf [10]byte
 		n := EncodeRune(m.rune, buf[0:])
 		b1 := buf[0:n]
@ -95,7 +91,7 @@ func TestEncodeRune(t *testing.T) {
 func TestDecodeRune(t *testing.T) {
 	for i := 0; i < len(utf8map); i++ {
 		m := utf8map[i]
-		b := makeBytes(m.str)
+		b := []byte(m.str)
 		rune, size := DecodeRune(b)
 		if rune != m.rune || size != len(b) {
 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b))
@ -163,6 +159,26 @@ func TestSequencing(t *testing.T) {
 	}
 }

+// Check that a range loop and a []int conversion visit the same runes.
+// Not really a test of this package, but the assumption is used here and
+// it's good to verify
+func TestIntConversion(t *testing.T) {
+	for _, ts := range testStrings {
+		runes := []int(ts)
+		if RuneCountInString(ts) != len(runes) {
+			t.Error("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
+			break
+		}
+		i := 0
+		for _, r := range ts {
+			if r != runes[i] {
+				t.Errorf("%q[%d]: expected %c (U+%04x); got %c (U+%04x)", ts, i, runes[i], runes[i], r, r)
+			}
+			i++
+		}
+	}
+}
+
 func testSequence(t *testing.T, s string) {
 	type info struct {
 		index int
@ -252,7 +268,7 @@ func TestRuneCount(t *testing.T) {
 		if out := RuneCountInString(tt.in); out != tt.out {
 			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
 		}
-		if out := RuneCount(makeBytes(tt.in)); out != tt.out {
+		if out := RuneCount([]byte(tt.in)); out != tt.out {
 			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
 		}
 	}