From 6f32c8295353ba781a840366319dd7c312ba9a43 Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Sat, 25 Sep 2010 06:58:34 +1000 Subject: [PATCH] utf8: Add new type String to automate string indexing by code point. R=rsc, rog CC=golang-dev https://golang.org/cl/2275041 --- src/pkg/utf8/Makefile | 1 + src/pkg/utf8/string.go | 166 ++++++++++++++++++++++++++++++++++++ src/pkg/utf8/string_test.go | 70 +++++++++++++++ src/pkg/utf8/utf8_test.go | 38 ++++++--- 4 files changed, 264 insertions(+), 11 deletions(-) create mode 100644 src/pkg/utf8/string.go create mode 100644 src/pkg/utf8/string_test.go diff --git a/src/pkg/utf8/Makefile b/src/pkg/utf8/Makefile index df69486dc3..b3574ba3b4 100644 --- a/src/pkg/utf8/Makefile +++ b/src/pkg/utf8/Makefile @@ -6,6 +6,7 @@ include ../../Make.inc TARG=utf8 GOFILES=\ + string.go\ utf8.go\ include ../../Make.pkg diff --git a/src/pkg/utf8/string.go b/src/pkg/utf8/string.go new file mode 100644 index 0000000000..ce74f720db --- /dev/null +++ b/src/pkg/utf8/string.go @@ -0,0 +1,166 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf8 + +// String wraps a regular string with a small structure that provides more +// efficient indexing by code point index, as opposed to byte index. +// Scanning incrementally forwards or backwards is O(1) per index operation +// (although not as fast a range clause going forwards). Random access is +// O(N) in the length of the string, but the overhead is less than always +// scanning from the beginning. +// If the string is ASCII, random access is O(1). +type String struct { + str string + numRunes int + // If width > 0, the rune at runePos starts at bytePos and has the specified width. + width int + bytePos int + runePos int + nonASCII int // byte index of the first non-ASCII rune. +} + +// NewString returns a new UTF-8 string with the provided contents. +func NewString(contents string) *String { + for i := 0; i < len(contents); i++ { + if contents[i] >= RuneSelf { + // Not ASCII. + _, wid := DecodeRuneInString(contents) + return &String{ + str: contents, + numRunes: RuneCountInString(contents), + width: wid, + nonASCII: i, + } + } + } + // ASCII is simple. Also, the empty string is ASCII. + return &String{str: contents, numRunes: len(contents), nonASCII: len(contents)} +} + +// String returns the contents of the String. This method also means the +// String is directly printable by fmt.Print. +func (s *String) String() string { + return s.str +} + +// RuneCount returns the number of runes (Unicode code points) in the String. +func (s *String) RuneCount() int { + return s.numRunes +} + +// IsASCII returns a boolean indicating whether the String contains only ASCII bytes. +func (s *String) IsASCII() bool { + return s.width == 0 +} + +// At returns the rune with index i in the String. The sequence of runes is the same +// as iterating over the contents with a "for range" clause. +func (s *String) At(i int) int { + // ASCII is easy. Let the compiler catch the indexing error if there is one. + if i < s.nonASCII { + return int(s.str[i]) + } + + // Now we do need to know the index is valid. + if i < 0 || i >= s.numRunes { + panic(outOfRange) + } + + var rune int + + // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end. + // With these cases, all scans from beginning or end work in O(1) time per rune. + switch { + + case i == s.runePos-1: // backing up one rune + rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos = i + s.bytePos -= s.width + return rune + case i == s.runePos+1: // moving ahead one rune + s.runePos = i + s.bytePos += s.width + fallthrough + case i == s.runePos: + rune, s.width = DecodeRuneInString(s.str[s.bytePos:]) + return rune + case i == 0: // start of string + rune, s.width = DecodeRuneInString(s.str) + s.runePos = 0 + s.bytePos = 0 + return rune + + case i == s.numRunes-1: // last rune in string + rune, s.width = DecodeLastRuneInString(s.str) + s.runePos = i + s.bytePos = len(s.str) - s.width + return rune + } + + // We need to do a linear scan. There are three places to start from: + // 1) The beginning + // 2) bytePos/runePos. + // 3) The end + // Choose the closest in rune count, scanning backwards if necessary. + forward := true + if i < s.runePos { + // Between beginning and pos. Which is closer? + // Since both i and runePos are guaranteed >= nonASCII, that's the + // lowest location we need to start from. + if i < (s.runePos-s.nonASCII)/2 { + // Scan forward from beginning + s.bytePos, s.runePos = s.nonASCII, s.nonASCII + } else { + // Scan backwards from where we are + forward = false + } + } else { + // Between pos and end. Which is closer? + if i-s.runePos < (s.numRunes-s.runePos)/2 { + // Scan forward from pos + } else { + // Scan backwards from end + s.bytePos, s.runePos = len(s.str), s.numRunes + forward = false + } + } + if forward { + // TODO: Is it much faster to use a range loop for this scan? + for { + rune, s.width = DecodeRuneInString(s.str[s.bytePos:]) + if s.runePos == i { + break + } + s.runePos++ + s.bytePos += s.width + } + } else { + for { + rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos-- + s.bytePos -= s.width + if s.runePos == i { + break + } + } + } + return rune +} + +// We want the panic in At(i) to satisfy os.Error, because that's what +// runtime panics satisfy, but we can't import os. This is our solution. + +// error is the type of the error returned if a user calls String.At(i) with i out of range. +// It satisfies os.Error and runtime.Error. +type error string + +func (err error) String() string { + return string(err) +} + +func (err error) RunTimeError() { +} + +var outOfRange = error("utf8.String: index out of Range") diff --git a/src/pkg/utf8/string_test.go b/src/pkg/utf8/string_test.go new file mode 100644 index 0000000000..7ce8f8a7ea --- /dev/null +++ b/src/pkg/utf8/string_test.go @@ -0,0 +1,70 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf8_test + +import ( + "rand" + "testing" + . "utf8" +) + +func TestScanForwards(t *testing.T) { + for _, s := range testStrings { + runes := []int(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for i, expect := range runes { + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got) + } + } + } +} + +func TestScanBackwards(t *testing.T) { + for _, s := range testStrings { + runes := []int(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for i := len(runes) - 1; i >= 0; i-- { + expect := runes[i] + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got) + } + } + } +} + +const randCount = 100000 + +func TestRandomAccess(t *testing.T) { + for _, s := range testStrings { + if len(s) == 0 { + continue + } + runes := []int(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for j := 0; j < randCount; j++ { + i := rand.Intn(len(runes)) + expect := runes[i] + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got) + } + } + } +} diff --git a/src/pkg/utf8/utf8_test.go b/src/pkg/utf8/utf8_test.go index 45c5ad3f8f..92e6bd3aab 100644 --- a/src/pkg/utf8/utf8_test.go +++ b/src/pkg/utf8/utf8_test.go @@ -47,20 +47,16 @@ var utf8map = []Utf8Map{ var testStrings = []string{ "", "abcd", + "☺☻☹", + "日a本b語ç日ð本Ê語þ日¥本¼語i日©", + "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", "\x80\x80\x80\x80", } -// strings.Bytes with one extra byte at end -func makeBytes(s string) []byte { - s += "\x00" - b := []byte(s) - return b[0 : len(s)-1] -} - func TestFullRune(t *testing.T) { for i := 0; i < len(utf8map); i++ { m := utf8map[i] - b := makeBytes(m.str) + b := []byte(m.str) if !FullRune(b) { t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune) } @@ -82,7 +78,7 @@ func TestFullRune(t *testing.T) { func TestEncodeRune(t *testing.T) { for i := 0; i < len(utf8map); i++ { m := utf8map[i] - b := makeBytes(m.str) + b := []byte(m.str) var buf [10]byte n := EncodeRune(m.rune, buf[0:]) b1 := buf[0:n] @@ -95,7 +91,7 @@ func TestEncodeRune(t *testing.T) { func TestDecodeRune(t *testing.T) { for i := 0; i < len(utf8map); i++ { m := utf8map[i] - b := makeBytes(m.str) + b := []byte(m.str) rune, size := DecodeRune(b) if rune != m.rune || size != len(b) { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b)) @@ -163,6 +159,26 @@ func TestSequencing(t *testing.T) { } } +// Check that a range loop and a []int conversion visit the same runes. +// Not really a test of this package, but the assumption is used here and +// it's good to verify +func TestIntConversion(t *testing.T) { + for _, ts := range testStrings { + runes := []int(ts) + if RuneCountInString(ts) != len(runes) { + t.Error("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts)) + break + } + i := 0 + for _, r := range ts { + if r != runes[i] { + t.Errorf("%q[%d]: expected %c (U+%04x); got %c (U+%04x)", ts, i, runes[i], runes[i], r, r) + } + i++ + } + } +} + func testSequence(t *testing.T, s string) { type info struct { index int @@ -252,7 +268,7 @@ func TestRuneCount(t *testing.T) { if out := RuneCountInString(tt.in); out != tt.out { t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out) } - if out := RuneCount(makeBytes(tt.in)); out != tt.out { + if out := RuneCount([]byte(tt.in)); out != tt.out { t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out) } }