1
0
mirror of https://github.com/golang/go synced 2024-11-20 06:24:40 -07:00

exp/locale/collate: implementation of trie that is used for detecting contractions.

(See http://www.unicode.org/reports/tr10/#Contractions.)  Each rune that is at the
start of any contraction is associated a trie. This trie, in turn, may be shared
by other runes that have the same set of suffixes.

R=r, r
CC=golang-dev
https://golang.org/cl/5970066
This commit is contained in:
Marcel van Lohuizen 2012-04-25 13:15:48 +02:00
parent 733b51d996
commit e456d015fb
4 changed files with 778 additions and 0 deletions

View File

@ -0,0 +1,301 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package build
import (
"fmt"
"io"
"reflect"
"sort"
"strings"
)
// This file contains code for detecting contractions and generating
// the necessary tables.
// Any Unicode Collation Algorithm (UCA) table entry that has more than
// one rune one the left-hand side is called a contraction.
// See http://www.unicode.org/reports/tr10/#Contractions for more details.
//
// We define the following terms:
// initial: a rune that appears as the first rune in a contraction.
// suffix: a sequence of runes succeeding the initial rune
// in a given contraction.
// non-initial: a rune that appears in a suffix.
//
// A rune may be both a initial and a non-initial and may be so in
// many contractions. An initial may typically also appear by itself.
// In case of ambiguities, the UCA requires we match the longest
// contraction.
//
// Many contraction rules share the same set of possible suffixes.
// We store sets of suffixes in a trie that associates an index with
// each suffix in the set. This index can be used to look up a
// collation element associated with the (starter rune, suffix) pair.
//
// The trie is defined on a UTF-8 byte sequence.
// The overall trie is represented as an array of ctEntries. Each node of the trie
// is represented as a subsequence of ctEntries, where each entry corresponds to
// a possible match of a next character in the search string. An entry
// also includes the length and offset to the next sequence of entries
// to check in case of a match.
// ctEntry associates to a matching byte an offset and/or next sequence of
// bytes to check. A ctEntry c is called final if a match means that the
// longest suffix has been found. An entry c is final if c.n == 0.
// A single final entry can match a range of characters to an offset.
// A non-final entry always matches a single byte. Note that a non-final
// entry might still resemble a completed suffix.
// Examples:
// The suffix strings "ab" and "ac" can be represented as:
// []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' by itself does not match, so i is 0xFF.
// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
// }
//
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
// []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' must be followed by 'b'.
// {'b', 2, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
// {'d', 'd', 0, 3}, // "abd" -> 3
// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
// {'d', 'd', 0, 4}, // "abcd" -> 4
// }
// See genStateTests in contract_test.go for more examples.
type ctEntry struct {
l uint8 // non-final: byte value to match; final: lowest match in range.
h uint8 // non-final: relative index to next block; final: highest match in range.
n uint8 // non-final: length of next block; final: 0
i uint8 // result offset. Will be 0xFF if more bytes are needed to complete.
}
// contractTrieSet holds a set of contraction tries. The tries are stored
// consecutively in the entry field.
type contractTrieSet []struct{ l, h, n, i uint8 }
// ctHandle is used to identify a trie in the trie set, consisting in an offset
// in the array and the size of the first node.
type ctHandle struct {
index, n int
}
// appendTrie adds a new trie for the given suffixes to the trie set and returns
// a handle to it. The handle will be invalid on error.
func (ct *contractTrieSet) appendTrie(suffixes []string) (ctHandle, error) {
es := make([]stridx, len(suffixes))
for i, s := range suffixes {
es[i].str = s
}
sort.Sort(offsetSort(es))
for i := range es {
es[i].index = i + 1
}
sort.Sort(genidxSort(es))
i := len(*ct)
n, err := ct.genStates(es)
if err != nil {
*ct = (*ct)[:i]
return ctHandle{}, err
}
return ctHandle{i, n}, nil
}
// genStates generates ctEntries for a given suffix set and returns
// the number of entries for the first node.
func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
if len(sis) == 0 {
return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
}
start := len(*ct)
// create entries for differing first bytes.
for _, si := range sis {
s := si.str
if len(s) == 0 {
continue
}
added := false
c := s[0]
if len(s) > 1 {
for j := len(*ct) - 1; j >= start; j-- {
if (*ct)[j].l == c {
added = true
break
}
}
if !added {
*ct = append(*ct, ctEntry{l: c, i: 0xFF})
}
} else {
for j := len(*ct) - 1; j >= start; j-- {
// Update the offset for longer suffixes with the same byte.
if (*ct)[j].l == c {
(*ct)[j].i = uint8(si.index)
added = true
}
// Extend range of final ctEntry, if possible.
if (*ct)[j].h+1 == c {
(*ct)[j].h = c
added = true
}
}
if !added {
*ct = append(*ct, ctEntry{l: c, h: c, i: uint8(si.index)})
}
}
}
n := len(*ct) - start
// Append nodes for the remainder of the suffixes for each ctEntry.
sp := 0
for i, end := start, len(*ct); i < end; i++ {
fe := (*ct)[i]
if fe.h == 0 { // uninitialized non-final
ln := len(*ct) - start
if ln > 0xFF {
return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
}
fe.h = uint8(ln)
// Find first non-final strings with same byte as current entry.
for ; sis[sp].str[0] != fe.l; sp++ {
}
se := sp + 1
for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.l; se++ {
}
sl := sis[sp:se]
sp = se
for i, si := range sl {
sl[i].str = si.str[1:]
}
nn, err := ct.genStates(sl)
if err != nil {
return 0, err
}
fe.n = uint8(nn)
(*ct)[i] = fe
}
}
sort.Sort(entrySort((*ct)[start : start+n]))
return n, nil
}
// There may be both a final and non-final entry for a byte if the byte
// is implied in a range of matches in the final entry.
// We need to ensure that the non-final entry comes first in that case.
type entrySort contractTrieSet
func (fe entrySort) Len() int { return len(fe) }
func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
func (fe entrySort) Less(i, j int) bool {
return fe[i].l > fe[j].l
}
// stridx is used for sorting suffixes and their associated offsets.
type stridx struct {
str string
index int
}
// For computing the offsets, we first sort by size, and then by string.
// This ensures that strings that only differ in the last byte by 1
// are sorted consecutively in increasing order such that they can
// be packed as a range in a final ctEntry.
type offsetSort []stridx
func (si offsetSort) Len() int { return len(si) }
func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
func (si offsetSort) Less(i, j int) bool {
if len(si[i].str) != len(si[j].str) {
return len(si[i].str) > len(si[j].str)
}
return si[i].str < si[j].str
}
// For indexing, we want to ensure that strings are sorted in string order, where
// for strings with the same prefix, we put longer strings before shorter ones.
type genidxSort []stridx
func (si genidxSort) Len() int { return len(si) }
func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
func (si genidxSort) Less(i, j int) bool {
if strings.HasPrefix(si[j].str, si[i].str) {
return false
}
if strings.HasPrefix(si[i].str, si[j].str) {
return true
}
return si[i].str < si[j].str
}
// lookup matches the longest suffix in str and returns the associated offset
// and the number of bytes consumed.
func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) {
states := (*ct)[h.index:]
p := 0
n := h.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
if c >= e.l {
p++
if e.l == c {
if e.i != 0xFF {
index, ns = int(e.i), p
}
if e.n != 0 {
// set to new state
i, states, n = 0, states[e.h:], int(e.n)
} else {
return
}
} else if e.n == 0 && c <= e.h {
return int(c-e.l) + int(e.i), p
}
} else {
i++
}
}
return
}
// print writes the contractTrieSet t as compilable Go code to w. It returns
// the total number of bytes written and the size of the resulting data structure in bytes.
func (t *contractTrieSet) print(w io.Writer, name string) (n, size int, err error) {
update3 := func(nn, sz int, e error) {
n += nn
if err == nil {
err = e
}
size += sz
}
update2 := func(nn int, e error) { update3(nn, 0, e) }
update3(t.printArray(w, name))
update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
update3(t.printStruct(w, name))
update2(fmt.Fprintln(w))
return
}
func (ct contractTrieSet) printArray(w io.Writer, name string) (n, size int, err error) {
p := func(f string, a ...interface{}) {
nn, e := fmt.Fprintf(w, f, a...)
n += nn
if err == nil {
err = e
}
}
size = len(ct) * 4
p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
p("var %sCTEntries = [%d]struct{l,h,n,i uint8}{\n", name, len(ct))
for _, fe := range ct {
p("\t{0x%X, 0x%X, %d, %d},\n", fe.l, fe.h, fe.n, fe.i)
}
p("}\n")
return
}
func (ct contractTrieSet) printStruct(w io.Writer, name string) (n, size int, err error) {
n, err = fmt.Fprintf(w, "contractTrieSet( %sCTEntries[:] )", name)
size = int(reflect.TypeOf(ct).Size())
return
}

View File

@ -0,0 +1,264 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package build
import (
"bytes"
"sort"
"testing"
)
var largetosmall = []stridx{
{"a", 5},
{"ab", 4},
{"abc", 3},
{"abcd", 2},
{"abcde", 1},
{"abcdef", 0},
}
var offsetSortTests = [][]stridx{
[]stridx{
{"bcde", 1},
{"bc", 5},
{"ab", 4},
{"bcd", 3},
{"abcd", 0},
{"abc", 2},
},
largetosmall,
}
func TestOffsetSort(t *testing.T) {
for i, st := range offsetSortTests {
sort.Sort(offsetSort(st))
for j, si := range st {
if j != si.index {
t.Errorf("%d: failed: %v", i, st)
}
}
}
for i, tt := range genStateTests {
// ensure input is well-formed
sort.Sort(offsetSort(tt.in))
for j, si := range tt.in {
if si.index != j+1 {
t.Errorf("%dth sort failed: %v", i, tt.in)
}
}
}
}
var genidxtest1 = []stridx{
{"bcde", 3},
{"bc", 6},
{"ab", 2},
{"bcd", 5},
{"abcd", 0},
{"abc", 1},
{"bcdf", 4},
}
var genidxSortTests = [][]stridx{
genidxtest1,
largetosmall,
}
func TestGenIdxSort(t *testing.T) {
for i, st := range genidxSortTests {
sort.Sort(genidxSort(st))
for j, si := range st {
if j != si.index {
t.Errorf("%dth sort failed %v", i, st)
break
}
}
}
}
var entrySortTests = []contractTrieSet{
contractTrieSet{
{10, 0, 1, 3},
{99, 0, 1, 0},
{20, 50, 0, 2},
{30, 0, 1, 1},
},
}
func TestEntrySort(t *testing.T) {
for i, et := range entrySortTests {
sort.Sort(entrySort(et))
for j, fe := range et {
if j != int(fe.i) {
t.Errorf("%dth sort failed %v", i, et)
break
}
}
}
}
type GenStateTest struct {
in []stridx
firstBlockLen int
out contractTrieSet
}
var genStateTests = []GenStateTest{
{[]stridx{
{"abc", 1},
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'c', 0, 1},
},
},
{[]stridx{
{"abc", 1},
{"abd", 2},
{"abe", 3},
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'e', 0, 1},
},
},
{[]stridx{
{"abc", 1},
{"ab", 2},
{"a", 3},
},
1,
contractTrieSet{
{'a', 1, 1, 3},
{'b', 1, 1, 2},
{'c', 'c', 0, 1},
},
},
{[]stridx{
{"abc", 1},
{"abd", 2},
{"ab", 3},
{"ac", 4},
{"a", 5},
{"b", 6},
},
2,
contractTrieSet{
{'b', 'b', 0, 6},
{'a', 2, 2, 5},
{'c', 'c', 0, 4},
{'b', 2, 1, 3},
{'c', 'd', 0, 1},
},
},
{[]stridx{
{"bcde", 2},
{"bc", 7},
{"ab", 6},
{"bcd", 5},
{"abcd", 1},
{"abc", 4},
{"bcdf", 3},
},
2,
contractTrieSet{
{'b', 5, 1, 0xFF},
{'a', 2, 1, 0xFF},
{'b', 1, 1, 6},
{'c', 1, 1, 4},
{'d', 'd', 0, 1},
{'c', 1, 1, 7},
{'d', 1, 1, 5},
{'e', 'f', 0, 2},
},
},
}
func TestGenStates(t *testing.T) {
for i, tt := range genStateTests {
si := []stridx{}
for _, e := range tt.in {
si = append(si, e)
}
// ensure input is well-formed
sort.Sort(genidxSort(si))
ct := contractTrieSet{}
n, _ := ct.genStates(si)
if nn := tt.firstBlockLen; nn != n {
t.Errorf("%d: block len %v; want %v", i, n, nn)
}
if lv, lw := len(ct), len(tt.out); lv != lw {
t.Errorf("%d: len %v; want %v", i, lv, lw)
continue
}
for j, fe := range tt.out {
const msg = "%d:%d: value %s=%v; want %v"
if fe.l != ct[j].l {
t.Errorf(msg, i, j, "l", ct[j].l, fe.l)
}
if fe.h != ct[j].h {
t.Errorf(msg, i, j, "h", ct[j].h, fe.h)
}
if fe.n != ct[j].n {
t.Errorf(msg, i, j, "n", ct[j].n, fe.n)
}
if fe.i != ct[j].i {
t.Errorf(msg, i, j, "i", ct[j].i, fe.i)
}
}
}
}
func TestLookupContraction(t *testing.T) {
for i, tt := range genStateTests {
input := []string{}
for _, e := range tt.in {
input = append(input, e.str)
}
cts := contractTrieSet{}
h, _ := cts.appendTrie(input)
for j, si := range tt.in {
str := si.str
for _, s := range []string{str, str + "X"} {
msg := "%d:%d: %s(%s) %v; want %v"
idx, sn := cts.lookup(h, []byte(s))
if idx != si.index {
t.Errorf(msg, i, j, "index", s, idx, si.index)
}
if sn != len(str) {
t.Errorf(msg, i, j, "sn", s, sn, len(str))
}
}
}
}
}
func TestPrintContractionTrieSet(t *testing.T) {
testdata := contractTrieSet(genStateTests[4].out)
buf := &bytes.Buffer{}
testdata.print(buf, "test")
if contractTrieOutput != buf.String() {
t.Errorf("output differs; found\n%s", buf.String())
println(string(buf.Bytes()))
}
}
const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes
var testCTEntries = [8]struct{l,h,n,i uint8}{
{0x62, 0x5, 1, 255},
{0x61, 0x2, 1, 255},
{0x62, 0x1, 1, 6},
{0x63, 0x1, 1, 4},
{0x64, 0x64, 0, 1},
{0x63, 0x1, 1, 7},
{0x64, 0x1, 1, 5},
{0x65, 0x66, 0, 2},
}
var testContractTrieSet = contractTrieSet( testCTEntries[:] )
`

View File

@ -0,0 +1,81 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package collate
import "unicode/utf8"
// For a description of contractTrieSet, see exp/locale/collate/build/contract.go.
type contractTrieSet []struct{ l, h, n, i uint8 }
// ctScanner is used to match a trie to an input sequence.
// A contraction may match a non-contiguous sequence of bytes in an input string.
// For example, if there is a contraction for <a, combining_ring>, it should match
// the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
// not block combining_ring.
// ctScanner does not automatically skip over non-blocking non-starters, but rather
// retains the state of the last match and leaves it up to the user to continue
// the match at the appropriate points.
type ctScanner struct {
states contractTrieSet
s []byte
n int
index int
pindex int
done bool
}
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
return ctScanner{states: t[index:], s: b, n: n}
}
// result returns the offset i and bytes consumed p so far. If no suffix
// matched, i and p will be 0.
func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex
}
// scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int {
pr := p // the p at the rune start
str := s.s
states, n := s.states, s.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
// TODO: a significant number of contractions are of a form that
// cannot match discontiguous UTF-8 in a normalized string. We could let
// a negative value of e.n mean that we can set s.done = true and avoid
// the need for additional matches.
if c >= e.l {
if e.l == c {
p++
if e.i != 0xFF {
s.index = int(e.i)
s.pindex = p
}
if e.n != 0 {
i, states, n = 0, states[e.h:], int(e.n)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
} else {
s.done = true
return p
}
continue
} else if e.n == 0 && c <= e.h {
p++
s.done = true
s.index = int(c-e.l) + int(e.i)
s.pindex = p
return p
}
}
i++
}
return pr
}

View File

@ -0,0 +1,132 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package collate
import (
"testing"
)
type lookupStrings struct {
str string
offset int
n int // bytes consumed from input
}
type LookupTest struct {
lookup []lookupStrings
n int
tries contractTrieSet
}
var lookupTests = []LookupTest{
{[]lookupStrings{
{"abc", 1, 3},
{"a", 0, 0},
{"b", 0, 0},
{"c", 0, 0},
{"d", 0, 0},
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'c', 0, 1},
},
},
{[]lookupStrings{
{"abc", 1, 3},
{"abd", 2, 3},
{"abe", 3, 3},
{"a", 0, 0},
{"ab", 0, 0},
{"d", 0, 0},
{"f", 0, 0},
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'e', 0, 1},
},
},
{[]lookupStrings{
{"abc", 1, 3},
{"ab", 2, 2},
{"a", 3, 1},
{"abcd", 1, 3},
{"abe", 2, 2},
},
1,
contractTrieSet{
{'a', 1, 1, 3},
{'b', 1, 1, 2},
{'c', 'c', 0, 1},
},
},
{[]lookupStrings{
{"abc", 1, 3},
{"abd", 2, 3},
{"ab", 3, 2},
{"ac", 4, 2},
{"a", 5, 1},
{"b", 6, 1},
{"ba", 6, 1},
},
2,
contractTrieSet{
{'b', 'b', 0, 6},
{'a', 2, 2, 5},
{'c', 'c', 0, 4},
{'b', 2, 1, 3},
{'c', 'd', 0, 1},
},
},
{[]lookupStrings{
{"bcde", 2, 4},
{"bc", 7, 2},
{"ab", 6, 2},
{"bcd", 5, 3},
{"abcd", 1, 4},
{"abc", 4, 3},
{"bcdf", 3, 4},
},
2,
contractTrieSet{
{'b', 5, 1, 0xFF},
{'a', 2, 1, 0xFF},
{'b', 1, 1, 6},
{'c', 1, 1, 4},
{'d', 'd', 0, 1},
{'c', 1, 1, 7},
{'d', 1, 1, 5},
{'e', 'f', 0, 2},
},
},
}
func lookup(c *contractTrieSet, nnode int, s []uint8) (i, n int) {
scan := c.scanner(0, nnode, s)
scan.scan(0)
return scan.result()
}
func TestLookupContraction(t *testing.T) {
for i, tt := range lookupTests {
cts := contractTrieSet(tt.tries)
for j, lu := range tt.lookup {
str := lu.str
for _, s := range []string{str, str + "X"} {
const msg = `%d:%d: %s of "%s" %v; want %v`
offset, n := lookup(&cts, tt.n, []byte(s))
if offset != lu.offset {
t.Errorf(msg, i, j, "offset", s, offset, lu.offset)
}
if n != lu.n {
t.Errorf(msg, i, j, "bytes consumed", s, n, len(str))
}
}
}
}
}