From e456d015fb670b82554284d74c5b88ee278b6f08 Mon Sep 17 00:00:00 2001 From: Marcel van Lohuizen Date: Wed, 25 Apr 2012 13:15:48 +0200 Subject: [PATCH] exp/locale/collate: implementation of trie that is used for detecting contractions. (See http://www.unicode.org/reports/tr10/#Contractions.) Each rune that is at the start of any contraction is associated a trie. This trie, in turn, may be shared by other runes that have the same set of suffixes. R=r, r CC=golang-dev https://golang.org/cl/5970066 --- src/pkg/exp/locale/collate/build/contract.go | 301 ++++++++++++++++++ .../exp/locale/collate/build/contract_test.go | 264 +++++++++++++++ src/pkg/exp/locale/collate/contract.go | 81 +++++ src/pkg/exp/locale/collate/contract_test.go | 132 ++++++++ 4 files changed, 778 insertions(+) create mode 100644 src/pkg/exp/locale/collate/build/contract.go create mode 100644 src/pkg/exp/locale/collate/build/contract_test.go create mode 100644 src/pkg/exp/locale/collate/contract.go create mode 100644 src/pkg/exp/locale/collate/contract_test.go diff --git a/src/pkg/exp/locale/collate/build/contract.go b/src/pkg/exp/locale/collate/build/contract.go new file mode 100644 index 0000000000..1f8691ba04 --- /dev/null +++ b/src/pkg/exp/locale/collate/build/contract.go @@ -0,0 +1,301 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import ( + "fmt" + "io" + "reflect" + "sort" + "strings" +) + +// This file contains code for detecting contractions and generating +// the necessary tables. +// Any Unicode Collation Algorithm (UCA) table entry that has more than +// one rune one the left-hand side is called a contraction. +// See http://www.unicode.org/reports/tr10/#Contractions for more details. +// +// We define the following terms: +// initial: a rune that appears as the first rune in a contraction. +// suffix: a sequence of runes succeeding the initial rune +// in a given contraction. +// non-initial: a rune that appears in a suffix. +// +// A rune may be both a initial and a non-initial and may be so in +// many contractions. An initial may typically also appear by itself. +// In case of ambiguities, the UCA requires we match the longest +// contraction. +// +// Many contraction rules share the same set of possible suffixes. +// We store sets of suffixes in a trie that associates an index with +// each suffix in the set. This index can be used to look up a +// collation element associated with the (starter rune, suffix) pair. +// +// The trie is defined on a UTF-8 byte sequence. +// The overall trie is represented as an array of ctEntries. Each node of the trie +// is represented as a subsequence of ctEntries, where each entry corresponds to +// a possible match of a next character in the search string. An entry +// also includes the length and offset to the next sequence of entries +// to check in case of a match. + +// ctEntry associates to a matching byte an offset and/or next sequence of +// bytes to check. A ctEntry c is called final if a match means that the +// longest suffix has been found. An entry c is final if c.n == 0. +// A single final entry can match a range of characters to an offset. +// A non-final entry always matches a single byte. Note that a non-final +// entry might still resemble a completed suffix. +// Examples: +// The suffix strings "ab" and "ac" can be represented as: +// []ctEntry{ +// {'a', 1, 1, 0xFF}, // 'a' by itself does not match, so i is 0xFF. +// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 +// } +// +// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: +// []ctEntry{ +// {'a', 1, 1, 0xFF}, // 'a' must be followed by 'b'. +// {'b', 2, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. +// {'d', 'd', 0, 3}, // "abd" -> 3 +// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. +// {'d', 'd', 0, 4}, // "abcd" -> 4 +// } +// See genStateTests in contract_test.go for more examples. +type ctEntry struct { + l uint8 // non-final: byte value to match; final: lowest match in range. + h uint8 // non-final: relative index to next block; final: highest match in range. + n uint8 // non-final: length of next block; final: 0 + i uint8 // result offset. Will be 0xFF if more bytes are needed to complete. +} + +// contractTrieSet holds a set of contraction tries. The tries are stored +// consecutively in the entry field. +type contractTrieSet []struct{ l, h, n, i uint8 } + +// ctHandle is used to identify a trie in the trie set, consisting in an offset +// in the array and the size of the first node. +type ctHandle struct { + index, n int +} + +// appendTrie adds a new trie for the given suffixes to the trie set and returns +// a handle to it. The handle will be invalid on error. +func (ct *contractTrieSet) appendTrie(suffixes []string) (ctHandle, error) { + es := make([]stridx, len(suffixes)) + for i, s := range suffixes { + es[i].str = s + } + sort.Sort(offsetSort(es)) + for i := range es { + es[i].index = i + 1 + } + sort.Sort(genidxSort(es)) + i := len(*ct) + n, err := ct.genStates(es) + if err != nil { + *ct = (*ct)[:i] + return ctHandle{}, err + } + return ctHandle{i, n}, nil +} + +// genStates generates ctEntries for a given suffix set and returns +// the number of entries for the first node. +func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { + if len(sis) == 0 { + return 0, fmt.Errorf("genStates: list of suffices must be non-empty") + } + start := len(*ct) + // create entries for differing first bytes. + for _, si := range sis { + s := si.str + if len(s) == 0 { + continue + } + added := false + c := s[0] + if len(s) > 1 { + for j := len(*ct) - 1; j >= start; j-- { + if (*ct)[j].l == c { + added = true + break + } + } + if !added { + *ct = append(*ct, ctEntry{l: c, i: 0xFF}) + } + } else { + for j := len(*ct) - 1; j >= start; j-- { + // Update the offset for longer suffixes with the same byte. + if (*ct)[j].l == c { + (*ct)[j].i = uint8(si.index) + added = true + } + // Extend range of final ctEntry, if possible. + if (*ct)[j].h+1 == c { + (*ct)[j].h = c + added = true + } + } + if !added { + *ct = append(*ct, ctEntry{l: c, h: c, i: uint8(si.index)}) + } + } + } + n := len(*ct) - start + // Append nodes for the remainder of the suffixes for each ctEntry. + sp := 0 + for i, end := start, len(*ct); i < end; i++ { + fe := (*ct)[i] + if fe.h == 0 { // uninitialized non-final + ln := len(*ct) - start + if ln > 0xFF { + return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) + } + fe.h = uint8(ln) + // Find first non-final strings with same byte as current entry. + for ; sis[sp].str[0] != fe.l; sp++ { + } + se := sp + 1 + for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.l; se++ { + } + sl := sis[sp:se] + sp = se + for i, si := range sl { + sl[i].str = si.str[1:] + } + nn, err := ct.genStates(sl) + if err != nil { + return 0, err + } + fe.n = uint8(nn) + (*ct)[i] = fe + } + } + sort.Sort(entrySort((*ct)[start : start+n])) + return n, nil +} + +// There may be both a final and non-final entry for a byte if the byte +// is implied in a range of matches in the final entry. +// We need to ensure that the non-final entry comes first in that case. +type entrySort contractTrieSet + +func (fe entrySort) Len() int { return len(fe) } +func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } +func (fe entrySort) Less(i, j int) bool { + return fe[i].l > fe[j].l +} + +// stridx is used for sorting suffixes and their associated offsets. +type stridx struct { + str string + index int +} + +// For computing the offsets, we first sort by size, and then by string. +// This ensures that strings that only differ in the last byte by 1 +// are sorted consecutively in increasing order such that they can +// be packed as a range in a final ctEntry. +type offsetSort []stridx + +func (si offsetSort) Len() int { return len(si) } +func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } +func (si offsetSort) Less(i, j int) bool { + if len(si[i].str) != len(si[j].str) { + return len(si[i].str) > len(si[j].str) + } + return si[i].str < si[j].str +} + +// For indexing, we want to ensure that strings are sorted in string order, where +// for strings with the same prefix, we put longer strings before shorter ones. +type genidxSort []stridx + +func (si genidxSort) Len() int { return len(si) } +func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } +func (si genidxSort) Less(i, j int) bool { + if strings.HasPrefix(si[j].str, si[i].str) { + return false + } + if strings.HasPrefix(si[i].str, si[j].str) { + return true + } + return si[i].str < si[j].str +} + +// lookup matches the longest suffix in str and returns the associated offset +// and the number of bytes consumed. +func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) { + states := (*ct)[h.index:] + p := 0 + n := h.n + for i := 0; i < n && p < len(str); { + e := states[i] + c := str[p] + if c >= e.l { + p++ + if e.l == c { + if e.i != 0xFF { + index, ns = int(e.i), p + } + if e.n != 0 { + // set to new state + i, states, n = 0, states[e.h:], int(e.n) + } else { + return + } + } else if e.n == 0 && c <= e.h { + return int(c-e.l) + int(e.i), p + } + } else { + i++ + } + } + return +} + +// print writes the contractTrieSet t as compilable Go code to w. It returns +// the total number of bytes written and the size of the resulting data structure in bytes. +func (t *contractTrieSet) print(w io.Writer, name string) (n, size int, err error) { + update3 := func(nn, sz int, e error) { + n += nn + if err == nil { + err = e + } + size += sz + } + update2 := func(nn int, e error) { update3(nn, 0, e) } + + update3(t.printArray(w, name)) + update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) + update3(t.printStruct(w, name)) + update2(fmt.Fprintln(w)) + return +} + +func (ct contractTrieSet) printArray(w io.Writer, name string) (n, size int, err error) { + p := func(f string, a ...interface{}) { + nn, e := fmt.Fprintf(w, f, a...) + n += nn + if err == nil { + err = e + } + } + size = len(ct) * 4 + p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) + p("var %sCTEntries = [%d]struct{l,h,n,i uint8}{\n", name, len(ct)) + for _, fe := range ct { + p("\t{0x%X, 0x%X, %d, %d},\n", fe.l, fe.h, fe.n, fe.i) + } + p("}\n") + return +} + +func (ct contractTrieSet) printStruct(w io.Writer, name string) (n, size int, err error) { + n, err = fmt.Fprintf(w, "contractTrieSet( %sCTEntries[:] )", name) + size = int(reflect.TypeOf(ct).Size()) + return +} diff --git a/src/pkg/exp/locale/collate/build/contract_test.go b/src/pkg/exp/locale/collate/build/contract_test.go new file mode 100644 index 0000000000..ea5f3c077a --- /dev/null +++ b/src/pkg/exp/locale/collate/build/contract_test.go @@ -0,0 +1,264 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import ( + "bytes" + "sort" + "testing" +) + +var largetosmall = []stridx{ + {"a", 5}, + {"ab", 4}, + {"abc", 3}, + {"abcd", 2}, + {"abcde", 1}, + {"abcdef", 0}, +} + +var offsetSortTests = [][]stridx{ + []stridx{ + {"bcde", 1}, + {"bc", 5}, + {"ab", 4}, + {"bcd", 3}, + {"abcd", 0}, + {"abc", 2}, + }, + largetosmall, +} + +func TestOffsetSort(t *testing.T) { + for i, st := range offsetSortTests { + sort.Sort(offsetSort(st)) + for j, si := range st { + if j != si.index { + t.Errorf("%d: failed: %v", i, st) + } + } + } + for i, tt := range genStateTests { + // ensure input is well-formed + sort.Sort(offsetSort(tt.in)) + for j, si := range tt.in { + if si.index != j+1 { + t.Errorf("%dth sort failed: %v", i, tt.in) + } + } + } +} + +var genidxtest1 = []stridx{ + {"bcde", 3}, + {"bc", 6}, + {"ab", 2}, + {"bcd", 5}, + {"abcd", 0}, + {"abc", 1}, + {"bcdf", 4}, +} + +var genidxSortTests = [][]stridx{ + genidxtest1, + largetosmall, +} + +func TestGenIdxSort(t *testing.T) { + for i, st := range genidxSortTests { + sort.Sort(genidxSort(st)) + for j, si := range st { + if j != si.index { + t.Errorf("%dth sort failed %v", i, st) + break + } + } + } +} + +var entrySortTests = []contractTrieSet{ + contractTrieSet{ + {10, 0, 1, 3}, + {99, 0, 1, 0}, + {20, 50, 0, 2}, + {30, 0, 1, 1}, + }, +} + +func TestEntrySort(t *testing.T) { + for i, et := range entrySortTests { + sort.Sort(entrySort(et)) + for j, fe := range et { + if j != int(fe.i) { + t.Errorf("%dth sort failed %v", i, et) + break + } + } + } +} + +type GenStateTest struct { + in []stridx + firstBlockLen int + out contractTrieSet +} + +var genStateTests = []GenStateTest{ + {[]stridx{ + {"abc", 1}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 0xFF}, + {'b', 1, 1, 0xFF}, + {'c', 'c', 0, 1}, + }, + }, + {[]stridx{ + {"abc", 1}, + {"abd", 2}, + {"abe", 3}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 0xFF}, + {'b', 1, 1, 0xFF}, + {'c', 'e', 0, 1}, + }, + }, + {[]stridx{ + {"abc", 1}, + {"ab", 2}, + {"a", 3}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 3}, + {'b', 1, 1, 2}, + {'c', 'c', 0, 1}, + }, + }, + {[]stridx{ + {"abc", 1}, + {"abd", 2}, + {"ab", 3}, + {"ac", 4}, + {"a", 5}, + {"b", 6}, + }, + 2, + contractTrieSet{ + {'b', 'b', 0, 6}, + {'a', 2, 2, 5}, + {'c', 'c', 0, 4}, + {'b', 2, 1, 3}, + {'c', 'd', 0, 1}, + }, + }, + {[]stridx{ + {"bcde", 2}, + {"bc", 7}, + {"ab", 6}, + {"bcd", 5}, + {"abcd", 1}, + {"abc", 4}, + {"bcdf", 3}, + }, + 2, + contractTrieSet{ + {'b', 5, 1, 0xFF}, + {'a', 2, 1, 0xFF}, + {'b', 1, 1, 6}, + {'c', 1, 1, 4}, + {'d', 'd', 0, 1}, + {'c', 1, 1, 7}, + {'d', 1, 1, 5}, + {'e', 'f', 0, 2}, + }, + }, +} + +func TestGenStates(t *testing.T) { + for i, tt := range genStateTests { + si := []stridx{} + for _, e := range tt.in { + si = append(si, e) + } + // ensure input is well-formed + sort.Sort(genidxSort(si)) + ct := contractTrieSet{} + n, _ := ct.genStates(si) + if nn := tt.firstBlockLen; nn != n { + t.Errorf("%d: block len %v; want %v", i, n, nn) + } + if lv, lw := len(ct), len(tt.out); lv != lw { + t.Errorf("%d: len %v; want %v", i, lv, lw) + continue + } + for j, fe := range tt.out { + const msg = "%d:%d: value %s=%v; want %v" + if fe.l != ct[j].l { + t.Errorf(msg, i, j, "l", ct[j].l, fe.l) + } + if fe.h != ct[j].h { + t.Errorf(msg, i, j, "h", ct[j].h, fe.h) + } + if fe.n != ct[j].n { + t.Errorf(msg, i, j, "n", ct[j].n, fe.n) + } + if fe.i != ct[j].i { + t.Errorf(msg, i, j, "i", ct[j].i, fe.i) + } + } + } +} + +func TestLookupContraction(t *testing.T) { + for i, tt := range genStateTests { + input := []string{} + for _, e := range tt.in { + input = append(input, e.str) + } + cts := contractTrieSet{} + h, _ := cts.appendTrie(input) + for j, si := range tt.in { + str := si.str + for _, s := range []string{str, str + "X"} { + msg := "%d:%d: %s(%s) %v; want %v" + idx, sn := cts.lookup(h, []byte(s)) + if idx != si.index { + t.Errorf(msg, i, j, "index", s, idx, si.index) + } + if sn != len(str) { + t.Errorf(msg, i, j, "sn", s, sn, len(str)) + } + } + } + } +} + +func TestPrintContractionTrieSet(t *testing.T) { + testdata := contractTrieSet(genStateTests[4].out) + buf := &bytes.Buffer{} + testdata.print(buf, "test") + if contractTrieOutput != buf.String() { + t.Errorf("output differs; found\n%s", buf.String()) + println(string(buf.Bytes())) + } +} + +const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes +var testCTEntries = [8]struct{l,h,n,i uint8}{ + {0x62, 0x5, 1, 255}, + {0x61, 0x2, 1, 255}, + {0x62, 0x1, 1, 6}, + {0x63, 0x1, 1, 4}, + {0x64, 0x64, 0, 1}, + {0x63, 0x1, 1, 7}, + {0x64, 0x1, 1, 5}, + {0x65, 0x66, 0, 2}, +} +var testContractTrieSet = contractTrieSet( testCTEntries[:] ) +` diff --git a/src/pkg/exp/locale/collate/contract.go b/src/pkg/exp/locale/collate/contract.go new file mode 100644 index 0000000000..28b9a04aca --- /dev/null +++ b/src/pkg/exp/locale/collate/contract.go @@ -0,0 +1,81 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package collate + +import "unicode/utf8" + +// For a description of contractTrieSet, see exp/locale/collate/build/contract.go. + +type contractTrieSet []struct{ l, h, n, i uint8 } + +// ctScanner is used to match a trie to an input sequence. +// A contraction may match a non-contiguous sequence of bytes in an input string. +// For example, if there is a contraction for , it should match +// the sequence , as combining_cedilla does +// not block combining_ring. +// ctScanner does not automatically skip over non-blocking non-starters, but rather +// retains the state of the last match and leaves it up to the user to continue +// the match at the appropriate points. +type ctScanner struct { + states contractTrieSet + s []byte + n int + index int + pindex int + done bool +} + +func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner { + return ctScanner{states: t[index:], s: b, n: n} +} + +// result returns the offset i and bytes consumed p so far. If no suffix +// matched, i and p will be 0. +func (s *ctScanner) result() (i, p int) { + return s.index, s.pindex +} + +// scan matches the longest suffix at the current location in the input +// and returns the number of bytes consumed. +func (s *ctScanner) scan(p int) int { + pr := p // the p at the rune start + str := s.s + states, n := s.states, s.n + for i := 0; i < n && p < len(str); { + e := states[i] + c := str[p] + // TODO: a significant number of contractions are of a form that + // cannot match discontiguous UTF-8 in a normalized string. We could let + // a negative value of e.n mean that we can set s.done = true and avoid + // the need for additional matches. + if c >= e.l { + if e.l == c { + p++ + if e.i != 0xFF { + s.index = int(e.i) + s.pindex = p + } + if e.n != 0 { + i, states, n = 0, states[e.h:], int(e.n) + if p >= len(str) || utf8.RuneStart(str[p]) { + s.states, s.n, pr = states, n, p + } + } else { + s.done = true + return p + } + continue + } else if e.n == 0 && c <= e.h { + p++ + s.done = true + s.index = int(c-e.l) + int(e.i) + s.pindex = p + return p + } + } + i++ + } + return pr +} diff --git a/src/pkg/exp/locale/collate/contract_test.go b/src/pkg/exp/locale/collate/contract_test.go new file mode 100644 index 0000000000..fd94d9c5c0 --- /dev/null +++ b/src/pkg/exp/locale/collate/contract_test.go @@ -0,0 +1,132 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package collate + +import ( + "testing" +) + +type lookupStrings struct { + str string + offset int + n int // bytes consumed from input +} + +type LookupTest struct { + lookup []lookupStrings + n int + tries contractTrieSet +} + +var lookupTests = []LookupTest{ + {[]lookupStrings{ + {"abc", 1, 3}, + {"a", 0, 0}, + {"b", 0, 0}, + {"c", 0, 0}, + {"d", 0, 0}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 0xFF}, + {'b', 1, 1, 0xFF}, + {'c', 'c', 0, 1}, + }, + }, + {[]lookupStrings{ + {"abc", 1, 3}, + {"abd", 2, 3}, + {"abe", 3, 3}, + {"a", 0, 0}, + {"ab", 0, 0}, + {"d", 0, 0}, + {"f", 0, 0}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 0xFF}, + {'b', 1, 1, 0xFF}, + {'c', 'e', 0, 1}, + }, + }, + {[]lookupStrings{ + {"abc", 1, 3}, + {"ab", 2, 2}, + {"a", 3, 1}, + {"abcd", 1, 3}, + {"abe", 2, 2}, + }, + 1, + contractTrieSet{ + {'a', 1, 1, 3}, + {'b', 1, 1, 2}, + {'c', 'c', 0, 1}, + }, + }, + {[]lookupStrings{ + {"abc", 1, 3}, + {"abd", 2, 3}, + {"ab", 3, 2}, + {"ac", 4, 2}, + {"a", 5, 1}, + {"b", 6, 1}, + {"ba", 6, 1}, + }, + 2, + contractTrieSet{ + {'b', 'b', 0, 6}, + {'a', 2, 2, 5}, + {'c', 'c', 0, 4}, + {'b', 2, 1, 3}, + {'c', 'd', 0, 1}, + }, + }, + {[]lookupStrings{ + {"bcde", 2, 4}, + {"bc", 7, 2}, + {"ab", 6, 2}, + {"bcd", 5, 3}, + {"abcd", 1, 4}, + {"abc", 4, 3}, + {"bcdf", 3, 4}, + }, + 2, + contractTrieSet{ + {'b', 5, 1, 0xFF}, + {'a', 2, 1, 0xFF}, + {'b', 1, 1, 6}, + {'c', 1, 1, 4}, + {'d', 'd', 0, 1}, + {'c', 1, 1, 7}, + {'d', 1, 1, 5}, + {'e', 'f', 0, 2}, + }, + }, +} + +func lookup(c *contractTrieSet, nnode int, s []uint8) (i, n int) { + scan := c.scanner(0, nnode, s) + scan.scan(0) + return scan.result() +} + +func TestLookupContraction(t *testing.T) { + for i, tt := range lookupTests { + cts := contractTrieSet(tt.tries) + for j, lu := range tt.lookup { + str := lu.str + for _, s := range []string{str, str + "X"} { + const msg = `%d:%d: %s of "%s" %v; want %v` + offset, n := lookup(&cts, tt.n, []byte(s)) + if offset != lu.offset { + t.Errorf(msg, i, j, "offset", s, offset, lu.offset) + } + if n != lu.n { + t.Errorf(msg, i, j, "bytes consumed", s, n, len(str)) + } + } + } + } +}