diff --git a/src/pkg/exp/locale/collate/build/trie.go b/src/pkg/exp/locale/collate/build/trie.go new file mode 100644 index 0000000000..894d29305e --- /dev/null +++ b/src/pkg/exp/locale/collate/build/trie.go @@ -0,0 +1,265 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The trie in this file is used to associate the first full character +// in a UTF-8 string to a collation element. +// All but the last byte in a UTF-8 byte sequence are +// used to look up offsets in the index table to be used for the next byte. +// The last byte is used to index into a table of collation elements. +// This file contains the code for the generation of the trie. + +package build + +import ( + "fmt" + "hash/fnv" + "io" + "log" + "reflect" +) + +const blockSize = 64 + +type trie struct { + index []uint16 + values []uint32 +} + +// trieNode is the intermediate trie structure used for generating a trie. +type trieNode struct { + table [256]*trieNode + value int64 + b byte + leaf bool +} + +func newNode() *trieNode { + return new(trieNode) +} + +func (n *trieNode) isInternal() bool { + internal := true + for i := 0; i < 256; i++ { + if nn := n.table[i]; nn != nil { + if !internal && !nn.leaf { + log.Fatalf("trie:isInternal: node contains both leaf and non-leaf children (%v)", n) + } + internal = internal && !nn.leaf + } + } + return internal +} + +func (n *trieNode) insert(r rune, value uint32) { + for _, b := range []byte(string(r)) { + if n.leaf { + log.Fatalf("trie:insert: node (%#v) should not be a leaf", n) + } + nn := n.table[b] + if nn == nil { + nn = newNode() + nn.b = b + n.table[b] = nn + } + n = nn + } + n.value = int64(value) + n.leaf = true +} + +type nodeIndex struct { + lookupBlocks []*trieNode + valueBlocks []*trieNode + + lookupBlockIdx map[uint32]int64 + valueBlockIdx map[uint32]int64 +} + +func newIndex() *nodeIndex { + index := &nodeIndex{} + index.lookupBlocks = make([]*trieNode, 0) + index.valueBlocks = make([]*trieNode, 0) + index.lookupBlockIdx = make(map[uint32]int64) + index.valueBlockIdx = make(map[uint32]int64) + return index +} + +func computeOffsets(index *nodeIndex, n *trieNode) int64 { + if n.leaf { + return n.value + } + hasher := fnv.New32() + // We only index continuation bytes. + for i := 0; i < blockSize; i++ { + v := int64(0) + if nn := n.table[0x80+i]; nn != nil { + v = computeOffsets(index, nn) + } + hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) + } + h := hasher.Sum32() + if n.isInternal() { + v, ok := index.lookupBlockIdx[h] + if !ok { + v = int64(len(index.lookupBlocks)) + index.lookupBlocks = append(index.lookupBlocks, n) + index.lookupBlockIdx[h] = v + } + n.value = v + } else { + v, ok := index.valueBlockIdx[h] + if !ok { + v = int64(len(index.valueBlocks)) + index.valueBlocks = append(index.valueBlocks, n) + index.valueBlockIdx[h] = v + } + n.value = v + } + return n.value +} + +func genValueBlock(t *trie, n *trieNode, offset int) error { + for i := 0; i < blockSize; i++ { + v := int64(0) + if nn := n.table[i+offset]; nn != nil { + v = nn.value + } + if v >= 1<<32 { + return fmt.Errorf("value %d at index %d does not fit in uint32", v, len(t.values)) + } + t.values = append(t.values, uint32(v)) + } + return nil +} + +func genLookupBlock(t *trie, n *trieNode, offset int) error { + for i := 0; i < blockSize; i++ { + v := int64(0) + if nn := n.table[i+offset]; nn != nil { + v = nn.value + } + if v >= 1<<16 { + return fmt.Errorf("value %d at index %d does not fit in uint16", v, len(t.index)) + } + t.index = append(t.index, uint16(v)) + } + return nil +} + +// generate generates and returns the trie for n. +func (n *trieNode) generate() (t *trie, err error) { + seterr := func(e error) { + if err == nil { + err = e + } + } + index := newIndex() + // Values for 7-bit ASCII are stored in the first of two blocks, followed by a nil block. + index.valueBlocks = append(index.valueBlocks, nil, nil, nil) + // First byte of multi-byte UTF-8 codepoints are indexed in 4th block. + index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil) + // Index starter bytes of multi-byte UTF-8. + for i := 0xC0; i < 0x100; i++ { + if n.table[i] != nil { + computeOffsets(index, n.table[i]) + } + } + t = &trie{} + seterr(genValueBlock(t, n, 0)) + seterr(genValueBlock(t, n, 64)) + seterr(genValueBlock(t, newNode(), 0)) + for i := 3; i < len(index.valueBlocks); i++ { + seterr(genValueBlock(t, index.valueBlocks[i], 0x80)) + } + if len(index.valueBlocks) >= 1<<16 { + seterr(fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(index.valueBlocks), 1<<16)) + return + } + seterr(genLookupBlock(t, newNode(), 0)) + seterr(genLookupBlock(t, newNode(), 0)) + seterr(genLookupBlock(t, newNode(), 0)) + seterr(genLookupBlock(t, n, 0xC0)) + for i := 4; i < len(index.lookupBlocks); i++ { + seterr(genLookupBlock(t, index.lookupBlocks[i], 0x80)) + } + return +} + +// print writes a compilable trie to w. It returns the number of characters +// printed and the size of the generated structure in bytes. +func (t *trie) print(w io.Writer, name string) (n, size int, err error) { + update3 := func(nn, sz int, e error) { + n += nn + if err == nil { + err = e + } + size += sz + } + update2 := func(nn int, e error) { update3(nn, 0, e) } + + update3(t.printArrays(w, name)) + update2(fmt.Fprintf(w, "var %sTrie = ", name)) + update3(t.printStruct(w, name)) + update2(fmt.Fprintln(w)) + return +} + +func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) { + p := func(f string, a ...interface{}) { + nn, e := fmt.Fprintf(w, f, a...) + n += nn + if err == nil { + err = e + } + } + nv := len(t.values) + p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4) + p("// Block 2 is the null block.\n") + p("var %sValues = [%d]uint32 {", name, nv) + var printnewline bool + for i, v := range t.values { + if i%blockSize == 0 { + p("\n\t// Block %#x, offset %#x", i/blockSize, i) + } + if i%4 == 0 { + printnewline = true + } + if v != 0 { + if printnewline { + p("\n\t") + printnewline = false + } + p("%#04x:%#08x, ", i, v) + } + } + p("\n}\n\n") + ni := len(t.index) + p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2) + p("// Block 0 is the null block.\n") + p("var %sLookup = [%d]uint16 {", name, ni) + printnewline = false + for i, v := range t.index { + if i%blockSize == 0 { + p("\n\t// Block %#x, offset %#x", i/blockSize, i) + } + if i%8 == 0 { + printnewline = true + } + if v != 0 { + if printnewline { + p("\n\t") + printnewline = false + } + p("%#03x:%#02x, ", i, v) + } + } + p("\n}\n\n") + return n, nv*4 + ni*2, err +} + +func (t *trie) printStruct(w io.Writer, name string) (n, sz int, err error) { + n, err = fmt.Fprintf(w, "trie{ %sLookup[:], %sValues[:]}", name, name) + sz += int(reflect.TypeOf(trie{}).Size()) + return +} diff --git a/src/pkg/exp/locale/collate/build/trie_test.go b/src/pkg/exp/locale/collate/build/trie_test.go new file mode 100644 index 0000000000..c530bb92f9 --- /dev/null +++ b/src/pkg/exp/locale/collate/build/trie_test.go @@ -0,0 +1,103 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import ( + "bytes" + "testing" +) + +// We take the smallest, largest and an arbitrary value for each +// of the UTF-8 sequence lengths. +var testRunes = []rune{ + 0x01, 0x0C, 0x7F, // 1-byte sequences + 0x80, 0x100, 0x7FF, // 2-byte sequences + 0x800, 0x999, 0xFFFF, // 3-byte sequences + 0x10000, 0x10101, 0x10FFFF, // 4-byte sequences + 0x200, 0x201, 0x202, 0x210, 0x215, // five entries in one sparse block +} + +func makeTestTrie(t *testing.T) trie { + n := newNode() + for i, r := range testRunes { + n.insert(r, uint32(i)) + } + tr, err := n.generate() + if err != nil { + t.Errorf(err.Error()) + } + return *tr +} + +func TestGenerateTrie(t *testing.T) { + testdata := makeTestTrie(t) + buf := &bytes.Buffer{} + testdata.print(buf, "test") + if output != buf.String() { + t.Errorf("output differs") + } +} + +var output = `// testValues: 832 entries, 3328 bytes +// Block 2 is the null block. +var testValues = [832]uint32 { + // Block 0x0, offset 0x0 + 0x000c:0x00000001, + // Block 0x1, offset 0x40 + 0x007f:0x00000002, + // Block 0x2, offset 0x80 + // Block 0x3, offset 0xc0 + 0x00c0:0x00000003, + // Block 0x4, offset 0x100 + 0x0100:0x00000004, + // Block 0x5, offset 0x140 + 0x0140:0x0000000c, 0x0141:0x0000000d, 0x0142:0x0000000e, + 0x0150:0x0000000f, + 0x0155:0x00000010, + // Block 0x6, offset 0x180 + 0x01bf:0x00000005, + // Block 0x7, offset 0x1c0 + 0x01c0:0x00000006, + // Block 0x8, offset 0x200 + 0x0219:0x00000007, + // Block 0x9, offset 0x240 + 0x027f:0x00000008, + // Block 0xa, offset 0x280 + 0x0280:0x00000009, + // Block 0xb, offset 0x2c0 + 0x02c1:0x0000000a, + // Block 0xc, offset 0x300 + 0x033f:0x0000000b, +} + +// testLookup: 640 entries, 1280 bytes +// Block 0 is the null block. +var testLookup = [640]uint16 { + // Block 0x0, offset 0x0 + // Block 0x1, offset 0x40 + // Block 0x2, offset 0x80 + // Block 0x3, offset 0xc0 + 0x0c2:0x03, 0x0c4:0x04, + 0x0c8:0x05, + 0x0df:0x06, + 0x0e0:0x04, + 0x0ef:0x05, + 0x0f0:0x07, 0x0f4:0x09, + // Block 0x4, offset 0x100 + 0x120:0x07, 0x126:0x08, + // Block 0x5, offset 0x140 + 0x17f:0x09, + // Block 0x6, offset 0x180 + 0x180:0x0a, 0x184:0x0b, + // Block 0x7, offset 0x1c0 + 0x1d0:0x06, + // Block 0x8, offset 0x200 + 0x23f:0x0c, + // Block 0x9, offset 0x240 + 0x24f:0x08, +} + +var testTrie = trie{ testLookup[:], testValues[:]} +` diff --git a/src/pkg/exp/locale/collate/trie.go b/src/pkg/exp/locale/collate/trie.go new file mode 100644 index 0000000000..ea0396085d --- /dev/null +++ b/src/pkg/exp/locale/collate/trie.go @@ -0,0 +1,102 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The trie in this file is used to associate the first full character +// in an UTF-8 string to a collation element. +// All but the last byte in a UTF-8 byte sequence are +// used to lookup offsets in the index table to be used for the next byte. +// The last byte is used to index into a table of collation elements. +// For a full description, see exp/locale/collate/build/trie.go. + +package collate + +const blockSize = 64 + +type trie struct { + index []uint16 + values []uint32 +} + +const ( + t1 = 0x00 // 0000 0000 + tx = 0x80 // 1000 0000 + t2 = 0xC0 // 1100 0000 + t3 = 0xE0 // 1110 0000 + t4 = 0xF0 // 1111 0000 + t5 = 0xF8 // 1111 1000 + t6 = 0xFC // 1111 1100 + te = 0xFE // 1111 1110 + + maskx = 0x3F // 0011 1111 + mask2 = 0x1F // 0001 1111 + mask3 = 0x0F // 0000 1111 + mask4 = 0x07 // 0000 0111 +) + +func (t *trie) lookupValue(n uint16, b byte) colElem { + return colElem(t.values[int(n)<<6+int(b&maskx)]) +} + +// lookup returns the trie value for the first UTF-8 encoding in s and +// the width in bytes of this encoding. The size will be 0 if s does not +// hold enough bytes to complete the encoding. len(s) must be greater than 0. +func (t *trie) lookup(s []byte) (v colElem, sz int) { + c0 := s[0] + switch { + case c0 < tx: + return colElem(t.values[c0]), 1 + case c0 < t2: + return 0, 1 + case c0 < t3: + if len(s) < 2 { + return 0, 0 + } + i := t.index[c0] + c1 := s[1] + if c1 < tx || t2 <= c1 { + return 0, 1 + } + return t.lookupValue(i, c1), 2 + case c0 < t4: + if len(s) < 3 { + return 0, 0 + } + i := t.index[c0] + c1 := s[1] + if c1 < tx || t2 <= c1 { + return 0, 1 + } + o := int(i)<<6 + int(c1)&maskx + i = t.index[o] + c2 := s[2] + if c2 < tx || t2 <= c2 { + return 0, 2 + } + return t.lookupValue(i, c2), 3 + case c0 < t5: + if len(s) < 4 { + return 0, 0 + } + i := t.index[c0] + c1 := s[1] + if c1 < tx || t2 <= c1 { + return 0, 1 + } + o := int(i)<<6 + int(c1)&maskx + i = t.index[o] + c2 := s[2] + if c2 < tx || t2 <= c2 { + return 0, 2 + } + o = int(i)<<6 + int(c2)&maskx + i = t.index[o] + c3 := s[3] + if c3 < tx || t2 <= c3 { + return 0, 3 + } + return t.lookupValue(i, c3), 4 + } + // Illegal rune + return 0, 1 +} diff --git a/src/pkg/exp/locale/collate/trie_test.go b/src/pkg/exp/locale/collate/trie_test.go new file mode 100644 index 0000000000..b5868cad14 --- /dev/null +++ b/src/pkg/exp/locale/collate/trie_test.go @@ -0,0 +1,106 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package collate + +import ( + "testing" +) + +// We take the smallest, largest and an arbitrary value for each +// of the UTF-8 sequence lengths. +var testRunes = []rune{ + 0x01, 0x0C, 0x7F, // 1-byte sequences + 0x80, 0x100, 0x7FF, // 2-byte sequences + 0x800, 0x999, 0xFFFF, // 3-byte sequences + 0x10000, 0x10101, 0x10FFFF, // 4-byte sequences + 0x200, 0x201, 0x202, 0x210, 0x215, // five entries in one sparse block +} + +// Test cases for illegal runes. +type trietest struct { + size int + bytes []byte +} + +var tests = []trietest{ + // illegal runes + {1, []byte{0x80}}, + {1, []byte{0xFF}}, + {1, []byte{t2, tx - 1}}, + {1, []byte{t2, t2}}, + {2, []byte{t3, tx, tx - 1}}, + {2, []byte{t3, tx, t2}}, + {1, []byte{t3, tx - 1, tx}}, + {3, []byte{t4, tx, tx, tx - 1}}, + {3, []byte{t4, tx, tx, t2}}, + {1, []byte{t4, t2, tx, tx - 1}}, + {2, []byte{t4, tx, t2, tx - 1}}, + + // short runes + {0, []byte{t2}}, + {0, []byte{t3, tx}}, + {0, []byte{t4, tx, tx}}, + + // we only support UTF-8 up to utf8.UTFMax bytes (4 bytes) + {1, []byte{t5, tx, tx, tx, tx}}, + {1, []byte{t6, tx, tx, tx, tx, tx}}, +} + +func TestLookupTrie(t *testing.T) { + for i, r := range testRunes { + b := []byte(string(r)) + v, sz := testTrie.lookup(b) + if int(v) != i { + t.Errorf("lookup(%U): found value %#x, expected %#x", r, v, i) + } + if sz != len(b) { + t.Errorf("lookup(%U): found size %d, expected %d", r, sz, len(b)) + } + } + for i, tt := range tests { + v, sz := testTrie.lookup(tt.bytes) + if int(v) != 0 { + t.Errorf("lookup of illegal rune, case %d: found value %#x, expected 0", i, v) + } + if sz != tt.size { + t.Errorf("lookup of illegal rune, case %d: found size %d, expected %d", i, sz, tt.size) + } + } +} + +// test data is taken from exp/collate/locale/build/trie_test.go +var testValues = [832]uint32{ + 0x000c: 0x00000001, + 0x007f: 0x00000002, + 0x00c0: 0x00000003, + 0x0100: 0x00000004, + 0x0140: 0x0000000c, 0x0141: 0x0000000d, 0x0142: 0x0000000e, + 0x0150: 0x0000000f, + 0x0155: 0x00000010, + 0x01bf: 0x00000005, + 0x01c0: 0x00000006, + 0x0219: 0x00000007, + 0x027f: 0x00000008, + 0x0280: 0x00000009, + 0x02c1: 0x0000000a, + 0x033f: 0x0000000b, +} + +var testLookup = [640]uint16{ + 0x0c2: 0x03, 0x0c4: 0x04, + 0x0c8: 0x05, + 0x0df: 0x06, + 0x0e0: 0x04, + 0x0ef: 0x05, + 0x0f0: 0x07, 0x0f4: 0x09, + 0x120: 0x07, 0x126: 0x08, + 0x17f: 0x09, + 0x180: 0x0a, 0x184: 0x0b, + 0x1d0: 0x06, + 0x23f: 0x0c, + 0x24f: 0x08, +} + +var testTrie = trie{testLookup[:], testValues[:]}