1
0
mirror of https://github.com/golang/go synced 2024-10-02 10:18:33 -06:00

exp/norm: merged charinfo and decomposition tables. As a result only

one trie lookup per rune is needed. See forminfo.go for a description
of the new format.  Also included leading and trailing canonical
combining class in decomposition information.  This will often avoid
additional trie lookups.

R=r, r
CC=golang-dev
https://golang.org/cl/5616071
This commit is contained in:
Marcel van Lohuizen 2012-02-13 14:54:46 +01:00
parent 7bd6ebb104
commit a52fb458df
6 changed files with 5527 additions and 5568 deletions

View File

@ -102,7 +102,7 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
} }
} }
if info.hasDecomposition() { if info.hasDecomposition() {
dcomp := rb.f.decompose(src, i) dcomp := info.decomposition()
rb.tmpBytes = inputBytes(dcomp) rb.tmpBytes = inputBytes(dcomp)
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
info = rb.f.info(&rb.tmpBytes, i) info = rb.f.info(&rb.tmpBytes, i)

View File

@ -6,25 +6,50 @@ package norm
// This file contains Form-specific logic and wrappers for data in tables.go. // This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 0..8: ccc
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
// 16: 1
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this lenght byte correspond
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the seqence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc.
const (
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the lenght value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// runeInfo is a representation for the data stored in charinfoTrie.
type runeInfo struct { type runeInfo struct {
pos uint8 // start position in reorderBuffer; used in composition.go pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // canonical combining class ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
flags qcInfo // quick check flags flags qcInfo // quick check flags
index uint16
} }
// functions dispatchable per form // functions dispatchable per form
type lookupFunc func(b input, i int) runeInfo type lookupFunc func(b input, i int) runeInfo
type decompFunc func(b input, i int) []byte
// formInfo holds Form-specific functions and tables. // formInfo holds Form-specific functions and tables.
type formInfo struct { type formInfo struct {
form Form form Form
composing, compatibility bool // form type composing, compatibility bool // form type
info lookupFunc
decompose decompFunc
info lookupFunc
} }
var formTable []*formInfo var formTable []*formInfo
@ -38,10 +63,8 @@ func init() {
f.form = Form(i) f.form = Form(i)
if Form(i) == NFKD || Form(i) == NFKC { if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true f.compatibility = true
f.decompose = decomposeNFKC
f.info = lookupInfoNFKC f.info = lookupInfoNFKC
} else { } else {
f.decompose = decomposeNFC
f.info = lookupInfoNFC f.info = lookupInfoNFC
} }
if Form(i) == NFC || Form(i) == NFKC { if Form(i) == NFC || Form(i) == NFKC {
@ -76,8 +99,6 @@ func (i runeInfo) boundaryAfter() bool {
// //
// When all 4 bits are zero, the character is inert, meaning it is never // When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization. // influenced by normalization.
//
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8 type qcInfo uint8
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 } func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
@ -91,22 +112,12 @@ func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0 return r.flags&0xf == 0 && r.ccc == 0
} }
// Wrappers for tables.go func (r runeInfo) decomposition() []byte {
if r.index == 0 {
// The 16-bit value of the decomposition tries is an index into a byte return nil
// array of UTF-8 decomposition sequences. The first byte is the number }
// of bytes in the decomposition (excluding this length byte). The actual p := r.index
// sequence starts at the offset+1. n := decomps[p] & 0x3F
func decomposeNFC(s input, i int) []byte {
p := s.decomposeNFC(i)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeNFKC(s input, i int) []byte {
p := s.decomposeNFKC(i)
n := decomps[p]
p++ p++
return decomps[p : p+uint16(n)] return decomps[p : p+uint16(n)]
} }
@ -124,16 +135,40 @@ func combine(a, b rune) rune {
return recompMap[key] return recompMap[key]
} }
// The 16-bit character info has the following bit layout:
// 0..7 CCC value.
// 8..11 qcInfo for NFC/NFD
// 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b input, i int) runeInfo { func lookupInfoNFC(b input, i int) runeInfo {
v, sz := b.charinfo(i) v, sz := b.charinfoNFC(i)
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 8)} return compInfo(v, sz)
} }
func lookupInfoNFKC(b input, i int) runeInfo { func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := b.charinfo(i) v, sz := b.charinfoNFKC(i)
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 12)} return compInfo(v, sz)
}
// compInfo converts the information contained in v and sz
// to a runeInfo. See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) runeInfo {
if v == 0 {
return runeInfo{size: uint8(sz)}
} else if v >= 0x8000 {
return runeInfo{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
flags: qcInfo(v>>8) & qcInfoMask,
}
}
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
ri := runeInfo{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
ri.tccc = decomps[v]
if v >= firstLeadingCCC {
ri.ccc = decomps[v+1]
}
}
return ri
} }

View File

@ -11,9 +11,8 @@ type input interface {
skipNonStarter(p int) int skipNonStarter(p int) int
appendSlice(buf []byte, s, e int) []byte appendSlice(buf []byte, s, e int) []byte
copySlice(buf []byte, s, e int) copySlice(buf []byte, s, e int)
charinfo(p int) (uint16, int) charinfoNFC(p int) (uint16, int)
decomposeNFC(p int) uint16 charinfoNFKC(p int) (uint16, int)
decomposeNFKC(p int) uint16
hangul(p int) rune hangul(p int) rune
} }
@ -42,16 +41,12 @@ func (s inputString) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e]) copy(buf, s[b:e])
} }
func (s inputString) charinfo(p int) (uint16, int) { func (s inputString) charinfoNFC(p int) (uint16, int) {
return charInfoTrie.lookupString(string(s[p:])) return nfcTrie.lookupString(string(s[p:]))
} }
func (s inputString) decomposeNFC(p int) uint16 { func (s inputString) charinfoNFKC(p int) (uint16, int) {
return nfcDecompTrie.lookupStringUnsafe(string(s[p:])) return nfkcTrie.lookupString(string(s[p:]))
}
func (s inputString) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
} }
func (s inputString) hangul(p int) rune { func (s inputString) hangul(p int) rune {
@ -84,16 +79,12 @@ func (s inputBytes) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e]) copy(buf, s[b:e])
} }
func (s inputBytes) charinfo(p int) (uint16, int) { func (s inputBytes) charinfoNFC(p int) (uint16, int) {
return charInfoTrie.lookup(s[p:]) return nfcTrie.lookup(s[p:])
} }
func (s inputBytes) decomposeNFC(p int) uint16 { func (s inputBytes) charinfoNFKC(p int) (uint16, int) {
return nfcDecompTrie.lookupUnsafe(s[p:]) return nfkcTrie.lookup(s[p:])
}
func (s inputBytes) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupUnsafe(s[p:])
} }
func (s inputBytes) hangul(p int) rune { func (s inputBytes) hangul(p int) rune {

View File

@ -4,6 +4,7 @@
// Normalization table generator. // Normalization table generator.
// Data read from the web. // Data read from the web.
// See forminfo.go for a description of the trie values associated with each rune.
package main package main
@ -17,6 +18,7 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"sort"
"strconv" "strconv"
"strings" "strings"
) )
@ -187,18 +189,14 @@ func (f FormInfo) String() string {
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
fmt.Fprintf(buf, " decomposition: %v\n", f.decomp) fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
fmt.Fprintf(buf, " expandedDecomp: %v\n", f.expandedDecomp) fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
return buf.String() return buf.String()
} }
type Decomposition []rune type Decomposition []rune
func (d Decomposition) String() string {
return fmt.Sprintf("%.4X", d)
}
func openReader(file string) (input io.ReadCloser) { func openReader(file string) (input io.ReadCloser) {
if *localFiles { if *localFiles {
f, err := os.Open(file) f, err := os.Open(file)
@ -571,80 +569,121 @@ func makeEntry(f *FormInfo) uint16 {
return e return e
} }
// Bits // decompSet keeps track of unique decompositions, grouped by whether
// 0..8: CCC // the decomposition is followed by a trailing and/or leading CCC.
// 9..12: NF(C|D) qc bits. type decompSet [4]map[string]bool
// 13..16: NFK(C|D) qc bits.
func makeCharInfo(c Char) uint16 { func makeDecompSet() decompSet {
e := makeEntry(&c.forms[FCompatibility]) m := decompSet{}
e = e<<4 | makeEntry(&c.forms[FCanonical]) for i, _ := range m {
e = e<<8 | uint16(c.ccc) m[i] = make(map[string]bool)
return e }
return m
}
func (m *decompSet) insert(key int, s string) {
m[key][s] = true
} }
func printCharInfoTables() int { func printCharInfoTables() int {
// Quick Check + CCC trie. mkstr := func(r rune, f *FormInfo) (int, string) {
t := newNode() d := f.expandedDecomp
for i, char := range chars { s := string([]rune(d))
v := makeCharInfo(char) if max := 1 << 6; len(s) >= max {
if v != 0 { const msg = "%U: too many bytes in decomposition: %d >= %d"
t.insert(rune(i), v) logger.Fatalf(msg, r, len(s), max)
} }
head := uint8(len(s))
if f.quickCheck[MComposed] != QCYes {
head |= 0x40
}
if f.combinesForward {
head |= 0x80
}
s = string([]byte{head}) + s
lccc := ccc(d[0])
tccc := ccc(d[len(d)-1])
if tccc < lccc && lccc != 0 {
const msg = "%U: lccc (%d) must be <= tcc (%d)"
logger.Fatalf(msg, r, lccc, tccc)
}
index := 0
if tccc > 0 || lccc > 0 {
s += string([]byte{tccc})
index = 1
if lccc > 0 {
s += string([]byte{lccc})
index |= 2
}
}
return index, s
} }
return t.printTables("charInfo")
}
func printDecompositionTables() int { decompSet := makeDecompSet()
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
size := 0
// Map decompositions
positionMap := make(map[string]uint16)
// Store the uniqued decompositions in a byte buffer, // Store the uniqued decompositions in a byte buffer,
// preceded by their byte length. // preceded by their byte length.
for _, c := range chars { for _, c := range chars {
for f := 0; f < 2; f++ { for _, f := range c.forms {
d := c.forms[f].expandedDecomp if len(f.expandedDecomp) == 0 {
s := string([]rune(d)) continue
if _, ok := positionMap[s]; !ok {
p := decompositions.Len()
decompositions.WriteByte(uint8(len(s)))
decompositions.WriteString(s)
positionMap[s] = uint16(p)
} }
if f.combinesBackward {
logger.Fatalf("%U: combinesBackward and decompose", c.codePoint)
}
index, s := mkstr(c.codePoint, &f)
decompSet.insert(index, s)
} }
} }
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
size := 0
positionMap := make(map[string]uint16)
decompositions.WriteString("\000")
cname := []string{"firstCCC", "firstLeadingCCC", "", "lastDecomp"}
fmt.Println("const (")
for i, m := range decompSet {
sa := []string{}
for s, _ := range m {
sa = append(sa, s)
}
sort.Strings(sa)
for _, s := range sa {
p := decompositions.Len()
decompositions.WriteString(s)
positionMap[s] = uint16(p)
}
if cname[i] != "" {
fmt.Printf("%s = 0x%X\n", cname[i], decompositions.Len())
}
}
fmt.Println("maxDecomp = 0x8000")
fmt.Println(")")
b := decompositions.Bytes() b := decompositions.Bytes()
printBytes(b, "decomps") printBytes(b, "decomps")
size += len(b) size += len(b)
nfcT := newNode() varnames := []string{"nfc", "nfkc"}
nfkcT := newNode() for i := 0; i < FNumberOfFormTypes; i++ {
for i, c := range chars { trie := newNode()
d := c.forms[FCanonical].expandedDecomp for r, c := range chars {
if len(d) != 0 { f := c.forms[i]
nfcT.insert(rune(i), positionMap[string([]rune(d))]) d := f.expandedDecomp
if ccc(c.codePoint) != ccc(d[0]) { if len(d) != 0 {
// We assume the lead ccc of a decomposition is !=0 in this case. _, key := mkstr(c.codePoint, &f)
if ccc(d[0]) == 0 { trie.insert(rune(r), positionMap[key])
logger.Fatal("Expected differing CCC to be non-zero.") if c.ccc != ccc(d[0]) {
} // We assume the lead ccc of a decomposition !=0 in this case.
} if ccc(d[0]) == 0 {
} logger.Fatal("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
d = c.forms[FCompatibility].expandedDecomp }
if len(d) != 0 {
nfkcT.insert(rune(i), positionMap[string([]rune(d))])
if ccc(c.codePoint) != ccc(d[0]) {
// We assume the lead ccc of a decomposition is !=0 in this case.
if ccc(d[0]) == 0 {
logger.Fatal("Expected differing CCC to be non-zero.")
} }
} else if v := makeEntry(&f)<<8 | uint16(c.ccc); v != 0 {
trie.insert(c.codePoint, 0x8000|v)
} }
} }
size += trie.printTables(varnames[i])
} }
size += nfcT.printTables("nfcDecomp")
size += nfkcT.printTables("nfkcDecomp")
return size return size
} }
@ -687,15 +726,15 @@ func makeTables() {
} }
list := strings.Split(*tablelist, ",") list := strings.Split(*tablelist, ",")
if *tablelist == "all" { if *tablelist == "all" {
list = []string{"decomp", "recomp", "info"} list = []string{"recomp", "info"}
} }
fmt.Printf(fileHeader, *tablelist, *url) fmt.Printf(fileHeader, *tablelist, *url)
fmt.Println("// Version is the Unicode edition from which the tables are derived.") fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version()) fmt.Printf("const Version = %q\n\n", version())
if contains(list, "decomp") { if contains(list, "info") {
size += printDecompositionTables() size += printCharInfoTables()
} }
if contains(list, "recomp") { if contains(list, "recomp") {
@ -730,9 +769,6 @@ func makeTables() {
fmt.Printf("}\n\n") fmt.Printf("}\n\n")
} }
if contains(list, "info") {
size += printCharInfoTables()
}
fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
} }
@ -761,6 +797,11 @@ func verifyComputed() {
log.Fatalf("%U: NF*C must be maybe if combinesBackward", i) log.Fatalf("%U: NF*C must be maybe if combinesBackward", i)
} }
} }
nfc := c.forms[FCanonical]
nfkc := c.forms[FCompatibility]
if nfc.combinesBackward != nfkc.combinesBackward {
logger.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
}
} }
} }

View File

@ -448,7 +448,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
} }
// Check that decomposition doesn't result in overflow. // Check that decomposition doesn't result in overflow.
if info.hasDecomposition() { if info.hasDecomposition() {
dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size)) dcomp := info.decomposition()
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
inf := rb.f.info(inputBytes(dcomp), i) inf := rb.f.info(inputBytes(dcomp), i)
i += int(inf.size) i += int(inf.size)

File diff suppressed because it is too large Load Diff