1
0
mirror of https://github.com/golang/go synced 2024-11-21 22:24:40 -07:00

exp/norm: merged charinfo and decomposition tables. As a result only

one trie lookup per rune is needed. See forminfo.go for a description
of the new format.  Also included leading and trailing canonical
combining class in decomposition information.  This will often avoid
additional trie lookups.

R=r, r
CC=golang-dev
https://golang.org/cl/5616071
This commit is contained in:
Marcel van Lohuizen 2012-02-13 14:54:46 +01:00
parent 7bd6ebb104
commit a52fb458df
6 changed files with 5527 additions and 5568 deletions

View File

@ -102,7 +102,7 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
}
}
if info.hasDecomposition() {
dcomp := rb.f.decompose(src, i)
dcomp := info.decomposition()
rb.tmpBytes = inputBytes(dcomp)
for i := 0; i < len(dcomp); {
info = rb.f.info(&rb.tmpBytes, i)

View File

@ -6,25 +6,50 @@ package norm
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 0..8: ccc
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
// 16: 1
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this lenght byte correspond
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the seqence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc.
const (
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the lenght value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// runeInfo is a representation for the data stored in charinfoTrie.
type runeInfo struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // canonical combining class
ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
flags qcInfo // quick check flags
index uint16
}
// functions dispatchable per form
type lookupFunc func(b input, i int) runeInfo
type decompFunc func(b input, i int) []byte
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
form Form
composing, compatibility bool // form type
decompose decompFunc
info lookupFunc
info lookupFunc
}
var formTable []*formInfo
@ -38,10 +63,8 @@ func init() {
f.form = Form(i)
if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true
f.decompose = decomposeNFKC
f.info = lookupInfoNFKC
} else {
f.decompose = decomposeNFC
f.info = lookupInfoNFC
}
if Form(i) == NFC || Form(i) == NFKC {
@ -76,8 +99,6 @@ func (i runeInfo) boundaryAfter() bool {
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
//
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
@ -91,22 +112,12 @@ func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0
}
// Wrappers for tables.go
// The 16-bit value of the decomposition tries is an index into a byte
// array of UTF-8 decomposition sequences. The first byte is the number
// of bytes in the decomposition (excluding this length byte). The actual
// sequence starts at the offset+1.
func decomposeNFC(s input, i int) []byte {
p := s.decomposeNFC(i)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeNFKC(s input, i int) []byte {
p := s.decomposeNFKC(i)
n := decomps[p]
func (r runeInfo) decomposition() []byte {
if r.index == 0 {
return nil
}
p := r.index
n := decomps[p] & 0x3F
p++
return decomps[p : p+uint16(n)]
}
@ -124,16 +135,40 @@ func combine(a, b rune) rune {
return recompMap[key]
}
// The 16-bit character info has the following bit layout:
// 0..7 CCC value.
// 8..11 qcInfo for NFC/NFD
// 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b input, i int) runeInfo {
v, sz := b.charinfo(i)
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 8)}
v, sz := b.charinfoNFC(i)
return compInfo(v, sz)
}
func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := b.charinfo(i)
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 12)}
v, sz := b.charinfoNFKC(i)
return compInfo(v, sz)
}
// compInfo converts the information contained in v and sz
// to a runeInfo. See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) runeInfo {
if v == 0 {
return runeInfo{size: uint8(sz)}
} else if v >= 0x8000 {
return runeInfo{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
flags: qcInfo(v>>8) & qcInfoMask,
}
}
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
ri := runeInfo{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
ri.tccc = decomps[v]
if v >= firstLeadingCCC {
ri.ccc = decomps[v+1]
}
}
return ri
}

View File

@ -11,9 +11,8 @@ type input interface {
skipNonStarter(p int) int
appendSlice(buf []byte, s, e int) []byte
copySlice(buf []byte, s, e int)
charinfo(p int) (uint16, int)
decomposeNFC(p int) uint16
decomposeNFKC(p int) uint16
charinfoNFC(p int) (uint16, int)
charinfoNFKC(p int) (uint16, int)
hangul(p int) rune
}
@ -42,16 +41,12 @@ func (s inputString) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
}
func (s inputString) charinfo(p int) (uint16, int) {
return charInfoTrie.lookupString(string(s[p:]))
func (s inputString) charinfoNFC(p int) (uint16, int) {
return nfcTrie.lookupString(string(s[p:]))
}
func (s inputString) decomposeNFC(p int) uint16 {
return nfcDecompTrie.lookupStringUnsafe(string(s[p:]))
}
func (s inputString) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
func (s inputString) charinfoNFKC(p int) (uint16, int) {
return nfkcTrie.lookupString(string(s[p:]))
}
func (s inputString) hangul(p int) rune {
@ -84,16 +79,12 @@ func (s inputBytes) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
}
func (s inputBytes) charinfo(p int) (uint16, int) {
return charInfoTrie.lookup(s[p:])
func (s inputBytes) charinfoNFC(p int) (uint16, int) {
return nfcTrie.lookup(s[p:])
}
func (s inputBytes) decomposeNFC(p int) uint16 {
return nfcDecompTrie.lookupUnsafe(s[p:])
}
func (s inputBytes) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupUnsafe(s[p:])
func (s inputBytes) charinfoNFKC(p int) (uint16, int) {
return nfkcTrie.lookup(s[p:])
}
func (s inputBytes) hangul(p int) rune {

View File

@ -4,6 +4,7 @@
// Normalization table generator.
// Data read from the web.
// See forminfo.go for a description of the trie values associated with each rune.
package main
@ -17,6 +18,7 @@ import (
"net/http"
"os"
"regexp"
"sort"
"strconv"
"strings"
)
@ -187,18 +189,14 @@ func (f FormInfo) String() string {
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
fmt.Fprintf(buf, " decomposition: %v\n", f.decomp)
fmt.Fprintf(buf, " expandedDecomp: %v\n", f.expandedDecomp)
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
return buf.String()
}
type Decomposition []rune
func (d Decomposition) String() string {
return fmt.Sprintf("%.4X", d)
}
func openReader(file string) (input io.ReadCloser) {
if *localFiles {
f, err := os.Open(file)
@ -571,80 +569,121 @@ func makeEntry(f *FormInfo) uint16 {
return e
}
// Bits
// 0..8: CCC
// 9..12: NF(C|D) qc bits.
// 13..16: NFK(C|D) qc bits.
func makeCharInfo(c Char) uint16 {
e := makeEntry(&c.forms[FCompatibility])
e = e<<4 | makeEntry(&c.forms[FCanonical])
e = e<<8 | uint16(c.ccc)
return e
// decompSet keeps track of unique decompositions, grouped by whether
// the decomposition is followed by a trailing and/or leading CCC.
type decompSet [4]map[string]bool
func makeDecompSet() decompSet {
m := decompSet{}
for i, _ := range m {
m[i] = make(map[string]bool)
}
return m
}
func (m *decompSet) insert(key int, s string) {
m[key][s] = true
}
func printCharInfoTables() int {
// Quick Check + CCC trie.
t := newNode()
for i, char := range chars {
v := makeCharInfo(char)
if v != 0 {
t.insert(rune(i), v)
mkstr := func(r rune, f *FormInfo) (int, string) {
d := f.expandedDecomp
s := string([]rune(d))
if max := 1 << 6; len(s) >= max {
const msg = "%U: too many bytes in decomposition: %d >= %d"
logger.Fatalf(msg, r, len(s), max)
}
head := uint8(len(s))
if f.quickCheck[MComposed] != QCYes {
head |= 0x40
}
if f.combinesForward {
head |= 0x80
}
s = string([]byte{head}) + s
lccc := ccc(d[0])
tccc := ccc(d[len(d)-1])
if tccc < lccc && lccc != 0 {
const msg = "%U: lccc (%d) must be <= tcc (%d)"
logger.Fatalf(msg, r, lccc, tccc)
}
index := 0
if tccc > 0 || lccc > 0 {
s += string([]byte{tccc})
index = 1
if lccc > 0 {
s += string([]byte{lccc})
index |= 2
}
}
return index, s
}
return t.printTables("charInfo")
}
func printDecompositionTables() int {
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
size := 0
// Map decompositions
positionMap := make(map[string]uint16)
decompSet := makeDecompSet()
// Store the uniqued decompositions in a byte buffer,
// preceded by their byte length.
for _, c := range chars {
for f := 0; f < 2; f++ {
d := c.forms[f].expandedDecomp
s := string([]rune(d))
if _, ok := positionMap[s]; !ok {
p := decompositions.Len()
decompositions.WriteByte(uint8(len(s)))
decompositions.WriteString(s)
positionMap[s] = uint16(p)
for _, f := range c.forms {
if len(f.expandedDecomp) == 0 {
continue
}
if f.combinesBackward {
logger.Fatalf("%U: combinesBackward and decompose", c.codePoint)
}
index, s := mkstr(c.codePoint, &f)
decompSet.insert(index, s)
}
}
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
size := 0
positionMap := make(map[string]uint16)
decompositions.WriteString("\000")
cname := []string{"firstCCC", "firstLeadingCCC", "", "lastDecomp"}
fmt.Println("const (")
for i, m := range decompSet {
sa := []string{}
for s, _ := range m {
sa = append(sa, s)
}
sort.Strings(sa)
for _, s := range sa {
p := decompositions.Len()
decompositions.WriteString(s)
positionMap[s] = uint16(p)
}
if cname[i] != "" {
fmt.Printf("%s = 0x%X\n", cname[i], decompositions.Len())
}
}
fmt.Println("maxDecomp = 0x8000")
fmt.Println(")")
b := decompositions.Bytes()
printBytes(b, "decomps")
size += len(b)
nfcT := newNode()
nfkcT := newNode()
for i, c := range chars {
d := c.forms[FCanonical].expandedDecomp
if len(d) != 0 {
nfcT.insert(rune(i), positionMap[string([]rune(d))])
if ccc(c.codePoint) != ccc(d[0]) {
// We assume the lead ccc of a decomposition is !=0 in this case.
if ccc(d[0]) == 0 {
logger.Fatal("Expected differing CCC to be non-zero.")
}
}
}
d = c.forms[FCompatibility].expandedDecomp
if len(d) != 0 {
nfkcT.insert(rune(i), positionMap[string([]rune(d))])
if ccc(c.codePoint) != ccc(d[0]) {
// We assume the lead ccc of a decomposition is !=0 in this case.
if ccc(d[0]) == 0 {
logger.Fatal("Expected differing CCC to be non-zero.")
varnames := []string{"nfc", "nfkc"}
for i := 0; i < FNumberOfFormTypes; i++ {
trie := newNode()
for r, c := range chars {
f := c.forms[i]
d := f.expandedDecomp
if len(d) != 0 {
_, key := mkstr(c.codePoint, &f)
trie.insert(rune(r), positionMap[key])
if c.ccc != ccc(d[0]) {
// We assume the lead ccc of a decomposition !=0 in this case.
if ccc(d[0]) == 0 {
logger.Fatal("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
}
}
} else if v := makeEntry(&f)<<8 | uint16(c.ccc); v != 0 {
trie.insert(c.codePoint, 0x8000|v)
}
}
size += trie.printTables(varnames[i])
}
size += nfcT.printTables("nfcDecomp")
size += nfkcT.printTables("nfkcDecomp")
return size
}
@ -687,15 +726,15 @@ func makeTables() {
}
list := strings.Split(*tablelist, ",")
if *tablelist == "all" {
list = []string{"decomp", "recomp", "info"}
list = []string{"recomp", "info"}
}
fmt.Printf(fileHeader, *tablelist, *url)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
if contains(list, "decomp") {
size += printDecompositionTables()
if contains(list, "info") {
size += printCharInfoTables()
}
if contains(list, "recomp") {
@ -730,9 +769,6 @@ func makeTables() {
fmt.Printf("}\n\n")
}
if contains(list, "info") {
size += printCharInfoTables()
}
fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
}
@ -761,6 +797,11 @@ func verifyComputed() {
log.Fatalf("%U: NF*C must be maybe if combinesBackward", i)
}
}
nfc := c.forms[FCanonical]
nfkc := c.forms[FCompatibility]
if nfc.combinesBackward != nfkc.combinesBackward {
logger.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
}
}
}

View File

@ -448,7 +448,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
}
// Check that decomposition doesn't result in overflow.
if info.hasDecomposition() {
dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
dcomp := info.decomposition()
for i := 0; i < len(dcomp); {
inf := rb.f.info(inputBytes(dcomp), i)
i += int(inf.size)

File diff suppressed because it is too large Load Diff