mirror of
https://github.com/golang/go
synced 2024-11-22 04:44:39 -07:00
exp/norm: merged charinfo and decomposition tables. As a result only
one trie lookup per rune is needed. See forminfo.go for a description of the new format. Also included leading and trailing canonical combining class in decomposition information. This will often avoid additional trie lookups. R=r, r CC=golang-dev https://golang.org/cl/5616071
This commit is contained in:
parent
7bd6ebb104
commit
a52fb458df
@ -102,7 +102,7 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
|
||||
}
|
||||
}
|
||||
if info.hasDecomposition() {
|
||||
dcomp := rb.f.decompose(src, i)
|
||||
dcomp := info.decomposition()
|
||||
rb.tmpBytes = inputBytes(dcomp)
|
||||
for i := 0; i < len(dcomp); {
|
||||
info = rb.f.info(&rb.tmpBytes, i)
|
||||
|
@ -6,25 +6,50 @@ package norm
|
||||
|
||||
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||
|
||||
// Rune info is stored in a separate trie per composing form. A composing form
|
||||
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||
// bits
|
||||
// 0..8: ccc
|
||||
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
|
||||
// 16: 1
|
||||
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||
// has the form:
|
||||
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||
// The header contains the number of bytes in the decomposition (excluding this
|
||||
// length byte). The two most significant bits of this lenght byte correspond
|
||||
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
|
||||
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||
// for these are not zero. The value of v determines which ccc are appended
|
||||
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||
// the seqence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||
// there is an additional leading ccc.
|
||||
|
||||
const (
|
||||
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
|
||||
headerLenMask = 0x3F // extract the lenght value from the header byte
|
||||
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||
)
|
||||
|
||||
// runeInfo is a representation for the data stored in charinfoTrie.
|
||||
type runeInfo struct {
|
||||
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||
size uint8 // length of UTF-8 encoding of this rune
|
||||
ccc uint8 // canonical combining class
|
||||
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||
flags qcInfo // quick check flags
|
||||
index uint16
|
||||
}
|
||||
|
||||
// functions dispatchable per form
|
||||
type lookupFunc func(b input, i int) runeInfo
|
||||
type decompFunc func(b input, i int) []byte
|
||||
|
||||
// formInfo holds Form-specific functions and tables.
|
||||
type formInfo struct {
|
||||
form Form
|
||||
|
||||
form Form
|
||||
composing, compatibility bool // form type
|
||||
|
||||
decompose decompFunc
|
||||
info lookupFunc
|
||||
info lookupFunc
|
||||
}
|
||||
|
||||
var formTable []*formInfo
|
||||
@ -38,10 +63,8 @@ func init() {
|
||||
f.form = Form(i)
|
||||
if Form(i) == NFKD || Form(i) == NFKC {
|
||||
f.compatibility = true
|
||||
f.decompose = decomposeNFKC
|
||||
f.info = lookupInfoNFKC
|
||||
} else {
|
||||
f.decompose = decomposeNFC
|
||||
f.info = lookupInfoNFC
|
||||
}
|
||||
if Form(i) == NFC || Form(i) == NFKC {
|
||||
@ -76,8 +99,6 @@ func (i runeInfo) boundaryAfter() bool {
|
||||
//
|
||||
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||
// influenced by normalization.
|
||||
//
|
||||
// We pack the bits for both NFC/D and NFKC/D in one byte.
|
||||
type qcInfo uint8
|
||||
|
||||
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
|
||||
@ -91,22 +112,12 @@ func (r runeInfo) isInert() bool {
|
||||
return r.flags&0xf == 0 && r.ccc == 0
|
||||
}
|
||||
|
||||
// Wrappers for tables.go
|
||||
|
||||
// The 16-bit value of the decomposition tries is an index into a byte
|
||||
// array of UTF-8 decomposition sequences. The first byte is the number
|
||||
// of bytes in the decomposition (excluding this length byte). The actual
|
||||
// sequence starts at the offset+1.
|
||||
func decomposeNFC(s input, i int) []byte {
|
||||
p := s.decomposeNFC(i)
|
||||
n := decomps[p]
|
||||
p++
|
||||
return decomps[p : p+uint16(n)]
|
||||
}
|
||||
|
||||
func decomposeNFKC(s input, i int) []byte {
|
||||
p := s.decomposeNFKC(i)
|
||||
n := decomps[p]
|
||||
func (r runeInfo) decomposition() []byte {
|
||||
if r.index == 0 {
|
||||
return nil
|
||||
}
|
||||
p := r.index
|
||||
n := decomps[p] & 0x3F
|
||||
p++
|
||||
return decomps[p : p+uint16(n)]
|
||||
}
|
||||
@ -124,16 +135,40 @@ func combine(a, b rune) rune {
|
||||
return recompMap[key]
|
||||
}
|
||||
|
||||
// The 16-bit character info has the following bit layout:
|
||||
// 0..7 CCC value.
|
||||
// 8..11 qcInfo for NFC/NFD
|
||||
// 12..15 qcInfo for NFKC/NFKD
|
||||
func lookupInfoNFC(b input, i int) runeInfo {
|
||||
v, sz := b.charinfo(i)
|
||||
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 8)}
|
||||
v, sz := b.charinfoNFC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
func lookupInfoNFKC(b input, i int) runeInfo {
|
||||
v, sz := b.charinfo(i)
|
||||
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 12)}
|
||||
v, sz := b.charinfoNFKC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
// compInfo converts the information contained in v and sz
|
||||
// to a runeInfo. See the comment at the top of the file
|
||||
// for more information on the format.
|
||||
func compInfo(v uint16, sz int) runeInfo {
|
||||
if v == 0 {
|
||||
return runeInfo{size: uint8(sz)}
|
||||
} else if v >= 0x8000 {
|
||||
return runeInfo{
|
||||
size: uint8(sz),
|
||||
ccc: uint8(v),
|
||||
tccc: uint8(v),
|
||||
flags: qcInfo(v>>8) & qcInfoMask,
|
||||
}
|
||||
}
|
||||
// has decomposition
|
||||
h := decomps[v]
|
||||
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
|
||||
ri := runeInfo{size: uint8(sz), flags: f, index: v}
|
||||
if v >= firstCCC {
|
||||
v += uint16(h&headerLenMask) + 1
|
||||
ri.tccc = decomps[v]
|
||||
if v >= firstLeadingCCC {
|
||||
ri.ccc = decomps[v+1]
|
||||
}
|
||||
}
|
||||
return ri
|
||||
}
|
||||
|
@ -11,9 +11,8 @@ type input interface {
|
||||
skipNonStarter(p int) int
|
||||
appendSlice(buf []byte, s, e int) []byte
|
||||
copySlice(buf []byte, s, e int)
|
||||
charinfo(p int) (uint16, int)
|
||||
decomposeNFC(p int) uint16
|
||||
decomposeNFKC(p int) uint16
|
||||
charinfoNFC(p int) (uint16, int)
|
||||
charinfoNFKC(p int) (uint16, int)
|
||||
hangul(p int) rune
|
||||
}
|
||||
|
||||
@ -42,16 +41,12 @@ func (s inputString) copySlice(buf []byte, b, e int) {
|
||||
copy(buf, s[b:e])
|
||||
}
|
||||
|
||||
func (s inputString) charinfo(p int) (uint16, int) {
|
||||
return charInfoTrie.lookupString(string(s[p:]))
|
||||
func (s inputString) charinfoNFC(p int) (uint16, int) {
|
||||
return nfcTrie.lookupString(string(s[p:]))
|
||||
}
|
||||
|
||||
func (s inputString) decomposeNFC(p int) uint16 {
|
||||
return nfcDecompTrie.lookupStringUnsafe(string(s[p:]))
|
||||
}
|
||||
|
||||
func (s inputString) decomposeNFKC(p int) uint16 {
|
||||
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
|
||||
func (s inputString) charinfoNFKC(p int) (uint16, int) {
|
||||
return nfkcTrie.lookupString(string(s[p:]))
|
||||
}
|
||||
|
||||
func (s inputString) hangul(p int) rune {
|
||||
@ -84,16 +79,12 @@ func (s inputBytes) copySlice(buf []byte, b, e int) {
|
||||
copy(buf, s[b:e])
|
||||
}
|
||||
|
||||
func (s inputBytes) charinfo(p int) (uint16, int) {
|
||||
return charInfoTrie.lookup(s[p:])
|
||||
func (s inputBytes) charinfoNFC(p int) (uint16, int) {
|
||||
return nfcTrie.lookup(s[p:])
|
||||
}
|
||||
|
||||
func (s inputBytes) decomposeNFC(p int) uint16 {
|
||||
return nfcDecompTrie.lookupUnsafe(s[p:])
|
||||
}
|
||||
|
||||
func (s inputBytes) decomposeNFKC(p int) uint16 {
|
||||
return nfkcDecompTrie.lookupUnsafe(s[p:])
|
||||
func (s inputBytes) charinfoNFKC(p int) (uint16, int) {
|
||||
return nfkcTrie.lookup(s[p:])
|
||||
}
|
||||
|
||||
func (s inputBytes) hangul(p int) rune {
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
// Normalization table generator.
|
||||
// Data read from the web.
|
||||
// See forminfo.go for a description of the trie values associated with each rune.
|
||||
|
||||
package main
|
||||
|
||||
@ -17,6 +18,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@ -187,18 +189,14 @@ func (f FormInfo) String() string {
|
||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
|
||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
|
||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
|
||||
fmt.Fprintf(buf, " decomposition: %v\n", f.decomp)
|
||||
fmt.Fprintf(buf, " expandedDecomp: %v\n", f.expandedDecomp)
|
||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
|
||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
type Decomposition []rune
|
||||
|
||||
func (d Decomposition) String() string {
|
||||
return fmt.Sprintf("%.4X", d)
|
||||
}
|
||||
|
||||
func openReader(file string) (input io.ReadCloser) {
|
||||
if *localFiles {
|
||||
f, err := os.Open(file)
|
||||
@ -571,80 +569,121 @@ func makeEntry(f *FormInfo) uint16 {
|
||||
return e
|
||||
}
|
||||
|
||||
// Bits
|
||||
// 0..8: CCC
|
||||
// 9..12: NF(C|D) qc bits.
|
||||
// 13..16: NFK(C|D) qc bits.
|
||||
func makeCharInfo(c Char) uint16 {
|
||||
e := makeEntry(&c.forms[FCompatibility])
|
||||
e = e<<4 | makeEntry(&c.forms[FCanonical])
|
||||
e = e<<8 | uint16(c.ccc)
|
||||
return e
|
||||
// decompSet keeps track of unique decompositions, grouped by whether
|
||||
// the decomposition is followed by a trailing and/or leading CCC.
|
||||
type decompSet [4]map[string]bool
|
||||
|
||||
func makeDecompSet() decompSet {
|
||||
m := decompSet{}
|
||||
for i, _ := range m {
|
||||
m[i] = make(map[string]bool)
|
||||
}
|
||||
return m
|
||||
}
|
||||
func (m *decompSet) insert(key int, s string) {
|
||||
m[key][s] = true
|
||||
}
|
||||
|
||||
func printCharInfoTables() int {
|
||||
// Quick Check + CCC trie.
|
||||
t := newNode()
|
||||
for i, char := range chars {
|
||||
v := makeCharInfo(char)
|
||||
if v != 0 {
|
||||
t.insert(rune(i), v)
|
||||
mkstr := func(r rune, f *FormInfo) (int, string) {
|
||||
d := f.expandedDecomp
|
||||
s := string([]rune(d))
|
||||
if max := 1 << 6; len(s) >= max {
|
||||
const msg = "%U: too many bytes in decomposition: %d >= %d"
|
||||
logger.Fatalf(msg, r, len(s), max)
|
||||
}
|
||||
head := uint8(len(s))
|
||||
if f.quickCheck[MComposed] != QCYes {
|
||||
head |= 0x40
|
||||
}
|
||||
if f.combinesForward {
|
||||
head |= 0x80
|
||||
}
|
||||
s = string([]byte{head}) + s
|
||||
|
||||
lccc := ccc(d[0])
|
||||
tccc := ccc(d[len(d)-1])
|
||||
if tccc < lccc && lccc != 0 {
|
||||
const msg = "%U: lccc (%d) must be <= tcc (%d)"
|
||||
logger.Fatalf(msg, r, lccc, tccc)
|
||||
}
|
||||
index := 0
|
||||
if tccc > 0 || lccc > 0 {
|
||||
s += string([]byte{tccc})
|
||||
index = 1
|
||||
if lccc > 0 {
|
||||
s += string([]byte{lccc})
|
||||
index |= 2
|
||||
}
|
||||
}
|
||||
return index, s
|
||||
}
|
||||
return t.printTables("charInfo")
|
||||
}
|
||||
|
||||
func printDecompositionTables() int {
|
||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
|
||||
size := 0
|
||||
|
||||
// Map decompositions
|
||||
positionMap := make(map[string]uint16)
|
||||
decompSet := makeDecompSet()
|
||||
|
||||
// Store the uniqued decompositions in a byte buffer,
|
||||
// preceded by their byte length.
|
||||
for _, c := range chars {
|
||||
for f := 0; f < 2; f++ {
|
||||
d := c.forms[f].expandedDecomp
|
||||
s := string([]rune(d))
|
||||
if _, ok := positionMap[s]; !ok {
|
||||
p := decompositions.Len()
|
||||
decompositions.WriteByte(uint8(len(s)))
|
||||
decompositions.WriteString(s)
|
||||
positionMap[s] = uint16(p)
|
||||
for _, f := range c.forms {
|
||||
if len(f.expandedDecomp) == 0 {
|
||||
continue
|
||||
}
|
||||
if f.combinesBackward {
|
||||
logger.Fatalf("%U: combinesBackward and decompose", c.codePoint)
|
||||
}
|
||||
index, s := mkstr(c.codePoint, &f)
|
||||
decompSet.insert(index, s)
|
||||
}
|
||||
}
|
||||
|
||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
|
||||
size := 0
|
||||
positionMap := make(map[string]uint16)
|
||||
decompositions.WriteString("\000")
|
||||
cname := []string{"firstCCC", "firstLeadingCCC", "", "lastDecomp"}
|
||||
fmt.Println("const (")
|
||||
for i, m := range decompSet {
|
||||
sa := []string{}
|
||||
for s, _ := range m {
|
||||
sa = append(sa, s)
|
||||
}
|
||||
sort.Strings(sa)
|
||||
for _, s := range sa {
|
||||
p := decompositions.Len()
|
||||
decompositions.WriteString(s)
|
||||
positionMap[s] = uint16(p)
|
||||
}
|
||||
if cname[i] != "" {
|
||||
fmt.Printf("%s = 0x%X\n", cname[i], decompositions.Len())
|
||||
}
|
||||
}
|
||||
fmt.Println("maxDecomp = 0x8000")
|
||||
fmt.Println(")")
|
||||
b := decompositions.Bytes()
|
||||
printBytes(b, "decomps")
|
||||
size += len(b)
|
||||
|
||||
nfcT := newNode()
|
||||
nfkcT := newNode()
|
||||
for i, c := range chars {
|
||||
d := c.forms[FCanonical].expandedDecomp
|
||||
if len(d) != 0 {
|
||||
nfcT.insert(rune(i), positionMap[string([]rune(d))])
|
||||
if ccc(c.codePoint) != ccc(d[0]) {
|
||||
// We assume the lead ccc of a decomposition is !=0 in this case.
|
||||
if ccc(d[0]) == 0 {
|
||||
logger.Fatal("Expected differing CCC to be non-zero.")
|
||||
}
|
||||
}
|
||||
}
|
||||
d = c.forms[FCompatibility].expandedDecomp
|
||||
if len(d) != 0 {
|
||||
nfkcT.insert(rune(i), positionMap[string([]rune(d))])
|
||||
if ccc(c.codePoint) != ccc(d[0]) {
|
||||
// We assume the lead ccc of a decomposition is !=0 in this case.
|
||||
if ccc(d[0]) == 0 {
|
||||
logger.Fatal("Expected differing CCC to be non-zero.")
|
||||
varnames := []string{"nfc", "nfkc"}
|
||||
for i := 0; i < FNumberOfFormTypes; i++ {
|
||||
trie := newNode()
|
||||
for r, c := range chars {
|
||||
f := c.forms[i]
|
||||
d := f.expandedDecomp
|
||||
if len(d) != 0 {
|
||||
_, key := mkstr(c.codePoint, &f)
|
||||
trie.insert(rune(r), positionMap[key])
|
||||
if c.ccc != ccc(d[0]) {
|
||||
// We assume the lead ccc of a decomposition !=0 in this case.
|
||||
if ccc(d[0]) == 0 {
|
||||
logger.Fatal("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
|
||||
}
|
||||
}
|
||||
} else if v := makeEntry(&f)<<8 | uint16(c.ccc); v != 0 {
|
||||
trie.insert(c.codePoint, 0x8000|v)
|
||||
}
|
||||
}
|
||||
size += trie.printTables(varnames[i])
|
||||
}
|
||||
size += nfcT.printTables("nfcDecomp")
|
||||
size += nfkcT.printTables("nfkcDecomp")
|
||||
return size
|
||||
}
|
||||
|
||||
@ -687,15 +726,15 @@ func makeTables() {
|
||||
}
|
||||
list := strings.Split(*tablelist, ",")
|
||||
if *tablelist == "all" {
|
||||
list = []string{"decomp", "recomp", "info"}
|
||||
list = []string{"recomp", "info"}
|
||||
}
|
||||
fmt.Printf(fileHeader, *tablelist, *url)
|
||||
|
||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
|
||||
fmt.Printf("const Version = %q\n\n", version())
|
||||
|
||||
if contains(list, "decomp") {
|
||||
size += printDecompositionTables()
|
||||
if contains(list, "info") {
|
||||
size += printCharInfoTables()
|
||||
}
|
||||
|
||||
if contains(list, "recomp") {
|
||||
@ -730,9 +769,6 @@ func makeTables() {
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
if contains(list, "info") {
|
||||
size += printCharInfoTables()
|
||||
}
|
||||
fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
|
||||
}
|
||||
|
||||
@ -761,6 +797,11 @@ func verifyComputed() {
|
||||
log.Fatalf("%U: NF*C must be maybe if combinesBackward", i)
|
||||
}
|
||||
}
|
||||
nfc := c.forms[FCanonical]
|
||||
nfkc := c.forms[FCompatibility]
|
||||
if nfc.combinesBackward != nfkc.combinesBackward {
|
||||
logger.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -448,7 +448,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
|
||||
}
|
||||
// Check that decomposition doesn't result in overflow.
|
||||
if info.hasDecomposition() {
|
||||
dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
|
||||
dcomp := info.decomposition()
|
||||
for i := 0; i < len(dcomp); {
|
||||
inf := rb.f.info(inputBytes(dcomp), i)
|
||||
i += int(inf.size)
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user