1
0
mirror of https://github.com/golang/go synced 2024-11-20 06:14:53 -07:00

exp/locale/collate: implementation of tailorings and table generation.

Tailorings are represented by removing and reinserting entries from a linked list.
After all tailorings are done, missing weights are computed and verified.
This implementation assumes that entries that are used in expansions are not
reinserted at a later point.  This considerably simplifies the implementation.

R=r
CC=golang-dev
https://golang.org/cl/6739052
This commit is contained in:
Marcel van Lohuizen 2012-10-31 14:28:44 +01:00
parent 4c1a6f84f8
commit b8b329451c
4 changed files with 46490 additions and 6343 deletions

View File

@ -58,7 +58,9 @@ type Tailoring struct {
id string
builder *Builder
index *ordering
// TODO: implement.
anchor *entry
before bool
}
// NewBuilder returns a new Builder.
@ -80,6 +82,7 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
builder: b,
index: b.root.clone(),
}
t.index.id = t.id
b.locale = append(b.locale, t)
return t
}
@ -95,7 +98,6 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
// a value for each colelem that is a variable. (See the reference above.)
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
str := string(runes)
elems := make([][]int, len(colelems))
for i, ce := range colelems {
elems[i] = append(elems[i], ce...)
@ -144,6 +146,21 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
return nil
}
func (t *Tailoring) setAnchor(anchor string) error {
anchor = norm.NFD.String(anchor)
a := t.index.find(anchor)
if a == nil {
a = t.index.newEntry(anchor, nil)
a.implicit = true
for _, r := range []rune(anchor) {
e := t.index.find(string(r))
e.lock = true
}
}
t.anchor = a
return nil
}
// SetAnchor sets the point after which elements passed in subsequent calls to
// Insert will be inserted. It is equivalent to the reset directive in an LDML
// specification. See Insert for an example.
@ -151,14 +168,20 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
// <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
// and <last_non_ignorable/>.
func (t *Tailoring) SetAnchor(anchor string) error {
// TODO: implement.
if err := t.setAnchor(anchor); err != nil {
return err
}
t.before = false
return nil
}
// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
// Insert will insert entries before the anchor.
func (t *Tailoring) SetAnchorBefore(anchor string) error {
// TODO: implement.
if err := t.setAnchor(anchor); err != nil {
return err
}
t.before = true
return nil
}
@ -195,7 +218,112 @@ func (t *Tailoring) SetAnchorBefore(anchor string) error {
// t.SetAnchor("<last_primary_ignorable/>")
// t.Insert(collate.Primary, "0", "")
func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
// TODO: implement.
if t.anchor == nil {
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
}
str = norm.NFD.String(str)
e := t.index.find(str)
if e == nil {
e = t.index.newEntry(str, nil)
} else if e.logical != noAnchor {
return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
}
if e.lock {
return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
}
a := t.anchor
// Find the first element after the anchor which differs at a level smaller or
// equal to the given level. Then insert at this position.
// See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
e.before = t.before
if t.before {
t.before = false
if a.prev == nil {
a.insertBefore(e)
} else {
for a = a.prev; a.level > level; a = a.prev {
}
a.insertAfter(e)
}
e.level = level
} else {
for ; a.level > level; a = a.next {
}
e.level = a.level
if a != e {
a.insertAfter(e)
a.level = level
} else {
// We don't set a to prev itself. This has the effect of the entry
// getting new collation elements that are an increment of itself.
// This is intentional.
a.prev.level = level
}
}
e.extend = norm.NFD.String(extend)
e.exclude = false
e.elems = nil
t.anchor = e
return nil
}
func (o *ordering) getWeight(e *entry) [][]int {
if len(e.elems) == 0 && e.logical == noAnchor {
if e.implicit {
for _, r := range e.runes {
e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
}
} else if e.before {
count := [collate.Identity + 1]int{}
a := e
for ; a.elems == nil && !a.implicit; a = a.next {
count[a.level]++
}
e.elems = append([][]int(nil), make([]int, len(a.elems[0])))
copy(e.elems[0], a.elems[0])
for i := collate.Primary; i < collate.Quaternary; i++ {
if count[i] != 0 {
e.elems[0][i] -= count[i]
break
}
}
if e.prev != nil {
o.verifyWeights(e.prev, e, e.prev.level)
}
} else {
prev := e.prev
e.elems = nextWeight(prev.level, o.getWeight(prev))
o.verifyWeights(e, e.next, e.level)
}
}
return e.elems
}
func (o *ordering) addExtension(e *entry) {
if ex := o.find(e.extend); ex != nil {
e.elems = append(e.elems, ex.elems...)
} else {
for _, r := range []rune(e.extend) {
e.elems = append(e.elems, o.find(string(r)).elems...)
}
}
e.extend = ""
}
func (o *ordering) verifyWeights(a, b *entry, level collate.Level) error {
if level == collate.Identity || b == nil || b.elems == nil || a.elems == nil {
return nil
}
for i := collate.Primary; i < level; i++ {
if a.elems[0][i] < b.elems[0][i] {
return nil
}
}
if a.elems[0][level] >= b.elems[0][level] {
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
log.Println(err)
// TODO: return the error instead, or better, fix the conflicting entry by making room.
}
return nil
}
@ -205,7 +333,19 @@ func (b *Builder) error(e error) {
}
}
func (b *Builder) errorID(locale string, e error) {
if e != nil {
b.err = fmt.Errorf("%s:%v", locale, e)
}
}
func (b *Builder) buildOrdering(o *ordering) {
for _, e := range o.ordered {
o.getWeight(e)
}
for _, e := range o.ordered {
o.addExtension(e)
}
o.sort()
simplify(o)
b.processExpansions(o) // requires simplify
@ -215,7 +355,7 @@ func (b *Builder) buildOrdering(o *ordering) {
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if !e.skip() {
ce, err := e.encode()
b.error(err)
b.errorID(o.id, err)
t.insert(e.runes[0], ce)
}
}
@ -252,7 +392,11 @@ func (b *Builder) Build() (*collate.Collator, error) {
if err != nil {
return nil, err
}
return collate.Init(t), nil
c := collate.Init(t)
if c == nil {
panic("generated table of incompatible type")
}
return c, nil
}
// Build builds a Collator for Tailoring t.
@ -308,6 +452,10 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
if i >= 2 && ce[2] != maxTertiary {
return false
}
if _, err := makeCE(ce); err != nil {
// Simply return false. The error will be caught elsewhere.
return false
}
}
return true
}
@ -332,12 +480,11 @@ func simplify(o *ordering) {
e.remove()
}
}
// Tag entries for which the runes NFKD decompose to identical values.
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
s := e.str
nfkd := norm.NFKD.String(s)
if len(e.runes) > 1 || keep[e.runes[0]] || nfkd == s {
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == s {
continue
}
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
@ -459,7 +606,7 @@ func (b *Builder) processContractions(o *ordering) {
elems := []uint32{}
for _, e := range es {
ce, err := e.encodeBase()
b.error(err)
b.errorID(o.id, err)
elems = append(elems, ce)
}
key = fmt.Sprintf("%v", elems)

View File

@ -26,16 +26,21 @@ const (
// Collation Element Table.
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
type entry struct {
runes []rune
elems [][]int // the collation elements for runes
str string // same as string(runes)
str string // same as string(runes)
runes []rune
elems [][]int // the collation elements
extend string // weights of extend to be appended to elems
before bool // weights relative to next instead of previous.
lock bool // entry is used in extension and can no longer be moved.
// prev, next, and level are used to keep track of tailorings.
prev, next *entry
level collate.Level // next differs at this level
skipRemove bool // do not unlink when removed
decompose bool // can use NFKD decomposition to generate elems
exclude bool // do not include in table
implicit bool // derived, is not included in the list
logical logicalAnchor
expansionIndex int // used to store index into expansion table
@ -44,8 +49,8 @@ type entry struct {
}
func (e *entry) String() string {
return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)",
e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)",
e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
}
func (e *entry) skip() bool {
@ -71,7 +76,7 @@ func (e *entry) contractionStarter() bool {
// examples of entries that will not be indexed.
func (e *entry) nextIndexed() (*entry, collate.Level) {
level := e.level
for e = e.next; e != nil && e.exclude; e = e.next {
for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next {
if e.level < level {
level = e.level
}
@ -87,16 +92,20 @@ func (e *entry) remove() {
if e.logical != noAnchor {
log.Fatalf("may not remove anchor %q", e.str)
}
if e.prev != nil {
e.prev.next = e.next
}
if e.next != nil {
e.next.prev = e.prev
}
// TODO: need to set e.prev.level to e.level if e.level is smaller?
e.elems = nil
if !e.skipRemove {
if e.prev != nil {
e.prev.next = e.next
}
if e.next != nil {
e.next.prev = e.prev
}
}
e.skipRemove = false
}
// insertAfter inserts t after e.
// insertAfter inserts n after e.
func (e *entry) insertAfter(n *entry) {
if e == n {
panic("e == anchor")
@ -109,10 +118,31 @@ func (e *entry) insertAfter(n *entry) {
n.next = e.next
n.prev = e
e.next.prev = n
if e.next != nil {
e.next.prev = n
}
e.next = n
}
// insertBefore inserts n before e.
func (e *entry) insertBefore(n *entry) {
if e == n {
panic("e == anchor")
}
if e == nil {
panic("unexpected nil anchor")
}
n.remove()
n.decompose = false // redo decomposition test
n.prev = e.prev
n.next = e
if e.prev != nil {
e.prev.next = n
}
e.prev = n
}
func (e *entry) encodeBase() (ce uint32, err error) {
switch {
case e.expansion():
@ -178,6 +208,7 @@ func (s sortedEntries) Less(i, j int) bool {
}
type ordering struct {
id string
entryMap map[string]*entry
ordered []*entry
handle *trieHandle
@ -187,7 +218,14 @@ type ordering struct {
// Note that insert simply appends e to ordered. To reattain a sorted
// order, o.sort() should be called.
func (o *ordering) insert(e *entry) {
o.entryMap[e.str] = e
if e.logical == noAnchor {
o.entryMap[e.str] = e
} else {
// Use key format as used in UCA rules.
o.entryMap[fmt.Sprintf("[%s]", e.str)] = e
// Also add index entry for XML format.
o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e
}
o.ordered = append(o.ordered, e)
}
@ -236,13 +274,13 @@ func makeRootOrdering() ordering {
entryMap: make(map[string]*entry),
}
insert := func(typ logicalAnchor, s string, ce []int) {
// Use key format as used in UCA rules.
e := o.newEntry(fmt.Sprintf("[%s]", s), [][]int{ce})
// Also add index entry for XML format.
o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(s, " ", "_", -1))] = e
e.runes = nil
e.exclude = true
e.logical = typ
e := &entry{
elems: [][]int{ce},
str: s,
exclude: true,
logical: typ,
}
o.insert(e)
}
insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0})
insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max})
@ -252,6 +290,29 @@ func makeRootOrdering() ordering {
return o
}
// patchForInsert eleminates entries from the list with more than one collation element.
// The next and prev fields of the eliminated entries still point to appropriate
// values in the newly created list.
// It requires that sort has been called.
func (o *ordering) patchForInsert() {
for i := 0; i < len(o.ordered)-1; {
e := o.ordered[i]
lev := e.level
n := e.next
for ; n != nil && len(n.elems) > 1; n = n.next {
if n.level < lev {
lev = n.level
}
n.skipRemove = true
}
for ; o.ordered[i] != n; i++ {
o.ordered[i].level = lev
o.ordered[i].next = n
o.ordered[i+1].prev = e
}
}
}
// clone copies all ordering of es into a new ordering value.
func (o *ordering) clone() *ordering {
o.sort()
@ -270,6 +331,7 @@ func (o *ordering) clone() *ordering {
oo.insert(ne)
}
oo.sort() // link all ordering.
oo.patchForInsert()
return &oo
}

View File

@ -128,6 +128,9 @@ func TestInsertAfter(t *testing.T) {
last.insertAfter(es[i])
last = es[i]
}
for _, e := range es {
e.elems = es[0].elems
}
e := es[0]
for _, i := range perm {
e, _ = e.nextIndexed()
@ -139,6 +142,34 @@ func TestInsertAfter(t *testing.T) {
}
}
func TestInsertBefore(t *testing.T) {
const n = 5
orig := makeList(n)
perm := make([]int, n)
for i := range perm {
perm[i] = i + 1
}
for ok := true; ok; ok = nextPerm(perm) {
es := makeList(n)
last := es[len(es)-1]
for _, i := range perm {
last.insertBefore(es[i])
last = es[i]
}
for _, e := range es {
e.elems = es[0].elems
}
e := es[0]
for i := n - 1; i >= 0; i-- {
e, _ = e.nextIndexed()
if e.runes[0] != rune(perm[i]) {
t.Errorf("%d:%d: expected entry %X; found %X", perm, i, orig[i].runes, e.runes)
break
}
}
}
}
type entryLessTest struct {
a, b *entry
res bool

File diff suppressed because it is too large Load Diff