1
0
mirror of https://github.com/golang/go synced 2024-11-19 01:54:39 -07:00

exp/locale/collate: include composed characters into the table. This eliminates

the need to decompose characters for the majority of cases.  This considerably
speeds up collation while increasing the table size minimally.

To detect non-normalized strings, rather than relying on exp/norm, the table
now includes CCC information. The inclusion of this information does not
increase table size.

DETAILS
 - Raw collation elements are now a struct that includes the CCC, rather
   than a slice of ints.
 - Builder now ensures that NFD and NFC counterparts are included in the table.
   This also fixes a bug for Korean which is responsible for most of the growth
   of the table size.
 - As there is no more normalization step, code should now handle both strings
   and byte slices as input. Introduced source type to facilitate this.

NOTES
 - This change does not handle normalization correctly entirely for contractions.
   This causes a few failures with the regtest. table_test.go contains a few
   uncommented tests that can be enabled once this is fixed.  The easiest is to
   fix this once we have the new norm.Iter.
 - Removed a test cases in table_test that covers cases that are now guaranteed
   to not exist.

R=rsc, mpvl
CC=golang-dev
https://golang.org/cl/6971044
This commit is contained in:
Marcel van Lohuizen 2012-12-24 16:42:29 +01:00
parent 43f2fc308b
commit 9aa70984a9
16 changed files with 46794 additions and 42287 deletions

View File

@ -98,24 +98,24 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
// a value for each colelem that is a variable. (See the reference above.) // a value for each colelem that is a variable. (See the reference above.)
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
str := string(runes) str := string(runes)
elems := make([][]int, len(colelems)) elems := make([]rawCE, len(colelems))
for i, ce := range colelems { for i, ce := range colelems {
elems[i] = append(elems[i], ce...)
if len(ce) == 0 { if len(ce) == 0 {
elems[i] = append(elems[i], []int{0, 0, 0, 0}...)
break break
} }
elems[i] = makeRawCE(ce, 0)
if len(ce) == 1 { if len(ce) == 1 {
elems[i] = append(elems[i], defaultSecondary) elems[i].w[1] = defaultSecondary
} }
if len(ce) <= 2 { if len(ce) <= 2 {
elems[i] = append(elems[i], defaultTertiary) elems[i].w[2] = defaultTertiary
} }
if len(ce) <= 3 { if len(ce) <= 3 {
elems[i] = append(elems[i], ce[0]) elems[i].w[3] = ce[0]
} }
} }
for i, ce := range elems { for i, ce := range elems {
p := ce.w[0]
isvar := false isvar := false
for _, j := range variables { for _, j := range variables {
if i == j { if i == j {
@ -123,18 +123,18 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
} }
} }
if isvar { if isvar {
if ce[0] >= b.minNonVar && b.minNonVar > 0 { if p >= b.minNonVar && b.minNonVar > 0 {
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar) return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
} }
if ce[0] > b.varTop { if p > b.varTop {
b.varTop = ce[0] b.varTop = p
} }
} else if ce[0] > 1 { // 1 is a special primary value reserved for FFFE } else if p > 1 { // 1 is a special primary value reserved for FFFE
if ce[0] <= b.varTop { if p <= b.varTop {
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop) return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
} }
if b.minNonVar == 0 || ce[0] < b.minNonVar { if b.minNonVar == 0 || p < b.minNonVar {
b.minNonVar = ce[0] b.minNonVar = p
} }
} }
} }
@ -142,16 +142,42 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
if err != nil { if err != nil {
return err return err
} }
cccs := []uint8{}
nfd := norm.NFD.String(str)
for i := range nfd {
cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
}
if len(cccs) < len(elems) {
if len(cccs) > 2 {
return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
}
p := len(elems) - 1
for ; p > 0 && elems[p].w[0] == 0; p-- {
elems[p].ccc = cccs[len(cccs)-1]
}
for ; p >= 0; p-- {
elems[p].ccc = cccs[0]
}
} else {
for i := range elems {
elems[i].ccc = cccs[i]
}
}
// doNorm in collate.go assumes that the following conditions hold.
if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
}
b.root.newEntry(str, elems) b.root.newEntry(str, elems)
return nil return nil
} }
func (t *Tailoring) setAnchor(anchor string) error { func (t *Tailoring) setAnchor(anchor string) error {
anchor = norm.NFD.String(anchor) anchor = norm.NFC.String(anchor)
a := t.index.find(anchor) a := t.index.find(anchor)
if a == nil { if a == nil {
a = t.index.newEntry(anchor, nil) a = t.index.newEntry(anchor, nil)
a.implicit = true a.implicit = true
a.modified = true
for _, r := range []rune(anchor) { for _, r := range []rune(anchor) {
e := t.index.find(string(r)) e := t.index.find(string(r))
e.lock = true e.lock = true
@ -221,7 +247,7 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
if t.anchor == nil { if t.anchor == nil {
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
} }
str = norm.NFD.String(str) str = norm.NFC.String(str)
e := t.index.find(str) e := t.index.find(str)
if e == nil { if e == nil {
e = t.index.newEntry(str, nil) e = t.index.newEntry(str, nil)
@ -262,12 +288,13 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
} }
e.extend = norm.NFD.String(extend) e.extend = norm.NFD.String(extend)
e.exclude = false e.exclude = false
e.modified = true
e.elems = nil e.elems = nil
t.anchor = e t.anchor = e
return nil return nil
} }
func (o *ordering) getWeight(e *entry) [][]int { func (o *ordering) getWeight(e *entry) []rawCE {
if len(e.elems) == 0 && e.logical == noAnchor { if len(e.elems) == 0 && e.logical == noAnchor {
if e.implicit { if e.implicit {
for _, r := range e.runes { for _, r := range e.runes {
@ -279,11 +306,10 @@ func (o *ordering) getWeight(e *entry) [][]int {
for ; a.elems == nil && !a.implicit; a = a.next { for ; a.elems == nil && !a.implicit; a = a.next {
count[a.level]++ count[a.level]++
} }
e.elems = append([][]int(nil), make([]int, len(a.elems[0]))) e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
copy(e.elems[0], a.elems[0])
for i := collate.Primary; i < collate.Quaternary; i++ { for i := collate.Primary; i < collate.Quaternary; i++ {
if count[i] != 0 { if count[i] != 0 {
e.elems[0][i] -= count[i] e.elems[0].w[i] -= count[i]
break break
} }
} }
@ -315,11 +341,11 @@ func (o *ordering) verifyWeights(a, b *entry, level collate.Level) error {
return nil return nil
} }
for i := collate.Primary; i < level; i++ { for i := collate.Primary; i < level; i++ {
if a.elems[0][i] < b.elems[0][i] { if a.elems[0].w[i] < b.elems[0].w[i] {
return nil return nil
} }
} }
if a.elems[0][level] >= b.elems[0][level] { if a.elems[0].w[level] >= b.elems[0].w[level] {
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
log.Println(err) log.Println(err)
// TODO: return the error instead, or better, fix the conflicting entry by making room. // TODO: return the error instead, or better, fix the conflicting entry by making room.
@ -339,6 +365,54 @@ func (b *Builder) errorID(locale string, e error) {
} }
} }
// patchNorm ensures that NFC and NFD counterparts are consistent.
func (o *ordering) patchNorm() {
// Insert the NFD counterparts, if necessary.
for _, e := range o.ordered {
nfd := norm.NFD.String(e.str)
if nfd != e.str {
if e0 := o.find(nfd); e0 != nil && !e0.modified {
e0.elems = e.elems
} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
e := o.newEntry(nfd, e.elems)
e.modified = true
}
}
}
// Update unchanged composed forms if one of their parts changed.
for _, e := range o.ordered {
nfd := norm.NFD.String(e.str)
if e.modified || nfd == e.str {
continue
}
if e0 := o.find(nfd); e0 != nil {
e.elems = e0.elems
} else {
e.elems = o.genColElems(nfd)
if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
r := []rune(nfd)
head := string(r[0])
tail := ""
for i := 1; i < len(r); i++ {
s := norm.NFC.String(head + string(r[i]))
if e0 := o.find(s); e0 != nil && e0.modified {
head = s
} else {
tail += string(r[i])
}
}
e.elems = append(o.genColElems(head), o.genColElems(tail)...)
}
}
}
// Exclude entries for which the individual runes generate the same collation elements.
for _, e := range o.ordered {
if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
e.exclude = true
}
}
}
func (b *Builder) buildOrdering(o *ordering) { func (b *Builder) buildOrdering(o *ordering) {
for _, e := range o.ordered { for _, e := range o.ordered {
o.getWeight(e) o.getWeight(e)
@ -346,6 +420,7 @@ func (b *Builder) buildOrdering(o *ordering) {
for _, e := range o.ordered { for _, e := range o.ordered {
o.addExtension(e) o.addExtension(e)
} }
o.patchNorm()
o.sort() o.sort()
simplify(o) simplify(o)
b.processExpansions(o) // requires simplify b.processExpansions(o) // requires simplify
@ -436,20 +511,20 @@ func (b *Builder) Print(w io.Writer) (n int, err error) {
// reproducibleFromNFKD checks whether the given expansion could be generated // reproducibleFromNFKD checks whether the given expansion could be generated
// from an NFKD expansion. // from an NFKD expansion.
func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
// Length must be equal. // Length must be equal.
if len(exp) != len(nfkd) { if len(exp) != len(nfkd) {
return false return false
} }
for i, ce := range exp { for i, ce := range exp {
// Primary and secondary values should be equal. // Primary and secondary values should be equal.
if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] { if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
return false return false
} }
// Tertiary values should be equal to maxTertiary for third element onwards. // Tertiary values should be equal to maxTertiary for third element onwards.
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
// simply be dropped. Try this out by dropping the following code. // simply be dropped. Try this out by dropping the following code.
if i >= 2 && ce[2] != maxTertiary { if i >= 2 && ce.w[2] != maxTertiary {
return false return false
} }
if _, err := makeCE(ce); err != nil { if _, err := makeCE(ce); err != nil {
@ -469,22 +544,12 @@ func simplify(o *ordering) {
keep[e.runes[0]] = true keep[e.runes[0]] = true
} }
} }
// Remove entries for which the runes normalize (using NFD) to identical values.
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
s := e.str
nfd := norm.NFD.String(s)
if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s {
continue
}
if equalCEArrays(o.genColElems(nfd), e.elems) {
e.remove()
}
}
// Tag entries for which the runes NFKD decompose to identical values. // Tag entries for which the runes NFKD decompose to identical values.
for e := o.front(); e != nil; e, _ = e.nextIndexed() { for e := o.front(); e != nil; e, _ = e.nextIndexed() {
s := e.str s := e.str
nfkd := norm.NFKD.String(s) nfkd := norm.NFKD.String(s)
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == s { nfd := norm.NFD.String(s)
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
continue continue
} }
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
@ -589,18 +654,18 @@ func (b *Builder) processContractions(o *ordering) {
// Bucket sort entries in index order. // Bucket sort entries in index order.
es := make([]*entry, len(l)) es := make([]*entry, len(l))
for _, e := range l { for _, e := range l {
var o, sn int var p, sn int
if len(e.runes) > 1 { if len(e.runes) > 1 {
str := []byte(string(e.runes[1:])) str := []byte(string(e.runes[1:]))
o, sn = t.contractTries.lookup(handle, str) p, sn = t.contractTries.lookup(handle, str)
if sn != len(str) { if sn != len(str) {
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str)) log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
} }
} }
if es[o] != nil { if es[p] != nil {
log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0]) log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
} }
es[o] = e es[p] = e
} }
// Create collation elements for contractions. // Create collation elements for contractions.
elems := []uint32{} elems := []uint32{}

View File

@ -7,48 +7,64 @@ package build
import "testing" import "testing"
// cjk returns an implicit collation element for a CJK rune. // cjk returns an implicit collation element for a CJK rune.
func cjk(r rune) [][]int { func cjk(r rune) []rawCE {
// A CJK character C is represented in the DUCET as // A CJK character C is represented in the DUCET as
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C] // [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
// Where AAAA is the most significant 15 bits plus a base value. // Where AAAA is the most significant 15 bits plus a base value.
// Any base value will work for the test, so we pick the common value of FB40. // Any base value will work for the test, so we pick the common value of FB40.
const base = 0xFB40 const base = 0xFB40
return [][]int{ return []rawCE{
{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}, {w: []int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}},
{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}, {w: []int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}},
} }
} }
func pCE(p int) [][]int { func pCE(p int) []rawCE {
return [][]int{{p, defaultSecondary, defaultTertiary, 0}} return mkCE([]int{p, defaultSecondary, defaultTertiary, 0}, 0)
} }
func pqCE(p, q int) [][]int { func pqCE(p, q int) []rawCE {
return [][]int{{p, defaultSecondary, defaultTertiary, q}} return mkCE([]int{p, defaultSecondary, defaultTertiary, q}, 0)
} }
func ptCE(p, t int) [][]int { func ptCE(p, t int) []rawCE {
return [][]int{{p, defaultSecondary, t, 0}} return mkCE([]int{p, defaultSecondary, t, 0}, 0)
} }
func sCE(s int) [][]int { func ptcCE(p, t int, ccc uint8) []rawCE {
return [][]int{{0, s, defaultTertiary, 0}} return mkCE([]int{p, defaultSecondary, t, 0}, ccc)
} }
func stCE(s, t int) [][]int { func sCE(s int) []rawCE {
return [][]int{{0, s, t, 0}} return mkCE([]int{0, s, defaultTertiary, 0}, 0)
}
func stCE(s, t int) []rawCE {
return mkCE([]int{0, s, t, 0}, 0)
}
func scCE(s int, ccc uint8) []rawCE {
return mkCE([]int{0, s, defaultTertiary, 0}, ccc)
}
func mkCE(w []int, ccc uint8) []rawCE {
return []rawCE{rawCE{w, ccc}}
} }
// ducetElem is used to define test data that is used to generate a table. // ducetElem is used to define test data that is used to generate a table.
type ducetElem struct { type ducetElem struct {
str string str string
ces [][]int ces []rawCE
} }
func newBuilder(t *testing.T, ducet []ducetElem) *Builder { func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
b := NewBuilder() b := NewBuilder()
for _, e := range ducet { for _, e := range ducet {
if err := b.Add([]rune(e.str), e.ces, nil); err != nil { ces := [][]int{}
for _, ce := range e.ces {
ces = append(ces, ce.w)
}
if err := b.Add([]rune(e.str), ces, nil); err != nil {
t.Errorf(err.Error()) t.Errorf(err.Error())
} }
} }
@ -58,7 +74,7 @@ func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
} }
type convertTest struct { type convertTest struct {
in, out [][]int in, out []rawCE
err bool err bool
} }
@ -173,16 +189,18 @@ func TestSimplify(t *testing.T) {
} }
var expandTest = []ducetElem{ var expandTest = []ducetElem{
{"\u00C0", append(ptCE(100, 8), sCE(30)...)}, {"\u0300", append(scCE(29, 230), scCE(30, 230)...)},
{"\u00C8", append(ptCE(105, 8), sCE(30)...)}, {"\u00C0", append(ptCE(100, 8), scCE(30, 230)...)},
{"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion {"\u00C8", append(ptCE(105, 8), scCE(30, 230)...)},
{"\u00C9", append(ptCE(105, 8), scCE(30, 230)...)}, // identical expansion
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])}, {"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
{"\u01FF", append(ptCE(200, 4), ptcCE(201, 4, 0)[0], scCE(30, 230)[0])},
} }
func TestExpand(t *testing.T) { func TestExpand(t *testing.T) {
const ( const (
totalExpansions = 3 totalExpansions = 5
totalElements = 2 + 2 + 3 + totalExpansions totalElements = 2 + 2 + 2 + 3 + 3 + totalExpansions
) )
b := newBuilder(t, expandTest) b := newBuilder(t, expandTest)
o := &b.root o := &b.root

View File

@ -16,6 +16,17 @@ const (
maxTertiary = 0x1F maxTertiary = 0x1F
) )
type rawCE struct {
w []int
ccc uint8
}
func makeRawCE(w []int, ccc uint8) rawCE {
ce := rawCE{w: make([]int, 4), ccc: ccc}
copy(ce.w, w)
return ce
}
// A collation element is represented as an uint32. // A collation element is represented as an uint32.
// In the typical case, a rune maps to a single collation element. If a rune // In the typical case, a rune maps to a single collation element. If a rune
// can be the start of a contraction or expands into multiple collation elements, // can be the start of a contraction or expands into multiple collation elements,
@ -29,29 +40,36 @@ const (
// 01pppppp pppppppp ppppppp0 ssssssss // 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value // - p* is primary collation value
// - s* is the secondary collation value // - s* is the secondary collation value
// or
// 00pppppp pppppppp ppppppps sssttttt, where // 00pppppp pppppppp ppppppps sssttttt, where
// - p* is primary collation value // - p* is primary collation value
// - s* offset of secondary from default value. // - s* offset of secondary from default value.
// - t* is the tertiary collation value // - t* is the tertiary collation value
// 100ttttt cccccccc pppppppp pppppppp
// - t* is the tertiar collation value
// - c* is the cannonical combining class
// - p* is the primary collation value
// Collation elements with a secondary value are of the form // Collation elements with a secondary value are of the form
// 10000000 0000ssss ssssssss tttttttt, where // 1010cccc ccccssss ssssssss tttttttt, where
// - 16 BMP implicit -> weight // - c* is the canonical combining class
// - 8 bit s // - s* is the secondary collation value
// - default tertiary // - t* is the tertiary collation value
const ( const (
maxPrimaryBits = 21 maxPrimaryBits = 21
maxPrimaryCompactBits = 16
maxSecondaryBits = 12 maxSecondaryBits = 12
maxSecondaryCompactBits = 8 maxSecondaryCompactBits = 8
maxCCCBits = 8
maxSecondaryDiffBits = 4 maxSecondaryDiffBits = 4
maxTertiaryBits = 8 maxTertiaryBits = 8
maxTertiaryCompactBits = 5 maxTertiaryCompactBits = 5
isSecondary = 0x80000000
isPrimary = 0x40000000 isPrimary = 0x40000000
isPrimaryCCC = 0x80000000
isSecondary = 0xA0000000
) )
func makeCE(weights []int) (uint32, error) { func makeCE(rce rawCE) (uint32, error) {
weights := rce.w
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 { if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
} }
@ -63,14 +81,25 @@ func makeCE(weights []int) (uint32, error) {
} }
ce := uint32(0) ce := uint32(0)
if weights[0] != 0 { if weights[0] != 0 {
if weights[2] == defaultTertiary { if rce.ccc != 0 {
if weights[0] >= 1<<maxPrimaryCompactBits {
return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights[0], 1<<maxPrimaryCompactBits)
}
if weights[1] != defaultSecondary {
return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights[1], rce.ccc)
}
ce = uint32(weights[2] << (maxPrimaryCompactBits + maxCCCBits))
ce |= uint32(rce.ccc) << maxPrimaryCompactBits
ce |= uint32(weights[0])
ce |= isPrimaryCCC
} else if weights[2] == defaultTertiary {
if weights[1] >= 1<<maxSecondaryCompactBits { if weights[1] >= 1<<maxSecondaryCompactBits {
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits) return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
} }
ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1]) ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary ce |= isPrimary
} else { } else {
d := weights[1] - defaultSecondary + 4 d := weights[1] - defaultSecondary + maxSecondaryDiffBits
if d >= 1<<maxSecondaryDiffBits || d < 0 { if d >= 1<<maxSecondaryDiffBits || d < 0 {
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
} }
@ -82,6 +111,7 @@ func makeCE(weights []int) (uint32, error) {
} }
} else { } else {
ce = uint32(weights[1]<<maxTertiaryBits + weights[2]) ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
ce += uint32(rce.ccc) << (maxSecondaryBits + maxTertiaryBits)
ce |= isSecondary ce |= isSecondary
} }
return ce, nil return ce, nil
@ -207,7 +237,7 @@ func implicitPrimary(r rune) int {
// We will rewrite these characters to a single CE. // We will rewrite these characters to a single CE.
// We assume the CJK values start at 0x8000. // We assume the CJK values start at 0x8000.
// See http://unicode.org/reports/tr10/#Implicit_Weights // See http://unicode.org/reports/tr10/#Implicit_Weights
func convertLargeWeights(elems [][]int) (res [][]int, err error) { func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
const ( const (
cjkPrimaryStart = 0xFB40 cjkPrimaryStart = 0xFB40
rarePrimaryStart = 0xFB80 rarePrimaryStart = 0xFB80
@ -219,7 +249,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
shiftBits = 15 shiftBits = 15
) )
for i := 0; i < len(elems); i++ { for i := 0; i < len(elems); i++ {
ce := elems[i] ce := elems[i].w
p := ce[0] p := ce[0]
if p < cjkPrimaryStart { if p < cjkPrimaryStart {
continue continue
@ -233,10 +263,10 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
if i+1 >= len(elems) { if i+1 >= len(elems) {
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
} }
if elems[i+1][0]&lowBitsFlag == 0 { if elems[i+1].w[0]&lowBitsFlag == 0 {
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
} }
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
switch { switch {
case p < rarePrimaryStart: case p < rarePrimaryStart:
np += commonUnifiedOffset np += commonUnifiedOffset
@ -257,26 +287,25 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
// nextWeight computes the first possible collation weights following elems // nextWeight computes the first possible collation weights following elems
// for the given level. // for the given level.
func nextWeight(level collate.Level, elems [][]int) [][]int { func nextWeight(level collate.Level, elems []rawCE) []rawCE {
if level == collate.Identity { if level == collate.Identity {
next := make([][]int, len(elems)) next := make([]rawCE, len(elems))
copy(next, elems) copy(next, elems)
return next return next
} }
next := [][]int{make([]int, len(elems[0]))} next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
copy(next[0], elems[0]) next[0].w[level]++
next[0][level]++
if level < collate.Secondary { if level < collate.Secondary {
next[0][collate.Secondary] = defaultSecondary next[0].w[collate.Secondary] = defaultSecondary
} }
if level < collate.Tertiary { if level < collate.Tertiary {
next[0][collate.Tertiary] = defaultTertiary next[0].w[collate.Tertiary] = defaultTertiary
} }
// Filter entries that cannot influence ordering. // Filter entries that cannot influence ordering.
for _, ce := range elems[1:] { for _, ce := range elems[1:] {
skip := true skip := true
for i := collate.Primary; i < level; i++ { for i := collate.Primary; i < level; i++ {
skip = skip && ce[i] == 0 skip = skip && ce.w[i] == 0
} }
if !skip { if !skip {
next = append(next, ce) next = append(next, ce)
@ -285,18 +314,18 @@ func nextWeight(level collate.Level, elems [][]int) [][]int {
return next return next
} }
func nextVal(elems [][]int, i int, level collate.Level) (index, value int) { func nextVal(elems []rawCE, i int, level collate.Level) (index, value int) {
for ; i < len(elems) && elems[i][level] == 0; i++ { for ; i < len(elems) && elems[i].w[level] == 0; i++ {
} }
if i < len(elems) { if i < len(elems) {
return i, elems[i][level] return i, elems[i].w[level]
} }
return i, 0 return i, 0
} }
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
// It also returns the collation level at which the difference is found. // It also returns the collation level at which the difference is found.
func compareWeights(a, b [][]int) (result int, level collate.Level) { func compareWeights(a, b []rawCE) (result int, level collate.Level) {
for level := collate.Primary; level < collate.Identity; level++ { for level := collate.Primary; level < collate.Identity; level++ {
var va, vb int var va, vb int
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
@ -314,19 +343,16 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) {
return 0, collate.Identity return 0, collate.Identity
} }
func equalCE(a, b []int) bool { func equalCE(a, b rawCE) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < 3; i++ { for i := 0; i < 3; i++ {
if b[i] != a[i] { if b.w[i] != a.w[i] {
return false return false
} }
} }
return true return true
} }
func equalCEArrays(a, b [][]int) bool { func equalCEArrays(a, b []rawCE) bool {
if len(a) != len(b) { if len(a) != len(b) {
return false return false
} }

View File

@ -16,7 +16,7 @@ type ceTest struct {
} }
func normalCE(in []int) (ce uint32, err error) { func normalCE(in []int) (ce uint32, err error) {
return makeCE(in) return makeCE(rawCE{w: in[:3], ccc: uint8(in[3])})
} }
func expandCE(in []int) (ce uint32, err error) { func expandCE(in []int) (ce uint32, err error) {
@ -32,17 +32,20 @@ func decompCE(in []int) (ce uint32, err error) {
} }
var ceTests = []ceTest{ var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}, 0x80000000}, {normalCE, []int{0, 0, 0, 0}, 0xA0000000},
{normalCE, []int{0, 0x28, 3}, 0x80002803}, {normalCE, []int{0, 0x28, 3, 0}, 0xA0002803},
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C883}, {normalCE, []int{0, 0x28, 3, 0xFF}, 0xAFF02803},
{normalCE, []int{100, defaultSecondary, 3, 0}, 0x0000C883},
// non-ignorable primary with non-default secondary // non-ignorable primary with non-default secondary
{normalCE, []int{100, 0x28, defaultTertiary}, 0x4000C828}, {normalCE, []int{100, 0x28, defaultTertiary, 0}, 0x4000C828},
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983}, {normalCE, []int{100, defaultSecondary + 8, 3, 0}, 0x0000C983},
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary {normalCE, []int{100, 0, 3, 0}, 0xFFFF}, // non-ignorable primary with non-supported secondary
{normalCE, []int{100, 1, 3}, 0xFFFF}, {normalCE, []int{100, 1, 3, 0}, 0xFFFF},
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF}, {normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0, 0}, 0xFFFF},
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF}, {normalCE, []int{0, 1 << maxSecondaryBits, 0, 0}, 0xFFFF},
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF}, {normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits, 0}, 0xFFFF},
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}, 0x88FF0123},
{normalCE, []int{0x123, defaultSecondary + 1, 8, 0xFF}, 0xFFFF},
{contractCE, []int{0, 0, 0}, 0xC0000000}, {contractCE, []int{0, 0, 0}, 0xC0000000},
{contractCE, []int{1, 1, 1}, 0xC0010011}, {contractCE, []int{1, 1, 1}, 0xC0010011},
@ -85,6 +88,14 @@ func TestColElem(t *testing.T) {
} }
} }
func mkRawCES(in [][]int) []rawCE {
out := []rawCE{}
for _, w := range in {
out = append(out, rawCE{w: w})
}
return out
}
type weightsTest struct { type weightsTest struct {
a, b [][]int a, b [][]int
level collate.Level level collate.Level
@ -119,8 +130,8 @@ var extra = [][]int{{200, 32, 8, 0}, {0, 32, 8, 0}, {0, 0, 8, 0}, {0, 0, 0, 0}}
func TestNextWeight(t *testing.T) { func TestNextWeight(t *testing.T) {
for i, tt := range nextWeightTests { for i, tt := range nextWeightTests {
test := func(l collate.Level, tt weightsTest, a, gold [][]int) { test := func(l collate.Level, tt weightsTest, a, gold [][]int) {
res := nextWeight(tt.level, a) res := nextWeight(tt.level, mkRawCES(a))
if !equalCEArrays(gold, res) { if !equalCEArrays(mkRawCES(gold), res) {
t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res) t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res)
} }
} }
@ -189,7 +200,7 @@ var compareTests = []weightsTest{
func TestCompareWeights(t *testing.T) { func TestCompareWeights(t *testing.T) {
for i, tt := range compareTests { for i, tt := range compareTests {
test := func(tt weightsTest, a, b [][]int) { test := func(tt weightsTest, a, b [][]int) {
res, level := compareWeights(a, b) res, level := compareWeights(mkRawCES(a), mkRawCES(b))
if res != tt.result { if res != tt.result {
t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res) t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res)
} }

View File

@ -6,6 +6,7 @@ package build
import ( import (
"exp/locale/collate" "exp/locale/collate"
"exp/norm"
"fmt" "fmt"
"log" "log"
"sort" "sort"
@ -28,7 +29,7 @@ const (
type entry struct { type entry struct {
str string // same as string(runes) str string // same as string(runes)
runes []rune runes []rune
elems [][]int // the collation elements elems []rawCE // the collation elements
extend string // weights of extend to be appended to elems extend string // weights of extend to be appended to elems
before bool // weights relative to next instead of previous. before bool // weights relative to next instead of previous.
lock bool // entry is used in extension and can no longer be moved. lock bool // entry is used in extension and can no longer be moved.
@ -41,6 +42,7 @@ type entry struct {
decompose bool // can use NFKD decomposition to generate elems decompose bool // can use NFKD decomposition to generate elems
exclude bool // do not include in table exclude bool // do not include in table
implicit bool // derived, is not included in the list implicit bool // derived, is not included in the list
modified bool // entry was modified in tailoring
logical logicalAnchor logical logicalAnchor
expansionIndex int // used to store index into expansion table expansionIndex int // used to store index into expansion table
@ -162,10 +164,10 @@ func (e *entry) encode() (ce uint32, err error) {
} }
switch { switch {
case e.decompose: case e.decompose:
t1 := e.elems[0][2] t1 := e.elems[0].w[2]
t2 := 0 t2 := 0
if len(e.elems) > 1 { if len(e.elems) > 1 {
t2 = e.elems[1][2] t2 = e.elems[1].w[2]
} }
ce, err = makeDecompose(t1, t2) ce, err = makeDecompose(t1, t2)
case e.contractionStarter(): case e.contractionStarter():
@ -231,7 +233,7 @@ func (o *ordering) insert(e *entry) {
// newEntry creates a new entry for the given info and inserts it into // newEntry creates a new entry for the given info and inserts it into
// the index. // the index.
func (o *ordering) newEntry(s string, ces [][]int) *entry { func (o *ordering) newEntry(s string, ces []rawCE) *entry {
e := &entry{ e := &entry{
runes: []rune(s), runes: []rune(s),
elems: ces, elems: ces,
@ -249,14 +251,29 @@ func (o *ordering) find(str string) *entry {
if e == nil { if e == nil {
r := []rune(str) r := []rune(str)
if len(r) == 1 { if len(r) == 1 {
e = o.newEntry(string(r[0]), [][]int{ const (
{ firstHangul = 0xAC00
lastHangul = 0xD7A3
)
if r[0] >= firstHangul && r[0] <= lastHangul {
ce := []rawCE{}
nfd := norm.NFD.String(str)
for _, r := range nfd {
ce = append(ce, o.find(string(r)).elems...)
}
e = o.newEntry(nfd, ce)
} else {
e = o.newEntry(string(r[0]), []rawCE{
{w: []int{
implicitPrimary(r[0]), implicitPrimary(r[0]),
defaultSecondary, defaultSecondary,
defaultTertiary, defaultTertiary,
int(r[0]), int(r[0]),
}, },
},
}) })
e.modified = true
}
e.exclude = true // do not index implicits e.exclude = true // do not index implicits
} }
} }
@ -275,7 +292,7 @@ func makeRootOrdering() ordering {
} }
insert := func(typ logicalAnchor, s string, ce []int) { insert := func(typ logicalAnchor, s string, ce []int) {
e := &entry{ e := &entry{
elems: [][]int{ce}, elems: []rawCE{{w: ce}},
str: s, str: s,
exclude: true, exclude: true,
logical: typ, logical: typ,
@ -362,10 +379,14 @@ func (o *ordering) sort() {
// genColElems generates a collation element array from the runes in str. This // genColElems generates a collation element array from the runes in str. This
// assumes that all collation elements have already been added to the Builder. // assumes that all collation elements have already been added to the Builder.
func (o *ordering) genColElems(str string) [][]int { func (o *ordering) genColElems(str string) []rawCE {
elems := [][]int{} elems := []rawCE{}
for _, r := range []rune(str) { for _, r := range []rune(str) {
elems = append(elems, o.find(string(r)).elems...) for _, ce := range o.find(string(r)).elems {
if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
elems = append(elems, ce)
}
}
} }
return elems return elems
} }

View File

@ -20,7 +20,7 @@ type entryTest struct {
// entries plus a leading and trailing anchor. // entries plus a leading and trailing anchor.
func makeList(n int) []*entry { func makeList(n int) []*entry {
es := make([]*entry, n+2) es := make([]*entry, n+2)
weights := [][]int{{100, 20, 5, 0}} weights := []rawCE{{w: []int{100, 20, 5, 0}}}
for i := range es { for i := range es {
runes := []rune{rune(i)} runes := []rune{rune(i)}
es[i] = &entry{ es[i] = &entry{
@ -176,8 +176,8 @@ type entryLessTest struct {
} }
var ( var (
w1 = [][]int{{100, 20, 5, 5}} w1 = []rawCE{{w: []int{100, 20, 5, 5}}}
w2 = [][]int{{101, 20, 5, 5}} w2 = []rawCE{{w: []int{101, 20, 5, 5}}}
) )
var entryLessTests = []entryLessTest{ var entryLessTests = []entryLessTest{

View File

@ -23,7 +23,7 @@ const (
type colElem uint32 type colElem uint32
const ( const (
maxCE colElem = 0x80FFFFFF maxCE colElem = 0xAFFFFFFF
minContract = 0xC0000000 minContract = 0xC0000000
maxContract = 0xDFFFFFFF maxContract = 0xDFFFFFFF
minExpand = 0xE0000000 minExpand = 0xE0000000
@ -62,30 +62,37 @@ func (ce colElem) ctype() ceType {
// 01pppppp pppppppp ppppppp0 ssssssss // 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value // - p* is primary collation value
// - s* is the secondary collation value // - s* is the secondary collation value
// or
// 00pppppp pppppppp ppppppps sssttttt, where // 00pppppp pppppppp ppppppps sssttttt, where
// - p* is primary collation value // - p* is primary collation value
// - s* offset of secondary from default value. // - s* offset of secondary from default value.
// - t* is the tertiary collation value // - t* is the tertiary collation value
// 100ttttt cccccccc pppppppp pppppppp
// - t* is the tertiar collation value
// - c* is the cannonical combining class
// - p* is the primary collation value
// Collation elements with a secondary value are of the form // Collation elements with a secondary value are of the form
// 10000000 0000ssss ssssssss tttttttt, where // 1010cccc ccccssss ssssssss tttttttt, where
// - 16 BMP implicit -> weight // - c* is the canonical combining class
// - 8 bit s // - s* is the secondary collation value
// - default tertiary // - t* is the tertiary collation value
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000 // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
// - q* quaternary value // - q* quaternary value
const ( const (
ceTypeMask = 0xC0000000 ceTypeMask = 0xC0000000
ceTypeMaskExt = 0xE0000000
ceType1 = 0x40000000 ceType1 = 0x40000000
ceType2 = 0x00000000 ceType2 = 0x00000000
ceType3 = 0x80000000 ceType3or4 = 0x80000000
ceType4 = 0xA0000000
ceTypeQ = 0xC0000000 ceTypeQ = 0xC0000000
ceIgnore = ceType3 ceIgnore = ceType4
firstNonPrimary = 0x80000000 firstNonPrimary = 0x80000000
lastSpecialPrimary = 0xA0000000
secondaryMask = 0x80000000 secondaryMask = 0x80000000
hasTertiaryMask = 0x40000000 hasTertiaryMask = 0x40000000
primaryValueMask = 0x3FFFFE00 primaryValueMask = 0x3FFFFE00
primaryShift = 9 primaryShift = 9
compactPrimaryBits = 16
compactSecondaryShift = 5 compactSecondaryShift = 5
minCompactSecondary = defaultSecondary - 4 minCompactSecondary = defaultSecondary - 4
) )
@ -98,10 +105,23 @@ func makeQuaternary(primary int) colElem {
return ceTypeQ | colElem(primary<<primaryShift) return ceTypeQ | colElem(primary<<primaryShift)
} }
func (ce colElem) ccc() uint8 {
if ce&ceType3or4 != 0 {
if ce&ceType4 == ceType3or4 {
return uint8(ce >> 16)
}
return uint8(ce >> 20)
}
return 0
}
func (ce colElem) primary() int { func (ce colElem) primary() int {
if ce >= firstNonPrimary { if ce >= firstNonPrimary {
if ce > lastSpecialPrimary {
return 0 return 0
} }
return int(uint16(ce))
}
return int(ce&primaryValueMask) >> primaryShift return int(ce&primaryValueMask) >> primaryShift
} }
@ -111,8 +131,11 @@ func (ce colElem) secondary() int {
return int(uint8(ce)) return int(uint8(ce))
case ceType2: case ceType2:
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
case ceType3: case ceType3or4:
return int(uint16(ce >> 8)) if ce < ceType4 {
return defaultSecondary
}
return int(ce>>8) & 0xFFF
case ceTypeQ: case ceTypeQ:
return 0 return 0
} }
@ -121,10 +144,13 @@ func (ce colElem) secondary() int {
func (ce colElem) tertiary() uint8 { func (ce colElem) tertiary() uint8 {
if ce&hasTertiaryMask == 0 { if ce&hasTertiaryMask == 0 {
if ce&ceType3 == 0 { if ce&ceType3or4 == 0 {
return uint8(ce & 0x1F) return uint8(ce & 0x1F)
} }
if ce&ceType4 == ceType4 {
return uint8(ce) return uint8(ce)
}
return uint8(ce>>24) & 0x1F // type 2
} else if ce&ceTypeMask == ceType1 { } else if ce&ceTypeMask == ceType1 {
return defaultTertiary return defaultTertiary
} }
@ -134,10 +160,15 @@ func (ce colElem) tertiary() uint8 {
func (ce colElem) updateTertiary(t uint8) colElem { func (ce colElem) updateTertiary(t uint8) colElem {
if ce&ceTypeMask == ceType1 { if ce&ceTypeMask == ceType1 {
// convert to type 4
nce := ce & primaryValueMask nce := ce & primaryValueMask
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
ce = nce ce = nce
} else if ce&ceTypeMaskExt == ceType3or4 {
ce &= ^colElem(maxTertiary << 24)
return ce | (colElem(t) << 24)
} else { } else {
// type 2 or 4
ce &= ^colElem(maxTertiary) ce &= ^colElem(maxTertiary)
} }
return ce | colElem(t) return ce | colElem(t)

View File

@ -23,12 +23,19 @@ func makeCE(weights []int) colElem {
maxSecondaryDiffBits = 4 maxSecondaryDiffBits = 4
maxTertiaryBits = 8 maxTertiaryBits = 8
maxTertiaryCompactBits = 5 maxTertiaryCompactBits = 5
isSecondary = 0x80000000
isPrimary = 0x40000000 isPrimary = 0x40000000
isPrimaryCCC = 0x80000000
isSecondary = 0xA0000000
) )
var ce colElem var ce colElem
ccc := weights[3]
if weights[0] != 0 { if weights[0] != 0 {
if weights[2] == defaultTertiary { if ccc != 0 {
ce = colElem(weights[2] << 24)
ce |= colElem(ccc) << 16
ce |= colElem(weights[0])
ce |= isPrimaryCCC
} else if weights[2] == defaultTertiary {
ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1]) ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary ce |= isPrimary
} else { } else {
@ -38,6 +45,7 @@ func makeCE(weights []int) colElem {
} }
} else { } else {
ce = colElem(weights[1]<<maxTertiaryBits + weights[2]) ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
ce += colElem(ccc) << 20
ce |= isSecondary ce |= isSecondary
} }
return ce return ce
@ -68,10 +76,11 @@ func makeDecompose(t1, t2 int) colElem {
} }
func normalCE(inout []int) (ce colElem, t ceType) { func normalCE(inout []int) (ce colElem, t ceType) {
w := makeCE(inout) ce = makeCE(inout)
inout[0] = w.primary() inout[0] = ce.primary()
inout[1] = w.secondary() inout[1] = ce.secondary()
inout[2] = int(w.tertiary()) inout[2] = int(ce.tertiary())
inout[3] = int(ce.ccc())
return ce, ceNormal return ce, ceNormal
} }
@ -102,9 +111,13 @@ const (
) )
var ceTests = []ceTest{ var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}}, {normalCE, []int{0, 0, 0, 0}},
{normalCE, []int{0, 30, 3}}, {normalCE, []int{0, 30, 3, 0}},
{normalCE, []int{100, defaultSecondary, 3}}, {normalCE, []int{0, 30, 3, 0xFF}},
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0}},
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0xFF}},
{normalCE, []int{100, defaultSecondary, 3, 0}},
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}},
{contractCE, []int{0, 0, 0}}, {contractCE, []int{0, 0, 0}},
{contractCE, []int{1, 1, 1}}, {contractCE, []int{1, 1, 1}},
@ -127,11 +140,11 @@ func TestColElem(t *testing.T) {
copy(inout, tt.arg) copy(inout, tt.arg)
ce, typ := tt.f(inout) ce, typ := tt.f(inout)
if ce.ctype() != typ { if ce.ctype() != typ {
t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ) t.Errorf("%d: type is %d; want %d (ColElem: %X)", i, ce.ctype(), typ, ce)
} }
for j, a := range tt.arg { for j, a := range tt.arg {
if inout[j] != a { if inout[j] != a {
t.Errorf("%d: argument %d is %X; want %X", i, j, inout[j], a) t.Errorf("%d: argument %d is %X; want %X (ColElem: %X)", i, j, inout[j], a, ce)
} }
} }
} }
@ -176,7 +189,8 @@ func TestUpdateTertiary(t *testing.T) {
{0x4000FE20, 0x0000FE8A, 0x0A}, {0x4000FE20, 0x0000FE8A, 0x0A},
{0x4000FE21, 0x0000FEAA, 0x0A}, {0x4000FE21, 0x0000FEAA, 0x0A},
{0x0000FE8B, 0x0000FE83, 0x03}, {0x0000FE8B, 0x0000FE83, 0x03},
{0x8000CC02, 0x8000CC1B, 0x1B}, {0x82FF0188, 0x9BFF0188, 0x1B},
{0xAFF0CC02, 0xAFF0CC1B, 0x1B},
} }
for i, tt := range tests { for i, tt := range tests {
if out := tt.in.updateTertiary(tt.t); out != tt.out { if out := tt.in.updateTertiary(tt.t); out != tt.out {
@ -184,3 +198,77 @@ func TestUpdateTertiary(t *testing.T) {
} }
} }
} }
func TestDoNorm(t *testing.T) {
const div = -1 // The insertion point of the next block.
tests := []struct {
in, out []int
}{
{in: []int{4, div, 3},
out: []int{3, 4},
},
{in: []int{4, div, 3, 3, 3},
out: []int{3, 3, 3, 4},
},
{in: []int{0, 4, div, 3},
out: []int{0, 3, 4},
},
{in: []int{0, 0, 4, 5, div, 3, 3},
out: []int{0, 0, 3, 3, 4, 5},
},
{in: []int{0, 0, 1, 4, 5, div, 3, 3},
out: []int{0, 0, 1, 3, 3, 4, 5},
},
{in: []int{0, 0, 1, 4, 5, div, 4, 4},
out: []int{0, 0, 1, 4, 4, 4, 5},
},
}
for j, tt := range tests {
i := iter{}
var w, p, s int
for k, cc := range tt.in {
if cc == 0 {
s = 0
}
if cc == div {
w = 100
p = k
i.pStarter = s
continue
}
i.ce = append(i.ce, makeCE([]int{w, 20, 2, cc}))
}
i.prevCCC = i.ce[p-1].ccc()
i.doNorm(p, i.ce[p].ccc())
if len(i.ce) != len(tt.out) {
t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
}
prevCCC := uint8(0)
for k, ce := range i.ce {
if int(ce.ccc()) != tt.out[k] {
t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.ccc(), tt.out[k])
}
if k > 0 && ce.ccc() == prevCCC && i.ce[k-1].primary() > ce.primary() {
t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
}
}
}
// test cutoff of large sequence of combining characters.
result := []uint8{8, 8, 8, 5, 5}
for o := -2; o <= 2; o++ {
i := iter{pStarter: 2, prevCCC: 8}
n := maxCombiningCharacters + 1 + o
for j := 1; j < n+i.pStarter; j++ {
i.ce = append(i.ce, makeCE([]int{100, 20, 2, 8}))
}
p := len(i.ce)
i.ce = append(i.ce, makeCE([]int{0, 20, 2, 5}))
i.doNorm(p, 5)
if i.prevCCC != result[o+2] {
t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
}
if result[o+2] == 5 && i.pStarter != p {
t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
}
}
}

View File

@ -10,6 +10,7 @@ package collate
import ( import (
"bytes" "bytes"
"exp/norm" "exp/norm"
"unicode/utf8"
) )
// Level identifies the collation comparison level. // Level identifies the collation comparison level.
@ -112,7 +113,7 @@ func New(loc string) *Collator {
func newCollator(t *table) *Collator { func newCollator(t *table) *Collator {
c := &Collator{ c := &Collator{
Strength: Quaternary, Strength: Tertiary,
f: norm.NFD, f: norm.NFD,
t: t, t: t,
} }
@ -269,8 +270,7 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
func (c *Collator) getColElems(str []byte) []colElem { func (c *Collator) getColElems(str []byte) []colElem {
i := c.iter(0) i := c.iter(0)
i.setInput(c, str) i.setInput(c, str)
for !i.done() { for i.next() {
i.next()
} }
return i.ce return i.ce
} }
@ -278,88 +278,185 @@ func (c *Collator) getColElems(str []byte) []colElem {
func (c *Collator) getColElemsString(str string) []colElem { func (c *Collator) getColElemsString(str string) []colElem {
i := c.iter(0) i := c.iter(0)
i.setInputString(c, str) i.setInputString(c, str)
for !i.done() { for i.next() {
i.next()
} }
return i.ce return i.ce
} }
type source struct {
str string
bytes []byte
buf [16]byte // Used for decomposing Hangul.
}
func (src *source) done() bool {
return len(src.str) == 0 && len(src.bytes) == 0
}
func (src *source) tail(n int) (res source) {
if src.bytes == nil {
res.str = src.str[n:]
} else {
res.bytes = src.bytes[n:]
}
return res
}
func (src *source) nfd(end int) []byte {
if src.bytes == nil {
return norm.NFD.AppendString(src.buf[:0], src.str[:end])
}
return norm.NFD.Append(src.buf[:0], src.bytes[:end]...)
}
func (src *source) properties(f norm.Form) norm.Properties {
if src.bytes == nil {
return f.PropertiesString(src.str)
}
return f.Properties(src.bytes)
}
func (src *source) lookup(t *table) (ce colElem, sz int) {
if src.bytes == nil {
return t.index.lookupString(src.str)
}
return t.index.lookup(src.bytes)
}
func (src *source) rune() (r rune, sz int) {
if src.bytes == nil {
return utf8.DecodeRuneInString(src.str)
}
return utf8.DecodeRune(src.bytes)
}
type iter struct { type iter struct {
src norm.Iter src source
norm [1024]byte
buf []byte
p int
minBufSize int
wa [512]colElem wa [512]colElem
ce []colElem ce []colElem
pce int pce int
nce int // nce <= len(nce)
prevCCC uint8
pStarter int
t *table t *table
_done, eof bool
} }
func (i *iter) init(c *Collator) { func (i *iter) init(c *Collator) {
i.t = c.t i.t = c.t
i.minBufSize = c.t.maxContractLen
i.ce = i.wa[:0] i.ce = i.wa[:0]
i.buf = i.norm[:0]
} }
func (i *iter) reset() { func (i *iter) reset() {
i.ce = i.ce[:0] i.ce = i.ce[:0]
i.buf = i.buf[:0] i.nce = 0
i.p = 0 i.prevCCC = 0
i.eof = i.src.Done() i.pStarter = 0
i._done = i.eof
} }
func (i *iter) setInput(c *Collator, s []byte) *iter { func (i *iter) setInput(c *Collator, s []byte) *iter {
i.src.SetInput(c.f, s) i.src.bytes = s
i.src.str = ""
i.reset() i.reset()
return i return i
} }
func (i *iter) setInputString(c *Collator, s string) *iter { func (i *iter) setInputString(c *Collator, s string) *iter {
i.src.SetInputString(c.f, s) i.src.str = s
i.src.bytes = nil
i.reset() i.reset()
return i return i
} }
func (i *iter) done() bool { // next appends colElems to the internal array until it adds an element with CCC=0.
return i._done // In the majority of cases, a colElem with a primary value > 0 will have
// a CCC of 0. The CCC values of colation elements are also used to detect if the
// input string was not normalized and to adjust the result accordingly.
func (i *iter) next() bool {
sz := 0
for !i.src.done() {
p0 := len(i.ce)
i.ce, sz = i.t.appendNext(i.ce, i.src)
i.src = i.src.tail(sz)
last := len(i.ce) - 1
if ccc := i.ce[last].ccc(); ccc == 0 {
i.nce = len(i.ce)
i.pStarter = last
i.prevCCC = 0
return true
} else if p0 < last && i.ce[p0].ccc() == 0 {
// set i.nce to only cover part of i.ce for which ccc == 0 and
// use rest the next call to next.
for p0++; p0 < last && i.ce[p0].ccc() == 0; p0++ {
}
i.nce = p0
i.pStarter = p0 - 1
i.prevCCC = ccc
return true
} else if ccc < i.prevCCC {
i.doNorm(p0, ccc) // should be rare for most common cases
} else {
i.prevCCC = ccc
}
}
if len(i.ce) != i.nce {
i.nce = len(i.ce)
return true
}
return false
} }
func (i *iter) next() { // nextPlain is the same as next, but does not "normalize" the collation
if !i.eof && len(i.buf)-i.p < i.minBufSize { // elements.
// replenish buffer // TODO: remove this function. Using this instead of next does not seem
n := copy(i.buf, i.buf[i.p:]) // to improve performance in any significant way. We retain this until
n += i.src.Next(i.buf[n:cap(i.buf)]) // later for evaluation purposes.
i.buf = i.buf[:n] func (i *iter) nextPlain() bool {
i.p = 0 if i.src.done() {
i.eof = i.src.Done() return false
}
if i.p == len(i.buf) {
i._done = true
return
} }
sz := 0 sz := 0
i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:]) i.ce, sz = i.t.appendNext(i.ce, i.src)
i.p += sz i.src = i.src.tail(sz)
i.nce = len(i.ce)
return true
}
const maxCombiningCharacters = 30
// doNorm reorders the collation elements in i.ce.
// It assumes that blocks of collation elements added with appendNext
// either start and end with the same CCC or start with CCC == 0.
// This allows for a single insertion point for the entire block.
// The correctness of this assumption is verified in builder.go.
func (i *iter) doNorm(p int, ccc uint8) {
if p-i.pStarter > maxCombiningCharacters {
i.prevCCC = i.ce[len(i.ce)-1].ccc()
i.pStarter = len(i.ce) - 1
return
}
n := len(i.ce)
k := p
for p--; p > i.pStarter && ccc < i.ce[p-1].ccc(); p-- {
}
i.ce = append(i.ce, i.ce[p:k]...)
copy(i.ce[p:], i.ce[k:])
i.ce = i.ce[:n]
} }
func (i *iter) nextPrimary() int { func (i *iter) nextPrimary() int {
for { for {
for ; i.pce < len(i.ce); i.pce++ { for ; i.pce < i.nce; i.pce++ {
if v := i.ce[i.pce].primary(); v != 0 { if v := i.ce[i.pce].primary(); v != 0 {
i.pce++ i.pce++
return v return v
} }
} }
if i.done() { if !i.next() {
return 0 return 0
} }
i.next()
} }
panic("should not reach here") panic("should not reach here")
} }

View File

@ -378,6 +378,7 @@ var keyTests = []keyTest{
func TestKey(t *testing.T) { func TestKey(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in) c, _ := makeTable(appendNextTests[4].in)
c.Alternate = collate.AltShifted c.Alternate = collate.AltShifted
c.Strength = collate.Quaternary
buf := collate.Buffer{} buf := collate.Buffer{}
keys1 := [][]byte{} keys1 := [][]byte{}
keys2 := [][]byte{} keys2 := [][]byte{}

View File

@ -27,8 +27,21 @@ type ctScanner struct {
done bool done bool
} }
type ctScannerString struct {
states contractTrieSet
s string
n int
index int
pindex int
done bool
}
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner { func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
return ctScanner{states: t[index:], s: b, n: n} return ctScanner{s: b, states: t[index:], n: n}
}
func (t contractTrieSet) scannerString(index, n int, str string) ctScannerString {
return ctScannerString{s: str, states: t[index:], n: n}
} }
// result returns the offset i and bytes consumed p so far. If no suffix // result returns the offset i and bytes consumed p so far. If no suffix
@ -37,6 +50,10 @@ func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex return s.index, s.pindex
} }
func (s *ctScannerString) result() (i, p int) {
return s.index, s.pindex
}
const ( const (
final = 0 final = 0
noIndex = 0xFF noIndex = 0xFF
@ -84,3 +101,45 @@ func (s *ctScanner) scan(p int) int {
} }
return pr return pr
} }
// scan is a verbatim copy of ctScanner.scan.
func (s *ctScannerString) scan(p int) int {
pr := p // the p at the rune start
str := s.s
states, n := s.states, s.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
// TODO: a significant number of contractions are of a form that
// cannot match discontiguous UTF-8 in a normalized string. We could let
// a negative value of e.n mean that we can set s.done = true and avoid
// the need for additional matches.
if c >= e.l {
if e.l == c {
p++
if e.i != noIndex {
s.index = int(e.i)
s.pindex = p
}
if e.n != final {
i, states, n = 0, states[int(e.h)+n:], int(e.n)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
} else {
s.done = true
return p
}
continue
} else if e.n == final && c <= e.h {
p++
s.done = true
s.index = int(c-e.l) + int(e.i)
s.pindex = p
return p
}
}
i++
}
return pr
}

View File

@ -30,7 +30,7 @@ func W(ce ...int) Weights {
return w return w
} }
func (w Weights) String() string { func (w Weights) String() string {
return fmt.Sprintf("[%d.%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary) return fmt.Sprintf("[%X.%X.%X.%X]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
} }
type Table struct { type Table struct {
@ -52,7 +52,7 @@ func convertToWeights(ws []colElem) []Weights {
func convertFromWeights(ws []Weights) []colElem { func convertFromWeights(ws []Weights) []colElem {
out := make([]colElem, len(ws)) out := make([]colElem, len(ws))
for i, w := range ws { for i, w := range ws {
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary}) out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary, 0})
if out[i] == ceIgnore && w.Quaternary > 0 { if out[i] == ceIgnore && w.Quaternary > 0 {
out[i] = makeQuaternary(w.Quaternary) out[i] = makeQuaternary(w.Quaternary)
} }
@ -61,7 +61,7 @@ func convertFromWeights(ws []Weights) []colElem {
} }
func (t *Table) AppendNext(s []byte) ([]Weights, int) { func (t *Table) AppendNext(s []byte) ([]Weights, int) {
w, n := t.t.appendNext(nil, s) w, n := t.t.appendNext(nil, source{bytes: s})
return convertToWeights(w), n return convertToWeights(w), n
} }

View File

@ -42,13 +42,26 @@ func (t *table) indexedTable(idx tableIndex) *table {
// sequence of runes, the weights for the interstitial runes are // sequence of runes, the weights for the interstitial runes are
// appended as well. It returns a new slice that includes the appended // appended as well. It returns a new slice that includes the appended
// weights and the number of bytes consumed from s. // weights and the number of bytes consumed from s.
func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) { func (t *table) appendNext(w []colElem, src source) (res []colElem, n int) {
v, sz := t.index.lookup(s) ce, sz := src.lookup(t)
ce := colElem(v)
tp := ce.ctype() tp := ce.ctype()
if tp == ceNormal { if tp == ceNormal {
if ce == 0 { if ce == 0 {
r, _ := utf8.DecodeRune(s) r, _ := src.rune()
const (
hangulSize = 3
firstHangul = 0xAC00
lastHangul = 0xD7A3
)
if r >= firstHangul && r <= lastHangul {
// TODO: performance can be considerably improved here.
n = sz
for b := src.nfd(hangulSize); len(b) > 0; b = b[sz:] {
ce, sz = t.index.lookup(b)
w = append(w, ce)
}
return w, n
}
ce = makeImplicitCE(implicitPrimary(r)) ce = makeImplicitCE(implicitPrimary(r))
} }
w = append(w, ce) w = append(w, ce)
@ -56,15 +69,20 @@ func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
w = t.appendExpansion(w, ce) w = t.appendExpansion(w, ce)
} else if tp == ceContractionIndex { } else if tp == ceContractionIndex {
n := 0 n := 0
w, n = t.matchContraction(w, ce, s[sz:]) src = src.tail(sz)
if src.bytes == nil {
w, n = t.matchContractionString(w, ce, src.str)
} else {
w, n = t.matchContraction(w, ce, src.bytes)
}
sz += n sz += n
} else if tp == ceDecompose { } else if tp == ceDecompose {
// Decompose using NFCK and replace tertiary weights. // Decompose using NFKD and replace tertiary weights.
t1, t2 := splitDecompose(ce) t1, t2 := splitDecompose(ce)
i := len(w) i := len(w)
nfkd := norm.NFKD.Properties(s).Decomposition() nfkd := src.properties(norm.NFKD).Decomposition()
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] { for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
w, p = t.appendNext(w, nfkd) w, p = t.appendNext(w, source{bytes: nfkd})
} }
w[i] = w[i].updateTertiary(t1) w[i] = w[i].updateTertiary(t1)
if i++; i < len(w) { if i++; i < len(w) {
@ -99,16 +117,17 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
// By now we should have filtered most cases. // By now we should have filtered most cases.
p0 := p p0 := p
bufn := 0 bufn := 0
rune := norm.NFC.Properties(suffix[p:]) rune := norm.NFD.Properties(suffix[p:])
p += rune.Size() p += rune.Size()
if prevCC := rune.TrailCCC(); prevCC != 0 { if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment. // A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize. // This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFC.FirstBoundary(suffix[p:]); end != -1 { if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
scan.s = suffix[:p+end] scan.s = suffix[:p+end]
} }
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf { for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFC.Properties(suffix[p:]) rune = norm.NFD.Properties(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc { if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break break
} }
@ -136,7 +155,65 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
} }
// Append weights for the runes in the segment not part of the contraction. // Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] { for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, b) w, p = t.appendNext(w, source{bytes: b})
}
return w, n
}
// TODO: unify the two implementations. This is best done after first simplifying
// the algorithm taking into account the inclusion of both NFC and NFD forms
// in the table.
func (t *table) matchContractionString(w []colElem, ce colElem, suffix string) ([]colElem, int) {
index, n, offset := splitContractIndex(ce)
scan := t.contractTries.scannerString(index, n, suffix)
buf := [norm.MaxSegmentSize]byte{}
bufp := 0
p := scan.scan(0)
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
// By now we should have filtered most cases.
p0 := p
bufn := 0
rune := norm.NFD.PropertiesString(suffix[p:])
p += rune.Size()
if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
scan.s = suffix[:p+end]
}
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFD.PropertiesString(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break
}
prevCC = rune.TrailCCC()
if pp := scan.scan(p); pp != p {
// Copy the interstitial runes for later processing.
bufn += copy(buf[bufn:], suffix[p0:p])
if scan.pindex == pp {
bufp = bufn
}
p, p0 = pp, pp
} else {
p += rune.Size()
}
}
}
}
// Append weights for the matched contraction, which may be an expansion.
i, n := scan.result()
ce = colElem(t.contractElem[i+offset])
if ce.ctype() == ceNormal {
w = append(w, ce)
} else {
w = t.appendExpansion(w, ce)
}
// Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, source{bytes: b})
} }
return w, n return w, n
} }

View File

@ -42,7 +42,9 @@ func pt(p, t int) []int {
func makeTable(in []input) (*collate.Collator, error) { func makeTable(in []input) (*collate.Collator, error) {
b := build.NewBuilder() b := build.NewBuilder()
for _, r := range in { for _, r := range in {
b.Add([]rune(r.str), r.ces, nil) if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
panic(e)
}
} }
return b.Build() return b.Build()
} }
@ -159,6 +161,7 @@ var appendNextTests = []tableTest{
{"b", [][]int{{200}}}, {"b", [][]int{{200}}},
{"c", [][]int{{300}}}, {"c", [][]int{{300}}},
{"\u03B1", [][]int{{900}}}, {"\u03B1", [][]int{{900}}},
{"\x01", [][]int{{0, 0, 0, 0}}},
// contractions // contractions
{"a\u0300", [][]int{{101}}}, {"a\u0300", [][]int{{101}}},
@ -171,10 +174,11 @@ var appendNextTests = []tableTest{
{"a\u0301\u035F", [][]int{{121}}}, {"a\u0301\u035F", [][]int{{121}}},
{"a\u0301\u035Fb", [][]int{{119}}}, {"a\u0301\u035Fb", [][]int{{119}}},
{"\u03B1\u0345", [][]int{{901}, {902}}}, {"\u03B1\u0345", [][]int{{901}, {902}}},
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}}, {"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
{"\u302F\u18A9", [][]int{{0, 130}}}, {"\u302F\u18A9", [][]int{{0, 130}}},
}...), }...),
[]check{ []check{
{"a\x01\u0300", 1, ColElems{w(100)}},
{"ab", 1, ColElems{w(100)}}, // closing segment {"ab", 1, ColElems{w(100)}}, // closing segment
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment {"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment {"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
@ -239,12 +243,17 @@ var appendNextTests = []tableTest{
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}}, {"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
// expansion within a gap // expansion within a gap
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}}, {"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
{"a\u302E\u18A9\u0301", 9, ColElems{w(102), w(0, 131), w(0, 132)}}, // repeating CCC blocks last modifier
{ {"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
"a\u0317\u302E\u18A9\u0301", // The trailing combining characters (with lower CCC) should block the first one.
11, // TODO: make the following pass.
ColElems{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)}, // {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
}, {"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
// Last combiner should match after normalization.
// TODO: make the following pass.
// {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
// The first combiner is blocking the second one as they have the same CCC.
{"a\u035D\u035Eb", 1, ColElems{w(100)}},
}, },
}, },
} }

File diff suppressed because it is too large Load Diff

View File

@ -97,3 +97,64 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
// Illegal rune // Illegal rune
return 0, 1 return 0, 1
} }
// The body of lookupString is a verbatim copy of that of lookup.
func (t *trie) lookupString(s string) (v colElem, sz int) {
c0 := s[0]
switch {
case c0 < tx:
return colElem(t.values0[c0]), 1
case c0 < t2:
return 0, 1
case c0 < t3:
if len(s) < 2 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
return t.lookupValue(i, c1), 2
case c0 < t4:
if len(s) < 3 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
return t.lookupValue(i, c2), 3
case c0 < t5:
if len(s) < 4 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = int(i)<<6 + int(c2)
i = t.index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
return 0, 3
}
return t.lookupValue(i, c3), 4
}
// Illegal rune
return 0, 1
}