mirror of
https://github.com/golang/go
synced 2024-11-19 01:54:39 -07:00
exp/locale/collate: include composed characters into the table. This eliminates
the need to decompose characters for the majority of cases. This considerably speeds up collation while increasing the table size minimally. To detect non-normalized strings, rather than relying on exp/norm, the table now includes CCC information. The inclusion of this information does not increase table size. DETAILS - Raw collation elements are now a struct that includes the CCC, rather than a slice of ints. - Builder now ensures that NFD and NFC counterparts are included in the table. This also fixes a bug for Korean which is responsible for most of the growth of the table size. - As there is no more normalization step, code should now handle both strings and byte slices as input. Introduced source type to facilitate this. NOTES - This change does not handle normalization correctly entirely for contractions. This causes a few failures with the regtest. table_test.go contains a few uncommented tests that can be enabled once this is fixed. The easiest is to fix this once we have the new norm.Iter. - Removed a test cases in table_test that covers cases that are now guaranteed to not exist. R=rsc, mpvl CC=golang-dev https://golang.org/cl/6971044
This commit is contained in:
parent
43f2fc308b
commit
9aa70984a9
@ -98,24 +98,24 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
|
|||||||
// a value for each colelem that is a variable. (See the reference above.)
|
// a value for each colelem that is a variable. (See the reference above.)
|
||||||
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
||||||
str := string(runes)
|
str := string(runes)
|
||||||
elems := make([][]int, len(colelems))
|
elems := make([]rawCE, len(colelems))
|
||||||
for i, ce := range colelems {
|
for i, ce := range colelems {
|
||||||
elems[i] = append(elems[i], ce...)
|
|
||||||
if len(ce) == 0 {
|
if len(ce) == 0 {
|
||||||
elems[i] = append(elems[i], []int{0, 0, 0, 0}...)
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
elems[i] = makeRawCE(ce, 0)
|
||||||
if len(ce) == 1 {
|
if len(ce) == 1 {
|
||||||
elems[i] = append(elems[i], defaultSecondary)
|
elems[i].w[1] = defaultSecondary
|
||||||
}
|
}
|
||||||
if len(ce) <= 2 {
|
if len(ce) <= 2 {
|
||||||
elems[i] = append(elems[i], defaultTertiary)
|
elems[i].w[2] = defaultTertiary
|
||||||
}
|
}
|
||||||
if len(ce) <= 3 {
|
if len(ce) <= 3 {
|
||||||
elems[i] = append(elems[i], ce[0])
|
elems[i].w[3] = ce[0]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, ce := range elems {
|
for i, ce := range elems {
|
||||||
|
p := ce.w[0]
|
||||||
isvar := false
|
isvar := false
|
||||||
for _, j := range variables {
|
for _, j := range variables {
|
||||||
if i == j {
|
if i == j {
|
||||||
@ -123,18 +123,18 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if isvar {
|
if isvar {
|
||||||
if ce[0] >= b.minNonVar && b.minNonVar > 0 {
|
if p >= b.minNonVar && b.minNonVar > 0 {
|
||||||
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar)
|
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
|
||||||
}
|
}
|
||||||
if ce[0] > b.varTop {
|
if p > b.varTop {
|
||||||
b.varTop = ce[0]
|
b.varTop = p
|
||||||
}
|
}
|
||||||
} else if ce[0] > 1 { // 1 is a special primary value reserved for FFFE
|
} else if p > 1 { // 1 is a special primary value reserved for FFFE
|
||||||
if ce[0] <= b.varTop {
|
if p <= b.varTop {
|
||||||
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop)
|
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
|
||||||
}
|
}
|
||||||
if b.minNonVar == 0 || ce[0] < b.minNonVar {
|
if b.minNonVar == 0 || p < b.minNonVar {
|
||||||
b.minNonVar = ce[0]
|
b.minNonVar = p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,16 +142,42 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
cccs := []uint8{}
|
||||||
|
nfd := norm.NFD.String(str)
|
||||||
|
for i := range nfd {
|
||||||
|
cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
|
||||||
|
}
|
||||||
|
if len(cccs) < len(elems) {
|
||||||
|
if len(cccs) > 2 {
|
||||||
|
return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
|
||||||
|
}
|
||||||
|
p := len(elems) - 1
|
||||||
|
for ; p > 0 && elems[p].w[0] == 0; p-- {
|
||||||
|
elems[p].ccc = cccs[len(cccs)-1]
|
||||||
|
}
|
||||||
|
for ; p >= 0; p-- {
|
||||||
|
elems[p].ccc = cccs[0]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i := range elems {
|
||||||
|
elems[i].ccc = cccs[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// doNorm in collate.go assumes that the following conditions hold.
|
||||||
|
if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
|
||||||
|
return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
|
||||||
|
}
|
||||||
b.root.newEntry(str, elems)
|
b.root.newEntry(str, elems)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Tailoring) setAnchor(anchor string) error {
|
func (t *Tailoring) setAnchor(anchor string) error {
|
||||||
anchor = norm.NFD.String(anchor)
|
anchor = norm.NFC.String(anchor)
|
||||||
a := t.index.find(anchor)
|
a := t.index.find(anchor)
|
||||||
if a == nil {
|
if a == nil {
|
||||||
a = t.index.newEntry(anchor, nil)
|
a = t.index.newEntry(anchor, nil)
|
||||||
a.implicit = true
|
a.implicit = true
|
||||||
|
a.modified = true
|
||||||
for _, r := range []rune(anchor) {
|
for _, r := range []rune(anchor) {
|
||||||
e := t.index.find(string(r))
|
e := t.index.find(string(r))
|
||||||
e.lock = true
|
e.lock = true
|
||||||
@ -221,7 +247,7 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
|
|||||||
if t.anchor == nil {
|
if t.anchor == nil {
|
||||||
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
|
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
|
||||||
}
|
}
|
||||||
str = norm.NFD.String(str)
|
str = norm.NFC.String(str)
|
||||||
e := t.index.find(str)
|
e := t.index.find(str)
|
||||||
if e == nil {
|
if e == nil {
|
||||||
e = t.index.newEntry(str, nil)
|
e = t.index.newEntry(str, nil)
|
||||||
@ -262,12 +288,13 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
|
|||||||
}
|
}
|
||||||
e.extend = norm.NFD.String(extend)
|
e.extend = norm.NFD.String(extend)
|
||||||
e.exclude = false
|
e.exclude = false
|
||||||
|
e.modified = true
|
||||||
e.elems = nil
|
e.elems = nil
|
||||||
t.anchor = e
|
t.anchor = e
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *ordering) getWeight(e *entry) [][]int {
|
func (o *ordering) getWeight(e *entry) []rawCE {
|
||||||
if len(e.elems) == 0 && e.logical == noAnchor {
|
if len(e.elems) == 0 && e.logical == noAnchor {
|
||||||
if e.implicit {
|
if e.implicit {
|
||||||
for _, r := range e.runes {
|
for _, r := range e.runes {
|
||||||
@ -279,11 +306,10 @@ func (o *ordering) getWeight(e *entry) [][]int {
|
|||||||
for ; a.elems == nil && !a.implicit; a = a.next {
|
for ; a.elems == nil && !a.implicit; a = a.next {
|
||||||
count[a.level]++
|
count[a.level]++
|
||||||
}
|
}
|
||||||
e.elems = append([][]int(nil), make([]int, len(a.elems[0])))
|
e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
|
||||||
copy(e.elems[0], a.elems[0])
|
|
||||||
for i := collate.Primary; i < collate.Quaternary; i++ {
|
for i := collate.Primary; i < collate.Quaternary; i++ {
|
||||||
if count[i] != 0 {
|
if count[i] != 0 {
|
||||||
e.elems[0][i] -= count[i]
|
e.elems[0].w[i] -= count[i]
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -315,11 +341,11 @@ func (o *ordering) verifyWeights(a, b *entry, level collate.Level) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
for i := collate.Primary; i < level; i++ {
|
for i := collate.Primary; i < level; i++ {
|
||||||
if a.elems[0][i] < b.elems[0][i] {
|
if a.elems[0].w[i] < b.elems[0].w[i] {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if a.elems[0][level] >= b.elems[0][level] {
|
if a.elems[0].w[level] >= b.elems[0].w[level] {
|
||||||
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
|
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
|
||||||
log.Println(err)
|
log.Println(err)
|
||||||
// TODO: return the error instead, or better, fix the conflicting entry by making room.
|
// TODO: return the error instead, or better, fix the conflicting entry by making room.
|
||||||
@ -339,6 +365,54 @@ func (b *Builder) errorID(locale string, e error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// patchNorm ensures that NFC and NFD counterparts are consistent.
|
||||||
|
func (o *ordering) patchNorm() {
|
||||||
|
// Insert the NFD counterparts, if necessary.
|
||||||
|
for _, e := range o.ordered {
|
||||||
|
nfd := norm.NFD.String(e.str)
|
||||||
|
if nfd != e.str {
|
||||||
|
if e0 := o.find(nfd); e0 != nil && !e0.modified {
|
||||||
|
e0.elems = e.elems
|
||||||
|
} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
|
||||||
|
e := o.newEntry(nfd, e.elems)
|
||||||
|
e.modified = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Update unchanged composed forms if one of their parts changed.
|
||||||
|
for _, e := range o.ordered {
|
||||||
|
nfd := norm.NFD.String(e.str)
|
||||||
|
if e.modified || nfd == e.str {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if e0 := o.find(nfd); e0 != nil {
|
||||||
|
e.elems = e0.elems
|
||||||
|
} else {
|
||||||
|
e.elems = o.genColElems(nfd)
|
||||||
|
if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
|
||||||
|
r := []rune(nfd)
|
||||||
|
head := string(r[0])
|
||||||
|
tail := ""
|
||||||
|
for i := 1; i < len(r); i++ {
|
||||||
|
s := norm.NFC.String(head + string(r[i]))
|
||||||
|
if e0 := o.find(s); e0 != nil && e0.modified {
|
||||||
|
head = s
|
||||||
|
} else {
|
||||||
|
tail += string(r[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e.elems = append(o.genColElems(head), o.genColElems(tail)...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Exclude entries for which the individual runes generate the same collation elements.
|
||||||
|
for _, e := range o.ordered {
|
||||||
|
if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
|
||||||
|
e.exclude = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (b *Builder) buildOrdering(o *ordering) {
|
func (b *Builder) buildOrdering(o *ordering) {
|
||||||
for _, e := range o.ordered {
|
for _, e := range o.ordered {
|
||||||
o.getWeight(e)
|
o.getWeight(e)
|
||||||
@ -346,6 +420,7 @@ func (b *Builder) buildOrdering(o *ordering) {
|
|||||||
for _, e := range o.ordered {
|
for _, e := range o.ordered {
|
||||||
o.addExtension(e)
|
o.addExtension(e)
|
||||||
}
|
}
|
||||||
|
o.patchNorm()
|
||||||
o.sort()
|
o.sort()
|
||||||
simplify(o)
|
simplify(o)
|
||||||
b.processExpansions(o) // requires simplify
|
b.processExpansions(o) // requires simplify
|
||||||
@ -436,20 +511,20 @@ func (b *Builder) Print(w io.Writer) (n int, err error) {
|
|||||||
|
|
||||||
// reproducibleFromNFKD checks whether the given expansion could be generated
|
// reproducibleFromNFKD checks whether the given expansion could be generated
|
||||||
// from an NFKD expansion.
|
// from an NFKD expansion.
|
||||||
func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
|
func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
|
||||||
// Length must be equal.
|
// Length must be equal.
|
||||||
if len(exp) != len(nfkd) {
|
if len(exp) != len(nfkd) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
for i, ce := range exp {
|
for i, ce := range exp {
|
||||||
// Primary and secondary values should be equal.
|
// Primary and secondary values should be equal.
|
||||||
if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] {
|
if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
// Tertiary values should be equal to maxTertiary for third element onwards.
|
// Tertiary values should be equal to maxTertiary for third element onwards.
|
||||||
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
|
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
|
||||||
// simply be dropped. Try this out by dropping the following code.
|
// simply be dropped. Try this out by dropping the following code.
|
||||||
if i >= 2 && ce[2] != maxTertiary {
|
if i >= 2 && ce.w[2] != maxTertiary {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if _, err := makeCE(ce); err != nil {
|
if _, err := makeCE(ce); err != nil {
|
||||||
@ -469,22 +544,12 @@ func simplify(o *ordering) {
|
|||||||
keep[e.runes[0]] = true
|
keep[e.runes[0]] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Remove entries for which the runes normalize (using NFD) to identical values.
|
|
||||||
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
|
|
||||||
s := e.str
|
|
||||||
nfd := norm.NFD.String(s)
|
|
||||||
if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if equalCEArrays(o.genColElems(nfd), e.elems) {
|
|
||||||
e.remove()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Tag entries for which the runes NFKD decompose to identical values.
|
// Tag entries for which the runes NFKD decompose to identical values.
|
||||||
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
|
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
|
||||||
s := e.str
|
s := e.str
|
||||||
nfkd := norm.NFKD.String(s)
|
nfkd := norm.NFKD.String(s)
|
||||||
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == s {
|
nfd := norm.NFD.String(s)
|
||||||
|
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
|
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
|
||||||
@ -589,18 +654,18 @@ func (b *Builder) processContractions(o *ordering) {
|
|||||||
// Bucket sort entries in index order.
|
// Bucket sort entries in index order.
|
||||||
es := make([]*entry, len(l))
|
es := make([]*entry, len(l))
|
||||||
for _, e := range l {
|
for _, e := range l {
|
||||||
var o, sn int
|
var p, sn int
|
||||||
if len(e.runes) > 1 {
|
if len(e.runes) > 1 {
|
||||||
str := []byte(string(e.runes[1:]))
|
str := []byte(string(e.runes[1:]))
|
||||||
o, sn = t.contractTries.lookup(handle, str)
|
p, sn = t.contractTries.lookup(handle, str)
|
||||||
if sn != len(str) {
|
if sn != len(str) {
|
||||||
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str))
|
log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if es[o] != nil {
|
if es[p] != nil {
|
||||||
log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0])
|
log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
|
||||||
}
|
}
|
||||||
es[o] = e
|
es[p] = e
|
||||||
}
|
}
|
||||||
// Create collation elements for contractions.
|
// Create collation elements for contractions.
|
||||||
elems := []uint32{}
|
elems := []uint32{}
|
||||||
|
@ -7,48 +7,64 @@ package build
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
// cjk returns an implicit collation element for a CJK rune.
|
// cjk returns an implicit collation element for a CJK rune.
|
||||||
func cjk(r rune) [][]int {
|
func cjk(r rune) []rawCE {
|
||||||
// A CJK character C is represented in the DUCET as
|
// A CJK character C is represented in the DUCET as
|
||||||
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
|
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
|
||||||
// Where AAAA is the most significant 15 bits plus a base value.
|
// Where AAAA is the most significant 15 bits plus a base value.
|
||||||
// Any base value will work for the test, so we pick the common value of FB40.
|
// Any base value will work for the test, so we pick the common value of FB40.
|
||||||
const base = 0xFB40
|
const base = 0xFB40
|
||||||
return [][]int{
|
return []rawCE{
|
||||||
{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)},
|
{w: []int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}},
|
||||||
{int(r&0x7FFF) | 0x8000, 0, 0, int(r)},
|
{w: []int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func pCE(p int) [][]int {
|
func pCE(p int) []rawCE {
|
||||||
return [][]int{{p, defaultSecondary, defaultTertiary, 0}}
|
return mkCE([]int{p, defaultSecondary, defaultTertiary, 0}, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func pqCE(p, q int) [][]int {
|
func pqCE(p, q int) []rawCE {
|
||||||
return [][]int{{p, defaultSecondary, defaultTertiary, q}}
|
return mkCE([]int{p, defaultSecondary, defaultTertiary, q}, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ptCE(p, t int) [][]int {
|
func ptCE(p, t int) []rawCE {
|
||||||
return [][]int{{p, defaultSecondary, t, 0}}
|
return mkCE([]int{p, defaultSecondary, t, 0}, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func sCE(s int) [][]int {
|
func ptcCE(p, t int, ccc uint8) []rawCE {
|
||||||
return [][]int{{0, s, defaultTertiary, 0}}
|
return mkCE([]int{p, defaultSecondary, t, 0}, ccc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func stCE(s, t int) [][]int {
|
func sCE(s int) []rawCE {
|
||||||
return [][]int{{0, s, t, 0}}
|
return mkCE([]int{0, s, defaultTertiary, 0}, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func stCE(s, t int) []rawCE {
|
||||||
|
return mkCE([]int{0, s, t, 0}, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func scCE(s int, ccc uint8) []rawCE {
|
||||||
|
return mkCE([]int{0, s, defaultTertiary, 0}, ccc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func mkCE(w []int, ccc uint8) []rawCE {
|
||||||
|
return []rawCE{rawCE{w, ccc}}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ducetElem is used to define test data that is used to generate a table.
|
// ducetElem is used to define test data that is used to generate a table.
|
||||||
type ducetElem struct {
|
type ducetElem struct {
|
||||||
str string
|
str string
|
||||||
ces [][]int
|
ces []rawCE
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
||||||
b := NewBuilder()
|
b := NewBuilder()
|
||||||
for _, e := range ducet {
|
for _, e := range ducet {
|
||||||
if err := b.Add([]rune(e.str), e.ces, nil); err != nil {
|
ces := [][]int{}
|
||||||
|
for _, ce := range e.ces {
|
||||||
|
ces = append(ces, ce.w)
|
||||||
|
}
|
||||||
|
if err := b.Add([]rune(e.str), ces, nil); err != nil {
|
||||||
t.Errorf(err.Error())
|
t.Errorf(err.Error())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -58,7 +74,7 @@ func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type convertTest struct {
|
type convertTest struct {
|
||||||
in, out [][]int
|
in, out []rawCE
|
||||||
err bool
|
err bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -173,16 +189,18 @@ func TestSimplify(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var expandTest = []ducetElem{
|
var expandTest = []ducetElem{
|
||||||
{"\u00C0", append(ptCE(100, 8), sCE(30)...)},
|
{"\u0300", append(scCE(29, 230), scCE(30, 230)...)},
|
||||||
{"\u00C8", append(ptCE(105, 8), sCE(30)...)},
|
{"\u00C0", append(ptCE(100, 8), scCE(30, 230)...)},
|
||||||
{"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion
|
{"\u00C8", append(ptCE(105, 8), scCE(30, 230)...)},
|
||||||
|
{"\u00C9", append(ptCE(105, 8), scCE(30, 230)...)}, // identical expansion
|
||||||
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
|
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
|
||||||
|
{"\u01FF", append(ptCE(200, 4), ptcCE(201, 4, 0)[0], scCE(30, 230)[0])},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExpand(t *testing.T) {
|
func TestExpand(t *testing.T) {
|
||||||
const (
|
const (
|
||||||
totalExpansions = 3
|
totalExpansions = 5
|
||||||
totalElements = 2 + 2 + 3 + totalExpansions
|
totalElements = 2 + 2 + 2 + 3 + 3 + totalExpansions
|
||||||
)
|
)
|
||||||
b := newBuilder(t, expandTest)
|
b := newBuilder(t, expandTest)
|
||||||
o := &b.root
|
o := &b.root
|
||||||
|
@ -16,6 +16,17 @@ const (
|
|||||||
maxTertiary = 0x1F
|
maxTertiary = 0x1F
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type rawCE struct {
|
||||||
|
w []int
|
||||||
|
ccc uint8
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeRawCE(w []int, ccc uint8) rawCE {
|
||||||
|
ce := rawCE{w: make([]int, 4), ccc: ccc}
|
||||||
|
copy(ce.w, w)
|
||||||
|
return ce
|
||||||
|
}
|
||||||
|
|
||||||
// A collation element is represented as an uint32.
|
// A collation element is represented as an uint32.
|
||||||
// In the typical case, a rune maps to a single collation element. If a rune
|
// In the typical case, a rune maps to a single collation element. If a rune
|
||||||
// can be the start of a contraction or expands into multiple collation elements,
|
// can be the start of a contraction or expands into multiple collation elements,
|
||||||
@ -29,29 +40,36 @@ const (
|
|||||||
// 01pppppp pppppppp ppppppp0 ssssssss
|
// 01pppppp pppppppp ppppppp0 ssssssss
|
||||||
// - p* is primary collation value
|
// - p* is primary collation value
|
||||||
// - s* is the secondary collation value
|
// - s* is the secondary collation value
|
||||||
// or
|
|
||||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||||
// - p* is primary collation value
|
// - p* is primary collation value
|
||||||
// - s* offset of secondary from default value.
|
// - s* offset of secondary from default value.
|
||||||
// - t* is the tertiary collation value
|
// - t* is the tertiary collation value
|
||||||
|
// 100ttttt cccccccc pppppppp pppppppp
|
||||||
|
// - t* is the tertiar collation value
|
||||||
|
// - c* is the cannonical combining class
|
||||||
|
// - p* is the primary collation value
|
||||||
// Collation elements with a secondary value are of the form
|
// Collation elements with a secondary value are of the form
|
||||||
// 10000000 0000ssss ssssssss tttttttt, where
|
// 1010cccc ccccssss ssssssss tttttttt, where
|
||||||
// - 16 BMP implicit -> weight
|
// - c* is the canonical combining class
|
||||||
// - 8 bit s
|
// - s* is the secondary collation value
|
||||||
// - default tertiary
|
// - t* is the tertiary collation value
|
||||||
const (
|
const (
|
||||||
maxPrimaryBits = 21
|
maxPrimaryBits = 21
|
||||||
|
maxPrimaryCompactBits = 16
|
||||||
maxSecondaryBits = 12
|
maxSecondaryBits = 12
|
||||||
maxSecondaryCompactBits = 8
|
maxSecondaryCompactBits = 8
|
||||||
|
maxCCCBits = 8
|
||||||
maxSecondaryDiffBits = 4
|
maxSecondaryDiffBits = 4
|
||||||
maxTertiaryBits = 8
|
maxTertiaryBits = 8
|
||||||
maxTertiaryCompactBits = 5
|
maxTertiaryCompactBits = 5
|
||||||
|
|
||||||
isSecondary = 0x80000000
|
|
||||||
isPrimary = 0x40000000
|
isPrimary = 0x40000000
|
||||||
|
isPrimaryCCC = 0x80000000
|
||||||
|
isSecondary = 0xA0000000
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeCE(weights []int) (uint32, error) {
|
func makeCE(rce rawCE) (uint32, error) {
|
||||||
|
weights := rce.w
|
||||||
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
|
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
|
||||||
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
|
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
|
||||||
}
|
}
|
||||||
@ -63,14 +81,25 @@ func makeCE(weights []int) (uint32, error) {
|
|||||||
}
|
}
|
||||||
ce := uint32(0)
|
ce := uint32(0)
|
||||||
if weights[0] != 0 {
|
if weights[0] != 0 {
|
||||||
if weights[2] == defaultTertiary {
|
if rce.ccc != 0 {
|
||||||
|
if weights[0] >= 1<<maxPrimaryCompactBits {
|
||||||
|
return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights[0], 1<<maxPrimaryCompactBits)
|
||||||
|
}
|
||||||
|
if weights[1] != defaultSecondary {
|
||||||
|
return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights[1], rce.ccc)
|
||||||
|
}
|
||||||
|
ce = uint32(weights[2] << (maxPrimaryCompactBits + maxCCCBits))
|
||||||
|
ce |= uint32(rce.ccc) << maxPrimaryCompactBits
|
||||||
|
ce |= uint32(weights[0])
|
||||||
|
ce |= isPrimaryCCC
|
||||||
|
} else if weights[2] == defaultTertiary {
|
||||||
if weights[1] >= 1<<maxSecondaryCompactBits {
|
if weights[1] >= 1<<maxSecondaryCompactBits {
|
||||||
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
|
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
|
||||||
}
|
}
|
||||||
ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
||||||
ce |= isPrimary
|
ce |= isPrimary
|
||||||
} else {
|
} else {
|
||||||
d := weights[1] - defaultSecondary + 4
|
d := weights[1] - defaultSecondary + maxSecondaryDiffBits
|
||||||
if d >= 1<<maxSecondaryDiffBits || d < 0 {
|
if d >= 1<<maxSecondaryDiffBits || d < 0 {
|
||||||
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
|
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
|
||||||
}
|
}
|
||||||
@ -82,6 +111,7 @@ func makeCE(weights []int) (uint32, error) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
|
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
|
||||||
|
ce += uint32(rce.ccc) << (maxSecondaryBits + maxTertiaryBits)
|
||||||
ce |= isSecondary
|
ce |= isSecondary
|
||||||
}
|
}
|
||||||
return ce, nil
|
return ce, nil
|
||||||
@ -207,7 +237,7 @@ func implicitPrimary(r rune) int {
|
|||||||
// We will rewrite these characters to a single CE.
|
// We will rewrite these characters to a single CE.
|
||||||
// We assume the CJK values start at 0x8000.
|
// We assume the CJK values start at 0x8000.
|
||||||
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
||||||
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
|
||||||
const (
|
const (
|
||||||
cjkPrimaryStart = 0xFB40
|
cjkPrimaryStart = 0xFB40
|
||||||
rarePrimaryStart = 0xFB80
|
rarePrimaryStart = 0xFB80
|
||||||
@ -219,7 +249,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
|||||||
shiftBits = 15
|
shiftBits = 15
|
||||||
)
|
)
|
||||||
for i := 0; i < len(elems); i++ {
|
for i := 0; i < len(elems); i++ {
|
||||||
ce := elems[i]
|
ce := elems[i].w
|
||||||
p := ce[0]
|
p := ce[0]
|
||||||
if p < cjkPrimaryStart {
|
if p < cjkPrimaryStart {
|
||||||
continue
|
continue
|
||||||
@ -233,10 +263,10 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
|||||||
if i+1 >= len(elems) {
|
if i+1 >= len(elems) {
|
||||||
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
|
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
|
||||||
}
|
}
|
||||||
if elems[i+1][0]&lowBitsFlag == 0 {
|
if elems[i+1].w[0]&lowBitsFlag == 0 {
|
||||||
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
||||||
}
|
}
|
||||||
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
|
np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
|
||||||
switch {
|
switch {
|
||||||
case p < rarePrimaryStart:
|
case p < rarePrimaryStart:
|
||||||
np += commonUnifiedOffset
|
np += commonUnifiedOffset
|
||||||
@ -257,26 +287,25 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
|||||||
|
|
||||||
// nextWeight computes the first possible collation weights following elems
|
// nextWeight computes the first possible collation weights following elems
|
||||||
// for the given level.
|
// for the given level.
|
||||||
func nextWeight(level collate.Level, elems [][]int) [][]int {
|
func nextWeight(level collate.Level, elems []rawCE) []rawCE {
|
||||||
if level == collate.Identity {
|
if level == collate.Identity {
|
||||||
next := make([][]int, len(elems))
|
next := make([]rawCE, len(elems))
|
||||||
copy(next, elems)
|
copy(next, elems)
|
||||||
return next
|
return next
|
||||||
}
|
}
|
||||||
next := [][]int{make([]int, len(elems[0]))}
|
next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
|
||||||
copy(next[0], elems[0])
|
next[0].w[level]++
|
||||||
next[0][level]++
|
|
||||||
if level < collate.Secondary {
|
if level < collate.Secondary {
|
||||||
next[0][collate.Secondary] = defaultSecondary
|
next[0].w[collate.Secondary] = defaultSecondary
|
||||||
}
|
}
|
||||||
if level < collate.Tertiary {
|
if level < collate.Tertiary {
|
||||||
next[0][collate.Tertiary] = defaultTertiary
|
next[0].w[collate.Tertiary] = defaultTertiary
|
||||||
}
|
}
|
||||||
// Filter entries that cannot influence ordering.
|
// Filter entries that cannot influence ordering.
|
||||||
for _, ce := range elems[1:] {
|
for _, ce := range elems[1:] {
|
||||||
skip := true
|
skip := true
|
||||||
for i := collate.Primary; i < level; i++ {
|
for i := collate.Primary; i < level; i++ {
|
||||||
skip = skip && ce[i] == 0
|
skip = skip && ce.w[i] == 0
|
||||||
}
|
}
|
||||||
if !skip {
|
if !skip {
|
||||||
next = append(next, ce)
|
next = append(next, ce)
|
||||||
@ -285,18 +314,18 @@ func nextWeight(level collate.Level, elems [][]int) [][]int {
|
|||||||
return next
|
return next
|
||||||
}
|
}
|
||||||
|
|
||||||
func nextVal(elems [][]int, i int, level collate.Level) (index, value int) {
|
func nextVal(elems []rawCE, i int, level collate.Level) (index, value int) {
|
||||||
for ; i < len(elems) && elems[i][level] == 0; i++ {
|
for ; i < len(elems) && elems[i].w[level] == 0; i++ {
|
||||||
}
|
}
|
||||||
if i < len(elems) {
|
if i < len(elems) {
|
||||||
return i, elems[i][level]
|
return i, elems[i].w[level]
|
||||||
}
|
}
|
||||||
return i, 0
|
return i, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
|
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
|
||||||
// It also returns the collation level at which the difference is found.
|
// It also returns the collation level at which the difference is found.
|
||||||
func compareWeights(a, b [][]int) (result int, level collate.Level) {
|
func compareWeights(a, b []rawCE) (result int, level collate.Level) {
|
||||||
for level := collate.Primary; level < collate.Identity; level++ {
|
for level := collate.Primary; level < collate.Identity; level++ {
|
||||||
var va, vb int
|
var va, vb int
|
||||||
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
|
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
|
||||||
@ -314,19 +343,16 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) {
|
|||||||
return 0, collate.Identity
|
return 0, collate.Identity
|
||||||
}
|
}
|
||||||
|
|
||||||
func equalCE(a, b []int) bool {
|
func equalCE(a, b rawCE) bool {
|
||||||
if len(a) != len(b) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
for i := 0; i < 3; i++ {
|
for i := 0; i < 3; i++ {
|
||||||
if b[i] != a[i] {
|
if b.w[i] != a.w[i] {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func equalCEArrays(a, b [][]int) bool {
|
func equalCEArrays(a, b []rawCE) bool {
|
||||||
if len(a) != len(b) {
|
if len(a) != len(b) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@ type ceTest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func normalCE(in []int) (ce uint32, err error) {
|
func normalCE(in []int) (ce uint32, err error) {
|
||||||
return makeCE(in)
|
return makeCE(rawCE{w: in[:3], ccc: uint8(in[3])})
|
||||||
}
|
}
|
||||||
|
|
||||||
func expandCE(in []int) (ce uint32, err error) {
|
func expandCE(in []int) (ce uint32, err error) {
|
||||||
@ -32,17 +32,20 @@ func decompCE(in []int) (ce uint32, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var ceTests = []ceTest{
|
var ceTests = []ceTest{
|
||||||
{normalCE, []int{0, 0, 0}, 0x80000000},
|
{normalCE, []int{0, 0, 0, 0}, 0xA0000000},
|
||||||
{normalCE, []int{0, 0x28, 3}, 0x80002803},
|
{normalCE, []int{0, 0x28, 3, 0}, 0xA0002803},
|
||||||
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C883},
|
{normalCE, []int{0, 0x28, 3, 0xFF}, 0xAFF02803},
|
||||||
|
{normalCE, []int{100, defaultSecondary, 3, 0}, 0x0000C883},
|
||||||
// non-ignorable primary with non-default secondary
|
// non-ignorable primary with non-default secondary
|
||||||
{normalCE, []int{100, 0x28, defaultTertiary}, 0x4000C828},
|
{normalCE, []int{100, 0x28, defaultTertiary, 0}, 0x4000C828},
|
||||||
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983},
|
{normalCE, []int{100, defaultSecondary + 8, 3, 0}, 0x0000C983},
|
||||||
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary
|
{normalCE, []int{100, 0, 3, 0}, 0xFFFF}, // non-ignorable primary with non-supported secondary
|
||||||
{normalCE, []int{100, 1, 3}, 0xFFFF},
|
{normalCE, []int{100, 1, 3, 0}, 0xFFFF},
|
||||||
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
|
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0, 0}, 0xFFFF},
|
||||||
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
|
{normalCE, []int{0, 1 << maxSecondaryBits, 0, 0}, 0xFFFF},
|
||||||
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
|
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits, 0}, 0xFFFF},
|
||||||
|
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}, 0x88FF0123},
|
||||||
|
{normalCE, []int{0x123, defaultSecondary + 1, 8, 0xFF}, 0xFFFF},
|
||||||
|
|
||||||
{contractCE, []int{0, 0, 0}, 0xC0000000},
|
{contractCE, []int{0, 0, 0}, 0xC0000000},
|
||||||
{contractCE, []int{1, 1, 1}, 0xC0010011},
|
{contractCE, []int{1, 1, 1}, 0xC0010011},
|
||||||
@ -85,6 +88,14 @@ func TestColElem(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mkRawCES(in [][]int) []rawCE {
|
||||||
|
out := []rawCE{}
|
||||||
|
for _, w := range in {
|
||||||
|
out = append(out, rawCE{w: w})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
type weightsTest struct {
|
type weightsTest struct {
|
||||||
a, b [][]int
|
a, b [][]int
|
||||||
level collate.Level
|
level collate.Level
|
||||||
@ -119,8 +130,8 @@ var extra = [][]int{{200, 32, 8, 0}, {0, 32, 8, 0}, {0, 0, 8, 0}, {0, 0, 0, 0}}
|
|||||||
func TestNextWeight(t *testing.T) {
|
func TestNextWeight(t *testing.T) {
|
||||||
for i, tt := range nextWeightTests {
|
for i, tt := range nextWeightTests {
|
||||||
test := func(l collate.Level, tt weightsTest, a, gold [][]int) {
|
test := func(l collate.Level, tt weightsTest, a, gold [][]int) {
|
||||||
res := nextWeight(tt.level, a)
|
res := nextWeight(tt.level, mkRawCES(a))
|
||||||
if !equalCEArrays(gold, res) {
|
if !equalCEArrays(mkRawCES(gold), res) {
|
||||||
t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res)
|
t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -189,7 +200,7 @@ var compareTests = []weightsTest{
|
|||||||
func TestCompareWeights(t *testing.T) {
|
func TestCompareWeights(t *testing.T) {
|
||||||
for i, tt := range compareTests {
|
for i, tt := range compareTests {
|
||||||
test := func(tt weightsTest, a, b [][]int) {
|
test := func(tt weightsTest, a, b [][]int) {
|
||||||
res, level := compareWeights(a, b)
|
res, level := compareWeights(mkRawCES(a), mkRawCES(b))
|
||||||
if res != tt.result {
|
if res != tt.result {
|
||||||
t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res)
|
t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res)
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ package build
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"exp/locale/collate"
|
"exp/locale/collate"
|
||||||
|
"exp/norm"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"sort"
|
"sort"
|
||||||
@ -28,7 +29,7 @@ const (
|
|||||||
type entry struct {
|
type entry struct {
|
||||||
str string // same as string(runes)
|
str string // same as string(runes)
|
||||||
runes []rune
|
runes []rune
|
||||||
elems [][]int // the collation elements
|
elems []rawCE // the collation elements
|
||||||
extend string // weights of extend to be appended to elems
|
extend string // weights of extend to be appended to elems
|
||||||
before bool // weights relative to next instead of previous.
|
before bool // weights relative to next instead of previous.
|
||||||
lock bool // entry is used in extension and can no longer be moved.
|
lock bool // entry is used in extension and can no longer be moved.
|
||||||
@ -41,6 +42,7 @@ type entry struct {
|
|||||||
decompose bool // can use NFKD decomposition to generate elems
|
decompose bool // can use NFKD decomposition to generate elems
|
||||||
exclude bool // do not include in table
|
exclude bool // do not include in table
|
||||||
implicit bool // derived, is not included in the list
|
implicit bool // derived, is not included in the list
|
||||||
|
modified bool // entry was modified in tailoring
|
||||||
logical logicalAnchor
|
logical logicalAnchor
|
||||||
|
|
||||||
expansionIndex int // used to store index into expansion table
|
expansionIndex int // used to store index into expansion table
|
||||||
@ -162,10 +164,10 @@ func (e *entry) encode() (ce uint32, err error) {
|
|||||||
}
|
}
|
||||||
switch {
|
switch {
|
||||||
case e.decompose:
|
case e.decompose:
|
||||||
t1 := e.elems[0][2]
|
t1 := e.elems[0].w[2]
|
||||||
t2 := 0
|
t2 := 0
|
||||||
if len(e.elems) > 1 {
|
if len(e.elems) > 1 {
|
||||||
t2 = e.elems[1][2]
|
t2 = e.elems[1].w[2]
|
||||||
}
|
}
|
||||||
ce, err = makeDecompose(t1, t2)
|
ce, err = makeDecompose(t1, t2)
|
||||||
case e.contractionStarter():
|
case e.contractionStarter():
|
||||||
@ -231,7 +233,7 @@ func (o *ordering) insert(e *entry) {
|
|||||||
|
|
||||||
// newEntry creates a new entry for the given info and inserts it into
|
// newEntry creates a new entry for the given info and inserts it into
|
||||||
// the index.
|
// the index.
|
||||||
func (o *ordering) newEntry(s string, ces [][]int) *entry {
|
func (o *ordering) newEntry(s string, ces []rawCE) *entry {
|
||||||
e := &entry{
|
e := &entry{
|
||||||
runes: []rune(s),
|
runes: []rune(s),
|
||||||
elems: ces,
|
elems: ces,
|
||||||
@ -249,14 +251,29 @@ func (o *ordering) find(str string) *entry {
|
|||||||
if e == nil {
|
if e == nil {
|
||||||
r := []rune(str)
|
r := []rune(str)
|
||||||
if len(r) == 1 {
|
if len(r) == 1 {
|
||||||
e = o.newEntry(string(r[0]), [][]int{
|
const (
|
||||||
{
|
firstHangul = 0xAC00
|
||||||
|
lastHangul = 0xD7A3
|
||||||
|
)
|
||||||
|
if r[0] >= firstHangul && r[0] <= lastHangul {
|
||||||
|
ce := []rawCE{}
|
||||||
|
nfd := norm.NFD.String(str)
|
||||||
|
for _, r := range nfd {
|
||||||
|
ce = append(ce, o.find(string(r)).elems...)
|
||||||
|
}
|
||||||
|
e = o.newEntry(nfd, ce)
|
||||||
|
} else {
|
||||||
|
e = o.newEntry(string(r[0]), []rawCE{
|
||||||
|
{w: []int{
|
||||||
implicitPrimary(r[0]),
|
implicitPrimary(r[0]),
|
||||||
defaultSecondary,
|
defaultSecondary,
|
||||||
defaultTertiary,
|
defaultTertiary,
|
||||||
int(r[0]),
|
int(r[0]),
|
||||||
},
|
},
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
e.modified = true
|
||||||
|
}
|
||||||
e.exclude = true // do not index implicits
|
e.exclude = true // do not index implicits
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -275,7 +292,7 @@ func makeRootOrdering() ordering {
|
|||||||
}
|
}
|
||||||
insert := func(typ logicalAnchor, s string, ce []int) {
|
insert := func(typ logicalAnchor, s string, ce []int) {
|
||||||
e := &entry{
|
e := &entry{
|
||||||
elems: [][]int{ce},
|
elems: []rawCE{{w: ce}},
|
||||||
str: s,
|
str: s,
|
||||||
exclude: true,
|
exclude: true,
|
||||||
logical: typ,
|
logical: typ,
|
||||||
@ -362,10 +379,14 @@ func (o *ordering) sort() {
|
|||||||
|
|
||||||
// genColElems generates a collation element array from the runes in str. This
|
// genColElems generates a collation element array from the runes in str. This
|
||||||
// assumes that all collation elements have already been added to the Builder.
|
// assumes that all collation elements have already been added to the Builder.
|
||||||
func (o *ordering) genColElems(str string) [][]int {
|
func (o *ordering) genColElems(str string) []rawCE {
|
||||||
elems := [][]int{}
|
elems := []rawCE{}
|
||||||
for _, r := range []rune(str) {
|
for _, r := range []rune(str) {
|
||||||
elems = append(elems, o.find(string(r)).elems...)
|
for _, ce := range o.find(string(r)).elems {
|
||||||
|
if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
|
||||||
|
elems = append(elems, ce)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return elems
|
return elems
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@ type entryTest struct {
|
|||||||
// entries plus a leading and trailing anchor.
|
// entries plus a leading and trailing anchor.
|
||||||
func makeList(n int) []*entry {
|
func makeList(n int) []*entry {
|
||||||
es := make([]*entry, n+2)
|
es := make([]*entry, n+2)
|
||||||
weights := [][]int{{100, 20, 5, 0}}
|
weights := []rawCE{{w: []int{100, 20, 5, 0}}}
|
||||||
for i := range es {
|
for i := range es {
|
||||||
runes := []rune{rune(i)}
|
runes := []rune{rune(i)}
|
||||||
es[i] = &entry{
|
es[i] = &entry{
|
||||||
@ -176,8 +176,8 @@ type entryLessTest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
w1 = [][]int{{100, 20, 5, 5}}
|
w1 = []rawCE{{w: []int{100, 20, 5, 5}}}
|
||||||
w2 = [][]int{{101, 20, 5, 5}}
|
w2 = []rawCE{{w: []int{101, 20, 5, 5}}}
|
||||||
)
|
)
|
||||||
|
|
||||||
var entryLessTests = []entryLessTest{
|
var entryLessTests = []entryLessTest{
|
||||||
|
@ -23,7 +23,7 @@ const (
|
|||||||
type colElem uint32
|
type colElem uint32
|
||||||
|
|
||||||
const (
|
const (
|
||||||
maxCE colElem = 0x80FFFFFF
|
maxCE colElem = 0xAFFFFFFF
|
||||||
minContract = 0xC0000000
|
minContract = 0xC0000000
|
||||||
maxContract = 0xDFFFFFFF
|
maxContract = 0xDFFFFFFF
|
||||||
minExpand = 0xE0000000
|
minExpand = 0xE0000000
|
||||||
@ -62,30 +62,37 @@ func (ce colElem) ctype() ceType {
|
|||||||
// 01pppppp pppppppp ppppppp0 ssssssss
|
// 01pppppp pppppppp ppppppp0 ssssssss
|
||||||
// - p* is primary collation value
|
// - p* is primary collation value
|
||||||
// - s* is the secondary collation value
|
// - s* is the secondary collation value
|
||||||
// or
|
|
||||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||||
// - p* is primary collation value
|
// - p* is primary collation value
|
||||||
// - s* offset of secondary from default value.
|
// - s* offset of secondary from default value.
|
||||||
// - t* is the tertiary collation value
|
// - t* is the tertiary collation value
|
||||||
|
// 100ttttt cccccccc pppppppp pppppppp
|
||||||
|
// - t* is the tertiar collation value
|
||||||
|
// - c* is the cannonical combining class
|
||||||
|
// - p* is the primary collation value
|
||||||
// Collation elements with a secondary value are of the form
|
// Collation elements with a secondary value are of the form
|
||||||
// 10000000 0000ssss ssssssss tttttttt, where
|
// 1010cccc ccccssss ssssssss tttttttt, where
|
||||||
// - 16 BMP implicit -> weight
|
// - c* is the canonical combining class
|
||||||
// - 8 bit s
|
// - s* is the secondary collation value
|
||||||
// - default tertiary
|
// - t* is the tertiary collation value
|
||||||
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
|
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
|
||||||
// - q* quaternary value
|
// - q* quaternary value
|
||||||
const (
|
const (
|
||||||
ceTypeMask = 0xC0000000
|
ceTypeMask = 0xC0000000
|
||||||
|
ceTypeMaskExt = 0xE0000000
|
||||||
ceType1 = 0x40000000
|
ceType1 = 0x40000000
|
||||||
ceType2 = 0x00000000
|
ceType2 = 0x00000000
|
||||||
ceType3 = 0x80000000
|
ceType3or4 = 0x80000000
|
||||||
|
ceType4 = 0xA0000000
|
||||||
ceTypeQ = 0xC0000000
|
ceTypeQ = 0xC0000000
|
||||||
ceIgnore = ceType3
|
ceIgnore = ceType4
|
||||||
firstNonPrimary = 0x80000000
|
firstNonPrimary = 0x80000000
|
||||||
|
lastSpecialPrimary = 0xA0000000
|
||||||
secondaryMask = 0x80000000
|
secondaryMask = 0x80000000
|
||||||
hasTertiaryMask = 0x40000000
|
hasTertiaryMask = 0x40000000
|
||||||
primaryValueMask = 0x3FFFFE00
|
primaryValueMask = 0x3FFFFE00
|
||||||
primaryShift = 9
|
primaryShift = 9
|
||||||
|
compactPrimaryBits = 16
|
||||||
compactSecondaryShift = 5
|
compactSecondaryShift = 5
|
||||||
minCompactSecondary = defaultSecondary - 4
|
minCompactSecondary = defaultSecondary - 4
|
||||||
)
|
)
|
||||||
@ -98,10 +105,23 @@ func makeQuaternary(primary int) colElem {
|
|||||||
return ceTypeQ | colElem(primary<<primaryShift)
|
return ceTypeQ | colElem(primary<<primaryShift)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ce colElem) ccc() uint8 {
|
||||||
|
if ce&ceType3or4 != 0 {
|
||||||
|
if ce&ceType4 == ceType3or4 {
|
||||||
|
return uint8(ce >> 16)
|
||||||
|
}
|
||||||
|
return uint8(ce >> 20)
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func (ce colElem) primary() int {
|
func (ce colElem) primary() int {
|
||||||
if ce >= firstNonPrimary {
|
if ce >= firstNonPrimary {
|
||||||
|
if ce > lastSpecialPrimary {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
return int(uint16(ce))
|
||||||
|
}
|
||||||
return int(ce&primaryValueMask) >> primaryShift
|
return int(ce&primaryValueMask) >> primaryShift
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,8 +131,11 @@ func (ce colElem) secondary() int {
|
|||||||
return int(uint8(ce))
|
return int(uint8(ce))
|
||||||
case ceType2:
|
case ceType2:
|
||||||
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
|
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
|
||||||
case ceType3:
|
case ceType3or4:
|
||||||
return int(uint16(ce >> 8))
|
if ce < ceType4 {
|
||||||
|
return defaultSecondary
|
||||||
|
}
|
||||||
|
return int(ce>>8) & 0xFFF
|
||||||
case ceTypeQ:
|
case ceTypeQ:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@ -121,10 +144,13 @@ func (ce colElem) secondary() int {
|
|||||||
|
|
||||||
func (ce colElem) tertiary() uint8 {
|
func (ce colElem) tertiary() uint8 {
|
||||||
if ce&hasTertiaryMask == 0 {
|
if ce&hasTertiaryMask == 0 {
|
||||||
if ce&ceType3 == 0 {
|
if ce&ceType3or4 == 0 {
|
||||||
return uint8(ce & 0x1F)
|
return uint8(ce & 0x1F)
|
||||||
}
|
}
|
||||||
|
if ce&ceType4 == ceType4 {
|
||||||
return uint8(ce)
|
return uint8(ce)
|
||||||
|
}
|
||||||
|
return uint8(ce>>24) & 0x1F // type 2
|
||||||
} else if ce&ceTypeMask == ceType1 {
|
} else if ce&ceTypeMask == ceType1 {
|
||||||
return defaultTertiary
|
return defaultTertiary
|
||||||
}
|
}
|
||||||
@ -134,10 +160,15 @@ func (ce colElem) tertiary() uint8 {
|
|||||||
|
|
||||||
func (ce colElem) updateTertiary(t uint8) colElem {
|
func (ce colElem) updateTertiary(t uint8) colElem {
|
||||||
if ce&ceTypeMask == ceType1 {
|
if ce&ceTypeMask == ceType1 {
|
||||||
|
// convert to type 4
|
||||||
nce := ce & primaryValueMask
|
nce := ce & primaryValueMask
|
||||||
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
|
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
|
||||||
ce = nce
|
ce = nce
|
||||||
|
} else if ce&ceTypeMaskExt == ceType3or4 {
|
||||||
|
ce &= ^colElem(maxTertiary << 24)
|
||||||
|
return ce | (colElem(t) << 24)
|
||||||
} else {
|
} else {
|
||||||
|
// type 2 or 4
|
||||||
ce &= ^colElem(maxTertiary)
|
ce &= ^colElem(maxTertiary)
|
||||||
}
|
}
|
||||||
return ce | colElem(t)
|
return ce | colElem(t)
|
||||||
|
@ -23,12 +23,19 @@ func makeCE(weights []int) colElem {
|
|||||||
maxSecondaryDiffBits = 4
|
maxSecondaryDiffBits = 4
|
||||||
maxTertiaryBits = 8
|
maxTertiaryBits = 8
|
||||||
maxTertiaryCompactBits = 5
|
maxTertiaryCompactBits = 5
|
||||||
isSecondary = 0x80000000
|
|
||||||
isPrimary = 0x40000000
|
isPrimary = 0x40000000
|
||||||
|
isPrimaryCCC = 0x80000000
|
||||||
|
isSecondary = 0xA0000000
|
||||||
)
|
)
|
||||||
var ce colElem
|
var ce colElem
|
||||||
|
ccc := weights[3]
|
||||||
if weights[0] != 0 {
|
if weights[0] != 0 {
|
||||||
if weights[2] == defaultTertiary {
|
if ccc != 0 {
|
||||||
|
ce = colElem(weights[2] << 24)
|
||||||
|
ce |= colElem(ccc) << 16
|
||||||
|
ce |= colElem(weights[0])
|
||||||
|
ce |= isPrimaryCCC
|
||||||
|
} else if weights[2] == defaultTertiary {
|
||||||
ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
||||||
ce |= isPrimary
|
ce |= isPrimary
|
||||||
} else {
|
} else {
|
||||||
@ -38,6 +45,7 @@ func makeCE(weights []int) colElem {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
|
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
|
||||||
|
ce += colElem(ccc) << 20
|
||||||
ce |= isSecondary
|
ce |= isSecondary
|
||||||
}
|
}
|
||||||
return ce
|
return ce
|
||||||
@ -68,10 +76,11 @@ func makeDecompose(t1, t2 int) colElem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func normalCE(inout []int) (ce colElem, t ceType) {
|
func normalCE(inout []int) (ce colElem, t ceType) {
|
||||||
w := makeCE(inout)
|
ce = makeCE(inout)
|
||||||
inout[0] = w.primary()
|
inout[0] = ce.primary()
|
||||||
inout[1] = w.secondary()
|
inout[1] = ce.secondary()
|
||||||
inout[2] = int(w.tertiary())
|
inout[2] = int(ce.tertiary())
|
||||||
|
inout[3] = int(ce.ccc())
|
||||||
return ce, ceNormal
|
return ce, ceNormal
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,9 +111,13 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ceTests = []ceTest{
|
var ceTests = []ceTest{
|
||||||
{normalCE, []int{0, 0, 0}},
|
{normalCE, []int{0, 0, 0, 0}},
|
||||||
{normalCE, []int{0, 30, 3}},
|
{normalCE, []int{0, 30, 3, 0}},
|
||||||
{normalCE, []int{100, defaultSecondary, 3}},
|
{normalCE, []int{0, 30, 3, 0xFF}},
|
||||||
|
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0}},
|
||||||
|
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0xFF}},
|
||||||
|
{normalCE, []int{100, defaultSecondary, 3, 0}},
|
||||||
|
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}},
|
||||||
|
|
||||||
{contractCE, []int{0, 0, 0}},
|
{contractCE, []int{0, 0, 0}},
|
||||||
{contractCE, []int{1, 1, 1}},
|
{contractCE, []int{1, 1, 1}},
|
||||||
@ -127,11 +140,11 @@ func TestColElem(t *testing.T) {
|
|||||||
copy(inout, tt.arg)
|
copy(inout, tt.arg)
|
||||||
ce, typ := tt.f(inout)
|
ce, typ := tt.f(inout)
|
||||||
if ce.ctype() != typ {
|
if ce.ctype() != typ {
|
||||||
t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
|
t.Errorf("%d: type is %d; want %d (ColElem: %X)", i, ce.ctype(), typ, ce)
|
||||||
}
|
}
|
||||||
for j, a := range tt.arg {
|
for j, a := range tt.arg {
|
||||||
if inout[j] != a {
|
if inout[j] != a {
|
||||||
t.Errorf("%d: argument %d is %X; want %X", i, j, inout[j], a)
|
t.Errorf("%d: argument %d is %X; want %X (ColElem: %X)", i, j, inout[j], a, ce)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -176,7 +189,8 @@ func TestUpdateTertiary(t *testing.T) {
|
|||||||
{0x4000FE20, 0x0000FE8A, 0x0A},
|
{0x4000FE20, 0x0000FE8A, 0x0A},
|
||||||
{0x4000FE21, 0x0000FEAA, 0x0A},
|
{0x4000FE21, 0x0000FEAA, 0x0A},
|
||||||
{0x0000FE8B, 0x0000FE83, 0x03},
|
{0x0000FE8B, 0x0000FE83, 0x03},
|
||||||
{0x8000CC02, 0x8000CC1B, 0x1B},
|
{0x82FF0188, 0x9BFF0188, 0x1B},
|
||||||
|
{0xAFF0CC02, 0xAFF0CC1B, 0x1B},
|
||||||
}
|
}
|
||||||
for i, tt := range tests {
|
for i, tt := range tests {
|
||||||
if out := tt.in.updateTertiary(tt.t); out != tt.out {
|
if out := tt.in.updateTertiary(tt.t); out != tt.out {
|
||||||
@ -184,3 +198,77 @@ func TestUpdateTertiary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDoNorm(t *testing.T) {
|
||||||
|
const div = -1 // The insertion point of the next block.
|
||||||
|
tests := []struct {
|
||||||
|
in, out []int
|
||||||
|
}{
|
||||||
|
{in: []int{4, div, 3},
|
||||||
|
out: []int{3, 4},
|
||||||
|
},
|
||||||
|
{in: []int{4, div, 3, 3, 3},
|
||||||
|
out: []int{3, 3, 3, 4},
|
||||||
|
},
|
||||||
|
{in: []int{0, 4, div, 3},
|
||||||
|
out: []int{0, 3, 4},
|
||||||
|
},
|
||||||
|
{in: []int{0, 0, 4, 5, div, 3, 3},
|
||||||
|
out: []int{0, 0, 3, 3, 4, 5},
|
||||||
|
},
|
||||||
|
{in: []int{0, 0, 1, 4, 5, div, 3, 3},
|
||||||
|
out: []int{0, 0, 1, 3, 3, 4, 5},
|
||||||
|
},
|
||||||
|
{in: []int{0, 0, 1, 4, 5, div, 4, 4},
|
||||||
|
out: []int{0, 0, 1, 4, 4, 4, 5},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for j, tt := range tests {
|
||||||
|
i := iter{}
|
||||||
|
var w, p, s int
|
||||||
|
for k, cc := range tt.in {
|
||||||
|
if cc == 0 {
|
||||||
|
s = 0
|
||||||
|
}
|
||||||
|
if cc == div {
|
||||||
|
w = 100
|
||||||
|
p = k
|
||||||
|
i.pStarter = s
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
i.ce = append(i.ce, makeCE([]int{w, 20, 2, cc}))
|
||||||
|
}
|
||||||
|
i.prevCCC = i.ce[p-1].ccc()
|
||||||
|
i.doNorm(p, i.ce[p].ccc())
|
||||||
|
if len(i.ce) != len(tt.out) {
|
||||||
|
t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
|
||||||
|
}
|
||||||
|
prevCCC := uint8(0)
|
||||||
|
for k, ce := range i.ce {
|
||||||
|
if int(ce.ccc()) != tt.out[k] {
|
||||||
|
t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.ccc(), tt.out[k])
|
||||||
|
}
|
||||||
|
if k > 0 && ce.ccc() == prevCCC && i.ce[k-1].primary() > ce.primary() {
|
||||||
|
t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// test cutoff of large sequence of combining characters.
|
||||||
|
result := []uint8{8, 8, 8, 5, 5}
|
||||||
|
for o := -2; o <= 2; o++ {
|
||||||
|
i := iter{pStarter: 2, prevCCC: 8}
|
||||||
|
n := maxCombiningCharacters + 1 + o
|
||||||
|
for j := 1; j < n+i.pStarter; j++ {
|
||||||
|
i.ce = append(i.ce, makeCE([]int{100, 20, 2, 8}))
|
||||||
|
}
|
||||||
|
p := len(i.ce)
|
||||||
|
i.ce = append(i.ce, makeCE([]int{0, 20, 2, 5}))
|
||||||
|
i.doNorm(p, 5)
|
||||||
|
if i.prevCCC != result[o+2] {
|
||||||
|
t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
|
||||||
|
}
|
||||||
|
if result[o+2] == 5 && i.pStarter != p {
|
||||||
|
t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -10,6 +10,7 @@ package collate
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"exp/norm"
|
"exp/norm"
|
||||||
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Level identifies the collation comparison level.
|
// Level identifies the collation comparison level.
|
||||||
@ -112,7 +113,7 @@ func New(loc string) *Collator {
|
|||||||
|
|
||||||
func newCollator(t *table) *Collator {
|
func newCollator(t *table) *Collator {
|
||||||
c := &Collator{
|
c := &Collator{
|
||||||
Strength: Quaternary,
|
Strength: Tertiary,
|
||||||
f: norm.NFD,
|
f: norm.NFD,
|
||||||
t: t,
|
t: t,
|
||||||
}
|
}
|
||||||
@ -269,8 +270,7 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
|
|||||||
func (c *Collator) getColElems(str []byte) []colElem {
|
func (c *Collator) getColElems(str []byte) []colElem {
|
||||||
i := c.iter(0)
|
i := c.iter(0)
|
||||||
i.setInput(c, str)
|
i.setInput(c, str)
|
||||||
for !i.done() {
|
for i.next() {
|
||||||
i.next()
|
|
||||||
}
|
}
|
||||||
return i.ce
|
return i.ce
|
||||||
}
|
}
|
||||||
@ -278,88 +278,185 @@ func (c *Collator) getColElems(str []byte) []colElem {
|
|||||||
func (c *Collator) getColElemsString(str string) []colElem {
|
func (c *Collator) getColElemsString(str string) []colElem {
|
||||||
i := c.iter(0)
|
i := c.iter(0)
|
||||||
i.setInputString(c, str)
|
i.setInputString(c, str)
|
||||||
for !i.done() {
|
for i.next() {
|
||||||
i.next()
|
|
||||||
}
|
}
|
||||||
return i.ce
|
return i.ce
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type source struct {
|
||||||
|
str string
|
||||||
|
bytes []byte
|
||||||
|
buf [16]byte // Used for decomposing Hangul.
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) done() bool {
|
||||||
|
return len(src.str) == 0 && len(src.bytes) == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) tail(n int) (res source) {
|
||||||
|
if src.bytes == nil {
|
||||||
|
res.str = src.str[n:]
|
||||||
|
} else {
|
||||||
|
res.bytes = src.bytes[n:]
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) nfd(end int) []byte {
|
||||||
|
if src.bytes == nil {
|
||||||
|
return norm.NFD.AppendString(src.buf[:0], src.str[:end])
|
||||||
|
}
|
||||||
|
return norm.NFD.Append(src.buf[:0], src.bytes[:end]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) properties(f norm.Form) norm.Properties {
|
||||||
|
if src.bytes == nil {
|
||||||
|
return f.PropertiesString(src.str)
|
||||||
|
}
|
||||||
|
return f.Properties(src.bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) lookup(t *table) (ce colElem, sz int) {
|
||||||
|
if src.bytes == nil {
|
||||||
|
return t.index.lookupString(src.str)
|
||||||
|
}
|
||||||
|
return t.index.lookup(src.bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (src *source) rune() (r rune, sz int) {
|
||||||
|
if src.bytes == nil {
|
||||||
|
return utf8.DecodeRuneInString(src.str)
|
||||||
|
}
|
||||||
|
return utf8.DecodeRune(src.bytes)
|
||||||
|
}
|
||||||
|
|
||||||
type iter struct {
|
type iter struct {
|
||||||
src norm.Iter
|
src source
|
||||||
norm [1024]byte
|
|
||||||
buf []byte
|
|
||||||
p int
|
|
||||||
minBufSize int
|
|
||||||
|
|
||||||
wa [512]colElem
|
wa [512]colElem
|
||||||
ce []colElem
|
ce []colElem
|
||||||
pce int
|
pce int
|
||||||
|
nce int // nce <= len(nce)
|
||||||
|
|
||||||
|
prevCCC uint8
|
||||||
|
pStarter int
|
||||||
|
|
||||||
t *table
|
t *table
|
||||||
_done, eof bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) init(c *Collator) {
|
func (i *iter) init(c *Collator) {
|
||||||
i.t = c.t
|
i.t = c.t
|
||||||
i.minBufSize = c.t.maxContractLen
|
|
||||||
i.ce = i.wa[:0]
|
i.ce = i.wa[:0]
|
||||||
i.buf = i.norm[:0]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) reset() {
|
func (i *iter) reset() {
|
||||||
i.ce = i.ce[:0]
|
i.ce = i.ce[:0]
|
||||||
i.buf = i.buf[:0]
|
i.nce = 0
|
||||||
i.p = 0
|
i.prevCCC = 0
|
||||||
i.eof = i.src.Done()
|
i.pStarter = 0
|
||||||
i._done = i.eof
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) setInput(c *Collator, s []byte) *iter {
|
func (i *iter) setInput(c *Collator, s []byte) *iter {
|
||||||
i.src.SetInput(c.f, s)
|
i.src.bytes = s
|
||||||
|
i.src.str = ""
|
||||||
i.reset()
|
i.reset()
|
||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) setInputString(c *Collator, s string) *iter {
|
func (i *iter) setInputString(c *Collator, s string) *iter {
|
||||||
i.src.SetInputString(c.f, s)
|
i.src.str = s
|
||||||
|
i.src.bytes = nil
|
||||||
i.reset()
|
i.reset()
|
||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) done() bool {
|
// next appends colElems to the internal array until it adds an element with CCC=0.
|
||||||
return i._done
|
// In the majority of cases, a colElem with a primary value > 0 will have
|
||||||
|
// a CCC of 0. The CCC values of colation elements are also used to detect if the
|
||||||
|
// input string was not normalized and to adjust the result accordingly.
|
||||||
|
func (i *iter) next() bool {
|
||||||
|
sz := 0
|
||||||
|
for !i.src.done() {
|
||||||
|
p0 := len(i.ce)
|
||||||
|
i.ce, sz = i.t.appendNext(i.ce, i.src)
|
||||||
|
i.src = i.src.tail(sz)
|
||||||
|
last := len(i.ce) - 1
|
||||||
|
if ccc := i.ce[last].ccc(); ccc == 0 {
|
||||||
|
i.nce = len(i.ce)
|
||||||
|
i.pStarter = last
|
||||||
|
i.prevCCC = 0
|
||||||
|
return true
|
||||||
|
} else if p0 < last && i.ce[p0].ccc() == 0 {
|
||||||
|
// set i.nce to only cover part of i.ce for which ccc == 0 and
|
||||||
|
// use rest the next call to next.
|
||||||
|
for p0++; p0 < last && i.ce[p0].ccc() == 0; p0++ {
|
||||||
|
}
|
||||||
|
i.nce = p0
|
||||||
|
i.pStarter = p0 - 1
|
||||||
|
i.prevCCC = ccc
|
||||||
|
return true
|
||||||
|
} else if ccc < i.prevCCC {
|
||||||
|
i.doNorm(p0, ccc) // should be rare for most common cases
|
||||||
|
} else {
|
||||||
|
i.prevCCC = ccc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(i.ce) != i.nce {
|
||||||
|
i.nce = len(i.ce)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) next() {
|
// nextPlain is the same as next, but does not "normalize" the collation
|
||||||
if !i.eof && len(i.buf)-i.p < i.minBufSize {
|
// elements.
|
||||||
// replenish buffer
|
// TODO: remove this function. Using this instead of next does not seem
|
||||||
n := copy(i.buf, i.buf[i.p:])
|
// to improve performance in any significant way. We retain this until
|
||||||
n += i.src.Next(i.buf[n:cap(i.buf)])
|
// later for evaluation purposes.
|
||||||
i.buf = i.buf[:n]
|
func (i *iter) nextPlain() bool {
|
||||||
i.p = 0
|
if i.src.done() {
|
||||||
i.eof = i.src.Done()
|
return false
|
||||||
}
|
|
||||||
if i.p == len(i.buf) {
|
|
||||||
i._done = true
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
sz := 0
|
sz := 0
|
||||||
i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
|
i.ce, sz = i.t.appendNext(i.ce, i.src)
|
||||||
i.p += sz
|
i.src = i.src.tail(sz)
|
||||||
|
i.nce = len(i.ce)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxCombiningCharacters = 30
|
||||||
|
|
||||||
|
// doNorm reorders the collation elements in i.ce.
|
||||||
|
// It assumes that blocks of collation elements added with appendNext
|
||||||
|
// either start and end with the same CCC or start with CCC == 0.
|
||||||
|
// This allows for a single insertion point for the entire block.
|
||||||
|
// The correctness of this assumption is verified in builder.go.
|
||||||
|
func (i *iter) doNorm(p int, ccc uint8) {
|
||||||
|
if p-i.pStarter > maxCombiningCharacters {
|
||||||
|
i.prevCCC = i.ce[len(i.ce)-1].ccc()
|
||||||
|
i.pStarter = len(i.ce) - 1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
n := len(i.ce)
|
||||||
|
k := p
|
||||||
|
for p--; p > i.pStarter && ccc < i.ce[p-1].ccc(); p-- {
|
||||||
|
}
|
||||||
|
i.ce = append(i.ce, i.ce[p:k]...)
|
||||||
|
copy(i.ce[p:], i.ce[k:])
|
||||||
|
i.ce = i.ce[:n]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *iter) nextPrimary() int {
|
func (i *iter) nextPrimary() int {
|
||||||
for {
|
for {
|
||||||
for ; i.pce < len(i.ce); i.pce++ {
|
for ; i.pce < i.nce; i.pce++ {
|
||||||
if v := i.ce[i.pce].primary(); v != 0 {
|
if v := i.ce[i.pce].primary(); v != 0 {
|
||||||
i.pce++
|
i.pce++
|
||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if i.done() {
|
if !i.next() {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
i.next()
|
|
||||||
}
|
}
|
||||||
panic("should not reach here")
|
panic("should not reach here")
|
||||||
}
|
}
|
||||||
|
@ -378,6 +378,7 @@ var keyTests = []keyTest{
|
|||||||
func TestKey(t *testing.T) {
|
func TestKey(t *testing.T) {
|
||||||
c, _ := makeTable(appendNextTests[4].in)
|
c, _ := makeTable(appendNextTests[4].in)
|
||||||
c.Alternate = collate.AltShifted
|
c.Alternate = collate.AltShifted
|
||||||
|
c.Strength = collate.Quaternary
|
||||||
buf := collate.Buffer{}
|
buf := collate.Buffer{}
|
||||||
keys1 := [][]byte{}
|
keys1 := [][]byte{}
|
||||||
keys2 := [][]byte{}
|
keys2 := [][]byte{}
|
||||||
|
@ -27,8 +27,21 @@ type ctScanner struct {
|
|||||||
done bool
|
done bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ctScannerString struct {
|
||||||
|
states contractTrieSet
|
||||||
|
s string
|
||||||
|
n int
|
||||||
|
index int
|
||||||
|
pindex int
|
||||||
|
done bool
|
||||||
|
}
|
||||||
|
|
||||||
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
|
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
|
||||||
return ctScanner{states: t[index:], s: b, n: n}
|
return ctScanner{s: b, states: t[index:], n: n}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t contractTrieSet) scannerString(index, n int, str string) ctScannerString {
|
||||||
|
return ctScannerString{s: str, states: t[index:], n: n}
|
||||||
}
|
}
|
||||||
|
|
||||||
// result returns the offset i and bytes consumed p so far. If no suffix
|
// result returns the offset i and bytes consumed p so far. If no suffix
|
||||||
@ -37,6 +50,10 @@ func (s *ctScanner) result() (i, p int) {
|
|||||||
return s.index, s.pindex
|
return s.index, s.pindex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ctScannerString) result() (i, p int) {
|
||||||
|
return s.index, s.pindex
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
final = 0
|
final = 0
|
||||||
noIndex = 0xFF
|
noIndex = 0xFF
|
||||||
@ -84,3 +101,45 @@ func (s *ctScanner) scan(p int) int {
|
|||||||
}
|
}
|
||||||
return pr
|
return pr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// scan is a verbatim copy of ctScanner.scan.
|
||||||
|
func (s *ctScannerString) scan(p int) int {
|
||||||
|
pr := p // the p at the rune start
|
||||||
|
str := s.s
|
||||||
|
states, n := s.states, s.n
|
||||||
|
for i := 0; i < n && p < len(str); {
|
||||||
|
e := states[i]
|
||||||
|
c := str[p]
|
||||||
|
// TODO: a significant number of contractions are of a form that
|
||||||
|
// cannot match discontiguous UTF-8 in a normalized string. We could let
|
||||||
|
// a negative value of e.n mean that we can set s.done = true and avoid
|
||||||
|
// the need for additional matches.
|
||||||
|
if c >= e.l {
|
||||||
|
if e.l == c {
|
||||||
|
p++
|
||||||
|
if e.i != noIndex {
|
||||||
|
s.index = int(e.i)
|
||||||
|
s.pindex = p
|
||||||
|
}
|
||||||
|
if e.n != final {
|
||||||
|
i, states, n = 0, states[int(e.h)+n:], int(e.n)
|
||||||
|
if p >= len(str) || utf8.RuneStart(str[p]) {
|
||||||
|
s.states, s.n, pr = states, n, p
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
s.done = true
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
} else if e.n == final && c <= e.h {
|
||||||
|
p++
|
||||||
|
s.done = true
|
||||||
|
s.index = int(c-e.l) + int(e.i)
|
||||||
|
s.pindex = p
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
return pr
|
||||||
|
}
|
||||||
|
@ -30,7 +30,7 @@ func W(ce ...int) Weights {
|
|||||||
return w
|
return w
|
||||||
}
|
}
|
||||||
func (w Weights) String() string {
|
func (w Weights) String() string {
|
||||||
return fmt.Sprintf("[%d.%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
|
return fmt.Sprintf("[%X.%X.%X.%X]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
|
||||||
}
|
}
|
||||||
|
|
||||||
type Table struct {
|
type Table struct {
|
||||||
@ -52,7 +52,7 @@ func convertToWeights(ws []colElem) []Weights {
|
|||||||
func convertFromWeights(ws []Weights) []colElem {
|
func convertFromWeights(ws []Weights) []colElem {
|
||||||
out := make([]colElem, len(ws))
|
out := make([]colElem, len(ws))
|
||||||
for i, w := range ws {
|
for i, w := range ws {
|
||||||
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary})
|
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary, 0})
|
||||||
if out[i] == ceIgnore && w.Quaternary > 0 {
|
if out[i] == ceIgnore && w.Quaternary > 0 {
|
||||||
out[i] = makeQuaternary(w.Quaternary)
|
out[i] = makeQuaternary(w.Quaternary)
|
||||||
}
|
}
|
||||||
@ -61,7 +61,7 @@ func convertFromWeights(ws []Weights) []colElem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t *Table) AppendNext(s []byte) ([]Weights, int) {
|
func (t *Table) AppendNext(s []byte) ([]Weights, int) {
|
||||||
w, n := t.t.appendNext(nil, s)
|
w, n := t.t.appendNext(nil, source{bytes: s})
|
||||||
return convertToWeights(w), n
|
return convertToWeights(w), n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,13 +42,26 @@ func (t *table) indexedTable(idx tableIndex) *table {
|
|||||||
// sequence of runes, the weights for the interstitial runes are
|
// sequence of runes, the weights for the interstitial runes are
|
||||||
// appended as well. It returns a new slice that includes the appended
|
// appended as well. It returns a new slice that includes the appended
|
||||||
// weights and the number of bytes consumed from s.
|
// weights and the number of bytes consumed from s.
|
||||||
func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
|
func (t *table) appendNext(w []colElem, src source) (res []colElem, n int) {
|
||||||
v, sz := t.index.lookup(s)
|
ce, sz := src.lookup(t)
|
||||||
ce := colElem(v)
|
|
||||||
tp := ce.ctype()
|
tp := ce.ctype()
|
||||||
if tp == ceNormal {
|
if tp == ceNormal {
|
||||||
if ce == 0 {
|
if ce == 0 {
|
||||||
r, _ := utf8.DecodeRune(s)
|
r, _ := src.rune()
|
||||||
|
const (
|
||||||
|
hangulSize = 3
|
||||||
|
firstHangul = 0xAC00
|
||||||
|
lastHangul = 0xD7A3
|
||||||
|
)
|
||||||
|
if r >= firstHangul && r <= lastHangul {
|
||||||
|
// TODO: performance can be considerably improved here.
|
||||||
|
n = sz
|
||||||
|
for b := src.nfd(hangulSize); len(b) > 0; b = b[sz:] {
|
||||||
|
ce, sz = t.index.lookup(b)
|
||||||
|
w = append(w, ce)
|
||||||
|
}
|
||||||
|
return w, n
|
||||||
|
}
|
||||||
ce = makeImplicitCE(implicitPrimary(r))
|
ce = makeImplicitCE(implicitPrimary(r))
|
||||||
}
|
}
|
||||||
w = append(w, ce)
|
w = append(w, ce)
|
||||||
@ -56,15 +69,20 @@ func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
|
|||||||
w = t.appendExpansion(w, ce)
|
w = t.appendExpansion(w, ce)
|
||||||
} else if tp == ceContractionIndex {
|
} else if tp == ceContractionIndex {
|
||||||
n := 0
|
n := 0
|
||||||
w, n = t.matchContraction(w, ce, s[sz:])
|
src = src.tail(sz)
|
||||||
|
if src.bytes == nil {
|
||||||
|
w, n = t.matchContractionString(w, ce, src.str)
|
||||||
|
} else {
|
||||||
|
w, n = t.matchContraction(w, ce, src.bytes)
|
||||||
|
}
|
||||||
sz += n
|
sz += n
|
||||||
} else if tp == ceDecompose {
|
} else if tp == ceDecompose {
|
||||||
// Decompose using NFCK and replace tertiary weights.
|
// Decompose using NFKD and replace tertiary weights.
|
||||||
t1, t2 := splitDecompose(ce)
|
t1, t2 := splitDecompose(ce)
|
||||||
i := len(w)
|
i := len(w)
|
||||||
nfkd := norm.NFKD.Properties(s).Decomposition()
|
nfkd := src.properties(norm.NFKD).Decomposition()
|
||||||
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
|
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
|
||||||
w, p = t.appendNext(w, nfkd)
|
w, p = t.appendNext(w, source{bytes: nfkd})
|
||||||
}
|
}
|
||||||
w[i] = w[i].updateTertiary(t1)
|
w[i] = w[i].updateTertiary(t1)
|
||||||
if i++; i < len(w) {
|
if i++; i < len(w) {
|
||||||
@ -99,16 +117,17 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
|
|||||||
// By now we should have filtered most cases.
|
// By now we should have filtered most cases.
|
||||||
p0 := p
|
p0 := p
|
||||||
bufn := 0
|
bufn := 0
|
||||||
rune := norm.NFC.Properties(suffix[p:])
|
rune := norm.NFD.Properties(suffix[p:])
|
||||||
p += rune.Size()
|
p += rune.Size()
|
||||||
if prevCC := rune.TrailCCC(); prevCC != 0 {
|
if rune.LeadCCC() != 0 {
|
||||||
|
prevCC := rune.TrailCCC()
|
||||||
// A gap may only occur in the last normalization segment.
|
// A gap may only occur in the last normalization segment.
|
||||||
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
|
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
|
||||||
if end := norm.NFC.FirstBoundary(suffix[p:]); end != -1 {
|
if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
|
||||||
scan.s = suffix[:p+end]
|
scan.s = suffix[:p+end]
|
||||||
}
|
}
|
||||||
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
|
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
|
||||||
rune = norm.NFC.Properties(suffix[p:])
|
rune = norm.NFD.Properties(suffix[p:])
|
||||||
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
|
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -136,7 +155,65 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
|
|||||||
}
|
}
|
||||||
// Append weights for the runes in the segment not part of the contraction.
|
// Append weights for the runes in the segment not part of the contraction.
|
||||||
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
|
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
|
||||||
w, p = t.appendNext(w, b)
|
w, p = t.appendNext(w, source{bytes: b})
|
||||||
|
}
|
||||||
|
return w, n
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: unify the two implementations. This is best done after first simplifying
|
||||||
|
// the algorithm taking into account the inclusion of both NFC and NFD forms
|
||||||
|
// in the table.
|
||||||
|
func (t *table) matchContractionString(w []colElem, ce colElem, suffix string) ([]colElem, int) {
|
||||||
|
index, n, offset := splitContractIndex(ce)
|
||||||
|
|
||||||
|
scan := t.contractTries.scannerString(index, n, suffix)
|
||||||
|
buf := [norm.MaxSegmentSize]byte{}
|
||||||
|
bufp := 0
|
||||||
|
p := scan.scan(0)
|
||||||
|
|
||||||
|
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
|
||||||
|
// By now we should have filtered most cases.
|
||||||
|
p0 := p
|
||||||
|
bufn := 0
|
||||||
|
rune := norm.NFD.PropertiesString(suffix[p:])
|
||||||
|
p += rune.Size()
|
||||||
|
if rune.LeadCCC() != 0 {
|
||||||
|
prevCC := rune.TrailCCC()
|
||||||
|
// A gap may only occur in the last normalization segment.
|
||||||
|
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
|
||||||
|
if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
|
||||||
|
scan.s = suffix[:p+end]
|
||||||
|
}
|
||||||
|
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
|
||||||
|
rune = norm.NFD.PropertiesString(suffix[p:])
|
||||||
|
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
prevCC = rune.TrailCCC()
|
||||||
|
if pp := scan.scan(p); pp != p {
|
||||||
|
// Copy the interstitial runes for later processing.
|
||||||
|
bufn += copy(buf[bufn:], suffix[p0:p])
|
||||||
|
if scan.pindex == pp {
|
||||||
|
bufp = bufn
|
||||||
|
}
|
||||||
|
p, p0 = pp, pp
|
||||||
|
} else {
|
||||||
|
p += rune.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Append weights for the matched contraction, which may be an expansion.
|
||||||
|
i, n := scan.result()
|
||||||
|
ce = colElem(t.contractElem[i+offset])
|
||||||
|
if ce.ctype() == ceNormal {
|
||||||
|
w = append(w, ce)
|
||||||
|
} else {
|
||||||
|
w = t.appendExpansion(w, ce)
|
||||||
|
}
|
||||||
|
// Append weights for the runes in the segment not part of the contraction.
|
||||||
|
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
|
||||||
|
w, p = t.appendNext(w, source{bytes: b})
|
||||||
}
|
}
|
||||||
return w, n
|
return w, n
|
||||||
}
|
}
|
||||||
|
@ -42,7 +42,9 @@ func pt(p, t int) []int {
|
|||||||
func makeTable(in []input) (*collate.Collator, error) {
|
func makeTable(in []input) (*collate.Collator, error) {
|
||||||
b := build.NewBuilder()
|
b := build.NewBuilder()
|
||||||
for _, r := range in {
|
for _, r := range in {
|
||||||
b.Add([]rune(r.str), r.ces, nil)
|
if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
|
||||||
|
panic(e)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return b.Build()
|
return b.Build()
|
||||||
}
|
}
|
||||||
@ -159,6 +161,7 @@ var appendNextTests = []tableTest{
|
|||||||
{"b", [][]int{{200}}},
|
{"b", [][]int{{200}}},
|
||||||
{"c", [][]int{{300}}},
|
{"c", [][]int{{300}}},
|
||||||
{"\u03B1", [][]int{{900}}},
|
{"\u03B1", [][]int{{900}}},
|
||||||
|
{"\x01", [][]int{{0, 0, 0, 0}}},
|
||||||
|
|
||||||
// contractions
|
// contractions
|
||||||
{"a\u0300", [][]int{{101}}},
|
{"a\u0300", [][]int{{101}}},
|
||||||
@ -171,10 +174,11 @@ var appendNextTests = []tableTest{
|
|||||||
{"a\u0301\u035F", [][]int{{121}}},
|
{"a\u0301\u035F", [][]int{{121}}},
|
||||||
{"a\u0301\u035Fb", [][]int{{119}}},
|
{"a\u0301\u035Fb", [][]int{{119}}},
|
||||||
{"\u03B1\u0345", [][]int{{901}, {902}}},
|
{"\u03B1\u0345", [][]int{{901}, {902}}},
|
||||||
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
|
{"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
|
||||||
{"\u302F\u18A9", [][]int{{0, 130}}},
|
{"\u302F\u18A9", [][]int{{0, 130}}},
|
||||||
}...),
|
}...),
|
||||||
[]check{
|
[]check{
|
||||||
|
{"a\x01\u0300", 1, ColElems{w(100)}},
|
||||||
{"ab", 1, ColElems{w(100)}}, // closing segment
|
{"ab", 1, ColElems{w(100)}}, // closing segment
|
||||||
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
|
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
|
||||||
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
|
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
|
||||||
@ -239,12 +243,17 @@ var appendNextTests = []tableTest{
|
|||||||
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
|
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
|
||||||
// expansion within a gap
|
// expansion within a gap
|
||||||
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
|
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
|
||||||
{"a\u302E\u18A9\u0301", 9, ColElems{w(102), w(0, 131), w(0, 132)}},
|
// repeating CCC blocks last modifier
|
||||||
{
|
{"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
|
||||||
"a\u0317\u302E\u18A9\u0301",
|
// The trailing combining characters (with lower CCC) should block the first one.
|
||||||
11,
|
// TODO: make the following pass.
|
||||||
ColElems{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
|
// {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
|
||||||
},
|
{"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
|
||||||
|
// Last combiner should match after normalization.
|
||||||
|
// TODO: make the following pass.
|
||||||
|
// {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
|
||||||
|
// The first combiner is blocking the second one as they have the same CCC.
|
||||||
|
{"a\u035D\u035Eb", 1, ColElems{w(100)}},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -97,3 +97,64 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
|
|||||||
// Illegal rune
|
// Illegal rune
|
||||||
return 0, 1
|
return 0, 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The body of lookupString is a verbatim copy of that of lookup.
|
||||||
|
func (t *trie) lookupString(s string) (v colElem, sz int) {
|
||||||
|
c0 := s[0]
|
||||||
|
switch {
|
||||||
|
case c0 < tx:
|
||||||
|
return colElem(t.values0[c0]), 1
|
||||||
|
case c0 < t2:
|
||||||
|
return 0, 1
|
||||||
|
case c0 < t3:
|
||||||
|
if len(s) < 2 {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
i := t.index0[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < tx || t2 <= c1 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
return t.lookupValue(i, c1), 2
|
||||||
|
case c0 < t4:
|
||||||
|
if len(s) < 3 {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
i := t.index0[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < tx || t2 <= c1 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
o := int(i)<<6 + int(c1)
|
||||||
|
i = t.index[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < tx || t2 <= c2 {
|
||||||
|
return 0, 2
|
||||||
|
}
|
||||||
|
return t.lookupValue(i, c2), 3
|
||||||
|
case c0 < t5:
|
||||||
|
if len(s) < 4 {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
i := t.index0[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < tx || t2 <= c1 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
o := int(i)<<6 + int(c1)
|
||||||
|
i = t.index[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < tx || t2 <= c2 {
|
||||||
|
return 0, 2
|
||||||
|
}
|
||||||
|
o = int(i)<<6 + int(c2)
|
||||||
|
i = t.index[o]
|
||||||
|
c3 := s[3]
|
||||||
|
if c3 < tx || t2 <= c3 {
|
||||||
|
return 0, 3
|
||||||
|
}
|
||||||
|
return t.lookupValue(i, c3), 4
|
||||||
|
}
|
||||||
|
// Illegal rune
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user