mirror of
https://github.com/golang/go
synced 2024-11-19 00:44:40 -07:00
exp/locale/collate: include composed characters into the table. This eliminates
the need to decompose characters for the majority of cases. This considerably speeds up collation while increasing the table size minimally. To detect non-normalized strings, rather than relying on exp/norm, the table now includes CCC information. The inclusion of this information does not increase table size. DETAILS - Raw collation elements are now a struct that includes the CCC, rather than a slice of ints. - Builder now ensures that NFD and NFC counterparts are included in the table. This also fixes a bug for Korean which is responsible for most of the growth of the table size. - As there is no more normalization step, code should now handle both strings and byte slices as input. Introduced source type to facilitate this. NOTES - This change does not handle normalization correctly entirely for contractions. This causes a few failures with the regtest. table_test.go contains a few uncommented tests that can be enabled once this is fixed. The easiest is to fix this once we have the new norm.Iter. - Removed a test cases in table_test that covers cases that are now guaranteed to not exist. R=rsc, mpvl CC=golang-dev https://golang.org/cl/6971044
This commit is contained in:
parent
43f2fc308b
commit
9aa70984a9
@ -98,24 +98,24 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
|
||||
// a value for each colelem that is a variable. (See the reference above.)
|
||||
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
||||
str := string(runes)
|
||||
elems := make([][]int, len(colelems))
|
||||
elems := make([]rawCE, len(colelems))
|
||||
for i, ce := range colelems {
|
||||
elems[i] = append(elems[i], ce...)
|
||||
if len(ce) == 0 {
|
||||
elems[i] = append(elems[i], []int{0, 0, 0, 0}...)
|
||||
break
|
||||
}
|
||||
elems[i] = makeRawCE(ce, 0)
|
||||
if len(ce) == 1 {
|
||||
elems[i] = append(elems[i], defaultSecondary)
|
||||
elems[i].w[1] = defaultSecondary
|
||||
}
|
||||
if len(ce) <= 2 {
|
||||
elems[i] = append(elems[i], defaultTertiary)
|
||||
elems[i].w[2] = defaultTertiary
|
||||
}
|
||||
if len(ce) <= 3 {
|
||||
elems[i] = append(elems[i], ce[0])
|
||||
elems[i].w[3] = ce[0]
|
||||
}
|
||||
}
|
||||
for i, ce := range elems {
|
||||
p := ce.w[0]
|
||||
isvar := false
|
||||
for _, j := range variables {
|
||||
if i == j {
|
||||
@ -123,18 +123,18 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
||||
}
|
||||
}
|
||||
if isvar {
|
||||
if ce[0] >= b.minNonVar && b.minNonVar > 0 {
|
||||
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar)
|
||||
if p >= b.minNonVar && b.minNonVar > 0 {
|
||||
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
|
||||
}
|
||||
if ce[0] > b.varTop {
|
||||
b.varTop = ce[0]
|
||||
if p > b.varTop {
|
||||
b.varTop = p
|
||||
}
|
||||
} else if ce[0] > 1 { // 1 is a special primary value reserved for FFFE
|
||||
if ce[0] <= b.varTop {
|
||||
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop)
|
||||
} else if p > 1 { // 1 is a special primary value reserved for FFFE
|
||||
if p <= b.varTop {
|
||||
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
|
||||
}
|
||||
if b.minNonVar == 0 || ce[0] < b.minNonVar {
|
||||
b.minNonVar = ce[0]
|
||||
if b.minNonVar == 0 || p < b.minNonVar {
|
||||
b.minNonVar = p
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -142,16 +142,42 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cccs := []uint8{}
|
||||
nfd := norm.NFD.String(str)
|
||||
for i := range nfd {
|
||||
cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
|
||||
}
|
||||
if len(cccs) < len(elems) {
|
||||
if len(cccs) > 2 {
|
||||
return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
|
||||
}
|
||||
p := len(elems) - 1
|
||||
for ; p > 0 && elems[p].w[0] == 0; p-- {
|
||||
elems[p].ccc = cccs[len(cccs)-1]
|
||||
}
|
||||
for ; p >= 0; p-- {
|
||||
elems[p].ccc = cccs[0]
|
||||
}
|
||||
} else {
|
||||
for i := range elems {
|
||||
elems[i].ccc = cccs[i]
|
||||
}
|
||||
}
|
||||
// doNorm in collate.go assumes that the following conditions hold.
|
||||
if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
|
||||
return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
|
||||
}
|
||||
b.root.newEntry(str, elems)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Tailoring) setAnchor(anchor string) error {
|
||||
anchor = norm.NFD.String(anchor)
|
||||
anchor = norm.NFC.String(anchor)
|
||||
a := t.index.find(anchor)
|
||||
if a == nil {
|
||||
a = t.index.newEntry(anchor, nil)
|
||||
a.implicit = true
|
||||
a.modified = true
|
||||
for _, r := range []rune(anchor) {
|
||||
e := t.index.find(string(r))
|
||||
e.lock = true
|
||||
@ -221,7 +247,7 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
|
||||
if t.anchor == nil {
|
||||
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
|
||||
}
|
||||
str = norm.NFD.String(str)
|
||||
str = norm.NFC.String(str)
|
||||
e := t.index.find(str)
|
||||
if e == nil {
|
||||
e = t.index.newEntry(str, nil)
|
||||
@ -262,12 +288,13 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
|
||||
}
|
||||
e.extend = norm.NFD.String(extend)
|
||||
e.exclude = false
|
||||
e.modified = true
|
||||
e.elems = nil
|
||||
t.anchor = e
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *ordering) getWeight(e *entry) [][]int {
|
||||
func (o *ordering) getWeight(e *entry) []rawCE {
|
||||
if len(e.elems) == 0 && e.logical == noAnchor {
|
||||
if e.implicit {
|
||||
for _, r := range e.runes {
|
||||
@ -279,11 +306,10 @@ func (o *ordering) getWeight(e *entry) [][]int {
|
||||
for ; a.elems == nil && !a.implicit; a = a.next {
|
||||
count[a.level]++
|
||||
}
|
||||
e.elems = append([][]int(nil), make([]int, len(a.elems[0])))
|
||||
copy(e.elems[0], a.elems[0])
|
||||
e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
|
||||
for i := collate.Primary; i < collate.Quaternary; i++ {
|
||||
if count[i] != 0 {
|
||||
e.elems[0][i] -= count[i]
|
||||
e.elems[0].w[i] -= count[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
@ -315,11 +341,11 @@ func (o *ordering) verifyWeights(a, b *entry, level collate.Level) error {
|
||||
return nil
|
||||
}
|
||||
for i := collate.Primary; i < level; i++ {
|
||||
if a.elems[0][i] < b.elems[0][i] {
|
||||
if a.elems[0].w[i] < b.elems[0].w[i] {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if a.elems[0][level] >= b.elems[0][level] {
|
||||
if a.elems[0].w[level] >= b.elems[0].w[level] {
|
||||
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
|
||||
log.Println(err)
|
||||
// TODO: return the error instead, or better, fix the conflicting entry by making room.
|
||||
@ -339,6 +365,54 @@ func (b *Builder) errorID(locale string, e error) {
|
||||
}
|
||||
}
|
||||
|
||||
// patchNorm ensures that NFC and NFD counterparts are consistent.
|
||||
func (o *ordering) patchNorm() {
|
||||
// Insert the NFD counterparts, if necessary.
|
||||
for _, e := range o.ordered {
|
||||
nfd := norm.NFD.String(e.str)
|
||||
if nfd != e.str {
|
||||
if e0 := o.find(nfd); e0 != nil && !e0.modified {
|
||||
e0.elems = e.elems
|
||||
} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
|
||||
e := o.newEntry(nfd, e.elems)
|
||||
e.modified = true
|
||||
}
|
||||
}
|
||||
}
|
||||
// Update unchanged composed forms if one of their parts changed.
|
||||
for _, e := range o.ordered {
|
||||
nfd := norm.NFD.String(e.str)
|
||||
if e.modified || nfd == e.str {
|
||||
continue
|
||||
}
|
||||
if e0 := o.find(nfd); e0 != nil {
|
||||
e.elems = e0.elems
|
||||
} else {
|
||||
e.elems = o.genColElems(nfd)
|
||||
if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
|
||||
r := []rune(nfd)
|
||||
head := string(r[0])
|
||||
tail := ""
|
||||
for i := 1; i < len(r); i++ {
|
||||
s := norm.NFC.String(head + string(r[i]))
|
||||
if e0 := o.find(s); e0 != nil && e0.modified {
|
||||
head = s
|
||||
} else {
|
||||
tail += string(r[i])
|
||||
}
|
||||
}
|
||||
e.elems = append(o.genColElems(head), o.genColElems(tail)...)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Exclude entries for which the individual runes generate the same collation elements.
|
||||
for _, e := range o.ordered {
|
||||
if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
|
||||
e.exclude = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Builder) buildOrdering(o *ordering) {
|
||||
for _, e := range o.ordered {
|
||||
o.getWeight(e)
|
||||
@ -346,6 +420,7 @@ func (b *Builder) buildOrdering(o *ordering) {
|
||||
for _, e := range o.ordered {
|
||||
o.addExtension(e)
|
||||
}
|
||||
o.patchNorm()
|
||||
o.sort()
|
||||
simplify(o)
|
||||
b.processExpansions(o) // requires simplify
|
||||
@ -436,20 +511,20 @@ func (b *Builder) Print(w io.Writer) (n int, err error) {
|
||||
|
||||
// reproducibleFromNFKD checks whether the given expansion could be generated
|
||||
// from an NFKD expansion.
|
||||
func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
|
||||
func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
|
||||
// Length must be equal.
|
||||
if len(exp) != len(nfkd) {
|
||||
return false
|
||||
}
|
||||
for i, ce := range exp {
|
||||
// Primary and secondary values should be equal.
|
||||
if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] {
|
||||
if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
|
||||
return false
|
||||
}
|
||||
// Tertiary values should be equal to maxTertiary for third element onwards.
|
||||
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
|
||||
// simply be dropped. Try this out by dropping the following code.
|
||||
if i >= 2 && ce[2] != maxTertiary {
|
||||
if i >= 2 && ce.w[2] != maxTertiary {
|
||||
return false
|
||||
}
|
||||
if _, err := makeCE(ce); err != nil {
|
||||
@ -469,22 +544,12 @@ func simplify(o *ordering) {
|
||||
keep[e.runes[0]] = true
|
||||
}
|
||||
}
|
||||
// Remove entries for which the runes normalize (using NFD) to identical values.
|
||||
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
|
||||
s := e.str
|
||||
nfd := norm.NFD.String(s)
|
||||
if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s {
|
||||
continue
|
||||
}
|
||||
if equalCEArrays(o.genColElems(nfd), e.elems) {
|
||||
e.remove()
|
||||
}
|
||||
}
|
||||
// Tag entries for which the runes NFKD decompose to identical values.
|
||||
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
|
||||
s := e.str
|
||||
nfkd := norm.NFKD.String(s)
|
||||
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == s {
|
||||
nfd := norm.NFD.String(s)
|
||||
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
|
||||
continue
|
||||
}
|
||||
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
|
||||
@ -589,18 +654,18 @@ func (b *Builder) processContractions(o *ordering) {
|
||||
// Bucket sort entries in index order.
|
||||
es := make([]*entry, len(l))
|
||||
for _, e := range l {
|
||||
var o, sn int
|
||||
var p, sn int
|
||||
if len(e.runes) > 1 {
|
||||
str := []byte(string(e.runes[1:]))
|
||||
o, sn = t.contractTries.lookup(handle, str)
|
||||
p, sn = t.contractTries.lookup(handle, str)
|
||||
if sn != len(str) {
|
||||
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str))
|
||||
log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
|
||||
}
|
||||
}
|
||||
if es[o] != nil {
|
||||
log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0])
|
||||
if es[p] != nil {
|
||||
log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
|
||||
}
|
||||
es[o] = e
|
||||
es[p] = e
|
||||
}
|
||||
// Create collation elements for contractions.
|
||||
elems := []uint32{}
|
||||
|
@ -7,48 +7,64 @@ package build
|
||||
import "testing"
|
||||
|
||||
// cjk returns an implicit collation element for a CJK rune.
|
||||
func cjk(r rune) [][]int {
|
||||
func cjk(r rune) []rawCE {
|
||||
// A CJK character C is represented in the DUCET as
|
||||
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
|
||||
// Where AAAA is the most significant 15 bits plus a base value.
|
||||
// Any base value will work for the test, so we pick the common value of FB40.
|
||||
const base = 0xFB40
|
||||
return [][]int{
|
||||
{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)},
|
||||
{int(r&0x7FFF) | 0x8000, 0, 0, int(r)},
|
||||
return []rawCE{
|
||||
{w: []int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}},
|
||||
{w: []int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}},
|
||||
}
|
||||
}
|
||||
|
||||
func pCE(p int) [][]int {
|
||||
return [][]int{{p, defaultSecondary, defaultTertiary, 0}}
|
||||
func pCE(p int) []rawCE {
|
||||
return mkCE([]int{p, defaultSecondary, defaultTertiary, 0}, 0)
|
||||
}
|
||||
|
||||
func pqCE(p, q int) [][]int {
|
||||
return [][]int{{p, defaultSecondary, defaultTertiary, q}}
|
||||
func pqCE(p, q int) []rawCE {
|
||||
return mkCE([]int{p, defaultSecondary, defaultTertiary, q}, 0)
|
||||
}
|
||||
|
||||
func ptCE(p, t int) [][]int {
|
||||
return [][]int{{p, defaultSecondary, t, 0}}
|
||||
func ptCE(p, t int) []rawCE {
|
||||
return mkCE([]int{p, defaultSecondary, t, 0}, 0)
|
||||
}
|
||||
|
||||
func sCE(s int) [][]int {
|
||||
return [][]int{{0, s, defaultTertiary, 0}}
|
||||
func ptcCE(p, t int, ccc uint8) []rawCE {
|
||||
return mkCE([]int{p, defaultSecondary, t, 0}, ccc)
|
||||
}
|
||||
|
||||
func stCE(s, t int) [][]int {
|
||||
return [][]int{{0, s, t, 0}}
|
||||
func sCE(s int) []rawCE {
|
||||
return mkCE([]int{0, s, defaultTertiary, 0}, 0)
|
||||
}
|
||||
|
||||
func stCE(s, t int) []rawCE {
|
||||
return mkCE([]int{0, s, t, 0}, 0)
|
||||
}
|
||||
|
||||
func scCE(s int, ccc uint8) []rawCE {
|
||||
return mkCE([]int{0, s, defaultTertiary, 0}, ccc)
|
||||
}
|
||||
|
||||
func mkCE(w []int, ccc uint8) []rawCE {
|
||||
return []rawCE{rawCE{w, ccc}}
|
||||
}
|
||||
|
||||
// ducetElem is used to define test data that is used to generate a table.
|
||||
type ducetElem struct {
|
||||
str string
|
||||
ces [][]int
|
||||
ces []rawCE
|
||||
}
|
||||
|
||||
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
||||
b := NewBuilder()
|
||||
for _, e := range ducet {
|
||||
if err := b.Add([]rune(e.str), e.ces, nil); err != nil {
|
||||
ces := [][]int{}
|
||||
for _, ce := range e.ces {
|
||||
ces = append(ces, ce.w)
|
||||
}
|
||||
if err := b.Add([]rune(e.str), ces, nil); err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
}
|
||||
@ -58,7 +74,7 @@ func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
||||
}
|
||||
|
||||
type convertTest struct {
|
||||
in, out [][]int
|
||||
in, out []rawCE
|
||||
err bool
|
||||
}
|
||||
|
||||
@ -173,16 +189,18 @@ func TestSimplify(t *testing.T) {
|
||||
}
|
||||
|
||||
var expandTest = []ducetElem{
|
||||
{"\u00C0", append(ptCE(100, 8), sCE(30)...)},
|
||||
{"\u00C8", append(ptCE(105, 8), sCE(30)...)},
|
||||
{"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion
|
||||
{"\u0300", append(scCE(29, 230), scCE(30, 230)...)},
|
||||
{"\u00C0", append(ptCE(100, 8), scCE(30, 230)...)},
|
||||
{"\u00C8", append(ptCE(105, 8), scCE(30, 230)...)},
|
||||
{"\u00C9", append(ptCE(105, 8), scCE(30, 230)...)}, // identical expansion
|
||||
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
|
||||
{"\u01FF", append(ptCE(200, 4), ptcCE(201, 4, 0)[0], scCE(30, 230)[0])},
|
||||
}
|
||||
|
||||
func TestExpand(t *testing.T) {
|
||||
const (
|
||||
totalExpansions = 3
|
||||
totalElements = 2 + 2 + 3 + totalExpansions
|
||||
totalExpansions = 5
|
||||
totalElements = 2 + 2 + 2 + 3 + 3 + totalExpansions
|
||||
)
|
||||
b := newBuilder(t, expandTest)
|
||||
o := &b.root
|
||||
|
@ -16,6 +16,17 @@ const (
|
||||
maxTertiary = 0x1F
|
||||
)
|
||||
|
||||
type rawCE struct {
|
||||
w []int
|
||||
ccc uint8
|
||||
}
|
||||
|
||||
func makeRawCE(w []int, ccc uint8) rawCE {
|
||||
ce := rawCE{w: make([]int, 4), ccc: ccc}
|
||||
copy(ce.w, w)
|
||||
return ce
|
||||
}
|
||||
|
||||
// A collation element is represented as an uint32.
|
||||
// In the typical case, a rune maps to a single collation element. If a rune
|
||||
// can be the start of a contraction or expands into multiple collation elements,
|
||||
@ -29,29 +40,36 @@ const (
|
||||
// 01pppppp pppppppp ppppppp0 ssssssss
|
||||
// - p* is primary collation value
|
||||
// - s* is the secondary collation value
|
||||
// or
|
||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||
// - p* is primary collation value
|
||||
// - s* offset of secondary from default value.
|
||||
// - t* is the tertiary collation value
|
||||
// 100ttttt cccccccc pppppppp pppppppp
|
||||
// - t* is the tertiar collation value
|
||||
// - c* is the cannonical combining class
|
||||
// - p* is the primary collation value
|
||||
// Collation elements with a secondary value are of the form
|
||||
// 10000000 0000ssss ssssssss tttttttt, where
|
||||
// - 16 BMP implicit -> weight
|
||||
// - 8 bit s
|
||||
// - default tertiary
|
||||
// 1010cccc ccccssss ssssssss tttttttt, where
|
||||
// - c* is the canonical combining class
|
||||
// - s* is the secondary collation value
|
||||
// - t* is the tertiary collation value
|
||||
const (
|
||||
maxPrimaryBits = 21
|
||||
maxPrimaryCompactBits = 16
|
||||
maxSecondaryBits = 12
|
||||
maxSecondaryCompactBits = 8
|
||||
maxCCCBits = 8
|
||||
maxSecondaryDiffBits = 4
|
||||
maxTertiaryBits = 8
|
||||
maxTertiaryCompactBits = 5
|
||||
|
||||
isSecondary = 0x80000000
|
||||
isPrimary = 0x40000000
|
||||
isPrimary = 0x40000000
|
||||
isPrimaryCCC = 0x80000000
|
||||
isSecondary = 0xA0000000
|
||||
)
|
||||
|
||||
func makeCE(weights []int) (uint32, error) {
|
||||
func makeCE(rce rawCE) (uint32, error) {
|
||||
weights := rce.w
|
||||
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
|
||||
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
|
||||
}
|
||||
@ -63,14 +81,25 @@ func makeCE(weights []int) (uint32, error) {
|
||||
}
|
||||
ce := uint32(0)
|
||||
if weights[0] != 0 {
|
||||
if weights[2] == defaultTertiary {
|
||||
if rce.ccc != 0 {
|
||||
if weights[0] >= 1<<maxPrimaryCompactBits {
|
||||
return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights[0], 1<<maxPrimaryCompactBits)
|
||||
}
|
||||
if weights[1] != defaultSecondary {
|
||||
return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights[1], rce.ccc)
|
||||
}
|
||||
ce = uint32(weights[2] << (maxPrimaryCompactBits + maxCCCBits))
|
||||
ce |= uint32(rce.ccc) << maxPrimaryCompactBits
|
||||
ce |= uint32(weights[0])
|
||||
ce |= isPrimaryCCC
|
||||
} else if weights[2] == defaultTertiary {
|
||||
if weights[1] >= 1<<maxSecondaryCompactBits {
|
||||
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
|
||||
}
|
||||
ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
||||
ce |= isPrimary
|
||||
} else {
|
||||
d := weights[1] - defaultSecondary + 4
|
||||
d := weights[1] - defaultSecondary + maxSecondaryDiffBits
|
||||
if d >= 1<<maxSecondaryDiffBits || d < 0 {
|
||||
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
|
||||
}
|
||||
@ -82,6 +111,7 @@ func makeCE(weights []int) (uint32, error) {
|
||||
}
|
||||
} else {
|
||||
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
|
||||
ce += uint32(rce.ccc) << (maxSecondaryBits + maxTertiaryBits)
|
||||
ce |= isSecondary
|
||||
}
|
||||
return ce, nil
|
||||
@ -207,7 +237,7 @@ func implicitPrimary(r rune) int {
|
||||
// We will rewrite these characters to a single CE.
|
||||
// We assume the CJK values start at 0x8000.
|
||||
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
||||
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
|
||||
const (
|
||||
cjkPrimaryStart = 0xFB40
|
||||
rarePrimaryStart = 0xFB80
|
||||
@ -219,7 +249,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
shiftBits = 15
|
||||
)
|
||||
for i := 0; i < len(elems); i++ {
|
||||
ce := elems[i]
|
||||
ce := elems[i].w
|
||||
p := ce[0]
|
||||
if p < cjkPrimaryStart {
|
||||
continue
|
||||
@ -233,10 +263,10 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
if i+1 >= len(elems) {
|
||||
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
|
||||
}
|
||||
if elems[i+1][0]&lowBitsFlag == 0 {
|
||||
if elems[i+1].w[0]&lowBitsFlag == 0 {
|
||||
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
||||
}
|
||||
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
|
||||
np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
|
||||
switch {
|
||||
case p < rarePrimaryStart:
|
||||
np += commonUnifiedOffset
|
||||
@ -257,26 +287,25 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
|
||||
// nextWeight computes the first possible collation weights following elems
|
||||
// for the given level.
|
||||
func nextWeight(level collate.Level, elems [][]int) [][]int {
|
||||
func nextWeight(level collate.Level, elems []rawCE) []rawCE {
|
||||
if level == collate.Identity {
|
||||
next := make([][]int, len(elems))
|
||||
next := make([]rawCE, len(elems))
|
||||
copy(next, elems)
|
||||
return next
|
||||
}
|
||||
next := [][]int{make([]int, len(elems[0]))}
|
||||
copy(next[0], elems[0])
|
||||
next[0][level]++
|
||||
next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
|
||||
next[0].w[level]++
|
||||
if level < collate.Secondary {
|
||||
next[0][collate.Secondary] = defaultSecondary
|
||||
next[0].w[collate.Secondary] = defaultSecondary
|
||||
}
|
||||
if level < collate.Tertiary {
|
||||
next[0][collate.Tertiary] = defaultTertiary
|
||||
next[0].w[collate.Tertiary] = defaultTertiary
|
||||
}
|
||||
// Filter entries that cannot influence ordering.
|
||||
for _, ce := range elems[1:] {
|
||||
skip := true
|
||||
for i := collate.Primary; i < level; i++ {
|
||||
skip = skip && ce[i] == 0
|
||||
skip = skip && ce.w[i] == 0
|
||||
}
|
||||
if !skip {
|
||||
next = append(next, ce)
|
||||
@ -285,18 +314,18 @@ func nextWeight(level collate.Level, elems [][]int) [][]int {
|
||||
return next
|
||||
}
|
||||
|
||||
func nextVal(elems [][]int, i int, level collate.Level) (index, value int) {
|
||||
for ; i < len(elems) && elems[i][level] == 0; i++ {
|
||||
func nextVal(elems []rawCE, i int, level collate.Level) (index, value int) {
|
||||
for ; i < len(elems) && elems[i].w[level] == 0; i++ {
|
||||
}
|
||||
if i < len(elems) {
|
||||
return i, elems[i][level]
|
||||
return i, elems[i].w[level]
|
||||
}
|
||||
return i, 0
|
||||
}
|
||||
|
||||
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
|
||||
// It also returns the collation level at which the difference is found.
|
||||
func compareWeights(a, b [][]int) (result int, level collate.Level) {
|
||||
func compareWeights(a, b []rawCE) (result int, level collate.Level) {
|
||||
for level := collate.Primary; level < collate.Identity; level++ {
|
||||
var va, vb int
|
||||
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
|
||||
@ -314,19 +343,16 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) {
|
||||
return 0, collate.Identity
|
||||
}
|
||||
|
||||
func equalCE(a, b []int) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
func equalCE(a, b rawCE) bool {
|
||||
for i := 0; i < 3; i++ {
|
||||
if b[i] != a[i] {
|
||||
if b.w[i] != a.w[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func equalCEArrays(a, b [][]int) bool {
|
||||
func equalCEArrays(a, b []rawCE) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ type ceTest struct {
|
||||
}
|
||||
|
||||
func normalCE(in []int) (ce uint32, err error) {
|
||||
return makeCE(in)
|
||||
return makeCE(rawCE{w: in[:3], ccc: uint8(in[3])})
|
||||
}
|
||||
|
||||
func expandCE(in []int) (ce uint32, err error) {
|
||||
@ -32,17 +32,20 @@ func decompCE(in []int) (ce uint32, err error) {
|
||||
}
|
||||
|
||||
var ceTests = []ceTest{
|
||||
{normalCE, []int{0, 0, 0}, 0x80000000},
|
||||
{normalCE, []int{0, 0x28, 3}, 0x80002803},
|
||||
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C883},
|
||||
{normalCE, []int{0, 0, 0, 0}, 0xA0000000},
|
||||
{normalCE, []int{0, 0x28, 3, 0}, 0xA0002803},
|
||||
{normalCE, []int{0, 0x28, 3, 0xFF}, 0xAFF02803},
|
||||
{normalCE, []int{100, defaultSecondary, 3, 0}, 0x0000C883},
|
||||
// non-ignorable primary with non-default secondary
|
||||
{normalCE, []int{100, 0x28, defaultTertiary}, 0x4000C828},
|
||||
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983},
|
||||
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary
|
||||
{normalCE, []int{100, 1, 3}, 0xFFFF},
|
||||
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
|
||||
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
|
||||
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
|
||||
{normalCE, []int{100, 0x28, defaultTertiary, 0}, 0x4000C828},
|
||||
{normalCE, []int{100, defaultSecondary + 8, 3, 0}, 0x0000C983},
|
||||
{normalCE, []int{100, 0, 3, 0}, 0xFFFF}, // non-ignorable primary with non-supported secondary
|
||||
{normalCE, []int{100, 1, 3, 0}, 0xFFFF},
|
||||
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0, 0}, 0xFFFF},
|
||||
{normalCE, []int{0, 1 << maxSecondaryBits, 0, 0}, 0xFFFF},
|
||||
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits, 0}, 0xFFFF},
|
||||
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}, 0x88FF0123},
|
||||
{normalCE, []int{0x123, defaultSecondary + 1, 8, 0xFF}, 0xFFFF},
|
||||
|
||||
{contractCE, []int{0, 0, 0}, 0xC0000000},
|
||||
{contractCE, []int{1, 1, 1}, 0xC0010011},
|
||||
@ -85,6 +88,14 @@ func TestColElem(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func mkRawCES(in [][]int) []rawCE {
|
||||
out := []rawCE{}
|
||||
for _, w := range in {
|
||||
out = append(out, rawCE{w: w})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type weightsTest struct {
|
||||
a, b [][]int
|
||||
level collate.Level
|
||||
@ -119,8 +130,8 @@ var extra = [][]int{{200, 32, 8, 0}, {0, 32, 8, 0}, {0, 0, 8, 0}, {0, 0, 0, 0}}
|
||||
func TestNextWeight(t *testing.T) {
|
||||
for i, tt := range nextWeightTests {
|
||||
test := func(l collate.Level, tt weightsTest, a, gold [][]int) {
|
||||
res := nextWeight(tt.level, a)
|
||||
if !equalCEArrays(gold, res) {
|
||||
res := nextWeight(tt.level, mkRawCES(a))
|
||||
if !equalCEArrays(mkRawCES(gold), res) {
|
||||
t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res)
|
||||
}
|
||||
}
|
||||
@ -189,7 +200,7 @@ var compareTests = []weightsTest{
|
||||
func TestCompareWeights(t *testing.T) {
|
||||
for i, tt := range compareTests {
|
||||
test := func(tt weightsTest, a, b [][]int) {
|
||||
res, level := compareWeights(a, b)
|
||||
res, level := compareWeights(mkRawCES(a), mkRawCES(b))
|
||||
if res != tt.result {
|
||||
t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res)
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ package build
|
||||
|
||||
import (
|
||||
"exp/locale/collate"
|
||||
"exp/norm"
|
||||
"fmt"
|
||||
"log"
|
||||
"sort"
|
||||
@ -28,7 +29,7 @@ const (
|
||||
type entry struct {
|
||||
str string // same as string(runes)
|
||||
runes []rune
|
||||
elems [][]int // the collation elements
|
||||
elems []rawCE // the collation elements
|
||||
extend string // weights of extend to be appended to elems
|
||||
before bool // weights relative to next instead of previous.
|
||||
lock bool // entry is used in extension and can no longer be moved.
|
||||
@ -41,6 +42,7 @@ type entry struct {
|
||||
decompose bool // can use NFKD decomposition to generate elems
|
||||
exclude bool // do not include in table
|
||||
implicit bool // derived, is not included in the list
|
||||
modified bool // entry was modified in tailoring
|
||||
logical logicalAnchor
|
||||
|
||||
expansionIndex int // used to store index into expansion table
|
||||
@ -162,10 +164,10 @@ func (e *entry) encode() (ce uint32, err error) {
|
||||
}
|
||||
switch {
|
||||
case e.decompose:
|
||||
t1 := e.elems[0][2]
|
||||
t1 := e.elems[0].w[2]
|
||||
t2 := 0
|
||||
if len(e.elems) > 1 {
|
||||
t2 = e.elems[1][2]
|
||||
t2 = e.elems[1].w[2]
|
||||
}
|
||||
ce, err = makeDecompose(t1, t2)
|
||||
case e.contractionStarter():
|
||||
@ -231,7 +233,7 @@ func (o *ordering) insert(e *entry) {
|
||||
|
||||
// newEntry creates a new entry for the given info and inserts it into
|
||||
// the index.
|
||||
func (o *ordering) newEntry(s string, ces [][]int) *entry {
|
||||
func (o *ordering) newEntry(s string, ces []rawCE) *entry {
|
||||
e := &entry{
|
||||
runes: []rune(s),
|
||||
elems: ces,
|
||||
@ -249,14 +251,29 @@ func (o *ordering) find(str string) *entry {
|
||||
if e == nil {
|
||||
r := []rune(str)
|
||||
if len(r) == 1 {
|
||||
e = o.newEntry(string(r[0]), [][]int{
|
||||
{
|
||||
implicitPrimary(r[0]),
|
||||
defaultSecondary,
|
||||
defaultTertiary,
|
||||
int(r[0]),
|
||||
},
|
||||
})
|
||||
const (
|
||||
firstHangul = 0xAC00
|
||||
lastHangul = 0xD7A3
|
||||
)
|
||||
if r[0] >= firstHangul && r[0] <= lastHangul {
|
||||
ce := []rawCE{}
|
||||
nfd := norm.NFD.String(str)
|
||||
for _, r := range nfd {
|
||||
ce = append(ce, o.find(string(r)).elems...)
|
||||
}
|
||||
e = o.newEntry(nfd, ce)
|
||||
} else {
|
||||
e = o.newEntry(string(r[0]), []rawCE{
|
||||
{w: []int{
|
||||
implicitPrimary(r[0]),
|
||||
defaultSecondary,
|
||||
defaultTertiary,
|
||||
int(r[0]),
|
||||
},
|
||||
},
|
||||
})
|
||||
e.modified = true
|
||||
}
|
||||
e.exclude = true // do not index implicits
|
||||
}
|
||||
}
|
||||
@ -275,7 +292,7 @@ func makeRootOrdering() ordering {
|
||||
}
|
||||
insert := func(typ logicalAnchor, s string, ce []int) {
|
||||
e := &entry{
|
||||
elems: [][]int{ce},
|
||||
elems: []rawCE{{w: ce}},
|
||||
str: s,
|
||||
exclude: true,
|
||||
logical: typ,
|
||||
@ -362,10 +379,14 @@ func (o *ordering) sort() {
|
||||
|
||||
// genColElems generates a collation element array from the runes in str. This
|
||||
// assumes that all collation elements have already been added to the Builder.
|
||||
func (o *ordering) genColElems(str string) [][]int {
|
||||
elems := [][]int{}
|
||||
func (o *ordering) genColElems(str string) []rawCE {
|
||||
elems := []rawCE{}
|
||||
for _, r := range []rune(str) {
|
||||
elems = append(elems, o.find(string(r)).elems...)
|
||||
for _, ce := range o.find(string(r)).elems {
|
||||
if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
|
||||
elems = append(elems, ce)
|
||||
}
|
||||
}
|
||||
}
|
||||
return elems
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ type entryTest struct {
|
||||
// entries plus a leading and trailing anchor.
|
||||
func makeList(n int) []*entry {
|
||||
es := make([]*entry, n+2)
|
||||
weights := [][]int{{100, 20, 5, 0}}
|
||||
weights := []rawCE{{w: []int{100, 20, 5, 0}}}
|
||||
for i := range es {
|
||||
runes := []rune{rune(i)}
|
||||
es[i] = &entry{
|
||||
@ -176,8 +176,8 @@ type entryLessTest struct {
|
||||
}
|
||||
|
||||
var (
|
||||
w1 = [][]int{{100, 20, 5, 5}}
|
||||
w2 = [][]int{{101, 20, 5, 5}}
|
||||
w1 = []rawCE{{w: []int{100, 20, 5, 5}}}
|
||||
w2 = []rawCE{{w: []int{101, 20, 5, 5}}}
|
||||
)
|
||||
|
||||
var entryLessTests = []entryLessTest{
|
||||
|
@ -23,7 +23,7 @@ const (
|
||||
type colElem uint32
|
||||
|
||||
const (
|
||||
maxCE colElem = 0x80FFFFFF
|
||||
maxCE colElem = 0xAFFFFFFF
|
||||
minContract = 0xC0000000
|
||||
maxContract = 0xDFFFFFFF
|
||||
minExpand = 0xE0000000
|
||||
@ -62,30 +62,37 @@ func (ce colElem) ctype() ceType {
|
||||
// 01pppppp pppppppp ppppppp0 ssssssss
|
||||
// - p* is primary collation value
|
||||
// - s* is the secondary collation value
|
||||
// or
|
||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||
// - p* is primary collation value
|
||||
// - s* offset of secondary from default value.
|
||||
// - t* is the tertiary collation value
|
||||
// 100ttttt cccccccc pppppppp pppppppp
|
||||
// - t* is the tertiar collation value
|
||||
// - c* is the cannonical combining class
|
||||
// - p* is the primary collation value
|
||||
// Collation elements with a secondary value are of the form
|
||||
// 10000000 0000ssss ssssssss tttttttt, where
|
||||
// - 16 BMP implicit -> weight
|
||||
// - 8 bit s
|
||||
// - default tertiary
|
||||
// 1010cccc ccccssss ssssssss tttttttt, where
|
||||
// - c* is the canonical combining class
|
||||
// - s* is the secondary collation value
|
||||
// - t* is the tertiary collation value
|
||||
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
|
||||
// - q* quaternary value
|
||||
const (
|
||||
ceTypeMask = 0xC0000000
|
||||
ceTypeMaskExt = 0xE0000000
|
||||
ceType1 = 0x40000000
|
||||
ceType2 = 0x00000000
|
||||
ceType3 = 0x80000000
|
||||
ceType3or4 = 0x80000000
|
||||
ceType4 = 0xA0000000
|
||||
ceTypeQ = 0xC0000000
|
||||
ceIgnore = ceType3
|
||||
ceIgnore = ceType4
|
||||
firstNonPrimary = 0x80000000
|
||||
lastSpecialPrimary = 0xA0000000
|
||||
secondaryMask = 0x80000000
|
||||
hasTertiaryMask = 0x40000000
|
||||
primaryValueMask = 0x3FFFFE00
|
||||
primaryShift = 9
|
||||
compactPrimaryBits = 16
|
||||
compactSecondaryShift = 5
|
||||
minCompactSecondary = defaultSecondary - 4
|
||||
)
|
||||
@ -98,9 +105,22 @@ func makeQuaternary(primary int) colElem {
|
||||
return ceTypeQ | colElem(primary<<primaryShift)
|
||||
}
|
||||
|
||||
func (ce colElem) ccc() uint8 {
|
||||
if ce&ceType3or4 != 0 {
|
||||
if ce&ceType4 == ceType3or4 {
|
||||
return uint8(ce >> 16)
|
||||
}
|
||||
return uint8(ce >> 20)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (ce colElem) primary() int {
|
||||
if ce >= firstNonPrimary {
|
||||
return 0
|
||||
if ce > lastSpecialPrimary {
|
||||
return 0
|
||||
}
|
||||
return int(uint16(ce))
|
||||
}
|
||||
return int(ce&primaryValueMask) >> primaryShift
|
||||
}
|
||||
@ -111,8 +131,11 @@ func (ce colElem) secondary() int {
|
||||
return int(uint8(ce))
|
||||
case ceType2:
|
||||
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
|
||||
case ceType3:
|
||||
return int(uint16(ce >> 8))
|
||||
case ceType3or4:
|
||||
if ce < ceType4 {
|
||||
return defaultSecondary
|
||||
}
|
||||
return int(ce>>8) & 0xFFF
|
||||
case ceTypeQ:
|
||||
return 0
|
||||
}
|
||||
@ -121,10 +144,13 @@ func (ce colElem) secondary() int {
|
||||
|
||||
func (ce colElem) tertiary() uint8 {
|
||||
if ce&hasTertiaryMask == 0 {
|
||||
if ce&ceType3 == 0 {
|
||||
if ce&ceType3or4 == 0 {
|
||||
return uint8(ce & 0x1F)
|
||||
}
|
||||
return uint8(ce)
|
||||
if ce&ceType4 == ceType4 {
|
||||
return uint8(ce)
|
||||
}
|
||||
return uint8(ce>>24) & 0x1F // type 2
|
||||
} else if ce&ceTypeMask == ceType1 {
|
||||
return defaultTertiary
|
||||
}
|
||||
@ -134,10 +160,15 @@ func (ce colElem) tertiary() uint8 {
|
||||
|
||||
func (ce colElem) updateTertiary(t uint8) colElem {
|
||||
if ce&ceTypeMask == ceType1 {
|
||||
// convert to type 4
|
||||
nce := ce & primaryValueMask
|
||||
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
|
||||
ce = nce
|
||||
} else if ce&ceTypeMaskExt == ceType3or4 {
|
||||
ce &= ^colElem(maxTertiary << 24)
|
||||
return ce | (colElem(t) << 24)
|
||||
} else {
|
||||
// type 2 or 4
|
||||
ce &= ^colElem(maxTertiary)
|
||||
}
|
||||
return ce | colElem(t)
|
||||
|
@ -23,12 +23,19 @@ func makeCE(weights []int) colElem {
|
||||
maxSecondaryDiffBits = 4
|
||||
maxTertiaryBits = 8
|
||||
maxTertiaryCompactBits = 5
|
||||
isSecondary = 0x80000000
|
||||
isPrimary = 0x40000000
|
||||
isPrimaryCCC = 0x80000000
|
||||
isSecondary = 0xA0000000
|
||||
)
|
||||
var ce colElem
|
||||
ccc := weights[3]
|
||||
if weights[0] != 0 {
|
||||
if weights[2] == defaultTertiary {
|
||||
if ccc != 0 {
|
||||
ce = colElem(weights[2] << 24)
|
||||
ce |= colElem(ccc) << 16
|
||||
ce |= colElem(weights[0])
|
||||
ce |= isPrimaryCCC
|
||||
} else if weights[2] == defaultTertiary {
|
||||
ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
|
||||
ce |= isPrimary
|
||||
} else {
|
||||
@ -38,6 +45,7 @@ func makeCE(weights []int) colElem {
|
||||
}
|
||||
} else {
|
||||
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
|
||||
ce += colElem(ccc) << 20
|
||||
ce |= isSecondary
|
||||
}
|
||||
return ce
|
||||
@ -68,10 +76,11 @@ func makeDecompose(t1, t2 int) colElem {
|
||||
}
|
||||
|
||||
func normalCE(inout []int) (ce colElem, t ceType) {
|
||||
w := makeCE(inout)
|
||||
inout[0] = w.primary()
|
||||
inout[1] = w.secondary()
|
||||
inout[2] = int(w.tertiary())
|
||||
ce = makeCE(inout)
|
||||
inout[0] = ce.primary()
|
||||
inout[1] = ce.secondary()
|
||||
inout[2] = int(ce.tertiary())
|
||||
inout[3] = int(ce.ccc())
|
||||
return ce, ceNormal
|
||||
}
|
||||
|
||||
@ -102,9 +111,13 @@ const (
|
||||
)
|
||||
|
||||
var ceTests = []ceTest{
|
||||
{normalCE, []int{0, 0, 0}},
|
||||
{normalCE, []int{0, 30, 3}},
|
||||
{normalCE, []int{100, defaultSecondary, 3}},
|
||||
{normalCE, []int{0, 0, 0, 0}},
|
||||
{normalCE, []int{0, 30, 3, 0}},
|
||||
{normalCE, []int{0, 30, 3, 0xFF}},
|
||||
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0}},
|
||||
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0xFF}},
|
||||
{normalCE, []int{100, defaultSecondary, 3, 0}},
|
||||
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}},
|
||||
|
||||
{contractCE, []int{0, 0, 0}},
|
||||
{contractCE, []int{1, 1, 1}},
|
||||
@ -127,11 +140,11 @@ func TestColElem(t *testing.T) {
|
||||
copy(inout, tt.arg)
|
||||
ce, typ := tt.f(inout)
|
||||
if ce.ctype() != typ {
|
||||
t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
|
||||
t.Errorf("%d: type is %d; want %d (ColElem: %X)", i, ce.ctype(), typ, ce)
|
||||
}
|
||||
for j, a := range tt.arg {
|
||||
if inout[j] != a {
|
||||
t.Errorf("%d: argument %d is %X; want %X", i, j, inout[j], a)
|
||||
t.Errorf("%d: argument %d is %X; want %X (ColElem: %X)", i, j, inout[j], a, ce)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -176,7 +189,8 @@ func TestUpdateTertiary(t *testing.T) {
|
||||
{0x4000FE20, 0x0000FE8A, 0x0A},
|
||||
{0x4000FE21, 0x0000FEAA, 0x0A},
|
||||
{0x0000FE8B, 0x0000FE83, 0x03},
|
||||
{0x8000CC02, 0x8000CC1B, 0x1B},
|
||||
{0x82FF0188, 0x9BFF0188, 0x1B},
|
||||
{0xAFF0CC02, 0xAFF0CC1B, 0x1B},
|
||||
}
|
||||
for i, tt := range tests {
|
||||
if out := tt.in.updateTertiary(tt.t); out != tt.out {
|
||||
@ -184,3 +198,77 @@ func TestUpdateTertiary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDoNorm(t *testing.T) {
|
||||
const div = -1 // The insertion point of the next block.
|
||||
tests := []struct {
|
||||
in, out []int
|
||||
}{
|
||||
{in: []int{4, div, 3},
|
||||
out: []int{3, 4},
|
||||
},
|
||||
{in: []int{4, div, 3, 3, 3},
|
||||
out: []int{3, 3, 3, 4},
|
||||
},
|
||||
{in: []int{0, 4, div, 3},
|
||||
out: []int{0, 3, 4},
|
||||
},
|
||||
{in: []int{0, 0, 4, 5, div, 3, 3},
|
||||
out: []int{0, 0, 3, 3, 4, 5},
|
||||
},
|
||||
{in: []int{0, 0, 1, 4, 5, div, 3, 3},
|
||||
out: []int{0, 0, 1, 3, 3, 4, 5},
|
||||
},
|
||||
{in: []int{0, 0, 1, 4, 5, div, 4, 4},
|
||||
out: []int{0, 0, 1, 4, 4, 4, 5},
|
||||
},
|
||||
}
|
||||
for j, tt := range tests {
|
||||
i := iter{}
|
||||
var w, p, s int
|
||||
for k, cc := range tt.in {
|
||||
if cc == 0 {
|
||||
s = 0
|
||||
}
|
||||
if cc == div {
|
||||
w = 100
|
||||
p = k
|
||||
i.pStarter = s
|
||||
continue
|
||||
}
|
||||
i.ce = append(i.ce, makeCE([]int{w, 20, 2, cc}))
|
||||
}
|
||||
i.prevCCC = i.ce[p-1].ccc()
|
||||
i.doNorm(p, i.ce[p].ccc())
|
||||
if len(i.ce) != len(tt.out) {
|
||||
t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
|
||||
}
|
||||
prevCCC := uint8(0)
|
||||
for k, ce := range i.ce {
|
||||
if int(ce.ccc()) != tt.out[k] {
|
||||
t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.ccc(), tt.out[k])
|
||||
}
|
||||
if k > 0 && ce.ccc() == prevCCC && i.ce[k-1].primary() > ce.primary() {
|
||||
t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
// test cutoff of large sequence of combining characters.
|
||||
result := []uint8{8, 8, 8, 5, 5}
|
||||
for o := -2; o <= 2; o++ {
|
||||
i := iter{pStarter: 2, prevCCC: 8}
|
||||
n := maxCombiningCharacters + 1 + o
|
||||
for j := 1; j < n+i.pStarter; j++ {
|
||||
i.ce = append(i.ce, makeCE([]int{100, 20, 2, 8}))
|
||||
}
|
||||
p := len(i.ce)
|
||||
i.ce = append(i.ce, makeCE([]int{0, 20, 2, 5}))
|
||||
i.doNorm(p, 5)
|
||||
if i.prevCCC != result[o+2] {
|
||||
t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
|
||||
}
|
||||
if result[o+2] == 5 && i.pStarter != p {
|
||||
t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ package collate
|
||||
import (
|
||||
"bytes"
|
||||
"exp/norm"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Level identifies the collation comparison level.
|
||||
@ -112,7 +113,7 @@ func New(loc string) *Collator {
|
||||
|
||||
func newCollator(t *table) *Collator {
|
||||
c := &Collator{
|
||||
Strength: Quaternary,
|
||||
Strength: Tertiary,
|
||||
f: norm.NFD,
|
||||
t: t,
|
||||
}
|
||||
@ -269,8 +270,7 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
|
||||
func (c *Collator) getColElems(str []byte) []colElem {
|
||||
i := c.iter(0)
|
||||
i.setInput(c, str)
|
||||
for !i.done() {
|
||||
i.next()
|
||||
for i.next() {
|
||||
}
|
||||
return i.ce
|
||||
}
|
||||
@ -278,88 +278,185 @@ func (c *Collator) getColElems(str []byte) []colElem {
|
||||
func (c *Collator) getColElemsString(str string) []colElem {
|
||||
i := c.iter(0)
|
||||
i.setInputString(c, str)
|
||||
for !i.done() {
|
||||
i.next()
|
||||
for i.next() {
|
||||
}
|
||||
return i.ce
|
||||
}
|
||||
|
||||
type source struct {
|
||||
str string
|
||||
bytes []byte
|
||||
buf [16]byte // Used for decomposing Hangul.
|
||||
}
|
||||
|
||||
func (src *source) done() bool {
|
||||
return len(src.str) == 0 && len(src.bytes) == 0
|
||||
}
|
||||
|
||||
func (src *source) tail(n int) (res source) {
|
||||
if src.bytes == nil {
|
||||
res.str = src.str[n:]
|
||||
} else {
|
||||
res.bytes = src.bytes[n:]
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (src *source) nfd(end int) []byte {
|
||||
if src.bytes == nil {
|
||||
return norm.NFD.AppendString(src.buf[:0], src.str[:end])
|
||||
}
|
||||
return norm.NFD.Append(src.buf[:0], src.bytes[:end]...)
|
||||
}
|
||||
|
||||
func (src *source) properties(f norm.Form) norm.Properties {
|
||||
if src.bytes == nil {
|
||||
return f.PropertiesString(src.str)
|
||||
}
|
||||
return f.Properties(src.bytes)
|
||||
}
|
||||
|
||||
func (src *source) lookup(t *table) (ce colElem, sz int) {
|
||||
if src.bytes == nil {
|
||||
return t.index.lookupString(src.str)
|
||||
}
|
||||
return t.index.lookup(src.bytes)
|
||||
}
|
||||
|
||||
func (src *source) rune() (r rune, sz int) {
|
||||
if src.bytes == nil {
|
||||
return utf8.DecodeRuneInString(src.str)
|
||||
}
|
||||
return utf8.DecodeRune(src.bytes)
|
||||
}
|
||||
|
||||
type iter struct {
|
||||
src norm.Iter
|
||||
norm [1024]byte
|
||||
buf []byte
|
||||
p int
|
||||
minBufSize int
|
||||
src source
|
||||
|
||||
wa [512]colElem
|
||||
ce []colElem
|
||||
pce int
|
||||
nce int // nce <= len(nce)
|
||||
|
||||
t *table
|
||||
_done, eof bool
|
||||
prevCCC uint8
|
||||
pStarter int
|
||||
|
||||
t *table
|
||||
}
|
||||
|
||||
func (i *iter) init(c *Collator) {
|
||||
i.t = c.t
|
||||
i.minBufSize = c.t.maxContractLen
|
||||
i.ce = i.wa[:0]
|
||||
i.buf = i.norm[:0]
|
||||
}
|
||||
|
||||
func (i *iter) reset() {
|
||||
i.ce = i.ce[:0]
|
||||
i.buf = i.buf[:0]
|
||||
i.p = 0
|
||||
i.eof = i.src.Done()
|
||||
i._done = i.eof
|
||||
i.nce = 0
|
||||
i.prevCCC = 0
|
||||
i.pStarter = 0
|
||||
}
|
||||
|
||||
func (i *iter) setInput(c *Collator, s []byte) *iter {
|
||||
i.src.SetInput(c.f, s)
|
||||
i.src.bytes = s
|
||||
i.src.str = ""
|
||||
i.reset()
|
||||
return i
|
||||
}
|
||||
|
||||
func (i *iter) setInputString(c *Collator, s string) *iter {
|
||||
i.src.SetInputString(c.f, s)
|
||||
i.src.str = s
|
||||
i.src.bytes = nil
|
||||
i.reset()
|
||||
return i
|
||||
}
|
||||
|
||||
func (i *iter) done() bool {
|
||||
return i._done
|
||||
// next appends colElems to the internal array until it adds an element with CCC=0.
|
||||
// In the majority of cases, a colElem with a primary value > 0 will have
|
||||
// a CCC of 0. The CCC values of colation elements are also used to detect if the
|
||||
// input string was not normalized and to adjust the result accordingly.
|
||||
func (i *iter) next() bool {
|
||||
sz := 0
|
||||
for !i.src.done() {
|
||||
p0 := len(i.ce)
|
||||
i.ce, sz = i.t.appendNext(i.ce, i.src)
|
||||
i.src = i.src.tail(sz)
|
||||
last := len(i.ce) - 1
|
||||
if ccc := i.ce[last].ccc(); ccc == 0 {
|
||||
i.nce = len(i.ce)
|
||||
i.pStarter = last
|
||||
i.prevCCC = 0
|
||||
return true
|
||||
} else if p0 < last && i.ce[p0].ccc() == 0 {
|
||||
// set i.nce to only cover part of i.ce for which ccc == 0 and
|
||||
// use rest the next call to next.
|
||||
for p0++; p0 < last && i.ce[p0].ccc() == 0; p0++ {
|
||||
}
|
||||
i.nce = p0
|
||||
i.pStarter = p0 - 1
|
||||
i.prevCCC = ccc
|
||||
return true
|
||||
} else if ccc < i.prevCCC {
|
||||
i.doNorm(p0, ccc) // should be rare for most common cases
|
||||
} else {
|
||||
i.prevCCC = ccc
|
||||
}
|
||||
}
|
||||
if len(i.ce) != i.nce {
|
||||
i.nce = len(i.ce)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (i *iter) next() {
|
||||
if !i.eof && len(i.buf)-i.p < i.minBufSize {
|
||||
// replenish buffer
|
||||
n := copy(i.buf, i.buf[i.p:])
|
||||
n += i.src.Next(i.buf[n:cap(i.buf)])
|
||||
i.buf = i.buf[:n]
|
||||
i.p = 0
|
||||
i.eof = i.src.Done()
|
||||
}
|
||||
if i.p == len(i.buf) {
|
||||
i._done = true
|
||||
return
|
||||
// nextPlain is the same as next, but does not "normalize" the collation
|
||||
// elements.
|
||||
// TODO: remove this function. Using this instead of next does not seem
|
||||
// to improve performance in any significant way. We retain this until
|
||||
// later for evaluation purposes.
|
||||
func (i *iter) nextPlain() bool {
|
||||
if i.src.done() {
|
||||
return false
|
||||
}
|
||||
sz := 0
|
||||
i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
|
||||
i.p += sz
|
||||
i.ce, sz = i.t.appendNext(i.ce, i.src)
|
||||
i.src = i.src.tail(sz)
|
||||
i.nce = len(i.ce)
|
||||
return true
|
||||
}
|
||||
|
||||
const maxCombiningCharacters = 30
|
||||
|
||||
// doNorm reorders the collation elements in i.ce.
|
||||
// It assumes that blocks of collation elements added with appendNext
|
||||
// either start and end with the same CCC or start with CCC == 0.
|
||||
// This allows for a single insertion point for the entire block.
|
||||
// The correctness of this assumption is verified in builder.go.
|
||||
func (i *iter) doNorm(p int, ccc uint8) {
|
||||
if p-i.pStarter > maxCombiningCharacters {
|
||||
i.prevCCC = i.ce[len(i.ce)-1].ccc()
|
||||
i.pStarter = len(i.ce) - 1
|
||||
return
|
||||
}
|
||||
n := len(i.ce)
|
||||
k := p
|
||||
for p--; p > i.pStarter && ccc < i.ce[p-1].ccc(); p-- {
|
||||
}
|
||||
i.ce = append(i.ce, i.ce[p:k]...)
|
||||
copy(i.ce[p:], i.ce[k:])
|
||||
i.ce = i.ce[:n]
|
||||
}
|
||||
|
||||
func (i *iter) nextPrimary() int {
|
||||
for {
|
||||
for ; i.pce < len(i.ce); i.pce++ {
|
||||
for ; i.pce < i.nce; i.pce++ {
|
||||
if v := i.ce[i.pce].primary(); v != 0 {
|
||||
i.pce++
|
||||
return v
|
||||
}
|
||||
}
|
||||
if i.done() {
|
||||
if !i.next() {
|
||||
return 0
|
||||
}
|
||||
i.next()
|
||||
}
|
||||
panic("should not reach here")
|
||||
}
|
||||
|
@ -378,6 +378,7 @@ var keyTests = []keyTest{
|
||||
func TestKey(t *testing.T) {
|
||||
c, _ := makeTable(appendNextTests[4].in)
|
||||
c.Alternate = collate.AltShifted
|
||||
c.Strength = collate.Quaternary
|
||||
buf := collate.Buffer{}
|
||||
keys1 := [][]byte{}
|
||||
keys2 := [][]byte{}
|
||||
|
@ -27,8 +27,21 @@ type ctScanner struct {
|
||||
done bool
|
||||
}
|
||||
|
||||
type ctScannerString struct {
|
||||
states contractTrieSet
|
||||
s string
|
||||
n int
|
||||
index int
|
||||
pindex int
|
||||
done bool
|
||||
}
|
||||
|
||||
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
|
||||
return ctScanner{states: t[index:], s: b, n: n}
|
||||
return ctScanner{s: b, states: t[index:], n: n}
|
||||
}
|
||||
|
||||
func (t contractTrieSet) scannerString(index, n int, str string) ctScannerString {
|
||||
return ctScannerString{s: str, states: t[index:], n: n}
|
||||
}
|
||||
|
||||
// result returns the offset i and bytes consumed p so far. If no suffix
|
||||
@ -37,6 +50,10 @@ func (s *ctScanner) result() (i, p int) {
|
||||
return s.index, s.pindex
|
||||
}
|
||||
|
||||
func (s *ctScannerString) result() (i, p int) {
|
||||
return s.index, s.pindex
|
||||
}
|
||||
|
||||
const (
|
||||
final = 0
|
||||
noIndex = 0xFF
|
||||
@ -84,3 +101,45 @@ func (s *ctScanner) scan(p int) int {
|
||||
}
|
||||
return pr
|
||||
}
|
||||
|
||||
// scan is a verbatim copy of ctScanner.scan.
|
||||
func (s *ctScannerString) scan(p int) int {
|
||||
pr := p // the p at the rune start
|
||||
str := s.s
|
||||
states, n := s.states, s.n
|
||||
for i := 0; i < n && p < len(str); {
|
||||
e := states[i]
|
||||
c := str[p]
|
||||
// TODO: a significant number of contractions are of a form that
|
||||
// cannot match discontiguous UTF-8 in a normalized string. We could let
|
||||
// a negative value of e.n mean that we can set s.done = true and avoid
|
||||
// the need for additional matches.
|
||||
if c >= e.l {
|
||||
if e.l == c {
|
||||
p++
|
||||
if e.i != noIndex {
|
||||
s.index = int(e.i)
|
||||
s.pindex = p
|
||||
}
|
||||
if e.n != final {
|
||||
i, states, n = 0, states[int(e.h)+n:], int(e.n)
|
||||
if p >= len(str) || utf8.RuneStart(str[p]) {
|
||||
s.states, s.n, pr = states, n, p
|
||||
}
|
||||
} else {
|
||||
s.done = true
|
||||
return p
|
||||
}
|
||||
continue
|
||||
} else if e.n == final && c <= e.h {
|
||||
p++
|
||||
s.done = true
|
||||
s.index = int(c-e.l) + int(e.i)
|
||||
s.pindex = p
|
||||
return p
|
||||
}
|
||||
}
|
||||
i++
|
||||
}
|
||||
return pr
|
||||
}
|
||||
|
@ -30,7 +30,7 @@ func W(ce ...int) Weights {
|
||||
return w
|
||||
}
|
||||
func (w Weights) String() string {
|
||||
return fmt.Sprintf("[%d.%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
|
||||
return fmt.Sprintf("[%X.%X.%X.%X]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
|
||||
}
|
||||
|
||||
type Table struct {
|
||||
@ -52,7 +52,7 @@ func convertToWeights(ws []colElem) []Weights {
|
||||
func convertFromWeights(ws []Weights) []colElem {
|
||||
out := make([]colElem, len(ws))
|
||||
for i, w := range ws {
|
||||
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary})
|
||||
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary, 0})
|
||||
if out[i] == ceIgnore && w.Quaternary > 0 {
|
||||
out[i] = makeQuaternary(w.Quaternary)
|
||||
}
|
||||
@ -61,7 +61,7 @@ func convertFromWeights(ws []Weights) []colElem {
|
||||
}
|
||||
|
||||
func (t *Table) AppendNext(s []byte) ([]Weights, int) {
|
||||
w, n := t.t.appendNext(nil, s)
|
||||
w, n := t.t.appendNext(nil, source{bytes: s})
|
||||
return convertToWeights(w), n
|
||||
}
|
||||
|
||||
|
@ -42,13 +42,26 @@ func (t *table) indexedTable(idx tableIndex) *table {
|
||||
// sequence of runes, the weights for the interstitial runes are
|
||||
// appended as well. It returns a new slice that includes the appended
|
||||
// weights and the number of bytes consumed from s.
|
||||
func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
|
||||
v, sz := t.index.lookup(s)
|
||||
ce := colElem(v)
|
||||
func (t *table) appendNext(w []colElem, src source) (res []colElem, n int) {
|
||||
ce, sz := src.lookup(t)
|
||||
tp := ce.ctype()
|
||||
if tp == ceNormal {
|
||||
if ce == 0 {
|
||||
r, _ := utf8.DecodeRune(s)
|
||||
r, _ := src.rune()
|
||||
const (
|
||||
hangulSize = 3
|
||||
firstHangul = 0xAC00
|
||||
lastHangul = 0xD7A3
|
||||
)
|
||||
if r >= firstHangul && r <= lastHangul {
|
||||
// TODO: performance can be considerably improved here.
|
||||
n = sz
|
||||
for b := src.nfd(hangulSize); len(b) > 0; b = b[sz:] {
|
||||
ce, sz = t.index.lookup(b)
|
||||
w = append(w, ce)
|
||||
}
|
||||
return w, n
|
||||
}
|
||||
ce = makeImplicitCE(implicitPrimary(r))
|
||||
}
|
||||
w = append(w, ce)
|
||||
@ -56,15 +69,20 @@ func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
|
||||
w = t.appendExpansion(w, ce)
|
||||
} else if tp == ceContractionIndex {
|
||||
n := 0
|
||||
w, n = t.matchContraction(w, ce, s[sz:])
|
||||
src = src.tail(sz)
|
||||
if src.bytes == nil {
|
||||
w, n = t.matchContractionString(w, ce, src.str)
|
||||
} else {
|
||||
w, n = t.matchContraction(w, ce, src.bytes)
|
||||
}
|
||||
sz += n
|
||||
} else if tp == ceDecompose {
|
||||
// Decompose using NFCK and replace tertiary weights.
|
||||
// Decompose using NFKD and replace tertiary weights.
|
||||
t1, t2 := splitDecompose(ce)
|
||||
i := len(w)
|
||||
nfkd := norm.NFKD.Properties(s).Decomposition()
|
||||
nfkd := src.properties(norm.NFKD).Decomposition()
|
||||
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
|
||||
w, p = t.appendNext(w, nfkd)
|
||||
w, p = t.appendNext(w, source{bytes: nfkd})
|
||||
}
|
||||
w[i] = w[i].updateTertiary(t1)
|
||||
if i++; i < len(w) {
|
||||
@ -99,16 +117,17 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
|
||||
// By now we should have filtered most cases.
|
||||
p0 := p
|
||||
bufn := 0
|
||||
rune := norm.NFC.Properties(suffix[p:])
|
||||
rune := norm.NFD.Properties(suffix[p:])
|
||||
p += rune.Size()
|
||||
if prevCC := rune.TrailCCC(); prevCC != 0 {
|
||||
if rune.LeadCCC() != 0 {
|
||||
prevCC := rune.TrailCCC()
|
||||
// A gap may only occur in the last normalization segment.
|
||||
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
|
||||
if end := norm.NFC.FirstBoundary(suffix[p:]); end != -1 {
|
||||
if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
|
||||
scan.s = suffix[:p+end]
|
||||
}
|
||||
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
|
||||
rune = norm.NFC.Properties(suffix[p:])
|
||||
rune = norm.NFD.Properties(suffix[p:])
|
||||
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
|
||||
break
|
||||
}
|
||||
@ -136,7 +155,65 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
|
||||
}
|
||||
// Append weights for the runes in the segment not part of the contraction.
|
||||
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
|
||||
w, p = t.appendNext(w, b)
|
||||
w, p = t.appendNext(w, source{bytes: b})
|
||||
}
|
||||
return w, n
|
||||
}
|
||||
|
||||
// TODO: unify the two implementations. This is best done after first simplifying
|
||||
// the algorithm taking into account the inclusion of both NFC and NFD forms
|
||||
// in the table.
|
||||
func (t *table) matchContractionString(w []colElem, ce colElem, suffix string) ([]colElem, int) {
|
||||
index, n, offset := splitContractIndex(ce)
|
||||
|
||||
scan := t.contractTries.scannerString(index, n, suffix)
|
||||
buf := [norm.MaxSegmentSize]byte{}
|
||||
bufp := 0
|
||||
p := scan.scan(0)
|
||||
|
||||
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
|
||||
// By now we should have filtered most cases.
|
||||
p0 := p
|
||||
bufn := 0
|
||||
rune := norm.NFD.PropertiesString(suffix[p:])
|
||||
p += rune.Size()
|
||||
if rune.LeadCCC() != 0 {
|
||||
prevCC := rune.TrailCCC()
|
||||
// A gap may only occur in the last normalization segment.
|
||||
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
|
||||
if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
|
||||
scan.s = suffix[:p+end]
|
||||
}
|
||||
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
|
||||
rune = norm.NFD.PropertiesString(suffix[p:])
|
||||
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
|
||||
break
|
||||
}
|
||||
prevCC = rune.TrailCCC()
|
||||
if pp := scan.scan(p); pp != p {
|
||||
// Copy the interstitial runes for later processing.
|
||||
bufn += copy(buf[bufn:], suffix[p0:p])
|
||||
if scan.pindex == pp {
|
||||
bufp = bufn
|
||||
}
|
||||
p, p0 = pp, pp
|
||||
} else {
|
||||
p += rune.Size()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Append weights for the matched contraction, which may be an expansion.
|
||||
i, n := scan.result()
|
||||
ce = colElem(t.contractElem[i+offset])
|
||||
if ce.ctype() == ceNormal {
|
||||
w = append(w, ce)
|
||||
} else {
|
||||
w = t.appendExpansion(w, ce)
|
||||
}
|
||||
// Append weights for the runes in the segment not part of the contraction.
|
||||
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
|
||||
w, p = t.appendNext(w, source{bytes: b})
|
||||
}
|
||||
return w, n
|
||||
}
|
||||
|
@ -42,7 +42,9 @@ func pt(p, t int) []int {
|
||||
func makeTable(in []input) (*collate.Collator, error) {
|
||||
b := build.NewBuilder()
|
||||
for _, r := range in {
|
||||
b.Add([]rune(r.str), r.ces, nil)
|
||||
if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
|
||||
panic(e)
|
||||
}
|
||||
}
|
||||
return b.Build()
|
||||
}
|
||||
@ -159,6 +161,7 @@ var appendNextTests = []tableTest{
|
||||
{"b", [][]int{{200}}},
|
||||
{"c", [][]int{{300}}},
|
||||
{"\u03B1", [][]int{{900}}},
|
||||
{"\x01", [][]int{{0, 0, 0, 0}}},
|
||||
|
||||
// contractions
|
||||
{"a\u0300", [][]int{{101}}},
|
||||
@ -171,10 +174,11 @@ var appendNextTests = []tableTest{
|
||||
{"a\u0301\u035F", [][]int{{121}}},
|
||||
{"a\u0301\u035Fb", [][]int{{119}}},
|
||||
{"\u03B1\u0345", [][]int{{901}, {902}}},
|
||||
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
|
||||
{"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
|
||||
{"\u302F\u18A9", [][]int{{0, 130}}},
|
||||
}...),
|
||||
[]check{
|
||||
{"a\x01\u0300", 1, ColElems{w(100)}},
|
||||
{"ab", 1, ColElems{w(100)}}, // closing segment
|
||||
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
|
||||
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
|
||||
@ -239,12 +243,17 @@ var appendNextTests = []tableTest{
|
||||
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
|
||||
// expansion within a gap
|
||||
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
|
||||
{"a\u302E\u18A9\u0301", 9, ColElems{w(102), w(0, 131), w(0, 132)}},
|
||||
{
|
||||
"a\u0317\u302E\u18A9\u0301",
|
||||
11,
|
||||
ColElems{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
|
||||
},
|
||||
// repeating CCC blocks last modifier
|
||||
{"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
|
||||
// The trailing combining characters (with lower CCC) should block the first one.
|
||||
// TODO: make the following pass.
|
||||
// {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
|
||||
{"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
|
||||
// Last combiner should match after normalization.
|
||||
// TODO: make the following pass.
|
||||
// {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
|
||||
// The first combiner is blocking the second one as they have the same CCC.
|
||||
{"a\u035D\u035Eb", 1, ColElems{w(100)}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -97,3 +97,64 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
|
||||
// Illegal rune
|
||||
return 0, 1
|
||||
}
|
||||
|
||||
// The body of lookupString is a verbatim copy of that of lookup.
|
||||
func (t *trie) lookupString(s string) (v colElem, sz int) {
|
||||
c0 := s[0]
|
||||
switch {
|
||||
case c0 < tx:
|
||||
return colElem(t.values0[c0]), 1
|
||||
case c0 < t2:
|
||||
return 0, 1
|
||||
case c0 < t3:
|
||||
if len(s) < 2 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index0[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
return t.lookupValue(i, c1), 2
|
||||
case c0 < t4:
|
||||
if len(s) < 3 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index0[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := int(i)<<6 + int(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
return t.lookupValue(i, c2), 3
|
||||
case c0 < t5:
|
||||
if len(s) < 4 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index0[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := int(i)<<6 + int(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
o = int(i)<<6 + int(c2)
|
||||
i = t.index[o]
|
||||
c3 := s[3]
|
||||
if c3 < tx || t2 <= c3 {
|
||||
return 0, 3
|
||||
}
|
||||
return t.lookupValue(i, c3), 4
|
||||
}
|
||||
// Illegal rune
|
||||
return 0, 1
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user