1
0
mirror of https://github.com/golang/go synced 2024-11-19 00:44:40 -07:00

exp/locale/collate: include composed characters into the table. This eliminates

the need to decompose characters for the majority of cases.  This considerably
speeds up collation while increasing the table size minimally.

To detect non-normalized strings, rather than relying on exp/norm, the table
now includes CCC information. The inclusion of this information does not
increase table size.

DETAILS
 - Raw collation elements are now a struct that includes the CCC, rather
   than a slice of ints.
 - Builder now ensures that NFD and NFC counterparts are included in the table.
   This also fixes a bug for Korean which is responsible for most of the growth
   of the table size.
 - As there is no more normalization step, code should now handle both strings
   and byte slices as input. Introduced source type to facilitate this.

NOTES
 - This change does not handle normalization correctly entirely for contractions.
   This causes a few failures with the regtest. table_test.go contains a few
   uncommented tests that can be enabled once this is fixed.  The easiest is to
   fix this once we have the new norm.Iter.
 - Removed a test cases in table_test that covers cases that are now guaranteed
   to not exist.

R=rsc, mpvl
CC=golang-dev
https://golang.org/cl/6971044
This commit is contained in:
Marcel van Lohuizen 2012-12-24 16:42:29 +01:00
parent 43f2fc308b
commit 9aa70984a9
16 changed files with 46794 additions and 42287 deletions

View File

@ -98,24 +98,24 @@ func (b *Builder) Tailoring(locale string) *Tailoring {
// a value for each colelem that is a variable. (See the reference above.)
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
str := string(runes)
elems := make([][]int, len(colelems))
elems := make([]rawCE, len(colelems))
for i, ce := range colelems {
elems[i] = append(elems[i], ce...)
if len(ce) == 0 {
elems[i] = append(elems[i], []int{0, 0, 0, 0}...)
break
}
elems[i] = makeRawCE(ce, 0)
if len(ce) == 1 {
elems[i] = append(elems[i], defaultSecondary)
elems[i].w[1] = defaultSecondary
}
if len(ce) <= 2 {
elems[i] = append(elems[i], defaultTertiary)
elems[i].w[2] = defaultTertiary
}
if len(ce) <= 3 {
elems[i] = append(elems[i], ce[0])
elems[i].w[3] = ce[0]
}
}
for i, ce := range elems {
p := ce.w[0]
isvar := false
for _, j := range variables {
if i == j {
@ -123,18 +123,18 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
}
}
if isvar {
if ce[0] >= b.minNonVar && b.minNonVar > 0 {
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar)
if p >= b.minNonVar && b.minNonVar > 0 {
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
}
if ce[0] > b.varTop {
b.varTop = ce[0]
if p > b.varTop {
b.varTop = p
}
} else if ce[0] > 1 { // 1 is a special primary value reserved for FFFE
if ce[0] <= b.varTop {
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop)
} else if p > 1 { // 1 is a special primary value reserved for FFFE
if p <= b.varTop {
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
}
if b.minNonVar == 0 || ce[0] < b.minNonVar {
b.minNonVar = ce[0]
if b.minNonVar == 0 || p < b.minNonVar {
b.minNonVar = p
}
}
}
@ -142,16 +142,42 @@ func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
if err != nil {
return err
}
cccs := []uint8{}
nfd := norm.NFD.String(str)
for i := range nfd {
cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
}
if len(cccs) < len(elems) {
if len(cccs) > 2 {
return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
}
p := len(elems) - 1
for ; p > 0 && elems[p].w[0] == 0; p-- {
elems[p].ccc = cccs[len(cccs)-1]
}
for ; p >= 0; p-- {
elems[p].ccc = cccs[0]
}
} else {
for i := range elems {
elems[i].ccc = cccs[i]
}
}
// doNorm in collate.go assumes that the following conditions hold.
if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
}
b.root.newEntry(str, elems)
return nil
}
func (t *Tailoring) setAnchor(anchor string) error {
anchor = norm.NFD.String(anchor)
anchor = norm.NFC.String(anchor)
a := t.index.find(anchor)
if a == nil {
a = t.index.newEntry(anchor, nil)
a.implicit = true
a.modified = true
for _, r := range []rune(anchor) {
e := t.index.find(string(r))
e.lock = true
@ -221,7 +247,7 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
if t.anchor == nil {
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
}
str = norm.NFD.String(str)
str = norm.NFC.String(str)
e := t.index.find(str)
if e == nil {
e = t.index.newEntry(str, nil)
@ -262,12 +288,13 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
}
e.extend = norm.NFD.String(extend)
e.exclude = false
e.modified = true
e.elems = nil
t.anchor = e
return nil
}
func (o *ordering) getWeight(e *entry) [][]int {
func (o *ordering) getWeight(e *entry) []rawCE {
if len(e.elems) == 0 && e.logical == noAnchor {
if e.implicit {
for _, r := range e.runes {
@ -279,11 +306,10 @@ func (o *ordering) getWeight(e *entry) [][]int {
for ; a.elems == nil && !a.implicit; a = a.next {
count[a.level]++
}
e.elems = append([][]int(nil), make([]int, len(a.elems[0])))
copy(e.elems[0], a.elems[0])
e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
for i := collate.Primary; i < collate.Quaternary; i++ {
if count[i] != 0 {
e.elems[0][i] -= count[i]
e.elems[0].w[i] -= count[i]
break
}
}
@ -315,11 +341,11 @@ func (o *ordering) verifyWeights(a, b *entry, level collate.Level) error {
return nil
}
for i := collate.Primary; i < level; i++ {
if a.elems[0][i] < b.elems[0][i] {
if a.elems[0].w[i] < b.elems[0].w[i] {
return nil
}
}
if a.elems[0][level] >= b.elems[0][level] {
if a.elems[0].w[level] >= b.elems[0].w[level] {
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
log.Println(err)
// TODO: return the error instead, or better, fix the conflicting entry by making room.
@ -339,6 +365,54 @@ func (b *Builder) errorID(locale string, e error) {
}
}
// patchNorm ensures that NFC and NFD counterparts are consistent.
func (o *ordering) patchNorm() {
// Insert the NFD counterparts, if necessary.
for _, e := range o.ordered {
nfd := norm.NFD.String(e.str)
if nfd != e.str {
if e0 := o.find(nfd); e0 != nil && !e0.modified {
e0.elems = e.elems
} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
e := o.newEntry(nfd, e.elems)
e.modified = true
}
}
}
// Update unchanged composed forms if one of their parts changed.
for _, e := range o.ordered {
nfd := norm.NFD.String(e.str)
if e.modified || nfd == e.str {
continue
}
if e0 := o.find(nfd); e0 != nil {
e.elems = e0.elems
} else {
e.elems = o.genColElems(nfd)
if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
r := []rune(nfd)
head := string(r[0])
tail := ""
for i := 1; i < len(r); i++ {
s := norm.NFC.String(head + string(r[i]))
if e0 := o.find(s); e0 != nil && e0.modified {
head = s
} else {
tail += string(r[i])
}
}
e.elems = append(o.genColElems(head), o.genColElems(tail)...)
}
}
}
// Exclude entries for which the individual runes generate the same collation elements.
for _, e := range o.ordered {
if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
e.exclude = true
}
}
}
func (b *Builder) buildOrdering(o *ordering) {
for _, e := range o.ordered {
o.getWeight(e)
@ -346,6 +420,7 @@ func (b *Builder) buildOrdering(o *ordering) {
for _, e := range o.ordered {
o.addExtension(e)
}
o.patchNorm()
o.sort()
simplify(o)
b.processExpansions(o) // requires simplify
@ -436,20 +511,20 @@ func (b *Builder) Print(w io.Writer) (n int, err error) {
// reproducibleFromNFKD checks whether the given expansion could be generated
// from an NFKD expansion.
func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
// Length must be equal.
if len(exp) != len(nfkd) {
return false
}
for i, ce := range exp {
// Primary and secondary values should be equal.
if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] {
if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
return false
}
// Tertiary values should be equal to maxTertiary for third element onwards.
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
// simply be dropped. Try this out by dropping the following code.
if i >= 2 && ce[2] != maxTertiary {
if i >= 2 && ce.w[2] != maxTertiary {
return false
}
if _, err := makeCE(ce); err != nil {
@ -469,22 +544,12 @@ func simplify(o *ordering) {
keep[e.runes[0]] = true
}
}
// Remove entries for which the runes normalize (using NFD) to identical values.
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
s := e.str
nfd := norm.NFD.String(s)
if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s {
continue
}
if equalCEArrays(o.genColElems(nfd), e.elems) {
e.remove()
}
}
// Tag entries for which the runes NFKD decompose to identical values.
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
s := e.str
nfkd := norm.NFKD.String(s)
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == s {
nfd := norm.NFD.String(s)
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
continue
}
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
@ -589,18 +654,18 @@ func (b *Builder) processContractions(o *ordering) {
// Bucket sort entries in index order.
es := make([]*entry, len(l))
for _, e := range l {
var o, sn int
var p, sn int
if len(e.runes) > 1 {
str := []byte(string(e.runes[1:]))
o, sn = t.contractTries.lookup(handle, str)
p, sn = t.contractTries.lookup(handle, str)
if sn != len(str) {
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str))
log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
}
}
if es[o] != nil {
log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0])
if es[p] != nil {
log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
}
es[o] = e
es[p] = e
}
// Create collation elements for contractions.
elems := []uint32{}

View File

@ -7,48 +7,64 @@ package build
import "testing"
// cjk returns an implicit collation element for a CJK rune.
func cjk(r rune) [][]int {
func cjk(r rune) []rawCE {
// A CJK character C is represented in the DUCET as
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
// Where AAAA is the most significant 15 bits plus a base value.
// Any base value will work for the test, so we pick the common value of FB40.
const base = 0xFB40
return [][]int{
{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)},
{int(r&0x7FFF) | 0x8000, 0, 0, int(r)},
return []rawCE{
{w: []int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}},
{w: []int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}},
}
}
func pCE(p int) [][]int {
return [][]int{{p, defaultSecondary, defaultTertiary, 0}}
func pCE(p int) []rawCE {
return mkCE([]int{p, defaultSecondary, defaultTertiary, 0}, 0)
}
func pqCE(p, q int) [][]int {
return [][]int{{p, defaultSecondary, defaultTertiary, q}}
func pqCE(p, q int) []rawCE {
return mkCE([]int{p, defaultSecondary, defaultTertiary, q}, 0)
}
func ptCE(p, t int) [][]int {
return [][]int{{p, defaultSecondary, t, 0}}
func ptCE(p, t int) []rawCE {
return mkCE([]int{p, defaultSecondary, t, 0}, 0)
}
func sCE(s int) [][]int {
return [][]int{{0, s, defaultTertiary, 0}}
func ptcCE(p, t int, ccc uint8) []rawCE {
return mkCE([]int{p, defaultSecondary, t, 0}, ccc)
}
func stCE(s, t int) [][]int {
return [][]int{{0, s, t, 0}}
func sCE(s int) []rawCE {
return mkCE([]int{0, s, defaultTertiary, 0}, 0)
}
func stCE(s, t int) []rawCE {
return mkCE([]int{0, s, t, 0}, 0)
}
func scCE(s int, ccc uint8) []rawCE {
return mkCE([]int{0, s, defaultTertiary, 0}, ccc)
}
func mkCE(w []int, ccc uint8) []rawCE {
return []rawCE{rawCE{w, ccc}}
}
// ducetElem is used to define test data that is used to generate a table.
type ducetElem struct {
str string
ces [][]int
ces []rawCE
}
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
b := NewBuilder()
for _, e := range ducet {
if err := b.Add([]rune(e.str), e.ces, nil); err != nil {
ces := [][]int{}
for _, ce := range e.ces {
ces = append(ces, ce.w)
}
if err := b.Add([]rune(e.str), ces, nil); err != nil {
t.Errorf(err.Error())
}
}
@ -58,7 +74,7 @@ func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
}
type convertTest struct {
in, out [][]int
in, out []rawCE
err bool
}
@ -173,16 +189,18 @@ func TestSimplify(t *testing.T) {
}
var expandTest = []ducetElem{
{"\u00C0", append(ptCE(100, 8), sCE(30)...)},
{"\u00C8", append(ptCE(105, 8), sCE(30)...)},
{"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion
{"\u0300", append(scCE(29, 230), scCE(30, 230)...)},
{"\u00C0", append(ptCE(100, 8), scCE(30, 230)...)},
{"\u00C8", append(ptCE(105, 8), scCE(30, 230)...)},
{"\u00C9", append(ptCE(105, 8), scCE(30, 230)...)}, // identical expansion
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
{"\u01FF", append(ptCE(200, 4), ptcCE(201, 4, 0)[0], scCE(30, 230)[0])},
}
func TestExpand(t *testing.T) {
const (
totalExpansions = 3
totalElements = 2 + 2 + 3 + totalExpansions
totalExpansions = 5
totalElements = 2 + 2 + 2 + 3 + 3 + totalExpansions
)
b := newBuilder(t, expandTest)
o := &b.root

View File

@ -16,6 +16,17 @@ const (
maxTertiary = 0x1F
)
type rawCE struct {
w []int
ccc uint8
}
func makeRawCE(w []int, ccc uint8) rawCE {
ce := rawCE{w: make([]int, 4), ccc: ccc}
copy(ce.w, w)
return ce
}
// A collation element is represented as an uint32.
// In the typical case, a rune maps to a single collation element. If a rune
// can be the start of a contraction or expands into multiple collation elements,
@ -29,29 +40,36 @@ const (
// 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value
// - s* is the secondary collation value
// or
// 00pppppp pppppppp ppppppps sssttttt, where
// - p* is primary collation value
// - s* offset of secondary from default value.
// - t* is the tertiary collation value
// 100ttttt cccccccc pppppppp pppppppp
// - t* is the tertiar collation value
// - c* is the cannonical combining class
// - p* is the primary collation value
// Collation elements with a secondary value are of the form
// 10000000 0000ssss ssssssss tttttttt, where
// - 16 BMP implicit -> weight
// - 8 bit s
// - default tertiary
// 1010cccc ccccssss ssssssss tttttttt, where
// - c* is the canonical combining class
// - s* is the secondary collation value
// - t* is the tertiary collation value
const (
maxPrimaryBits = 21
maxPrimaryCompactBits = 16
maxSecondaryBits = 12
maxSecondaryCompactBits = 8
maxCCCBits = 8
maxSecondaryDiffBits = 4
maxTertiaryBits = 8
maxTertiaryCompactBits = 5
isSecondary = 0x80000000
isPrimary = 0x40000000
isPrimary = 0x40000000
isPrimaryCCC = 0x80000000
isSecondary = 0xA0000000
)
func makeCE(weights []int) (uint32, error) {
func makeCE(rce rawCE) (uint32, error) {
weights := rce.w
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
}
@ -63,14 +81,25 @@ func makeCE(weights []int) (uint32, error) {
}
ce := uint32(0)
if weights[0] != 0 {
if weights[2] == defaultTertiary {
if rce.ccc != 0 {
if weights[0] >= 1<<maxPrimaryCompactBits {
return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights[0], 1<<maxPrimaryCompactBits)
}
if weights[1] != defaultSecondary {
return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights[1], rce.ccc)
}
ce = uint32(weights[2] << (maxPrimaryCompactBits + maxCCCBits))
ce |= uint32(rce.ccc) << maxPrimaryCompactBits
ce |= uint32(weights[0])
ce |= isPrimaryCCC
} else if weights[2] == defaultTertiary {
if weights[1] >= 1<<maxSecondaryCompactBits {
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
}
ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary
} else {
d := weights[1] - defaultSecondary + 4
d := weights[1] - defaultSecondary + maxSecondaryDiffBits
if d >= 1<<maxSecondaryDiffBits || d < 0 {
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
}
@ -82,6 +111,7 @@ func makeCE(weights []int) (uint32, error) {
}
} else {
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
ce += uint32(rce.ccc) << (maxSecondaryBits + maxTertiaryBits)
ce |= isSecondary
}
return ce, nil
@ -207,7 +237,7 @@ func implicitPrimary(r rune) int {
// We will rewrite these characters to a single CE.
// We assume the CJK values start at 0x8000.
// See http://unicode.org/reports/tr10/#Implicit_Weights
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
const (
cjkPrimaryStart = 0xFB40
rarePrimaryStart = 0xFB80
@ -219,7 +249,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
shiftBits = 15
)
for i := 0; i < len(elems); i++ {
ce := elems[i]
ce := elems[i].w
p := ce[0]
if p < cjkPrimaryStart {
continue
@ -233,10 +263,10 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
if i+1 >= len(elems) {
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
}
if elems[i+1][0]&lowBitsFlag == 0 {
if elems[i+1].w[0]&lowBitsFlag == 0 {
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
}
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
switch {
case p < rarePrimaryStart:
np += commonUnifiedOffset
@ -257,26 +287,25 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
// nextWeight computes the first possible collation weights following elems
// for the given level.
func nextWeight(level collate.Level, elems [][]int) [][]int {
func nextWeight(level collate.Level, elems []rawCE) []rawCE {
if level == collate.Identity {
next := make([][]int, len(elems))
next := make([]rawCE, len(elems))
copy(next, elems)
return next
}
next := [][]int{make([]int, len(elems[0]))}
copy(next[0], elems[0])
next[0][level]++
next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
next[0].w[level]++
if level < collate.Secondary {
next[0][collate.Secondary] = defaultSecondary
next[0].w[collate.Secondary] = defaultSecondary
}
if level < collate.Tertiary {
next[0][collate.Tertiary] = defaultTertiary
next[0].w[collate.Tertiary] = defaultTertiary
}
// Filter entries that cannot influence ordering.
for _, ce := range elems[1:] {
skip := true
for i := collate.Primary; i < level; i++ {
skip = skip && ce[i] == 0
skip = skip && ce.w[i] == 0
}
if !skip {
next = append(next, ce)
@ -285,18 +314,18 @@ func nextWeight(level collate.Level, elems [][]int) [][]int {
return next
}
func nextVal(elems [][]int, i int, level collate.Level) (index, value int) {
for ; i < len(elems) && elems[i][level] == 0; i++ {
func nextVal(elems []rawCE, i int, level collate.Level) (index, value int) {
for ; i < len(elems) && elems[i].w[level] == 0; i++ {
}
if i < len(elems) {
return i, elems[i][level]
return i, elems[i].w[level]
}
return i, 0
}
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
// It also returns the collation level at which the difference is found.
func compareWeights(a, b [][]int) (result int, level collate.Level) {
func compareWeights(a, b []rawCE) (result int, level collate.Level) {
for level := collate.Primary; level < collate.Identity; level++ {
var va, vb int
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
@ -314,19 +343,16 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) {
return 0, collate.Identity
}
func equalCE(a, b []int) bool {
if len(a) != len(b) {
return false
}
func equalCE(a, b rawCE) bool {
for i := 0; i < 3; i++ {
if b[i] != a[i] {
if b.w[i] != a.w[i] {
return false
}
}
return true
}
func equalCEArrays(a, b [][]int) bool {
func equalCEArrays(a, b []rawCE) bool {
if len(a) != len(b) {
return false
}

View File

@ -16,7 +16,7 @@ type ceTest struct {
}
func normalCE(in []int) (ce uint32, err error) {
return makeCE(in)
return makeCE(rawCE{w: in[:3], ccc: uint8(in[3])})
}
func expandCE(in []int) (ce uint32, err error) {
@ -32,17 +32,20 @@ func decompCE(in []int) (ce uint32, err error) {
}
var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}, 0x80000000},
{normalCE, []int{0, 0x28, 3}, 0x80002803},
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C883},
{normalCE, []int{0, 0, 0, 0}, 0xA0000000},
{normalCE, []int{0, 0x28, 3, 0}, 0xA0002803},
{normalCE, []int{0, 0x28, 3, 0xFF}, 0xAFF02803},
{normalCE, []int{100, defaultSecondary, 3, 0}, 0x0000C883},
// non-ignorable primary with non-default secondary
{normalCE, []int{100, 0x28, defaultTertiary}, 0x4000C828},
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983},
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary
{normalCE, []int{100, 1, 3}, 0xFFFF},
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
{normalCE, []int{100, 0x28, defaultTertiary, 0}, 0x4000C828},
{normalCE, []int{100, defaultSecondary + 8, 3, 0}, 0x0000C983},
{normalCE, []int{100, 0, 3, 0}, 0xFFFF}, // non-ignorable primary with non-supported secondary
{normalCE, []int{100, 1, 3, 0}, 0xFFFF},
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0, 0}, 0xFFFF},
{normalCE, []int{0, 1 << maxSecondaryBits, 0, 0}, 0xFFFF},
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits, 0}, 0xFFFF},
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}, 0x88FF0123},
{normalCE, []int{0x123, defaultSecondary + 1, 8, 0xFF}, 0xFFFF},
{contractCE, []int{0, 0, 0}, 0xC0000000},
{contractCE, []int{1, 1, 1}, 0xC0010011},
@ -85,6 +88,14 @@ func TestColElem(t *testing.T) {
}
}
func mkRawCES(in [][]int) []rawCE {
out := []rawCE{}
for _, w := range in {
out = append(out, rawCE{w: w})
}
return out
}
type weightsTest struct {
a, b [][]int
level collate.Level
@ -119,8 +130,8 @@ var extra = [][]int{{200, 32, 8, 0}, {0, 32, 8, 0}, {0, 0, 8, 0}, {0, 0, 0, 0}}
func TestNextWeight(t *testing.T) {
for i, tt := range nextWeightTests {
test := func(l collate.Level, tt weightsTest, a, gold [][]int) {
res := nextWeight(tt.level, a)
if !equalCEArrays(gold, res) {
res := nextWeight(tt.level, mkRawCES(a))
if !equalCEArrays(mkRawCES(gold), res) {
t.Errorf("%d:%d: expected weights %d; found %d", i, l, gold, res)
}
}
@ -189,7 +200,7 @@ var compareTests = []weightsTest{
func TestCompareWeights(t *testing.T) {
for i, tt := range compareTests {
test := func(tt weightsTest, a, b [][]int) {
res, level := compareWeights(a, b)
res, level := compareWeights(mkRawCES(a), mkRawCES(b))
if res != tt.result {
t.Errorf("%d: expected comparisson result %d; found %d", i, tt.result, res)
}

View File

@ -6,6 +6,7 @@ package build
import (
"exp/locale/collate"
"exp/norm"
"fmt"
"log"
"sort"
@ -28,7 +29,7 @@ const (
type entry struct {
str string // same as string(runes)
runes []rune
elems [][]int // the collation elements
elems []rawCE // the collation elements
extend string // weights of extend to be appended to elems
before bool // weights relative to next instead of previous.
lock bool // entry is used in extension and can no longer be moved.
@ -41,6 +42,7 @@ type entry struct {
decompose bool // can use NFKD decomposition to generate elems
exclude bool // do not include in table
implicit bool // derived, is not included in the list
modified bool // entry was modified in tailoring
logical logicalAnchor
expansionIndex int // used to store index into expansion table
@ -162,10 +164,10 @@ func (e *entry) encode() (ce uint32, err error) {
}
switch {
case e.decompose:
t1 := e.elems[0][2]
t1 := e.elems[0].w[2]
t2 := 0
if len(e.elems) > 1 {
t2 = e.elems[1][2]
t2 = e.elems[1].w[2]
}
ce, err = makeDecompose(t1, t2)
case e.contractionStarter():
@ -231,7 +233,7 @@ func (o *ordering) insert(e *entry) {
// newEntry creates a new entry for the given info and inserts it into
// the index.
func (o *ordering) newEntry(s string, ces [][]int) *entry {
func (o *ordering) newEntry(s string, ces []rawCE) *entry {
e := &entry{
runes: []rune(s),
elems: ces,
@ -249,14 +251,29 @@ func (o *ordering) find(str string) *entry {
if e == nil {
r := []rune(str)
if len(r) == 1 {
e = o.newEntry(string(r[0]), [][]int{
{
implicitPrimary(r[0]),
defaultSecondary,
defaultTertiary,
int(r[0]),
},
})
const (
firstHangul = 0xAC00
lastHangul = 0xD7A3
)
if r[0] >= firstHangul && r[0] <= lastHangul {
ce := []rawCE{}
nfd := norm.NFD.String(str)
for _, r := range nfd {
ce = append(ce, o.find(string(r)).elems...)
}
e = o.newEntry(nfd, ce)
} else {
e = o.newEntry(string(r[0]), []rawCE{
{w: []int{
implicitPrimary(r[0]),
defaultSecondary,
defaultTertiary,
int(r[0]),
},
},
})
e.modified = true
}
e.exclude = true // do not index implicits
}
}
@ -275,7 +292,7 @@ func makeRootOrdering() ordering {
}
insert := func(typ logicalAnchor, s string, ce []int) {
e := &entry{
elems: [][]int{ce},
elems: []rawCE{{w: ce}},
str: s,
exclude: true,
logical: typ,
@ -362,10 +379,14 @@ func (o *ordering) sort() {
// genColElems generates a collation element array from the runes in str. This
// assumes that all collation elements have already been added to the Builder.
func (o *ordering) genColElems(str string) [][]int {
elems := [][]int{}
func (o *ordering) genColElems(str string) []rawCE {
elems := []rawCE{}
for _, r := range []rune(str) {
elems = append(elems, o.find(string(r)).elems...)
for _, ce := range o.find(string(r)).elems {
if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
elems = append(elems, ce)
}
}
}
return elems
}

View File

@ -20,7 +20,7 @@ type entryTest struct {
// entries plus a leading and trailing anchor.
func makeList(n int) []*entry {
es := make([]*entry, n+2)
weights := [][]int{{100, 20, 5, 0}}
weights := []rawCE{{w: []int{100, 20, 5, 0}}}
for i := range es {
runes := []rune{rune(i)}
es[i] = &entry{
@ -176,8 +176,8 @@ type entryLessTest struct {
}
var (
w1 = [][]int{{100, 20, 5, 5}}
w2 = [][]int{{101, 20, 5, 5}}
w1 = []rawCE{{w: []int{100, 20, 5, 5}}}
w2 = []rawCE{{w: []int{101, 20, 5, 5}}}
)
var entryLessTests = []entryLessTest{

View File

@ -23,7 +23,7 @@ const (
type colElem uint32
const (
maxCE colElem = 0x80FFFFFF
maxCE colElem = 0xAFFFFFFF
minContract = 0xC0000000
maxContract = 0xDFFFFFFF
minExpand = 0xE0000000
@ -62,30 +62,37 @@ func (ce colElem) ctype() ceType {
// 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value
// - s* is the secondary collation value
// or
// 00pppppp pppppppp ppppppps sssttttt, where
// - p* is primary collation value
// - s* offset of secondary from default value.
// - t* is the tertiary collation value
// 100ttttt cccccccc pppppppp pppppppp
// - t* is the tertiar collation value
// - c* is the cannonical combining class
// - p* is the primary collation value
// Collation elements with a secondary value are of the form
// 10000000 0000ssss ssssssss tttttttt, where
// - 16 BMP implicit -> weight
// - 8 bit s
// - default tertiary
// 1010cccc ccccssss ssssssss tttttttt, where
// - c* is the canonical combining class
// - s* is the secondary collation value
// - t* is the tertiary collation value
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
// - q* quaternary value
const (
ceTypeMask = 0xC0000000
ceTypeMaskExt = 0xE0000000
ceType1 = 0x40000000
ceType2 = 0x00000000
ceType3 = 0x80000000
ceType3or4 = 0x80000000
ceType4 = 0xA0000000
ceTypeQ = 0xC0000000
ceIgnore = ceType3
ceIgnore = ceType4
firstNonPrimary = 0x80000000
lastSpecialPrimary = 0xA0000000
secondaryMask = 0x80000000
hasTertiaryMask = 0x40000000
primaryValueMask = 0x3FFFFE00
primaryShift = 9
compactPrimaryBits = 16
compactSecondaryShift = 5
minCompactSecondary = defaultSecondary - 4
)
@ -98,9 +105,22 @@ func makeQuaternary(primary int) colElem {
return ceTypeQ | colElem(primary<<primaryShift)
}
func (ce colElem) ccc() uint8 {
if ce&ceType3or4 != 0 {
if ce&ceType4 == ceType3or4 {
return uint8(ce >> 16)
}
return uint8(ce >> 20)
}
return 0
}
func (ce colElem) primary() int {
if ce >= firstNonPrimary {
return 0
if ce > lastSpecialPrimary {
return 0
}
return int(uint16(ce))
}
return int(ce&primaryValueMask) >> primaryShift
}
@ -111,8 +131,11 @@ func (ce colElem) secondary() int {
return int(uint8(ce))
case ceType2:
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
case ceType3:
return int(uint16(ce >> 8))
case ceType3or4:
if ce < ceType4 {
return defaultSecondary
}
return int(ce>>8) & 0xFFF
case ceTypeQ:
return 0
}
@ -121,10 +144,13 @@ func (ce colElem) secondary() int {
func (ce colElem) tertiary() uint8 {
if ce&hasTertiaryMask == 0 {
if ce&ceType3 == 0 {
if ce&ceType3or4 == 0 {
return uint8(ce & 0x1F)
}
return uint8(ce)
if ce&ceType4 == ceType4 {
return uint8(ce)
}
return uint8(ce>>24) & 0x1F // type 2
} else if ce&ceTypeMask == ceType1 {
return defaultTertiary
}
@ -134,10 +160,15 @@ func (ce colElem) tertiary() uint8 {
func (ce colElem) updateTertiary(t uint8) colElem {
if ce&ceTypeMask == ceType1 {
// convert to type 4
nce := ce & primaryValueMask
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
ce = nce
} else if ce&ceTypeMaskExt == ceType3or4 {
ce &= ^colElem(maxTertiary << 24)
return ce | (colElem(t) << 24)
} else {
// type 2 or 4
ce &= ^colElem(maxTertiary)
}
return ce | colElem(t)

View File

@ -23,12 +23,19 @@ func makeCE(weights []int) colElem {
maxSecondaryDiffBits = 4
maxTertiaryBits = 8
maxTertiaryCompactBits = 5
isSecondary = 0x80000000
isPrimary = 0x40000000
isPrimaryCCC = 0x80000000
isSecondary = 0xA0000000
)
var ce colElem
ccc := weights[3]
if weights[0] != 0 {
if weights[2] == defaultTertiary {
if ccc != 0 {
ce = colElem(weights[2] << 24)
ce |= colElem(ccc) << 16
ce |= colElem(weights[0])
ce |= isPrimaryCCC
} else if weights[2] == defaultTertiary {
ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary
} else {
@ -38,6 +45,7 @@ func makeCE(weights []int) colElem {
}
} else {
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
ce += colElem(ccc) << 20
ce |= isSecondary
}
return ce
@ -68,10 +76,11 @@ func makeDecompose(t1, t2 int) colElem {
}
func normalCE(inout []int) (ce colElem, t ceType) {
w := makeCE(inout)
inout[0] = w.primary()
inout[1] = w.secondary()
inout[2] = int(w.tertiary())
ce = makeCE(inout)
inout[0] = ce.primary()
inout[1] = ce.secondary()
inout[2] = int(ce.tertiary())
inout[3] = int(ce.ccc())
return ce, ceNormal
}
@ -102,9 +111,13 @@ const (
)
var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}},
{normalCE, []int{0, 30, 3}},
{normalCE, []int{100, defaultSecondary, 3}},
{normalCE, []int{0, 0, 0, 0}},
{normalCE, []int{0, 30, 3, 0}},
{normalCE, []int{0, 30, 3, 0xFF}},
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0}},
{normalCE, []int{100, defaultSecondary, defaultTertiary, 0xFF}},
{normalCE, []int{100, defaultSecondary, 3, 0}},
{normalCE, []int{0x123, defaultSecondary, 8, 0xFF}},
{contractCE, []int{0, 0, 0}},
{contractCE, []int{1, 1, 1}},
@ -127,11 +140,11 @@ func TestColElem(t *testing.T) {
copy(inout, tt.arg)
ce, typ := tt.f(inout)
if ce.ctype() != typ {
t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
t.Errorf("%d: type is %d; want %d (ColElem: %X)", i, ce.ctype(), typ, ce)
}
for j, a := range tt.arg {
if inout[j] != a {
t.Errorf("%d: argument %d is %X; want %X", i, j, inout[j], a)
t.Errorf("%d: argument %d is %X; want %X (ColElem: %X)", i, j, inout[j], a, ce)
}
}
}
@ -176,7 +189,8 @@ func TestUpdateTertiary(t *testing.T) {
{0x4000FE20, 0x0000FE8A, 0x0A},
{0x4000FE21, 0x0000FEAA, 0x0A},
{0x0000FE8B, 0x0000FE83, 0x03},
{0x8000CC02, 0x8000CC1B, 0x1B},
{0x82FF0188, 0x9BFF0188, 0x1B},
{0xAFF0CC02, 0xAFF0CC1B, 0x1B},
}
for i, tt := range tests {
if out := tt.in.updateTertiary(tt.t); out != tt.out {
@ -184,3 +198,77 @@ func TestUpdateTertiary(t *testing.T) {
}
}
}
func TestDoNorm(t *testing.T) {
const div = -1 // The insertion point of the next block.
tests := []struct {
in, out []int
}{
{in: []int{4, div, 3},
out: []int{3, 4},
},
{in: []int{4, div, 3, 3, 3},
out: []int{3, 3, 3, 4},
},
{in: []int{0, 4, div, 3},
out: []int{0, 3, 4},
},
{in: []int{0, 0, 4, 5, div, 3, 3},
out: []int{0, 0, 3, 3, 4, 5},
},
{in: []int{0, 0, 1, 4, 5, div, 3, 3},
out: []int{0, 0, 1, 3, 3, 4, 5},
},
{in: []int{0, 0, 1, 4, 5, div, 4, 4},
out: []int{0, 0, 1, 4, 4, 4, 5},
},
}
for j, tt := range tests {
i := iter{}
var w, p, s int
for k, cc := range tt.in {
if cc == 0 {
s = 0
}
if cc == div {
w = 100
p = k
i.pStarter = s
continue
}
i.ce = append(i.ce, makeCE([]int{w, 20, 2, cc}))
}
i.prevCCC = i.ce[p-1].ccc()
i.doNorm(p, i.ce[p].ccc())
if len(i.ce) != len(tt.out) {
t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
}
prevCCC := uint8(0)
for k, ce := range i.ce {
if int(ce.ccc()) != tt.out[k] {
t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.ccc(), tt.out[k])
}
if k > 0 && ce.ccc() == prevCCC && i.ce[k-1].primary() > ce.primary() {
t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
}
}
}
// test cutoff of large sequence of combining characters.
result := []uint8{8, 8, 8, 5, 5}
for o := -2; o <= 2; o++ {
i := iter{pStarter: 2, prevCCC: 8}
n := maxCombiningCharacters + 1 + o
for j := 1; j < n+i.pStarter; j++ {
i.ce = append(i.ce, makeCE([]int{100, 20, 2, 8}))
}
p := len(i.ce)
i.ce = append(i.ce, makeCE([]int{0, 20, 2, 5}))
i.doNorm(p, 5)
if i.prevCCC != result[o+2] {
t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
}
if result[o+2] == 5 && i.pStarter != p {
t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
}
}
}

View File

@ -10,6 +10,7 @@ package collate
import (
"bytes"
"exp/norm"
"unicode/utf8"
)
// Level identifies the collation comparison level.
@ -112,7 +113,7 @@ func New(loc string) *Collator {
func newCollator(t *table) *Collator {
c := &Collator{
Strength: Quaternary,
Strength: Tertiary,
f: norm.NFD,
t: t,
}
@ -269,8 +270,7 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
func (c *Collator) getColElems(str []byte) []colElem {
i := c.iter(0)
i.setInput(c, str)
for !i.done() {
i.next()
for i.next() {
}
return i.ce
}
@ -278,88 +278,185 @@ func (c *Collator) getColElems(str []byte) []colElem {
func (c *Collator) getColElemsString(str string) []colElem {
i := c.iter(0)
i.setInputString(c, str)
for !i.done() {
i.next()
for i.next() {
}
return i.ce
}
type source struct {
str string
bytes []byte
buf [16]byte // Used for decomposing Hangul.
}
func (src *source) done() bool {
return len(src.str) == 0 && len(src.bytes) == 0
}
func (src *source) tail(n int) (res source) {
if src.bytes == nil {
res.str = src.str[n:]
} else {
res.bytes = src.bytes[n:]
}
return res
}
func (src *source) nfd(end int) []byte {
if src.bytes == nil {
return norm.NFD.AppendString(src.buf[:0], src.str[:end])
}
return norm.NFD.Append(src.buf[:0], src.bytes[:end]...)
}
func (src *source) properties(f norm.Form) norm.Properties {
if src.bytes == nil {
return f.PropertiesString(src.str)
}
return f.Properties(src.bytes)
}
func (src *source) lookup(t *table) (ce colElem, sz int) {
if src.bytes == nil {
return t.index.lookupString(src.str)
}
return t.index.lookup(src.bytes)
}
func (src *source) rune() (r rune, sz int) {
if src.bytes == nil {
return utf8.DecodeRuneInString(src.str)
}
return utf8.DecodeRune(src.bytes)
}
type iter struct {
src norm.Iter
norm [1024]byte
buf []byte
p int
minBufSize int
src source
wa [512]colElem
ce []colElem
pce int
nce int // nce <= len(nce)
t *table
_done, eof bool
prevCCC uint8
pStarter int
t *table
}
func (i *iter) init(c *Collator) {
i.t = c.t
i.minBufSize = c.t.maxContractLen
i.ce = i.wa[:0]
i.buf = i.norm[:0]
}
func (i *iter) reset() {
i.ce = i.ce[:0]
i.buf = i.buf[:0]
i.p = 0
i.eof = i.src.Done()
i._done = i.eof
i.nce = 0
i.prevCCC = 0
i.pStarter = 0
}
func (i *iter) setInput(c *Collator, s []byte) *iter {
i.src.SetInput(c.f, s)
i.src.bytes = s
i.src.str = ""
i.reset()
return i
}
func (i *iter) setInputString(c *Collator, s string) *iter {
i.src.SetInputString(c.f, s)
i.src.str = s
i.src.bytes = nil
i.reset()
return i
}
func (i *iter) done() bool {
return i._done
// next appends colElems to the internal array until it adds an element with CCC=0.
// In the majority of cases, a colElem with a primary value > 0 will have
// a CCC of 0. The CCC values of colation elements are also used to detect if the
// input string was not normalized and to adjust the result accordingly.
func (i *iter) next() bool {
sz := 0
for !i.src.done() {
p0 := len(i.ce)
i.ce, sz = i.t.appendNext(i.ce, i.src)
i.src = i.src.tail(sz)
last := len(i.ce) - 1
if ccc := i.ce[last].ccc(); ccc == 0 {
i.nce = len(i.ce)
i.pStarter = last
i.prevCCC = 0
return true
} else if p0 < last && i.ce[p0].ccc() == 0 {
// set i.nce to only cover part of i.ce for which ccc == 0 and
// use rest the next call to next.
for p0++; p0 < last && i.ce[p0].ccc() == 0; p0++ {
}
i.nce = p0
i.pStarter = p0 - 1
i.prevCCC = ccc
return true
} else if ccc < i.prevCCC {
i.doNorm(p0, ccc) // should be rare for most common cases
} else {
i.prevCCC = ccc
}
}
if len(i.ce) != i.nce {
i.nce = len(i.ce)
return true
}
return false
}
func (i *iter) next() {
if !i.eof && len(i.buf)-i.p < i.minBufSize {
// replenish buffer
n := copy(i.buf, i.buf[i.p:])
n += i.src.Next(i.buf[n:cap(i.buf)])
i.buf = i.buf[:n]
i.p = 0
i.eof = i.src.Done()
}
if i.p == len(i.buf) {
i._done = true
return
// nextPlain is the same as next, but does not "normalize" the collation
// elements.
// TODO: remove this function. Using this instead of next does not seem
// to improve performance in any significant way. We retain this until
// later for evaluation purposes.
func (i *iter) nextPlain() bool {
if i.src.done() {
return false
}
sz := 0
i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
i.p += sz
i.ce, sz = i.t.appendNext(i.ce, i.src)
i.src = i.src.tail(sz)
i.nce = len(i.ce)
return true
}
const maxCombiningCharacters = 30
// doNorm reorders the collation elements in i.ce.
// It assumes that blocks of collation elements added with appendNext
// either start and end with the same CCC or start with CCC == 0.
// This allows for a single insertion point for the entire block.
// The correctness of this assumption is verified in builder.go.
func (i *iter) doNorm(p int, ccc uint8) {
if p-i.pStarter > maxCombiningCharacters {
i.prevCCC = i.ce[len(i.ce)-1].ccc()
i.pStarter = len(i.ce) - 1
return
}
n := len(i.ce)
k := p
for p--; p > i.pStarter && ccc < i.ce[p-1].ccc(); p-- {
}
i.ce = append(i.ce, i.ce[p:k]...)
copy(i.ce[p:], i.ce[k:])
i.ce = i.ce[:n]
}
func (i *iter) nextPrimary() int {
for {
for ; i.pce < len(i.ce); i.pce++ {
for ; i.pce < i.nce; i.pce++ {
if v := i.ce[i.pce].primary(); v != 0 {
i.pce++
return v
}
}
if i.done() {
if !i.next() {
return 0
}
i.next()
}
panic("should not reach here")
}

View File

@ -378,6 +378,7 @@ var keyTests = []keyTest{
func TestKey(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in)
c.Alternate = collate.AltShifted
c.Strength = collate.Quaternary
buf := collate.Buffer{}
keys1 := [][]byte{}
keys2 := [][]byte{}

View File

@ -27,8 +27,21 @@ type ctScanner struct {
done bool
}
type ctScannerString struct {
states contractTrieSet
s string
n int
index int
pindex int
done bool
}
func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
return ctScanner{states: t[index:], s: b, n: n}
return ctScanner{s: b, states: t[index:], n: n}
}
func (t contractTrieSet) scannerString(index, n int, str string) ctScannerString {
return ctScannerString{s: str, states: t[index:], n: n}
}
// result returns the offset i and bytes consumed p so far. If no suffix
@ -37,6 +50,10 @@ func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex
}
func (s *ctScannerString) result() (i, p int) {
return s.index, s.pindex
}
const (
final = 0
noIndex = 0xFF
@ -84,3 +101,45 @@ func (s *ctScanner) scan(p int) int {
}
return pr
}
// scan is a verbatim copy of ctScanner.scan.
func (s *ctScannerString) scan(p int) int {
pr := p // the p at the rune start
str := s.s
states, n := s.states, s.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
// TODO: a significant number of contractions are of a form that
// cannot match discontiguous UTF-8 in a normalized string. We could let
// a negative value of e.n mean that we can set s.done = true and avoid
// the need for additional matches.
if c >= e.l {
if e.l == c {
p++
if e.i != noIndex {
s.index = int(e.i)
s.pindex = p
}
if e.n != final {
i, states, n = 0, states[int(e.h)+n:], int(e.n)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
} else {
s.done = true
return p
}
continue
} else if e.n == final && c <= e.h {
p++
s.done = true
s.index = int(c-e.l) + int(e.i)
s.pindex = p
return p
}
}
i++
}
return pr
}

View File

@ -30,7 +30,7 @@ func W(ce ...int) Weights {
return w
}
func (w Weights) String() string {
return fmt.Sprintf("[%d.%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
return fmt.Sprintf("[%X.%X.%X.%X]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
}
type Table struct {
@ -52,7 +52,7 @@ func convertToWeights(ws []colElem) []Weights {
func convertFromWeights(ws []Weights) []colElem {
out := make([]colElem, len(ws))
for i, w := range ws {
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary})
out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary, 0})
if out[i] == ceIgnore && w.Quaternary > 0 {
out[i] = makeQuaternary(w.Quaternary)
}
@ -61,7 +61,7 @@ func convertFromWeights(ws []Weights) []colElem {
}
func (t *Table) AppendNext(s []byte) ([]Weights, int) {
w, n := t.t.appendNext(nil, s)
w, n := t.t.appendNext(nil, source{bytes: s})
return convertToWeights(w), n
}

View File

@ -42,13 +42,26 @@ func (t *table) indexedTable(idx tableIndex) *table {
// sequence of runes, the weights for the interstitial runes are
// appended as well. It returns a new slice that includes the appended
// weights and the number of bytes consumed from s.
func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
v, sz := t.index.lookup(s)
ce := colElem(v)
func (t *table) appendNext(w []colElem, src source) (res []colElem, n int) {
ce, sz := src.lookup(t)
tp := ce.ctype()
if tp == ceNormal {
if ce == 0 {
r, _ := utf8.DecodeRune(s)
r, _ := src.rune()
const (
hangulSize = 3
firstHangul = 0xAC00
lastHangul = 0xD7A3
)
if r >= firstHangul && r <= lastHangul {
// TODO: performance can be considerably improved here.
n = sz
for b := src.nfd(hangulSize); len(b) > 0; b = b[sz:] {
ce, sz = t.index.lookup(b)
w = append(w, ce)
}
return w, n
}
ce = makeImplicitCE(implicitPrimary(r))
}
w = append(w, ce)
@ -56,15 +69,20 @@ func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
w = t.appendExpansion(w, ce)
} else if tp == ceContractionIndex {
n := 0
w, n = t.matchContraction(w, ce, s[sz:])
src = src.tail(sz)
if src.bytes == nil {
w, n = t.matchContractionString(w, ce, src.str)
} else {
w, n = t.matchContraction(w, ce, src.bytes)
}
sz += n
} else if tp == ceDecompose {
// Decompose using NFCK and replace tertiary weights.
// Decompose using NFKD and replace tertiary weights.
t1, t2 := splitDecompose(ce)
i := len(w)
nfkd := norm.NFKD.Properties(s).Decomposition()
nfkd := src.properties(norm.NFKD).Decomposition()
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
w, p = t.appendNext(w, nfkd)
w, p = t.appendNext(w, source{bytes: nfkd})
}
w[i] = w[i].updateTertiary(t1)
if i++; i < len(w) {
@ -99,16 +117,17 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
// By now we should have filtered most cases.
p0 := p
bufn := 0
rune := norm.NFC.Properties(suffix[p:])
rune := norm.NFD.Properties(suffix[p:])
p += rune.Size()
if prevCC := rune.TrailCCC(); prevCC != 0 {
if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFC.FirstBoundary(suffix[p:]); end != -1 {
if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
scan.s = suffix[:p+end]
}
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFC.Properties(suffix[p:])
rune = norm.NFD.Properties(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break
}
@ -136,7 +155,65 @@ func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colE
}
// Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, b)
w, p = t.appendNext(w, source{bytes: b})
}
return w, n
}
// TODO: unify the two implementations. This is best done after first simplifying
// the algorithm taking into account the inclusion of both NFC and NFD forms
// in the table.
func (t *table) matchContractionString(w []colElem, ce colElem, suffix string) ([]colElem, int) {
index, n, offset := splitContractIndex(ce)
scan := t.contractTries.scannerString(index, n, suffix)
buf := [norm.MaxSegmentSize]byte{}
bufp := 0
p := scan.scan(0)
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
// By now we should have filtered most cases.
p0 := p
bufn := 0
rune := norm.NFD.PropertiesString(suffix[p:])
p += rune.Size()
if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
scan.s = suffix[:p+end]
}
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFD.PropertiesString(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break
}
prevCC = rune.TrailCCC()
if pp := scan.scan(p); pp != p {
// Copy the interstitial runes for later processing.
bufn += copy(buf[bufn:], suffix[p0:p])
if scan.pindex == pp {
bufp = bufn
}
p, p0 = pp, pp
} else {
p += rune.Size()
}
}
}
}
// Append weights for the matched contraction, which may be an expansion.
i, n := scan.result()
ce = colElem(t.contractElem[i+offset])
if ce.ctype() == ceNormal {
w = append(w, ce)
} else {
w = t.appendExpansion(w, ce)
}
// Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, source{bytes: b})
}
return w, n
}

View File

@ -42,7 +42,9 @@ func pt(p, t int) []int {
func makeTable(in []input) (*collate.Collator, error) {
b := build.NewBuilder()
for _, r := range in {
b.Add([]rune(r.str), r.ces, nil)
if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
panic(e)
}
}
return b.Build()
}
@ -159,6 +161,7 @@ var appendNextTests = []tableTest{
{"b", [][]int{{200}}},
{"c", [][]int{{300}}},
{"\u03B1", [][]int{{900}}},
{"\x01", [][]int{{0, 0, 0, 0}}},
// contractions
{"a\u0300", [][]int{{101}}},
@ -171,10 +174,11 @@ var appendNextTests = []tableTest{
{"a\u0301\u035F", [][]int{{121}}},
{"a\u0301\u035Fb", [][]int{{119}}},
{"\u03B1\u0345", [][]int{{901}, {902}}},
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
{"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
{"\u302F\u18A9", [][]int{{0, 130}}},
}...),
[]check{
{"a\x01\u0300", 1, ColElems{w(100)}},
{"ab", 1, ColElems{w(100)}}, // closing segment
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
@ -239,12 +243,17 @@ var appendNextTests = []tableTest{
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
// expansion within a gap
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
{"a\u302E\u18A9\u0301", 9, ColElems{w(102), w(0, 131), w(0, 132)}},
{
"a\u0317\u302E\u18A9\u0301",
11,
ColElems{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
},
// repeating CCC blocks last modifier
{"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
// The trailing combining characters (with lower CCC) should block the first one.
// TODO: make the following pass.
// {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
{"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
// Last combiner should match after normalization.
// TODO: make the following pass.
// {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
// The first combiner is blocking the second one as they have the same CCC.
{"a\u035D\u035Eb", 1, ColElems{w(100)}},
},
},
}

File diff suppressed because it is too large Load Diff

View File

@ -97,3 +97,64 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
// Illegal rune
return 0, 1
}
// The body of lookupString is a verbatim copy of that of lookup.
func (t *trie) lookupString(s string) (v colElem, sz int) {
c0 := s[0]
switch {
case c0 < tx:
return colElem(t.values0[c0]), 1
case c0 < t2:
return 0, 1
case c0 < t3:
if len(s) < 2 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
return t.lookupValue(i, c1), 2
case c0 < t4:
if len(s) < 3 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
return t.lookupValue(i, c2), 3
case c0 < t5:
if len(s) < 4 {
return 0, 0
}
i := t.index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = int(i)<<6 + int(c2)
i = t.index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
return 0, 3
}
return t.lookupValue(i, c3), 4
}
// Illegal rune
return 0, 1
}