1
0
mirror of https://github.com/golang/go synced 2024-11-12 04:40:22 -07:00

exp/locale/collate: implementation of main collation functionality for

key and simple comparisson. Search is not yet implemented in this CL.
Changed some of the types of table_test.go to allow reuse in the new test.
Also reduced number of primary values for illegal runes to 1 (both map to
the same).

R=r
CC=golang-dev
https://golang.org/cl/6202062
This commit is contained in:
Marcel van Lohuizen 2012-05-17 19:48:56 +02:00
parent a2004546a9
commit ec099b2bc7
6 changed files with 750 additions and 89 deletions

View File

@ -153,7 +153,7 @@ const (
rareUnifiedOffset = 0x1FB40
otherOffset = 0x4FB40
illegalOffset = otherOffset + unicode.MaxRune
maxPrimary = illegalOffset + 2 // there are 2 illegal values.
maxPrimary = illegalOffset + 1
)
// implicitPrimary returns the primary weight for the a rune

View File

@ -22,6 +22,7 @@ const (
defaultSecondary = 0x20
defaultTertiary = 0x2
maxTertiary = 0x1F
maxQuaternary = 0x1FFFFF // 21 bits.
)
// colElem is a representation of a collation element.
@ -145,7 +146,8 @@ const (
commonUnifiedOffset = 0xFB40
rareUnifiedOffset = 0x1FB40
otherOffset = 0x4FB40
maxPrimary = otherOffset + unicode.MaxRune
illegalOffset = otherOffset + unicode.MaxRune
maxPrimary = illegalOffset + 1
)
// implicitPrimary returns the primary weight for the a rune

View File

@ -8,6 +8,7 @@
package collate
import (
"bytes"
"exp/norm"
)
@ -67,6 +68,7 @@ type Collator struct {
// This option exists predominantly to support reverse sorting of accents in French.
Backwards bool
// TODO: implement:
// With HiraganaQuaternary enabled, Hiragana codepoints will get lower values
// than all the other non-variable code points. Strength must be greater or
// equal to Quaternary for this to take effect.
@ -122,25 +124,46 @@ func (b *Buffer) ResetKeys() {
// Compare returns an integer comparing the two byte slices.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// Compare calls ResetKeys, thereby invalidating keys
// previously generated using Key or KeyFromString using buf.
func (c *Collator) Compare(buf *Buffer, a, b []byte) int {
// TODO: implement
return 0
// TODO: for now we simply compute keys and compare. Once we
// have good benchmarks, move to an implementation that works
// incrementally for the majority of cases.
// - Benchmark with long strings that only vary in modifiers.
buf.ResetKeys()
ka := c.Key(buf, a)
kb := c.Key(buf, b)
defer buf.ResetKeys()
return bytes.Compare(ka, kb)
}
// CompareString returns an integer comparing the two strings.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// CompareString calls ResetKeys, thereby invalidating keys
// previously generated using Key or KeyFromString using buf.
func (c *Collator) CompareString(buf *Buffer, a, b string) int {
// TODO: implement
buf.ResetKeys()
ka := c.KeyFromString(buf, a)
kb := c.KeyFromString(buf, b)
defer buf.ResetKeys()
return bytes.Compare(ka, kb)
}
func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
// iterate over s, track bytes consumed.
return 0
}
// Key returns the collation key for str.
// Passing the buffer buf may avoid memory allocations.
// The returned slice will point to an allocation in Buffer and will retain
// The returned slice will point to an allocation in Buffer and will remain
// valid until the next call to buf.ResetKeys().
func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// TODO: implement
return nil
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
c.getColElems(buf, str)
return c.key(buf, buf.ce)
}
// KeyFromString returns the collation key for str.
@ -148,6 +171,175 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// The returned slice will point to an allocation in Buffer and will retain
// valid until the next call to buf.ResetKeys().
func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
// TODO: implement
return nil
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
c.getColElemsString(buf, str)
return c.key(buf, buf.ce)
}
func (c *Collator) key(buf *Buffer, w []weights) []byte {
processWeights(c.Alternate, c.variableTop, w)
kn := len(buf.key)
c.keyFromElems(buf, w)
return buf.key[kn:]
}
func (c *Collator) getColElems(buf *Buffer, str []byte) {
i := c.iter()
i.src.SetInput(c.f, str)
for !i.done() {
buf.ce = i.next(buf.ce)
}
}
func (c *Collator) getColElemsString(buf *Buffer, str string) {
i := c.iter()
i.src.SetInputString(c.f, str)
for !i.done() {
buf.ce = i.next(buf.ce)
}
}
type iter struct {
src norm.Iter
ba [1024]byte
buf []byte
t *table
p int
minBufSize int
_done, eof bool
}
func (c *Collator) iter() iter {
i := iter{t: c.t, minBufSize: c.t.maxContractLen}
i.buf = i.ba[:0]
return i
}
func (i *iter) done() bool {
return i._done
}
func (i *iter) next(ce []weights) []weights {
if !i.eof && len(i.buf)-i.p < i.minBufSize {
// replenish buffer
n := copy(i.buf, i.buf[i.p:])
n += i.src.Next(i.buf[n:cap(i.buf)])
i.buf = i.buf[:n]
i.p = 0
i.eof = i.src.Done()
}
if i.p == len(i.buf) {
i._done = true
return ce
}
ce, sz := i.t.appendNext(ce, i.buf[i.p:])
i.p += sz
return ce
}
func appendPrimary(key []byte, p uint32) []byte {
// Convert to variable length encoding; supports up to 23 bits.
if p <= 0x7FFF {
key = append(key, uint8(p>>8), uint8(p))
} else {
key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
}
return key
}
// keyFromElems converts the weights ws to a compact sequence of bytes.
// The result will be appended to the byte buffer in buf.
func (c *Collator) keyFromElems(buf *Buffer, ws []weights) {
for _, v := range ws {
if w := v.primary; w > 0 {
buf.key = appendPrimary(buf.key, w)
}
}
if Secondary <= c.Strength {
buf.key = append(buf.key, 0, 0)
// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
if !c.Backwards {
for _, v := range ws {
if w := v.secondary; w > 0 {
buf.key = append(buf.key, uint8(w>>8), uint8(w))
}
}
} else {
for i := len(ws) - 1; i >= 0; i-- {
if w := ws[i].secondary; w > 0 {
buf.key = append(buf.key, uint8(w>>8), uint8(w))
}
}
}
} else if c.CaseLevel {
buf.key = append(buf.key, 0, 0)
}
if Tertiary <= c.Strength || c.CaseLevel {
buf.key = append(buf.key, 0, 0)
for _, v := range ws {
if w := v.tertiary; w > 0 {
buf.key = append(buf.key, w)
}
}
// Derive the quaternary weights from the options and other levels.
// Note that we represent maxQuaternary as 0xFF. The first byte of the
// representation of a a primary weight is always smaller than 0xFF,
// so using this single byte value will compare correctly.
if Quaternary <= c.Strength {
if c.Alternate == AltShiftTrimmed {
lastNonFFFF := len(buf.key)
buf.key = append(buf.key, 0)
for _, v := range ws {
if w := v.quaternary; w == maxQuaternary {
buf.key = append(buf.key, 0xFF)
} else if w > 0 {
buf.key = appendPrimary(buf.key, w)
lastNonFFFF = len(buf.key)
}
}
buf.key = buf.key[:lastNonFFFF]
} else {
buf.key = append(buf.key, 0)
for _, v := range ws {
if w := v.quaternary; w == maxQuaternary {
buf.key = append(buf.key, 0xFF)
} else if w > 0 {
buf.key = appendPrimary(buf.key, w)
}
}
}
}
}
}
func processWeights(vw AlternateHandling, top uint32, wa []weights) {
ignore := false
switch vw {
case AltShifted, AltShiftTrimmed:
for i := range wa {
if p := wa[i].primary; p <= top && p != 0 {
wa[i] = weights{quaternary: p}
ignore = true
} else if p == 0 {
if ignore {
wa[i] = weights{}
} else if wa[i].tertiary != 0 {
wa[i].quaternary = maxQuaternary
}
} else {
wa[i].quaternary = maxQuaternary
ignore = false
}
}
case AltBlanked:
for i := range wa {
if p := wa[i].primary; p <= top && (ignore || p != 0) {
wa[i] = weights{}
ignore = true
} else {
ignore = false
}
}
}
}

View File

@ -0,0 +1,422 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package collate_test
import (
"bytes"
"exp/locale/collate"
"testing"
)
type weightsTest struct {
opt opts
in, out ColElems
}
type opts struct {
lev int
alt collate.AlternateHandling
top int
backwards bool
caseLevel bool
}
func (o opts) level() collate.Level {
if o.lev == 0 {
return collate.Quaternary
}
return collate.Level(o.lev - 1)
}
func (o opts) collator() *collate.Collator {
c := &collate.Collator{
Strength: o.level(),
Alternate: o.alt,
Backwards: o.backwards,
CaseLevel: o.caseLevel,
}
collate.SetTop(c, o.top)
return c
}
const (
maxQ = 0x1FFFFF
)
func wpq(p, q int) collate.Weights {
return collate.W(p, defaults.Secondary, defaults.Tertiary, q)
}
func wsq(s, q int) collate.Weights {
return collate.W(0, s, defaults.Tertiary, q)
}
func wq(q int) collate.Weights {
return collate.W(0, 0, 0, q)
}
var zero = w(0, 0, 0, 0)
var processTests = []weightsTest{
// Shifted
{ // simple sequence of non-variables
opt: opts{alt: collate.AltShifted, top: 100},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{wpq(200, maxQ), wpq(300, maxQ), wpq(400, maxQ)},
},
{ // first is a variable
opt: opts{alt: collate.AltShifted, top: 250},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{wq(200), wpq(300, maxQ), wpq(400, maxQ)},
},
{ // all but first are variable
opt: opts{alt: collate.AltShifted, top: 999},
in: ColElems{w(1000), w(200), w(300), w(400)},
out: ColElems{wpq(1000, maxQ), wq(200), wq(300), wq(400)},
},
{ // first is a modifier
opt: opts{alt: collate.AltShifted, top: 999},
in: ColElems{w(0, 10), w(1000)},
out: ColElems{wsq(10, maxQ), wpq(1000, maxQ)},
},
{ // primary ignorables
opt: opts{alt: collate.AltShifted, top: 250},
in: ColElems{w(200), w(0, 10), w(300), w(0, 15), w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), wsq(15, maxQ), wpq(400, maxQ)},
},
{ // secondary ignorables
opt: opts{alt: collate.AltShifted, top: 250},
in: ColElems{w(200), w(0, 0, 10), w(300), w(0, 0, 15), w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), w(0, 0, 15, maxQ), wpq(400, maxQ)},
},
{ // tertiary ignorables, no change
opt: opts{alt: collate.AltShifted, top: 250},
in: ColElems{w(200), zero, w(300), zero, w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), zero, wpq(400, maxQ)},
},
// ShiftTrimmed (same as Shifted)
{ // simple sequence of non-variables
opt: opts{alt: collate.AltShiftTrimmed, top: 100},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{wpq(200, maxQ), wpq(300, maxQ), wpq(400, maxQ)},
},
{ // first is a variable
opt: opts{alt: collate.AltShiftTrimmed, top: 250},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{wq(200), wpq(300, maxQ), wpq(400, maxQ)},
},
{ // all but first are variable
opt: opts{alt: collate.AltShiftTrimmed, top: 999},
in: ColElems{w(1000), w(200), w(300), w(400)},
out: ColElems{wpq(1000, maxQ), wq(200), wq(300), wq(400)},
},
{ // first is a modifier
opt: opts{alt: collate.AltShiftTrimmed, top: 999},
in: ColElems{w(0, 10), w(1000)},
out: ColElems{wsq(10, maxQ), wpq(1000, maxQ)},
},
{ // primary ignorables
opt: opts{alt: collate.AltShiftTrimmed, top: 250},
in: ColElems{w(200), w(0, 10), w(300), w(0, 15), w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), wsq(15, maxQ), wpq(400, maxQ)},
},
{ // secondary ignorables
opt: opts{alt: collate.AltShiftTrimmed, top: 250},
in: ColElems{w(200), w(0, 0, 10), w(300), w(0, 0, 15), w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), w(0, 0, 15, maxQ), wpq(400, maxQ)},
},
{ // tertiary ignorables, no change
opt: opts{alt: collate.AltShiftTrimmed, top: 250},
in: ColElems{w(200), zero, w(300), zero, w(400)},
out: ColElems{wq(200), zero, wpq(300, maxQ), zero, wpq(400, maxQ)},
},
// Blanked
{ // simple sequence of non-variables
opt: opts{alt: collate.AltBlanked, top: 100},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{w(200), w(300), w(400)},
},
{ // first is a variable
opt: opts{alt: collate.AltBlanked, top: 250},
in: ColElems{w(200), w(300), w(400)},
out: ColElems{zero, w(300), w(400)},
},
{ // all but first are variable
opt: opts{alt: collate.AltBlanked, top: 999},
in: ColElems{w(1000), w(200), w(300), w(400)},
out: ColElems{w(1000), zero, zero, zero},
},
{ // first is a modifier
opt: opts{alt: collate.AltBlanked, top: 999},
in: ColElems{w(0, 10), w(1000)},
out: ColElems{w(0, 10), w(1000)},
},
{ // primary ignorables
opt: opts{alt: collate.AltBlanked, top: 250},
in: ColElems{w(200), w(0, 10), w(300), w(0, 15), w(400)},
out: ColElems{zero, zero, w(300), w(0, 15), w(400)},
},
{ // secondary ignorables
opt: opts{alt: collate.AltBlanked, top: 250},
in: ColElems{w(200), w(0, 0, 10), w(300), w(0, 0, 15), w(400)},
out: ColElems{zero, zero, w(300), w(0, 0, 15), w(400)},
},
{ // tertiary ignorables, no change
opt: opts{alt: collate.AltBlanked, top: 250},
in: ColElems{w(200), zero, w(300), zero, w(400)},
out: ColElems{zero, zero, w(300), zero, w(400)},
},
// Non-ignorable: input is always equal to output.
{ // all but first are variable
opt: opts{alt: collate.AltNonIgnorable, top: 999},
in: ColElems{w(1000), w(200), w(300), w(400)},
out: ColElems{w(1000), w(200), w(300), w(400)},
},
{ // primary ignorables
opt: opts{alt: collate.AltNonIgnorable, top: 250},
in: ColElems{w(200), w(0, 10), w(300), w(0, 15), w(400)},
out: ColElems{w(200), w(0, 10), w(300), w(0, 15), w(400)},
},
{ // secondary ignorables
opt: opts{alt: collate.AltNonIgnorable, top: 250},
in: ColElems{w(200), w(0, 0, 10), w(300), w(0, 0, 15), w(400)},
out: ColElems{w(200), w(0, 0, 10), w(300), w(0, 0, 15), w(400)},
},
{ // tertiary ignorables, no change
opt: opts{alt: collate.AltNonIgnorable, top: 250},
in: ColElems{w(200), zero, w(300), zero, w(400)},
out: ColElems{w(200), zero, w(300), zero, w(400)},
},
}
func TestProcessWeights(t *testing.T) {
for i, tt := range processTests {
res := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in)
if len(res) != len(tt.out) {
t.Errorf("%d: len(ws) was %d; want %d (%v should be %v)", i, len(res), len(tt.out), res, tt.out)
continue
}
for j, w := range res {
if w != tt.out[j] {
t.Errorf("%d: Weights %d was %v; want %v", i, j, w, tt.out[j])
}
}
}
}
type keyFromElemTest struct {
opt opts
in ColElems
out []byte
}
var defS = byte(defaults.Secondary)
var defT = byte(defaults.Tertiary)
const sep = 0 // separator byte
var keyFromElemTests = []keyFromElemTest{
{ // simple primary and secondary weights.
opts{},
ColElems{w(0x200), w(0x7FFF), w(0, 0x30), w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
},
},
{ // same as first, but with zero element that need to be removed
opts{},
ColElems{w(0x200), zero, w(0x7FFF), w(0, 0x30), zero, w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
},
},
{ // same as first, with large primary values
opts{},
ColElems{w(0x200), w(0x8000), w(0, 0x30), w(0x12345)},
[]byte{0x2, 0, 0x80, 0x80, 0x00, 0x81, 0x23, 0x45, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
},
},
{ // same as first, but with the secondary level backwards
opts{backwards: true},
ColElems{w(0x200), w(0x7FFF), w(0, 0x30), w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, 0, defS, 0, 0x30, 0, defS, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
},
},
{ // same as first, ignoring quaternary level
opts{lev: 3},
ColElems{w(0x200), zero, w(0x7FFF), w(0, 0x30), zero, w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
},
},
{ // same as first, ignoring tertiary level
opts{lev: 2},
ColElems{w(0x200), zero, w(0x7FFF), w(0, 0x30), zero, w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
},
},
{ // same as first, ignoring secondary level
opts{lev: 1},
ColElems{w(0x200), zero, w(0x7FFF), w(0, 0x30), zero, w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00},
},
{ // simple primary and secondary weights.
opts{alt: collate.AltShiftTrimmed, top: 0x250},
ColElems{w(0x300), w(0x200), w(0x7FFF), w(0, 0x30), w(0x800)},
[]byte{0x3, 0, 0x7F, 0xFF, 0x8, 0x00, // primary
sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
sep, 0xFF, 0x2, 0, // quaternary
},
},
{ // as first, primary with case level enabled
opts{lev: 1, caseLevel: true},
ColElems{w(0x200), w(0x7FFF), w(0, 0x30), w(0x100)},
[]byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
sep, sep, // secondary
sep, sep, defT, defT, defT, defT, // tertiary
},
},
}
func TestKeyFromElems(t *testing.T) {
buf := collate.Buffer{}
for i, tt := range keyFromElemTests {
buf.ResetKeys()
ws := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in)
res := collate.KeyFromElems(tt.opt.collator(), &buf, ws)
if len(res) != len(tt.out) {
t.Errorf("%d: len(ws) was %d; want %d (%X should be %X)", i, len(res), len(tt.out), res, tt.out)
}
n := len(res)
if len(tt.out) < n {
n = len(tt.out)
}
for j, c := range res[:n] {
if c != tt.out[j] {
t.Errorf("%d: byte %d was %X; want %X", i, j, c, tt.out[j])
}
}
}
}
func TestGetColElems(t *testing.T) {
for i, tt := range appendNextTests {
c, err := makeTable(tt.in)
if err != nil {
// error is reported in TestAppendNext
continue
}
buf := collate.Buffer{}
// Create one large test per table
str := make([]byte, 0, 4000)
out := ColElems{}
for len(str) < 3000 {
for _, chk := range tt.chk {
str = append(str, chk.in[:chk.n]...)
out = append(out, chk.out...)
}
}
for j, chk := range append(tt.chk, check{string(str), len(str), out}) {
ws := collate.GetColElems(c, &buf, []byte(chk.in)[:chk.n])
if len(ws) != len(chk.out) {
t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ws), len(chk.out))
continue
}
cnt := 0
for k, w := range ws {
if w != chk.out[k] {
t.Errorf("%d:%d: Weights %d was %v; want %v", i, j, k, w, chk.out[k])
cnt++
}
if cnt > 10 {
break
}
}
}
}
}
type keyTest struct {
in string
out []byte
}
var keyTests = []keyTest{
{"abc",
[]byte{0, 100, 0, 200, 1, 44, 0, 0, 0, 32, 0, 32, 0, 32, 0, 0, 2, 2, 2, 0, 255, 255, 255},
},
{"a\u0301",
[]byte{0, 102, 0, 0, 0, 32, 0, 0, 2, 0, 255},
},
{"aaaaa",
[]byte{0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 0,
0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 0,
2, 2, 2, 2, 2, 0,
255, 255, 255, 255, 255,
},
},
}
func TestKey(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in)
buf := collate.Buffer{}
keys1 := [][]byte{}
keys2 := [][]byte{}
for _, tt := range keyTests {
keys1 = append(keys1, c.Key(&buf, []byte(tt.in)))
keys2 = append(keys2, c.KeyFromString(&buf, tt.in))
}
// Separate generation from testing to ensure buffers are not overwritten.
for i, tt := range keyTests {
if bytes.Compare(keys1[i], tt.out) != 0 {
t.Errorf("%d: Key(%q) = %d; want %d", i, tt.in, keys1[i], tt.out)
}
if bytes.Compare(keys2[i], tt.out) != 0 {
t.Errorf("%d: KeyFromString(%q) = %d; want %d", i, tt.in, keys2[i], tt.out)
}
}
}
type compareTest struct {
a, b string
res int // comparison result
}
var compareTests = []compareTest{
{"a\u0301", "a", 1},
{"a", "a\u0301", -1},
{"a\u0301", "a\u0301", 0},
{"a", "a", 0},
}
func TestCompare(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in)
buf := collate.Buffer{}
for i, tt := range compareTests {
if res := c.Compare(&buf, []byte(tt.a), []byte(tt.b)); res != tt.res {
t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
if res := c.CompareString(&buf, tt.a, tt.b); res != tt.res {
t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
}
}

View File

@ -6,24 +6,30 @@ package collate
// Export for testing.
import "fmt"
import (
"exp/norm"
"fmt"
)
type Weights struct {
Primary, Secondary, Tertiary int
Primary, Secondary, Tertiary, Quaternary int
}
func W(ce ...int) Weights {
w := Weights{ce[0], defaultSecondary, defaultTertiary}
w := Weights{ce[0], defaultSecondary, defaultTertiary, 0}
if len(ce) > 1 {
w.Secondary = ce[1]
}
if len(ce) > 2 {
w.Tertiary = ce[2]
}
if len(ce) > 3 {
w.Quaternary = ce[3]
}
return w
}
func (w Weights) String() string {
return fmt.Sprintf("[%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary)
return fmt.Sprintf("[%d.%d.%d.%d]", w.Primary, w.Secondary, w.Tertiary, w.Quaternary)
}
type Table struct {
@ -35,15 +41,52 @@ func GetTable(c *Collator) *Table {
return &Table{c.t, nil}
}
func convertWeights(ws []weights) []Weights {
func convertToWeights(ws []weights) []Weights {
out := make([]Weights, len(ws))
for i, w := range ws {
out[i] = Weights{int(w.primary), int(w.secondary), int(w.tertiary)}
out[i] = Weights{int(w.primary), int(w.secondary), int(w.tertiary), int(w.quaternary)}
}
return out
}
func convertFromWeights(ws []Weights) []weights {
out := make([]weights, len(ws))
for i, w := range ws {
out[i] = weights{uint32(w.Primary), uint16(w.Secondary), uint8(w.Tertiary), uint32(w.Quaternary)}
}
return out
}
func (t *Table) AppendNext(s []byte) ([]Weights, int) {
w, n := t.t.appendNext(nil, s)
return convertWeights(w), n
return convertToWeights(w), n
}
func SetTop(c *Collator, top int) {
c.variableTop = uint32(top)
}
func InitCollator(c *Collator) {
c.Strength = Quaternary
c.f = norm.NFD
c.t.maxContractLen = 30
}
func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights {
buf.ResetKeys()
InitCollator(c)
c.getColElems(buf, str)
return convertToWeights(buf.ce)
}
func ProcessWeights(h AlternateHandling, top int, w []Weights) []Weights {
in := convertFromWeights(w)
processWeights(h, uint32(top), in)
return convertToWeights(in)
}
func KeyFromElems(c *Collator, buf *Buffer, w []Weights) []byte {
k := len(buf.key)
c.keyFromElems(buf, convertFromWeights(w))
return buf.key[k:]
}

View File

@ -11,9 +11,7 @@ import (
"testing"
)
type Weights struct {
collate.Weights
}
type ColElems []collate.Weights
type input struct {
str string
@ -23,7 +21,7 @@ type input struct {
type check struct {
in string
n int
out []Weights
out ColElems
}
type tableTest struct {
@ -31,8 +29,8 @@ type tableTest struct {
chk []check
}
func w(ce ...int) Weights {
return Weights{collate.W(ce...)}
func w(ce ...int) collate.Weights {
return collate.W(ce...)
}
var defaults = w(0)
@ -46,7 +44,11 @@ func makeTable(in []input) (*collate.Collator, error) {
for _, r := range in {
b.Add([]rune(r.str), r.ces)
}
return b.Build("")
c, err := b.Build("")
if err == nil {
collate.InitCollator(c)
}
return c, err
}
// modSeq holds a seqeunce of modifiers in increasing order of CCC long enough
@ -60,8 +62,8 @@ var modSeq = []rune{
}
var mods []input
var modW = func() []Weights {
ws := []Weights{}
var modW = func() ColElems {
ws := ColElems{}
for _, r := range modSeq {
rune := norm.NFC.PropertiesString(string(r))
ws = append(ws, w(0, int(rune.CCC())))
@ -79,14 +81,14 @@ var appendNextTests = []tableTest{
{"ß", [][]int{{120}}},
},
[]check{
{"a", 1, []Weights{w(100)}},
{"b", 1, []Weights{w(105)}},
{"c", 1, []Weights{w(110)}},
{"d", 1, []Weights{w(0x4FBA4)}},
{"ab", 1, []Weights{w(100)}},
{"bc", 1, []Weights{w(105)}},
{"dd", 1, []Weights{w(0x4FBA4)}},
{"ß", 2, []Weights{w(120)}},
{"a", 1, ColElems{w(100)}},
{"b", 1, ColElems{w(105)}},
{"c", 1, ColElems{w(110)}},
{"d", 1, ColElems{w(0x4FBA4)}},
{"ab", 1, ColElems{w(100)}},
{"bc", 1, ColElems{w(105)}},
{"dd", 1, ColElems{w(0x4FBA4)}},
{"ß", 2, ColElems{w(120)}},
},
},
{ // test expansion
@ -97,10 +99,10 @@ var appendNextTests = []tableTest{
{"W", [][]int{{100}, {0, 25}, {100}, {0, 25}}},
},
[]check{
{"u", 1, []Weights{w(100)}},
{"U", 1, []Weights{w(100), w(0, 25)}},
{"w", 1, []Weights{w(100), w(100)}},
{"W", 1, []Weights{w(100), w(0, 25), w(100), w(0, 25)}},
{"u", 1, ColElems{w(100)}},
{"U", 1, ColElems{w(100), w(0, 25)}},
{"w", 1, ColElems{w(100), w(100)}},
{"W", 1, ColElems{w(100), w(0, 25), w(100), w(0, 25)}},
},
},
{ // test decompose
@ -111,7 +113,7 @@ var appendNextTests = []tableTest{
{"\u01C5", [][]int{pt(104, 9), pt(130, 4), {0, 40, 0x1F}}}, // Dž = D+z+caron
},
[]check{
{"\u01C5", 2, []Weights{w(pt(104, 9)...), w(pt(130, 4)...), w(0, 40, 0x1F)}},
{"\u01C5", 2, ColElems{w(pt(104, 9)...), w(pt(130, 4)...), w(0, 40, 0x1F)}},
},
},
{ // test basic contraction
@ -125,16 +127,16 @@ var appendNextTests = []tableTest{
{"d", [][]int{{400}}},
},
[]check{
{"a", 1, []Weights{w(100)}},
{"aa", 1, []Weights{w(100)}},
{"aac", 1, []Weights{w(100)}},
{"ab", 2, []Weights{w(101)}},
{"abb", 2, []Weights{w(101)}},
{"aab", 3, []Weights{w(101), w(101)}},
{"aaba", 3, []Weights{w(101), w(101)}},
{"abc", 3, []Weights{w(102)}},
{"abcd", 3, []Weights{w(102)}},
{"d", 1, []Weights{w(400)}},
{"a", 1, ColElems{w(100)}},
{"aa", 1, ColElems{w(100)}},
{"aac", 1, ColElems{w(100)}},
{"d", 1, ColElems{w(400)}},
{"ab", 2, ColElems{w(101)}},
{"abb", 2, ColElems{w(101)}},
{"aab", 3, ColElems{w(101), w(101)}},
{"aaba", 3, ColElems{w(101), w(101)}},
{"abc", 3, ColElems{w(102)}},
{"abcd", 3, ColElems{w(102)}},
},
},
{ // test discontinuous contraction
@ -177,75 +179,75 @@ var appendNextTests = []tableTest{
{"\u302F\u18A9", [][]int{{0, 130}}},
}...),
[]check{
{"ab", 1, []Weights{w(100)}}, // closing segment
{"a\u0316\u0300b", 5, []Weights{w(101), w(0, 220)}}, // closing segment
{"a\u0316\u0300", 5, []Weights{w(101), w(0, 220)}}, // no closing segment
{"a\u0316\u0300\u035Cb", 5, []Weights{w(101), w(0, 220)}}, // completes before segment end
{"a\u0316\u0300\u035C", 5, []Weights{w(101), w(0, 220)}}, // completes before segment end
{"ab", 1, ColElems{w(100)}}, // closing segment
{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
{"a\u0316\u0300\u035Cb", 5, ColElems{w(101), w(0, 220)}}, // completes before segment end
{"a\u0316\u0300\u035C", 5, ColElems{w(101), w(0, 220)}}, // completes before segment end
{"a\u0316\u0301b", 5, []Weights{w(102), w(0, 220)}}, // closing segment
{"a\u0316\u0301", 5, []Weights{w(102), w(0, 220)}}, // no closing segment
{"a\u0316\u0301\u035Cb", 5, []Weights{w(102), w(0, 220)}}, // completes before segment end
{"a\u0316\u0301\u035C", 5, []Weights{w(102), w(0, 220)}}, // completes before segment end
{"a\u0316\u0301b", 5, ColElems{w(102), w(0, 220)}}, // closing segment
{"a\u0316\u0301", 5, ColElems{w(102), w(0, 220)}}, // no closing segment
{"a\u0316\u0301\u035Cb", 5, ColElems{w(102), w(0, 220)}}, // completes before segment end
{"a\u0316\u0301\u035C", 5, ColElems{w(102), w(0, 220)}}, // completes before segment end
// match blocked by modifier with same ccc
{"a\u0301\u0315\u031A\u035Fb", 3, []Weights{w(102)}},
{"a\u0301\u0315\u031A\u035Fb", 3, ColElems{w(102)}},
// multiple gaps
{"a\u0301\u035Db", 6, []Weights{w(120)}},
{"a\u0301\u035F", 5, []Weights{w(121)}},
{"a\u0301\u035Fb", 6, []Weights{w(122)}},
{"a\u0316\u0301\u035F", 7, []Weights{w(121), w(0, 220)}},
{"a\u0301\u0315\u035Fb", 7, []Weights{w(121), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Db", 5, []Weights{w(102), w(0, 220)}},
{"a\u0316\u0301\u0315\u035F", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Fb", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035F\u035D", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035F\u035Db", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
{"a\u0301\u035Db", 6, ColElems{w(120)}},
{"a\u0301\u035F", 5, ColElems{w(121)}},
{"a\u0301\u035Fb", 6, ColElems{w(122)}},
{"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
{"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},
{"a\u0316\u0301\u0315\u035F", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Fb", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035F\u035D", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
{"a\u0316\u0301\u0315\u035F\u035Db", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
// handling of segment overflow
{ // just fits within segment
"a" + string(modSeq[:30]) + "\u0301",
3 + len(string(modSeq[:30])),
append([]Weights{w(102)}, modW[:30]...),
append(ColElems{w(102)}, modW[:30]...),
},
{"a" + string(modSeq[:31]) + "\u0301", 1, []Weights{w(100)}}, // overflow
{"a" + string(modSeq) + "\u0301", 1, []Weights{w(100)}},
{"a" + string(modSeq[:31]) + "\u0301", 1, ColElems{w(100)}}, // overflow
{"a" + string(modSeq) + "\u0301", 1, ColElems{w(100)}},
{ // just fits within segment with two interstitial runes
"a" + string(modSeq[:28]) + "\u0301\u0315\u035F",
7 + len(string(modSeq[:28])),
append(append([]Weights{w(121)}, modW[:28]...), w(0, 232)),
append(append(ColElems{w(121)}, modW[:28]...), w(0, 232)),
},
{ // second half does not fit within segment
"a" + string(modSeq[:29]) + "\u0301\u0315\u035F",
3 + len(string(modSeq[:29])),
append([]Weights{w(102)}, modW[:29]...),
append(ColElems{w(102)}, modW[:29]...),
},
// discontinuity can only occur in last normalization segment
{"a\u035Eb\u035E", 6, []Weights{w(115)}},
{"a\u0316\u035Eb\u035E", 5, []Weights{w(110), w(0, 220)}},
{"a\u035Db\u035D", 6, []Weights{w(117)}},
{"a\u0316\u035Db\u035D", 1, []Weights{w(100)}},
{"a\u035Eb\u0316\u035E", 8, []Weights{w(115), w(0, 220)}},
{"a\u035Db\u0316\u035D", 8, []Weights{w(117), w(0, 220)}},
{"ac\u035Eaca\u035E", 9, []Weights{w(116)}},
{"a\u0316c\u035Eaca\u035E", 1, []Weights{w(100)}},
{"ac\u035Eac\u0316a\u035E", 1, []Weights{w(100)}},
{"a\u035Eb\u035E", 6, ColElems{w(115)}},
{"a\u0316\u035Eb\u035E", 5, ColElems{w(110), w(0, 220)}},
{"a\u035Db\u035D", 6, ColElems{w(117)}},
{"a\u0316\u035Db\u035D", 1, ColElems{w(100)}},
{"a\u035Eb\u0316\u035E", 8, ColElems{w(115), w(0, 220)}},
{"a\u035Db\u0316\u035D", 8, ColElems{w(117), w(0, 220)}},
{"ac\u035Eaca\u035E", 9, ColElems{w(116)}},
{"a\u0316c\u035Eaca\u035E", 1, ColElems{w(100)}},
{"ac\u035Eac\u0316a\u035E", 1, ColElems{w(100)}},
// expanding contraction
{"\u03B1\u0345", 4, []Weights{w(901), w(902)}},
{"\u03B1\u0345", 4, ColElems{w(901), w(902)}},
// Theoretical possibilities
// contraction within a gap
{"a\u302F\u18A9\u0301", 9, []Weights{w(102), w(0, 130)}},
{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
// expansion within a gap
{"a\u0317\u0301", 5, []Weights{w(102), w(0, 220), w(0, 220)}},
{"a\u302E\u18A9\u0301", 9, []Weights{w(102), w(0, 131), w(0, 132)}},
{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
{"a\u302E\u18A9\u0301", 9, ColElems{w(102), w(0, 131), w(0, 132)}},
{
"a\u0317\u302E\u18A9\u0301",
11,
[]Weights{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
ColElems{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
},
},
},
@ -269,7 +271,7 @@ func TestAppendNext(t *testing.T) {
continue
}
for k, w := range ws {
if w != chk.out[k].Weights {
if w != chk.out[k] {
t.Errorf("%d:%d: Weights %d was %v; want %v", i, j, k, w, chk.out[k])
}
}