mirror of
https://github.com/golang/go
synced 2024-10-03 17:11:21 -06:00
exp/locale/collate: This CL includes the following changes:
- Changed the representation of colElem to support a few cases for some languages not supported by the current format. - Changed offsets for implicit primary values. This makes the values both easier to read and debug (last 4 nibbles are identical to implicit primary value) and also results in better packing. - Fixed bug in weight conversion code that did not pop up yet by sheer luck. Note that tables.go also includes changes to the contraction trie from CL 6346092. R=r, mpvl CC=golang-dev https://golang.org/cl/6392060
This commit is contained in:
parent
adc19ac5e3
commit
882b6ef454
@ -111,6 +111,11 @@ func (b *Builder) Add(str []rune, colelems [][]int) error {
|
||||
e.elems[i] = append(e.elems[i], ce[0])
|
||||
}
|
||||
}
|
||||
elems, err := convertLargeWeights(e.elems)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
e.elems = elems
|
||||
b.entryMap[string(str)] = e
|
||||
b.entry = append(b.entry, e)
|
||||
return nil
|
||||
@ -184,8 +189,7 @@ func (b *Builder) build() (*table, error) {
|
||||
b.built = true
|
||||
b.t = &table{}
|
||||
|
||||
b.contractCJK()
|
||||
b.simplify() // requires contractCJK
|
||||
b.simplify()
|
||||
b.processExpansions() // requires simplify
|
||||
b.processContractions() // requires simplify
|
||||
b.buildTrie() // requires process*
|
||||
@ -231,6 +235,8 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
|
||||
return false
|
||||
}
|
||||
// Tertiary values should be equal to maxTertiary for third element onwards.
|
||||
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
|
||||
// simply be dropped. Try this out by dropping the following code.
|
||||
if i >= 2 && ce[2] != maxTertiary {
|
||||
return false
|
||||
}
|
||||
@ -322,10 +328,16 @@ func (b *Builder) simplify() {
|
||||
// convertLargeWeights converts collation elements with large
|
||||
// primaries (either double primaries or for illegal runes)
|
||||
// to our own representation.
|
||||
// A CJK character C is represented in the DUCET as
|
||||
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
|
||||
// We will rewrite these characters to a single CE.
|
||||
// We assume the CJK values start at 0x8000.
|
||||
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
||||
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
const (
|
||||
firstLargePrimary = 0xFB40
|
||||
cjkPrimaryStart = 0xFB40
|
||||
rarePrimaryStart = 0xFB80
|
||||
otherPrimaryStart = 0xFBC0
|
||||
illegalPrimary = 0xFFFE
|
||||
highBitsMask = 0x3F
|
||||
lowBitsMask = 0x7FFF
|
||||
@ -335,7 +347,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
for i := 0; i < len(elems); i++ {
|
||||
ce := elems[i]
|
||||
p := ce[0]
|
||||
if p < firstLargePrimary {
|
||||
if p < cjkPrimaryStart {
|
||||
continue
|
||||
}
|
||||
if p > 0xFFFF {
|
||||
@ -350,8 +362,16 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
if elems[i+1][0]&lowBitsFlag == 0 {
|
||||
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
||||
}
|
||||
r := rune(((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask)
|
||||
ce[0] = implicitPrimary(r)
|
||||
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
|
||||
switch {
|
||||
case p < rarePrimaryStart:
|
||||
np += commonUnifiedOffset
|
||||
case p < otherPrimaryStart:
|
||||
np += rareUnifiedOffset
|
||||
default:
|
||||
p += otherOffset
|
||||
}
|
||||
ce[0] = np
|
||||
for j := i + 1; j+1 < len(elems); j++ {
|
||||
elems[j] = elems[j+1]
|
||||
}
|
||||
@ -361,21 +381,6 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
return elems, nil
|
||||
}
|
||||
|
||||
// A CJK character C is represented in the DUCET as
|
||||
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
|
||||
// We will rewrite these characters to a single CE.
|
||||
// We assume the CJK values start at 0x8000.
|
||||
func (b *Builder) contractCJK() {
|
||||
for _, e := range b.entry {
|
||||
elms, err := convertLargeWeights(e.elems)
|
||||
e.elems = elms
|
||||
if err != nil {
|
||||
err = fmt.Errorf("%U: %s", e.runes, err)
|
||||
}
|
||||
b.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
// appendExpansion converts the given collation sequence to
|
||||
// collation elements and adds them to the expansion table.
|
||||
// It returns an index to the expansion table.
|
||||
@ -479,7 +484,7 @@ func (b *Builder) processContractions() {
|
||||
str := []byte(string(e.runes[1:]))
|
||||
o, sn = t.contractTries.lookup(handle, str)
|
||||
if sn != len(str) {
|
||||
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", []rune(string(str)), sn, len(str))
|
||||
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str))
|
||||
}
|
||||
}
|
||||
if es[o] != nil {
|
||||
|
@ -63,7 +63,7 @@ type convertTest struct {
|
||||
|
||||
var convLargeTests = []convertTest{
|
||||
{pCE(0xFB39), pCE(0xFB39), false},
|
||||
{cjk(0x2F9B2), pqCE(0x4F4F2, 0x2F9B2), false},
|
||||
{cjk(0x2F9B2), pqCE(0x3F9B2, 0x2F9B2), false},
|
||||
{pCE(0xFB40), pCE(0), true},
|
||||
{append(pCE(0xFB40), pCE(0)[0]), pCE(0), true},
|
||||
{pCE(0xFFFE), pCE(illegalOffset), false},
|
||||
@ -109,9 +109,9 @@ var simplifyTest = []ducetElem{
|
||||
}
|
||||
|
||||
var genColTests = []ducetElem{
|
||||
{"\uFA70", pqCE(0x1F5B0, 0xFA70)},
|
||||
{"\uFA70", pqCE(0x1FA70, 0xFA70)},
|
||||
{"A\u0300", append(ptCE(100, 8), sCE(30)...)},
|
||||
{"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1F5B0, 0xFA70)[0])},
|
||||
{"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1FA70, 0xFA70)[0])},
|
||||
{"A\u0300A\u0300", append(ptCE(100, 8), sCE(30)[0], ptCE(100, 8)[0], sCE(30)[0])},
|
||||
}
|
||||
|
||||
|
@ -25,19 +25,29 @@ const (
|
||||
// For normal collation elements, we assume that a collation element either has
|
||||
// a primary or non-default secondary value, not both.
|
||||
// Collation elements with a primary value are of the form
|
||||
// 000ppppp pppppppp pppppppp tttttttt, where
|
||||
// 010ppppp pppppppp pppppppp ssssssss
|
||||
// - p* is primary collation value
|
||||
// - s* is the secondary collation value
|
||||
// or
|
||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||
// - p* is primary collation value
|
||||
// - s* offset of secondary from default value.
|
||||
// - t* is the tertiary collation value
|
||||
// Collation elements with a secondary value are of the form
|
||||
// 01000000 ssssssss ssssssss tttttttt, where
|
||||
// - s* is the secondary collation value
|
||||
// - t* is the tertiary collation value
|
||||
// 10000000 0000ssss ssssssss tttttttt, where
|
||||
// - 16 BMP implicit -> weight
|
||||
// - 8 bit s
|
||||
// - default tertiary
|
||||
const (
|
||||
maxPrimaryBits = 21
|
||||
maxSecondaryBits = 16
|
||||
maxTertiaryBits = 8
|
||||
maxPrimaryBits = 21
|
||||
maxSecondaryBits = 12
|
||||
maxSecondaryCompactBits = 8
|
||||
maxSecondaryDiffBits = 4
|
||||
maxTertiaryBits = 8
|
||||
maxTertiaryCompactBits = 5
|
||||
|
||||
isSecondary = 0x40000000
|
||||
isSecondary = 0x80000000
|
||||
isPrimary = 0x40000000
|
||||
)
|
||||
|
||||
func makeCE(weights []int) (uint32, error) {
|
||||
@ -48,17 +58,28 @@ func makeCE(weights []int) (uint32, error) {
|
||||
return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
|
||||
}
|
||||
if w := weights[2]; w >= 1<<maxTertiaryBits || w < 0 {
|
||||
return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %d >= %d", w, 1<<maxTertiaryBits)
|
||||
return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
|
||||
}
|
||||
ce := uint32(0)
|
||||
if weights[0] != 0 {
|
||||
// primary weight form
|
||||
if weights[1] != defaultSecondary {
|
||||
return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
|
||||
if weights[2] == defaultTertiary {
|
||||
if weights[1] >= 1<<maxSecondaryCompactBits {
|
||||
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
|
||||
}
|
||||
ce = uint32(weights[0]<<maxSecondaryCompactBits + weights[1])
|
||||
ce |= isPrimary
|
||||
} else {
|
||||
d := weights[1] - defaultSecondary
|
||||
if d >= 1<<maxSecondaryDiffBits || d < 0 {
|
||||
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
|
||||
}
|
||||
if weights[2] >= 1<<maxTertiaryCompactBits {
|
||||
return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x (%X)", weights[2], 1<<maxTertiaryCompactBits, weights)
|
||||
}
|
||||
ce = uint32(weights[0]<<maxSecondaryDiffBits + d)
|
||||
ce = ce<<maxTertiaryCompactBits + uint32(weights[2])
|
||||
}
|
||||
ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
|
||||
} else {
|
||||
// secondary weight form
|
||||
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
|
||||
ce |= isSecondary
|
||||
}
|
||||
@ -66,16 +87,16 @@ func makeCE(weights []int) (uint32, error) {
|
||||
}
|
||||
|
||||
// For contractions, collation elements are of the form
|
||||
// 10bbbbbb bbbbbbbb iiiiiiii iiinnnnn, where
|
||||
// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
|
||||
// - n* is the size of the first node in the contraction trie.
|
||||
// - i* is the index of the first node in the contraction trie.
|
||||
// - b* is the offset into the contraction collation element table.
|
||||
// See contract.go for details on the contraction trie.
|
||||
const (
|
||||
contractID = 0x80000000
|
||||
maxNBits = 5
|
||||
maxTrieIndexBits = 11
|
||||
maxContractOffsetBits = 14
|
||||
contractID = 0xC0000000
|
||||
maxNBits = 4
|
||||
maxTrieIndexBits = 12
|
||||
maxContractOffsetBits = 13
|
||||
)
|
||||
|
||||
func makeContractIndex(h ctHandle, offset int) (uint32, error) {
|
||||
@ -86,26 +107,26 @@ func makeContractIndex(h ctHandle, offset int) (uint32, error) {
|
||||
return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
|
||||
}
|
||||
if offset >= 1<<maxContractOffsetBits {
|
||||
return 0, fmt.Errorf("offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
|
||||
return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
|
||||
}
|
||||
ce := uint32(contractID)
|
||||
ce += uint32(offset << (maxTrieIndexBits + maxNBits))
|
||||
ce += uint32(offset << (maxNBits + maxTrieIndexBits))
|
||||
ce += uint32(h.index << maxNBits)
|
||||
ce += uint32(h.n)
|
||||
return ce, nil
|
||||
}
|
||||
|
||||
// For expansions, collation elements are of the form
|
||||
// 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
|
||||
// 11100000 00000000 bbbbbbbb bbbbbbbb,
|
||||
// where b* is the index into the expansion sequence table.
|
||||
const (
|
||||
expandID = 0xC0000000
|
||||
maxExpandIndexBits = 29
|
||||
expandID = 0xE0000000
|
||||
maxExpandIndexBits = 16
|
||||
)
|
||||
|
||||
func makeExpandIndex(index int) (uint32, error) {
|
||||
if index >= 1<<maxExpandIndexBits {
|
||||
return 0, fmt.Errorf("index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
|
||||
return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
|
||||
}
|
||||
return expandID + uint32(index), nil
|
||||
}
|
||||
@ -120,13 +141,13 @@ func makeExpansionHeader(n int) (uint32, error) {
|
||||
// sequence of collation elements, we decompose the rune and lookup the collation
|
||||
// elements for each rune in the decomposition and modify the tertiary weights.
|
||||
// The collation element, in this case, is of the form
|
||||
// 11100000 00000000 wwwwwwww vvvvvvvv, where
|
||||
// 11110000 00000000 wwwwwwww vvvvvvvv, where
|
||||
// - v* is the replacement tertiary weight for the first rune,
|
||||
// - w* is the replacement tertiary weight for the second rune,
|
||||
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
|
||||
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
|
||||
const (
|
||||
decompID = 0xE0000000
|
||||
decompID = 0xF0000000
|
||||
)
|
||||
|
||||
func makeDecompose(t1, t2 int) (uint32, error) {
|
||||
@ -149,10 +170,10 @@ const (
|
||||
maxRare = 0x4DBF
|
||||
)
|
||||
const (
|
||||
commonUnifiedOffset = 0xFB40
|
||||
rareUnifiedOffset = 0x1FB40
|
||||
otherOffset = 0x4FB40
|
||||
illegalOffset = otherOffset + unicode.MaxRune
|
||||
commonUnifiedOffset = 0x10000
|
||||
rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
|
||||
otherOffset = 0x50000 // largest rune in rare is U+2FA1D
|
||||
illegalOffset = otherOffset + int(unicode.MaxRune)
|
||||
maxPrimary = illegalOffset + 1
|
||||
)
|
||||
|
||||
|
@ -29,32 +29,35 @@ func decompCE(in []int) (ce uint32, err error) {
|
||||
}
|
||||
|
||||
var ceTests = []ceTest{
|
||||
{normalCE, []int{0, 0, 0}, 0x40000000},
|
||||
{normalCE, []int{0, 30, 3}, 0x40001E03},
|
||||
{normalCE, []int{100, defaultSecondary, 3}, 0x6403},
|
||||
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
|
||||
{normalCE, []int{0, 0, 0}, 0x80000000},
|
||||
{normalCE, []int{0, 0x28, 3}, 0x80002803},
|
||||
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C803},
|
||||
// non-ignorable primary with non-default secondary
|
||||
{normalCE, []int{100, 0x28, defaultTertiary}, 0x40006428},
|
||||
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C903},
|
||||
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary
|
||||
{normalCE, []int{100, 1, 3}, 0xFFFF},
|
||||
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
|
||||
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
|
||||
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
|
||||
|
||||
{contractCE, []int{0, 0, 0}, 0x80000000},
|
||||
{contractCE, []int{1, 1, 1}, 0x80010021},
|
||||
{contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0x8001003F},
|
||||
{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0x8001FFE1},
|
||||
{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xBFFF0021},
|
||||
{contractCE, []int{0, 0, 0}, 0xC0000000},
|
||||
{contractCE, []int{1, 1, 1}, 0xC0010011},
|
||||
{contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0xC001001F},
|
||||
{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0xC001FFF1},
|
||||
{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xDFFF0011},
|
||||
{contractCE, []int{1, (1 << maxNBits), 1}, 0xFFFF},
|
||||
{contractCE, []int{(1 << maxTrieIndexBits), 1, 1}, 0xFFFF},
|
||||
{contractCE, []int{1, (1 << maxContractOffsetBits), 1}, 0xFFFF},
|
||||
|
||||
{expandCE, []int{0}, 0xC0000000},
|
||||
{expandCE, []int{5}, 0xC0000005},
|
||||
{expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xDFFFFFFF},
|
||||
{expandCE, []int{0}, 0xE0000000},
|
||||
{expandCE, []int{5}, 0xE0000005},
|
||||
{expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xE000FFFF},
|
||||
{expandCE, []int{1 << maxExpandIndexBits}, 0xFFFF},
|
||||
|
||||
{decompCE, []int{0, 0}, 0xE0000000},
|
||||
{decompCE, []int{1, 1}, 0xE0000101},
|
||||
{decompCE, []int{0x1F, 0x1F}, 0xE0001F1F},
|
||||
{decompCE, []int{0, 0}, 0xF0000000},
|
||||
{decompCE, []int{1, 1}, 0xF0000101},
|
||||
{decompCE, []int{0x1F, 0x1F}, 0xF0001F1F},
|
||||
{decompCE, []int{256, 0x1F}, 0xFFFF},
|
||||
{decompCE, []int{0x1F, 256}, 0xFFFF},
|
||||
}
|
||||
|
@ -33,12 +33,12 @@ const (
|
||||
type colElem uint32
|
||||
|
||||
const (
|
||||
maxCE colElem = 0x7FFFFFFF
|
||||
minContract = 0x80000000
|
||||
maxContract = 0xBFFFFFFF
|
||||
minExpand = 0xC0000000
|
||||
maxExpand = 0xDFFFFFFF
|
||||
minDecomp = 0xE0000000
|
||||
maxCE colElem = 0x80FFFFFF
|
||||
minContract = 0xC0000000
|
||||
maxContract = 0xDFFFFFFF
|
||||
minExpand = 0xE0000000
|
||||
maxExpand = 0xEFFFFFFF
|
||||
minDecomp = 0xF0000000
|
||||
)
|
||||
|
||||
type ceType int
|
||||
@ -69,66 +69,77 @@ func (ce colElem) ctype() ceType {
|
||||
// For normal collation elements, we assume that a collation element either has
|
||||
// a primary or non-default secondary value, not both.
|
||||
// Collation elements with a primary value are of the form
|
||||
// 000ppppp pppppppp pppppppp tttttttt, where
|
||||
// 010ppppp pppppppp pppppppp ssssssss
|
||||
// - p* is primary collation value
|
||||
// - s* is the secondary collation value
|
||||
// or
|
||||
// 00pppppp pppppppp ppppppps sssttttt, where
|
||||
// - p* is primary collation value
|
||||
// - s* offset of secondary from default value.
|
||||
// - t* is the tertiary collation value
|
||||
// Collation elements with a secondary value are of the form
|
||||
// 01000000 ssssssss ssssssss tttttttt, where
|
||||
// - s* is the secondary collation value
|
||||
// - t* is the tertiary collation value
|
||||
// 10000000 0000ssss ssssssss tttttttt, where
|
||||
// - 16 BMP implicit -> weight
|
||||
// - 8 bit s
|
||||
// - default tertiary
|
||||
func splitCE(ce colElem) weights {
|
||||
const secondaryMask = 0x40000000
|
||||
const primaryMask = 0x40000000
|
||||
const secondaryMask = 0x80000000
|
||||
w := weights{}
|
||||
w.tertiary = uint8(ce)
|
||||
if ce&secondaryMask == 0 {
|
||||
// primary weight form
|
||||
if ce&primaryMask != 0 {
|
||||
w.tertiary = defaultTertiary
|
||||
w.secondary = uint16(uint8(ce))
|
||||
w.primary = uint32((ce >> 8) & 0x1FFFFF)
|
||||
w.secondary = defaultSecondary
|
||||
} else if ce&secondaryMask == 0 {
|
||||
w.tertiary = uint8(ce & 0x1F)
|
||||
ce >>= 5
|
||||
w.secondary = defaultSecondary + uint16(ce&0xF)
|
||||
ce >>= 4
|
||||
w.primary = uint32(ce)
|
||||
} else {
|
||||
// secondary weight form
|
||||
w.tertiary = uint8(ce)
|
||||
w.secondary = uint16(ce >> 8)
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
// For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where
|
||||
// - h* is the compTrieHandle.
|
||||
// For contractions, collation elements are of the form
|
||||
// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
|
||||
// - n* is the size of the first node in the contraction trie.
|
||||
// - i* is the index of the first node in the contraction trie.
|
||||
// - b* is the offset into the contraction collation element table.
|
||||
// See contract.go for details on the contraction trie.
|
||||
const (
|
||||
maxNBits = 5
|
||||
maxTrieIndexBits = 11
|
||||
maxContractOffsetBits = 14
|
||||
maxNBits = 4
|
||||
maxTrieIndexBits = 12
|
||||
maxContractOffsetBits = 13
|
||||
)
|
||||
|
||||
func splitContractIndex(ce colElem) (index, n, offset int) {
|
||||
h := ce & 0xffff
|
||||
return int(h >> maxNBits), int(h & (1<<maxNBits - 1)), int(ce>>16) & (1<<maxContractOffsetBits - 1)
|
||||
n = int(ce & (1<<maxNBits - 1))
|
||||
ce >>= maxNBits
|
||||
index = int(ce & (1<<maxTrieIndexBits - 1))
|
||||
ce >>= maxTrieIndexBits
|
||||
offset = int(ce & (1<<maxContractOffsetBits - 1))
|
||||
return
|
||||
}
|
||||
|
||||
// For expansions, colElems are of the form 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
|
||||
// For expansions, colElems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
|
||||
// where b* is the index into the expansion sequence table.
|
||||
const (
|
||||
maxExpandIndexBits = 29
|
||||
)
|
||||
const maxExpandIndexBits = 16
|
||||
|
||||
func splitExpandIndex(ce colElem) (index int) {
|
||||
index = int(ce) & (1<<maxExpandIndexBits - 1)
|
||||
return
|
||||
return int(uint16(ce))
|
||||
}
|
||||
|
||||
// Some runes can be expanded using NFKD decomposition. Instead of storing the full
|
||||
// sequence of collation elements, we decompose the rune and lookup the collation
|
||||
// elements for each rune in the decomposition and modify the tertiary weights.
|
||||
// The colElem, in this case, is of the form 11100000 00000000 wwwwwwww vvvvvvvv, where
|
||||
// The colElem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
|
||||
// - v* is the replacement tertiary weight for the first rune,
|
||||
// - w* is the replacement tertiary weight for the second rune,
|
||||
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
|
||||
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
|
||||
const (
|
||||
decompID = 0xE0000000
|
||||
)
|
||||
|
||||
func splitDecompose(ce colElem) (t1, t2 uint8) {
|
||||
return uint8(ce), uint8(ce >> 8)
|
||||
}
|
||||
@ -143,10 +154,10 @@ const (
|
||||
maxRare = 0x4DBF
|
||||
)
|
||||
const (
|
||||
commonUnifiedOffset = 0xFB40
|
||||
rareUnifiedOffset = 0x1FB40
|
||||
otherOffset = 0x4FB40
|
||||
illegalOffset = otherOffset + unicode.MaxRune
|
||||
commonUnifiedOffset = 0x10000
|
||||
rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
|
||||
otherOffset = 0x50000 // largest rune in rare is U+2FA1D
|
||||
illegalOffset = otherOffset + int(unicode.MaxRune)
|
||||
maxPrimary = illegalOffset + 1
|
||||
)
|
||||
|
||||
|
@ -17,14 +17,25 @@ type ceTest struct {
|
||||
// The make* funcs are simplified versions of the functions in build/colelem.go
|
||||
func makeCE(weights []int) colElem {
|
||||
const (
|
||||
maxPrimaryBits = 21
|
||||
maxSecondaryBits = 16
|
||||
maxTertiaryBits = 8
|
||||
isSecondary = 0x40000000
|
||||
maxPrimaryBits = 21
|
||||
maxSecondaryBits = 12
|
||||
maxSecondaryCompactBits = 8
|
||||
maxSecondaryDiffBits = 4
|
||||
maxTertiaryBits = 8
|
||||
maxTertiaryCompactBits = 5
|
||||
isSecondary = 0x80000000
|
||||
isPrimary = 0x40000000
|
||||
)
|
||||
var ce colElem
|
||||
if weights[0] != 0 {
|
||||
ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
|
||||
if weights[2] == defaultTertiary {
|
||||
ce = colElem(weights[0]<<maxSecondaryCompactBits + weights[1])
|
||||
ce |= isPrimary
|
||||
} else {
|
||||
d := weights[1] - defaultSecondary
|
||||
ce = colElem(weights[0]<<maxSecondaryDiffBits + d)
|
||||
ce = ce<<maxTertiaryCompactBits + colElem(weights[2])
|
||||
}
|
||||
} else {
|
||||
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
|
||||
ce |= isSecondary
|
||||
@ -34,24 +45,25 @@ func makeCE(weights []int) colElem {
|
||||
|
||||
func makeContractIndex(index, n, offset int) colElem {
|
||||
const (
|
||||
contractID = 0x80000000
|
||||
maxNBits = 5
|
||||
maxTrieIndexBits = 11
|
||||
contractID = 0xC0000000
|
||||
maxNBits = 4
|
||||
maxTrieIndexBits = 12
|
||||
maxContractOffsetBits = 13
|
||||
)
|
||||
ce := colElem(contractID)
|
||||
ce += colElem(offset << (maxTrieIndexBits + maxNBits))
|
||||
ce += colElem(offset << (maxNBits + maxTrieIndexBits))
|
||||
ce += colElem(index << maxNBits)
|
||||
ce += colElem(n)
|
||||
return ce
|
||||
}
|
||||
|
||||
func makeExpandIndex(index int) colElem {
|
||||
const expandID = 0xC0000000
|
||||
const expandID = 0xE0000000
|
||||
return expandID + colElem(index)
|
||||
}
|
||||
|
||||
func makeDecompose(t1, t2 int) colElem {
|
||||
const decompID = 0xE0000000
|
||||
const decompID = 0xF0000000
|
||||
return colElem(t2<<8+t1) + decompID
|
||||
}
|
||||
|
||||
@ -119,7 +131,7 @@ func TestColElem(t *testing.T) {
|
||||
}
|
||||
for j, a := range tt.arg {
|
||||
if inout[j] != a {
|
||||
t.Errorf("%d: argument %d is %d; want %d", i, j, inout[j], a)
|
||||
t.Errorf("%d: argument %d is %X; want %X", i, j, inout[j], a)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -131,21 +143,21 @@ type implicitTest struct {
|
||||
}
|
||||
|
||||
var implicitTests = []implicitTest{
|
||||
{0x33FF, 0x52F3F},
|
||||
{0x3400, 0x22F40},
|
||||
{0x4DC0, 0x54900},
|
||||
{0x4DFF, 0x5493F},
|
||||
{0x4E00, 0x14940},
|
||||
{0x9FCB, 0x19B0B},
|
||||
{0xA000, 0x59B40},
|
||||
{0xF8FF, 0x5F43F},
|
||||
{0xF900, 0x1F440},
|
||||
{0xFA23, 0x1F563},
|
||||
{0xFAD9, 0x1F619},
|
||||
{0xFB00, 0x5F640},
|
||||
{0x20000, 0x3FB40},
|
||||
{0x2B81C, 0x4B35C},
|
||||
{unicode.MaxRune, 0x15FB3F}, // maximum primary value
|
||||
{0x33FF, 0x533FF},
|
||||
{0x3400, 0x23400},
|
||||
{0x4DC0, 0x54DC0},
|
||||
{0x4DFF, 0x54DFF},
|
||||
{0x4E00, 0x14E00},
|
||||
{0x9FCB, 0x19FCB},
|
||||
{0xA000, 0x5A000},
|
||||
{0xF8FF, 0x5F8FF},
|
||||
{0xF900, 0x1F900},
|
||||
{0xFA23, 0x1FA23},
|
||||
{0xFAD9, 0x1FAD9},
|
||||
{0xFB00, 0x5FB00},
|
||||
{0x20000, 0x40000},
|
||||
{0x2B81C, 0x4B81C},
|
||||
{unicode.MaxRune, 0x15FFFF}, // maximum primary value
|
||||
}
|
||||
|
||||
func TestImplicit(t *testing.T) {
|
||||
|
@ -84,10 +84,10 @@ var appendNextTests = []tableTest{
|
||||
{"a", 1, ColElems{w(100)}},
|
||||
{"b", 1, ColElems{w(105)}},
|
||||
{"c", 1, ColElems{w(110)}},
|
||||
{"d", 1, ColElems{w(0x4FBA4)}},
|
||||
{"d", 1, ColElems{w(0x50064)}},
|
||||
{"ab", 1, ColElems{w(100)}},
|
||||
{"bc", 1, ColElems{w(105)}},
|
||||
{"dd", 1, ColElems{w(0x4FBA4)}},
|
||||
{"dd", 1, ColElems{w(0x50064)}},
|
||||
{"ß", 2, ColElems{w(120)}},
|
||||
},
|
||||
},
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user