1
0
mirror of https://github.com/golang/go synced 2024-10-03 09:21:21 -06:00

exp/locale/collate: added representation for collation elements

(see http://www.unicode.org/reports/tr10/).

R=r, r
CC=golang-dev
https://golang.org/cl/5981048
This commit is contained in:
Marcel van Lohuizen 2012-04-25 13:16:24 +02:00
parent e456d015fb
commit bb3f3c9775
4 changed files with 586 additions and 0 deletions

View File

@ -0,0 +1,179 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package build
import (
"fmt"
"unicode"
)
const (
defaultSecondary = 0x20
defaultTertiary = 0x2
maxTertiary = 0x1F
)
// A collation element is represented as an uint32.
// In the typical case, a rune maps to a single collation element. If a rune
// can be the start of a contraction or expands into multiple collation elements,
// then the collation element that is associated with a rune will have a special
// form to represent such m to n mappings. Such special collation elements
// have a value >= 0x80000000.
// For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp tttttttt, where
// - p* is primary collation value
// - t* is the tertiary collation value
// Collation elements with a secondary value are of the form
// 00000000 ssssssss ssssssss tttttttt, where
// - s* is the secondary collation value
// - t* is the tertiary collation value
const (
maxPrimaryBits = 21
maxSecondaryBits = 16
maxTertiaryBits = 8
isPrimary = 0x40000000
)
func makeCE(weights []int) (uint32, error) {
if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
}
if w := weights[1]; w >= 1<<maxSecondaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
}
if w := weights[2]; w >= 1<<maxTertiaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %d >= %d", w, 1<<maxTertiaryBits)
}
ce := uint32(0)
if weights[0] != 0 {
// primary weight form
if weights[1] != defaultSecondary {
return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
}
ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
ce |= isPrimary
} else {
// secondary weight form
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
}
return ce, nil
}
// For contractions, collation elements are of the form
// 10bbbbbb bbbbbbbb iiiiiiii iiinnnnn, where
// - n* is the size of the first node in the contraction trie.
// - i* is the index of the first node in the contraction trie.
// - b* is the offset into the contraction collation element table.
// See contract.go for details on the contraction trie.
const (
contractID = 0x80000000
maxNBits = 5
maxTrieIndexBits = 11
maxContractOffsetBits = 14
)
func makeContractIndex(h ctHandle, offset int) (uint32, error) {
if h.n >= 1<<maxNBits {
return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
}
if h.index >= 1<<maxTrieIndexBits {
return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
}
if offset >= 1<<maxContractOffsetBits {
return 0, fmt.Errorf("offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
}
ce := uint32(contractID)
ce += uint32(offset << (maxTrieIndexBits + maxNBits))
ce += uint32(h.index << maxNBits)
ce += uint32(h.n)
return ce, nil
}
// For expansions, collation elements are of the form
// 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
// where b* is the index into the expansion sequence table.
const (
expandID = 0xC0000000
maxExpandIndexBits = 29
)
func makeExpandIndex(index int) (uint32, error) {
if index >= 1<<maxExpandIndexBits {
return 0, fmt.Errorf("index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
}
return expandID + uint32(index), nil
}
// Each list of collation elements corresponding to an expansion starts with
// a header indicating the length of the sequence.
func makeExpansionHeader(n int) (uint32, error) {
return uint32(n), nil
}
// Some runes can be expanded using NFKD decomposition. Instead of storing the full
// sequence of collation elements, we decompose the rune and lookup the collation
// elements for each rune in the decomposition and modify the tertiary weights.
// The collation element, in this case, is of the form
// 11100000 00000000 wwwwwwww vvvvvvvv, where
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
const (
decompID = 0xE0000000
)
func makeDecompose(t1, t2 int) (uint32, error) {
if t1 >= 256 || t1 < 0 {
return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
}
if t2 >= 256 || t2 < 0 {
return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
}
return uint32(t2<<8+t1) + decompID, nil
}
const (
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
minUnified rune = 0x4E00
maxUnified = 0x9FFF
minCompatibility = 0xF900
maxCompatibility = 0xFAFF
minRare = 0x3400
maxRare = 0x4DBF
)
const (
commonUnifiedOffset = 0xFB40
rareUnifiedOffset = 0x1FB40
otherOffset = 0x4FB40
illegalOffset = otherOffset + unicode.MaxRune
maxPrimary = illegalOffset + 2 // there are 2 illegal values.
)
// implicitPrimary returns the primary weight for the a rune
// for which there is no entry for the rune in the collation table.
// We take a different approach from the one specified in
// http://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if r >= minUnified && r <= maxUnified {
// The most common case for CJK.
return int(r) + commonUnifiedOffset
}
if r >= minCompatibility && r <= maxCompatibility {
// This will never hit as long as we don't remove the characters
// that would match from the table.
return int(r) + commonUnifiedOffset
}
if unicode.Is(unicode.Unified_Ideograph, r) {
return int(r) + rareUnifiedOffset
}
return int(r) + otherOffset
}

View File

@ -0,0 +1,80 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package build
import "testing"
type ceTest struct {
f func(in []int) (uint32, error)
arg []int
val uint32
}
func normalCE(in []int) (ce uint32, err error) {
return makeCE(in)
}
func expandCE(in []int) (ce uint32, err error) {
return makeExpandIndex(in[0])
}
func contractCE(in []int) (ce uint32, err error) {
return makeContractIndex(ctHandle{in[0], in[1]}, in[2])
}
func decompCE(in []int) (ce uint32, err error) {
return makeDecompose(in[0], in[1])
}
var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}, 000},
{normalCE, []int{0, 30, 3}, 0x1E03},
{normalCE, []int{100, defaultSecondary, 3}, 0x40006403},
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
{normalCE, []int{100, 1, 3}, 0xFFFF},
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
{contractCE, []int{0, 0, 0}, 0x80000000},
{contractCE, []int{1, 1, 1}, 0x80010021},
{contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0x8001003F},
{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0x8001FFE1},
{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xBFFF0021},
{contractCE, []int{1, (1 << maxNBits), 1}, 0xFFFF},
{contractCE, []int{(1 << maxTrieIndexBits), 1, 1}, 0xFFFF},
{contractCE, []int{1, (1 << maxContractOffsetBits), 1}, 0xFFFF},
{expandCE, []int{0}, 0xC0000000},
{expandCE, []int{5}, 0xC0000005},
{expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xDFFFFFFF},
{expandCE, []int{1 << maxExpandIndexBits}, 0xFFFF},
{decompCE, []int{0, 0}, 0xE0000000},
{decompCE, []int{1, 1}, 0xE0000101},
{decompCE, []int{0x1F, 0x1F}, 0xE0001F1F},
{decompCE, []int{256, 0x1F}, 0xFFFF},
{decompCE, []int{0x1F, 256}, 0xFFFF},
}
func TestColElem(t *testing.T) {
for i, tt := range ceTests {
in := make([]int, len(tt.arg))
copy(in, tt.arg)
ce, err := tt.f(in)
if tt.val == 0xFFFF {
if err == nil {
t.Errorf("%d: expected error for args %x", i, tt.arg)
}
continue
}
if err != nil {
t.Errorf("%d: unexpected error: %v", i, err.Error())
}
if ce != tt.val {
t.Errorf("%d: colElem=%X; want %X", i, ce, tt.val)
}
}
}

View File

@ -0,0 +1,170 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package collate
import (
"unicode"
)
// weights holds the decoded weights per collation level.
type weights struct {
primary uint32
secondary uint16
tertiary uint8
// TODO: compute quaternary on the fly or compress this value into 8 bits
// such that weights fit within 64bit.
quaternary uint32
}
const (
defaultSecondary = 0x20
defaultTertiary = 0x2
maxTertiary = 0x1F
)
// colElem is a representation of a collation element.
// In the typical case, a rune maps to a single collation element. If a rune
// can be the start of a contraction or expands into multiple collation elements,
// then the colElem that is associated with a rune will have a special form to represent
// such m to n mappings. Such special colElems have a value >= 0x80000000.
type colElem uint32
const (
maxCE colElem = 0x7FFFFFFF
minContract = 0x80000000
maxContract = 0xBFFFFFFF
minExpand = 0xC0000000
maxExpand = 0xDFFFFFFF
minDecomp = 0xE0000000
)
type ceType int
const (
ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
ceContractionIndex // rune can be a start of a contraction
ceExpansionIndex // rune expands into a sequence of collation elements
ceDecompose // rune expands using NFKC decomposition
)
func (ce colElem) ctype() ceType {
if ce <= maxCE {
return ceNormal
}
if ce <= maxContract {
return ceContractionIndex
} else {
if ce <= maxExpand {
return ceExpansionIndex
}
return ceDecompose
}
panic("should not reach here")
return ceType(-1)
}
// For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp tttttttt, where
// - p* is primary collation value
// - t* is the tertiary collation value
// Collation elements with a secondary value are of the form
// 00000000 ssssssss ssssssss tttttttt, where
// - s* is the secondary collation value
// - t* is the tertiary collation value
func splitCE(ce colElem) weights {
w := weights{}
w.tertiary = uint8(ce)
if ce&0x40000000 != 0 {
// primary weight form
w.primary = uint32((ce >> 8) & 0x1FFFFF)
w.secondary = defaultSecondary
} else {
// secondary weight form
w.secondary = uint16(ce >> 8)
}
return w
}
// For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where
// - h* is the compTrieHandle.
// - b* is the offset into the contraction collation element table.
// See contract.go for details on the contraction trie.
const (
maxNBits = 5
maxTrieIndexBits = 11
maxContractOffsetBits = 14
)
func splitContractIndex(ce colElem) (index, n, offset int) {
h := uint16(ce)
return int(h >> maxNBits), int(h & (1<<maxNBits - 1)), int(ce>>16) & (1<<maxContractOffsetBits - 1)
}
// For expansions, colElems are of the form 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
// where b* is the index into the expansion sequence table.
const (
maxExpandIndexBits = 29
)
func splitExpandIndex(ce colElem) (index int) {
index = int(ce) & (1<<maxExpandIndexBits - 1)
return
}
// Some runes can be expanded using NFKD decomposition. Instead of storing the full
// sequence of collation elements, we decompose the rune and lookup the collation
// elements for each rune in the decomposition and modify the tertiary weights.
// The colElem, in this case, is of the form 11100000 00000000 wwwwwwww vvvvvvvv, where
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
const (
decompID = 0xE0000000
)
func splitDecompose(ce colElem) (t1, t2 uint8) {
return uint8(ce), uint8(ce >> 8)
}
const (
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
minUnified rune = 0x4E00
maxUnified = 0x9FFF
minCompatibility = 0xF900
maxCompatibility = 0xFAFF
minRare = 0x3400
maxRare = 0x4DBF
)
const (
commonUnifiedOffset = 0xFB40
rareUnifiedOffset = 0x1FB40
otherOffset = 0x4FB40
maxPrimary = otherOffset + unicode.MaxRune
)
// implicitPrimary returns the primary weight for the a rune
// for which there is no entry for the rune in the collation table.
// We take a different approach from the one specified in
// http://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if r >= minUnified && r <= maxUnified {
// The most common case for CJK.
return int(r) + commonUnifiedOffset
}
if r >= minCompatibility && r <= maxCompatibility {
// This will never hit as long as we don't remove the characters
// that would match from the table.
return int(r) + commonUnifiedOffset
}
if unicode.Is(unicode.Unified_Ideograph, r) {
return int(r) + rareUnifiedOffset
}
return int(r) + otherOffset
}

View File

@ -0,0 +1,157 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package collate
import (
"testing"
"unicode"
)
type ceTest struct {
f func(inout []int) (colElem, ceType)
arg []int
}
// The make* funcs are simplified versions of the functions in build/colelem.go
func makeCE(weights []int) colElem {
const (
maxPrimaryBits = 21
maxSecondaryBits = 16
maxTertiaryBits = 8
isPrimary = 0x40000000
)
var ce colElem
if weights[0] != 0 {
ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
ce |= isPrimary
} else {
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
}
return ce
}
func makeContractIndex(index, n, offset int) colElem {
const (
contractID = 0x80000000
maxNBits = 5
maxTrieIndexBits = 11
)
ce := colElem(contractID)
ce += colElem(offset << (maxTrieIndexBits + maxNBits))
ce += colElem(index << maxNBits)
ce += colElem(n)
return ce
}
func makeExpandIndex(index int) colElem {
const expandID = 0xC0000000
return expandID + colElem(index)
}
func makeDecompose(t1, t2 int) colElem {
const decompID = 0xE0000000
return colElem(t2<<8+t1) + decompID
}
func normalCE(inout []int) (ce colElem, t ceType) {
w := splitCE(makeCE(inout))
inout[0] = int(w.primary)
inout[1] = int(w.secondary)
inout[2] = int(w.tertiary)
return ce, ceNormal
}
func expandCE(inout []int) (ce colElem, t ceType) {
ce = makeExpandIndex(inout[0])
inout[0] = splitExpandIndex(ce)
return ce, ceExpansionIndex
}
func contractCE(inout []int) (ce colElem, t ceType) {
ce = makeContractIndex(inout[0], inout[1], inout[2])
i, n, o := splitContractIndex(ce)
inout[0], inout[1], inout[2] = i, n, o
return ce, ceContractionIndex
}
func decompCE(inout []int) (ce colElem, t ceType) {
ce = makeDecompose(inout[0], inout[1])
t1, t2 := splitDecompose(ce)
inout[0], inout[1] = int(t1), int(t2)
return ce, ceDecompose
}
const (
maxPrimaryBits = 21
maxSecondaryBits = 16
maxTertiaryBits = 8
)
var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}},
{normalCE, []int{0, 30, 3}},
{normalCE, []int{100, defaultSecondary, 3}},
{contractCE, []int{0, 0, 0}},
{contractCE, []int{1, 1, 1}},
{contractCE, []int{1, (1 << maxNBits) - 1, 1}},
{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}},
{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}},
{expandCE, []int{0}},
{expandCE, []int{5}},
{expandCE, []int{(1 << maxExpandIndexBits) - 1}},
{decompCE, []int{0, 0}},
{decompCE, []int{1, 1}},
{decompCE, []int{0x1F, 0x1F}},
}
func TestColElem(t *testing.T) {
for i, tt := range ceTests {
inout := make([]int, len(tt.arg))
copy(inout, tt.arg)
ce, typ := tt.f(inout)
if ce.ctype() != typ {
t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
}
for j, a := range tt.arg {
if inout[j] != a {
t.Errorf("%d: argument %d is %d; want %d", i, j, inout[j], a)
}
}
}
}
type implicitTest struct {
r rune
p int
}
var implicitTests = []implicitTest{
{0x33FF, 0x52F3F},
{0x3400, 0x22F40},
{0x4DC0, 0x54900},
{0x4DFF, 0x5493F},
{0x4E00, 0x14940},
{0x9FCB, 0x19B0B},
{0xA000, 0x59B40},
{0xF8FF, 0x5F43F},
{0xF900, 0x1F440},
{0xFA23, 0x1F563},
{0xFAFF, 0x1F63F},
{0xFB00, 0x5F640},
{0x20000, 0x3FB40},
{0x2B81C, 0x4B35C},
{unicode.MaxRune, 0x15FB3F}, // maximum primary value
}
func TestImplicit(t *testing.T) {
for _, tt := range implicitTests {
if p := implicitPrimary(tt.r); p != tt.p {
t.Errorf("%U: was %X; want %X", tt.r, p, tt.p)
}
}
}