exp/locale/collate: added representation for collation elements

(see http://www.unicode.org/reports/tr10/). R=r, r CC=golang-dev https://golang.org/cl/5981048
2024-10-03 09:21:21 -06:00 · 2012-04-25 13:16:24 +02:00 · 2012-04-25 13:16:24 +02:00 · bb3f3c9775
commit bb3f3c9775
parent e456d015fb
4 changed files with 586 additions and 0 deletions
--- a/src/pkg/exp/locale/collate/build/colelem.go
+++ b/src/pkg/exp/locale/collate/build/colelem.go
@ -0,0 +1,179 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package build
 import (
 	"fmt"
 	"unicode"
 )
 const (
 	defaultSecondary = 0x20
 	defaultTertiary  = 0x2
 	maxTertiary      = 0x1F
 )
 // A collation element is represented as an uint32.
 // In the typical case, a rune maps to a single collation element. If a rune
 // can be the start of a contraction or expands into multiple collation elements,
 // then the collation element that is associated with a rune will have a special
 // form to represent such m to n mappings.  Such special collation elements
 // have a value >= 0x80000000.
 // For normal collation elements, we assume that a collation element either has
 // a primary or non-default secondary value, not both.
 // Collation elements with a primary value are of the form
 // 010ppppp pppppppp pppppppp tttttttt, where
 //   - p* is primary collation value
 //   - t* is the tertiary collation value
 // Collation elements with a secondary value are of the form
 // 00000000 ssssssss ssssssss tttttttt, where
 //   - s* is the secondary collation value
 //   - t* is the tertiary collation value
 const (
 	maxPrimaryBits   = 21
 	maxSecondaryBits = 16
 	maxTertiaryBits  = 8
 	isPrimary = 0x40000000
 )
 func makeCE(weights []int) (uint32, error) {
 	if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
 		return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
 	}
 	if w := weights[1]; w >= 1<<maxSecondaryBits || w < 0 {
 		return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
 	}
 	if w := weights[2]; w >= 1<<maxTertiaryBits || w < 0 {
 		return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %d >= %d", w, 1<<maxTertiaryBits)
 	}
 	ce := uint32(0)
 	if weights[0] != 0 {
 		// primary weight form
 		if weights[1] != defaultSecondary {
 			return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
 		}
 		ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
 		ce |= isPrimary
 	} else {
 		// secondary weight form
 		ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
 	}
 	return ce, nil
 }
 // For contractions, collation elements are of the form
 // 10bbbbbb bbbbbbbb iiiiiiii iiinnnnn, where
 //   - n* is the size of the first node in the contraction trie.
 //   - i* is the index of the first node in the contraction trie.
 //   - b* is the offset into the contraction collation element table.
 // See contract.go for details on the contraction trie.
 const (
 	contractID            = 0x80000000
 	maxNBits              = 5
 	maxTrieIndexBits      = 11
 	maxContractOffsetBits = 14
 )
 func makeContractIndex(h ctHandle, offset int) (uint32, error) {
 	if h.n >= 1<<maxNBits {
 		return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
 	}
 	if h.index >= 1<<maxTrieIndexBits {
 		return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
 	}
 	if offset >= 1<<maxContractOffsetBits {
 		return 0, fmt.Errorf("offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
 	}
 	ce := uint32(contractID)
 	ce += uint32(offset << (maxTrieIndexBits + maxNBits))
 	ce += uint32(h.index << maxNBits)
 	ce += uint32(h.n)
 	return ce, nil
 }
 // For expansions, collation elements are of the form
 // 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
 // where b* is the index into the expansion sequence table.
 const (
 	expandID           = 0xC0000000
 	maxExpandIndexBits = 29
 )
 func makeExpandIndex(index int) (uint32, error) {
 	if index >= 1<<maxExpandIndexBits {
 		return 0, fmt.Errorf("index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
 	}
 	return expandID + uint32(index), nil
 }
 // Each list of collation elements corresponding to an expansion starts with 
 // a header indicating the length of the sequence.
 func makeExpansionHeader(n int) (uint32, error) {
 	return uint32(n), nil
 }
 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
 // sequence of collation elements, we decompose the rune and lookup the collation
 // elements for each rune in the decomposition and modify the tertiary weights.
 // The collation element, in this case, is of the form
 // 11100000 00000000 wwwwwwww vvvvvvvv, where
 //   - v* is the replacement tertiary weight for the first rune,
 //   - w* is the replacement tertiary weight for the second rune,
 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
 const (
 	decompID = 0xE0000000
 )
 func makeDecompose(t1, t2 int) (uint32, error) {
 	if t1 >= 256 || t1 < 0 {
 		return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
 	}
 	if t2 >= 256 || t2 < 0 {
 		return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
 	}
 	return uint32(t2<<8+t1) + decompID, nil
 }
 const (
 	// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
 	minUnified       rune = 0x4E00
 	maxUnified            = 0x9FFF
 	minCompatibility      = 0xF900
 	maxCompatibility      = 0xFAFF
 	minRare               = 0x3400
 	maxRare               = 0x4DBF
 )
 const (
 	commonUnifiedOffset = 0xFB40
 	rareUnifiedOffset   = 0x1FB40
 	otherOffset         = 0x4FB40
 	illegalOffset       = otherOffset + unicode.MaxRune
 	maxPrimary          = illegalOffset + 2 // there are 2 illegal values.
 )
 // implicitPrimary returns the primary weight for the a rune
 // for which there is no entry for the rune in the collation table.
 // We take a different approach from the one specified in
 // http://unicode.org/reports/tr10/#Implicit_Weights,
 // but preserve the resulting relative ordering of the runes.
 func implicitPrimary(r rune) int {
 	if r >= minUnified && r <= maxUnified {
 		// The most common case for CJK.
 		return int(r) + commonUnifiedOffset
 	}
 	if r >= minCompatibility && r <= maxCompatibility {
 		// This will never hit as long as we don't remove the characters
 		// that would match from the table.
 		return int(r) + commonUnifiedOffset
 	}
 	if unicode.Is(unicode.Unified_Ideograph, r) {
 		return int(r) + rareUnifiedOffset
 	}
 	return int(r) + otherOffset
 }
--- a/src/pkg/exp/locale/collate/build/colelem_test.go
+++ b/src/pkg/exp/locale/collate/build/colelem_test.go
@ -0,0 +1,80 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package build
 import "testing"
 type ceTest struct {
 	f   func(in []int) (uint32, error)
 	arg []int
 	val uint32
 }
 func normalCE(in []int) (ce uint32, err error) {
 	return makeCE(in)
 }
 func expandCE(in []int) (ce uint32, err error) {
 	return makeExpandIndex(in[0])
 }
 func contractCE(in []int) (ce uint32, err error) {
 	return makeContractIndex(ctHandle{in[0], in[1]}, in[2])
 }
 func decompCE(in []int) (ce uint32, err error) {
 	return makeDecompose(in[0], in[1])
 }
 var ceTests = []ceTest{
 	{normalCE, []int{0, 0, 0}, 000},
 	{normalCE, []int{0, 30, 3}, 0x1E03},
 	{normalCE, []int{100, defaultSecondary, 3}, 0x40006403},
 	{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
 	{normalCE, []int{100, 1, 3}, 0xFFFF},
 	{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
 	{normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
 	{normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
 	{contractCE, []int{0, 0, 0}, 0x80000000},
 	{contractCE, []int{1, 1, 1}, 0x80010021},
 	{contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0x8001003F},
 	{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0x8001FFE1},
 	{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xBFFF0021},
 	{contractCE, []int{1, (1 << maxNBits), 1}, 0xFFFF},
 	{contractCE, []int{(1 << maxTrieIndexBits), 1, 1}, 0xFFFF},
 	{contractCE, []int{1, (1 << maxContractOffsetBits), 1}, 0xFFFF},
 	{expandCE, []int{0}, 0xC0000000},
 	{expandCE, []int{5}, 0xC0000005},
 	{expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xDFFFFFFF},
 	{expandCE, []int{1 << maxExpandIndexBits}, 0xFFFF},
 	{decompCE, []int{0, 0}, 0xE0000000},
 	{decompCE, []int{1, 1}, 0xE0000101},
 	{decompCE, []int{0x1F, 0x1F}, 0xE0001F1F},
 	{decompCE, []int{256, 0x1F}, 0xFFFF},
 	{decompCE, []int{0x1F, 256}, 0xFFFF},
 }
 func TestColElem(t *testing.T) {
 	for i, tt := range ceTests {
 		in := make([]int, len(tt.arg))
 		copy(in, tt.arg)
 		ce, err := tt.f(in)
 		if tt.val == 0xFFFF {
 			if err == nil {
 				t.Errorf("%d: expected error for args %x", i, tt.arg)
 			}
 			continue
 		}
 		if err != nil {
 			t.Errorf("%d: unexpected error: %v", i, err.Error())
 		}
 		if ce != tt.val {
 			t.Errorf("%d: colElem=%X; want %X", i, ce, tt.val)
 		}
 	}
 }
--- a/src/pkg/exp/locale/collate/colelem.go
+++ b/src/pkg/exp/locale/collate/colelem.go
@ -0,0 +1,170 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package collate
 import (
 	"unicode"
 )
 // weights holds the decoded weights per collation level.
 type weights struct {
 	primary   uint32
 	secondary uint16
 	tertiary  uint8
 	// TODO: compute quaternary on the fly or compress this value into 8 bits
 	// such that weights fit within 64bit.
 	quaternary uint32
 }
 const (
 	defaultSecondary = 0x20
 	defaultTertiary  = 0x2
 	maxTertiary      = 0x1F
 )
 // colElem is a representation of a collation element.
 // In the typical case, a rune maps to a single collation element. If a rune
 // can be the start of a contraction or expands into multiple collation elements,
 // then the colElem that is associated with a rune will have a special form to represent
 // such m to n mappings.  Such special colElems have a value >= 0x80000000.
 type colElem uint32
 const (
 	maxCE       colElem = 0x7FFFFFFF
 	minContract         = 0x80000000
 	maxContract         = 0xBFFFFFFF
 	minExpand           = 0xC0000000
 	maxExpand           = 0xDFFFFFFF
 	minDecomp           = 0xE0000000
 )
 type ceType int
 const (
 	ceNormal           ceType = iota // ceNormal includes implicits (ce == 0)
 	ceContractionIndex               // rune can be a start of a contraction
 	ceExpansionIndex                 // rune expands into a sequence of collation elements
 	ceDecompose                      // rune expands using NFKC decomposition
 )
 func (ce colElem) ctype() ceType {
 	if ce <= maxCE {
 		return ceNormal
 	}
 	if ce <= maxContract {
 		return ceContractionIndex
 	} else {
 		if ce <= maxExpand {
 			return ceExpansionIndex
 		}
 		return ceDecompose
 	}
 	panic("should not reach here")
 	return ceType(-1)
 }
 // For normal collation elements, we assume that a collation element either has
 // a primary or non-default secondary value, not both.
 // Collation elements with a primary value are of the form
 // 010ppppp pppppppp pppppppp tttttttt, where
 //   - p* is primary collation value
 //   - t* is the tertiary collation value
 // Collation elements with a secondary value are of the form
 // 00000000 ssssssss ssssssss tttttttt, where
 //   - s* is the secondary collation value
 //   - t* is the tertiary collation value
 func splitCE(ce colElem) weights {
 	w := weights{}
 	w.tertiary = uint8(ce)
 	if ce&0x40000000 != 0 {
 		// primary weight form
 		w.primary = uint32((ce >> 8) & 0x1FFFFF)
 		w.secondary = defaultSecondary
 	} else {
 		// secondary weight form
 		w.secondary = uint16(ce >> 8)
 	}
 	return w
 }
 // For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where
 //   - h* is the compTrieHandle.
 //   - b* is the offset into the contraction collation element table.
 // See contract.go for details on the contraction trie.
 const (
 	maxNBits              = 5
 	maxTrieIndexBits      = 11
 	maxContractOffsetBits = 14
 )
 func splitContractIndex(ce colElem) (index, n, offset int) {
 	h := uint16(ce)
 	return int(h >> maxNBits), int(h & (1<<maxNBits - 1)), int(ce>>16) & (1<<maxContractOffsetBits - 1)
 }
 // For expansions, colElems are of the form 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
 // where b* is the index into the expansion sequence table.
 const (
 	maxExpandIndexBits = 29
 )
 func splitExpandIndex(ce colElem) (index int) {
 	index = int(ce) & (1<<maxExpandIndexBits - 1)
 	return
 }
 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
 // sequence of collation elements, we decompose the rune and lookup the collation
 // elements for each rune in the decomposition and modify the tertiary weights.
 // The colElem, in this case, is of the form 11100000 00000000 wwwwwwww vvvvvvvv, where
 //   - v* is the replacement tertiary weight for the first rune,
 //   - w* is the replacement tertiary weight for the second rune,
 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
 const (
 	decompID = 0xE0000000
 )
 func splitDecompose(ce colElem) (t1, t2 uint8) {
 	return uint8(ce), uint8(ce >> 8)
 }
 const (
 	// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
 	minUnified       rune = 0x4E00
 	maxUnified            = 0x9FFF
 	minCompatibility      = 0xF900
 	maxCompatibility      = 0xFAFF
 	minRare               = 0x3400
 	maxRare               = 0x4DBF
 )
 const (
 	commonUnifiedOffset = 0xFB40
 	rareUnifiedOffset   = 0x1FB40
 	otherOffset         = 0x4FB40
 	maxPrimary          = otherOffset + unicode.MaxRune
 )
 // implicitPrimary returns the primary weight for the a rune
 // for which there is no entry for the rune in the collation table.
 // We take a different approach from the one specified in
 // http://unicode.org/reports/tr10/#Implicit_Weights,
 // but preserve the resulting relative ordering of the runes.
 func implicitPrimary(r rune) int {
 	if r >= minUnified && r <= maxUnified {
 		// The most common case for CJK.
 		return int(r) + commonUnifiedOffset
 	}
 	if r >= minCompatibility && r <= maxCompatibility {
 		// This will never hit as long as we don't remove the characters
 		// that would match from the table.
 		return int(r) + commonUnifiedOffset
 	}
 	if unicode.Is(unicode.Unified_Ideograph, r) {
 		return int(r) + rareUnifiedOffset
 	}
 	return int(r) + otherOffset
 }
--- a/src/pkg/exp/locale/collate/colelem_test.go
+++ b/src/pkg/exp/locale/collate/colelem_test.go
@ -0,0 +1,157 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package collate
 import (
 	"testing"
 	"unicode"
 )
 type ceTest struct {
 	f   func(inout []int) (colElem, ceType)
 	arg []int
 }
 // The make* funcs are simplified versions of the functions in build/colelem.go
 func makeCE(weights []int) colElem {
 	const (
 		maxPrimaryBits   = 21
 		maxSecondaryBits = 16
 		maxTertiaryBits  = 8
 		isPrimary        = 0x40000000
 	)
 	var ce colElem
 	if weights[0] != 0 {
 		ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
 		ce |= isPrimary
 	} else {
 		ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
 	}
 	return ce
 }
 func makeContractIndex(index, n, offset int) colElem {
 	const (
 		contractID       = 0x80000000
 		maxNBits         = 5
 		maxTrieIndexBits = 11
 	)
 	ce := colElem(contractID)
 	ce += colElem(offset << (maxTrieIndexBits + maxNBits))
 	ce += colElem(index << maxNBits)
 	ce += colElem(n)
 	return ce
 }
 func makeExpandIndex(index int) colElem {
 	const expandID = 0xC0000000
 	return expandID + colElem(index)
 }
 func makeDecompose(t1, t2 int) colElem {
 	const decompID = 0xE0000000
 	return colElem(t2<<8+t1) + decompID
 }
 func normalCE(inout []int) (ce colElem, t ceType) {
 	w := splitCE(makeCE(inout))
 	inout[0] = int(w.primary)
 	inout[1] = int(w.secondary)
 	inout[2] = int(w.tertiary)
 	return ce, ceNormal
 }
 func expandCE(inout []int) (ce colElem, t ceType) {
 	ce = makeExpandIndex(inout[0])
 	inout[0] = splitExpandIndex(ce)
 	return ce, ceExpansionIndex
 }
 func contractCE(inout []int) (ce colElem, t ceType) {
 	ce = makeContractIndex(inout[0], inout[1], inout[2])
 	i, n, o := splitContractIndex(ce)
 	inout[0], inout[1], inout[2] = i, n, o
 	return ce, ceContractionIndex
 }
 func decompCE(inout []int) (ce colElem, t ceType) {
 	ce = makeDecompose(inout[0], inout[1])
 	t1, t2 := splitDecompose(ce)
 	inout[0], inout[1] = int(t1), int(t2)
 	return ce, ceDecompose
 }
 const (
 	maxPrimaryBits   = 21
 	maxSecondaryBits = 16
 	maxTertiaryBits  = 8
 )
 var ceTests = []ceTest{
 	{normalCE, []int{0, 0, 0}},
 	{normalCE, []int{0, 30, 3}},
 	{normalCE, []int{100, defaultSecondary, 3}},
 	{contractCE, []int{0, 0, 0}},
 	{contractCE, []int{1, 1, 1}},
 	{contractCE, []int{1, (1 << maxNBits) - 1, 1}},
 	{contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}},
 	{contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}},
 	{expandCE, []int{0}},
 	{expandCE, []int{5}},
 	{expandCE, []int{(1 << maxExpandIndexBits) - 1}},
 	{decompCE, []int{0, 0}},
 	{decompCE, []int{1, 1}},
 	{decompCE, []int{0x1F, 0x1F}},
 }
 func TestColElem(t *testing.T) {
 	for i, tt := range ceTests {
 		inout := make([]int, len(tt.arg))
 		copy(inout, tt.arg)
 		ce, typ := tt.f(inout)
 		if ce.ctype() != typ {
 			t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
 		}
 		for j, a := range tt.arg {
 			if inout[j] != a {
 				t.Errorf("%d: argument %d is %d; want %d", i, j, inout[j], a)
 			}
 		}
 	}
 }
 type implicitTest struct {
 	r rune
 	p int
 }
 var implicitTests = []implicitTest{
 	{0x33FF, 0x52F3F},
 	{0x3400, 0x22F40},
 	{0x4DC0, 0x54900},
 	{0x4DFF, 0x5493F},
 	{0x4E00, 0x14940},
 	{0x9FCB, 0x19B0B},
 	{0xA000, 0x59B40},
 	{0xF8FF, 0x5F43F},
 	{0xF900, 0x1F440},
 	{0xFA23, 0x1F563},
 	{0xFAFF, 0x1F63F},
 	{0xFB00, 0x5F640},
 	{0x20000, 0x3FB40},
 	{0x2B81C, 0x4B35C},
 	{unicode.MaxRune, 0x15FB3F}, // maximum primary value
 }
 func TestImplicit(t *testing.T) {
 	for _, tt := range implicitTests {
 		if p := implicitPrimary(tt.r); p != tt.p {
 			t.Errorf("%U: was %X; want %X", tt.r, p, tt.p)
 		}
 	}
 }