diff --git a/src/pkg/exp/locale/collate/build/colelem.go b/src/pkg/exp/locale/collate/build/colelem.go new file mode 100644 index 0000000000..09425320fd --- /dev/null +++ b/src/pkg/exp/locale/collate/build/colelem.go @@ -0,0 +1,179 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import ( + "fmt" + "unicode" +) + +const ( + defaultSecondary = 0x20 + defaultTertiary = 0x2 + maxTertiary = 0x1F +) + +// A collation element is represented as an uint32. +// In the typical case, a rune maps to a single collation element. If a rune +// can be the start of a contraction or expands into multiple collation elements, +// then the collation element that is associated with a rune will have a special +// form to represent such m to n mappings. Such special collation elements +// have a value >= 0x80000000. + +// For normal collation elements, we assume that a collation element either has +// a primary or non-default secondary value, not both. +// Collation elements with a primary value are of the form +// 010ppppp pppppppp pppppppp tttttttt, where +// - p* is primary collation value +// - t* is the tertiary collation value +// Collation elements with a secondary value are of the form +// 00000000 ssssssss ssssssss tttttttt, where +// - s* is the secondary collation value +// - t* is the tertiary collation value +const ( + maxPrimaryBits = 21 + maxSecondaryBits = 16 + maxTertiaryBits = 8 + + isPrimary = 0x40000000 +) + +func makeCE(weights []int) (uint32, error) { + if w := weights[0]; w >= 1<= %x", w, 1<= 1<= %x", w, 1<= 1<= %d", w, 1<= 1<= %d", h.n, 1<= 1<= %d", h.index, 1<= 1<= %x", offset, 1<= 1<= %x", index, 1<= 256 || t1 < 0 { + return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) + } + if t2 >= 256 || t2 < 0 { + return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) + } + return uint32(t2<<8+t1) + decompID, nil +} + +const ( + // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. + minUnified rune = 0x4E00 + maxUnified = 0x9FFF + minCompatibility = 0xF900 + maxCompatibility = 0xFAFF + minRare = 0x3400 + maxRare = 0x4DBF +) +const ( + commonUnifiedOffset = 0xFB40 + rareUnifiedOffset = 0x1FB40 + otherOffset = 0x4FB40 + illegalOffset = otherOffset + unicode.MaxRune + maxPrimary = illegalOffset + 2 // there are 2 illegal values. +) + +// implicitPrimary returns the primary weight for the a rune +// for which there is no entry for the rune in the collation table. +// We take a different approach from the one specified in +// http://unicode.org/reports/tr10/#Implicit_Weights, +// but preserve the resulting relative ordering of the runes. +func implicitPrimary(r rune) int { + + if r >= minUnified && r <= maxUnified { + // The most common case for CJK. + return int(r) + commonUnifiedOffset + } + if r >= minCompatibility && r <= maxCompatibility { + // This will never hit as long as we don't remove the characters + // that would match from the table. + return int(r) + commonUnifiedOffset + } + if unicode.Is(unicode.Unified_Ideograph, r) { + return int(r) + rareUnifiedOffset + } + return int(r) + otherOffset +} diff --git a/src/pkg/exp/locale/collate/build/colelem_test.go b/src/pkg/exp/locale/collate/build/colelem_test.go new file mode 100644 index 0000000000..841ac11629 --- /dev/null +++ b/src/pkg/exp/locale/collate/build/colelem_test.go @@ -0,0 +1,80 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import "testing" + +type ceTest struct { + f func(in []int) (uint32, error) + arg []int + val uint32 +} + +func normalCE(in []int) (ce uint32, err error) { + return makeCE(in) +} + +func expandCE(in []int) (ce uint32, err error) { + return makeExpandIndex(in[0]) +} + +func contractCE(in []int) (ce uint32, err error) { + return makeContractIndex(ctHandle{in[0], in[1]}, in[2]) +} + +func decompCE(in []int) (ce uint32, err error) { + return makeDecompose(in[0], in[1]) +} + +var ceTests = []ceTest{ + {normalCE, []int{0, 0, 0}, 000}, + {normalCE, []int{0, 30, 3}, 0x1E03}, + {normalCE, []int{100, defaultSecondary, 3}, 0x40006403}, + {normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary + {normalCE, []int{100, 1, 3}, 0xFFFF}, + {normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF}, + {normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF}, + {normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF}, + + {contractCE, []int{0, 0, 0}, 0x80000000}, + {contractCE, []int{1, 1, 1}, 0x80010021}, + {contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0x8001003F}, + {contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0x8001FFE1}, + {contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xBFFF0021}, + {contractCE, []int{1, (1 << maxNBits), 1}, 0xFFFF}, + {contractCE, []int{(1 << maxTrieIndexBits), 1, 1}, 0xFFFF}, + {contractCE, []int{1, (1 << maxContractOffsetBits), 1}, 0xFFFF}, + + {expandCE, []int{0}, 0xC0000000}, + {expandCE, []int{5}, 0xC0000005}, + {expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xDFFFFFFF}, + {expandCE, []int{1 << maxExpandIndexBits}, 0xFFFF}, + + {decompCE, []int{0, 0}, 0xE0000000}, + {decompCE, []int{1, 1}, 0xE0000101}, + {decompCE, []int{0x1F, 0x1F}, 0xE0001F1F}, + {decompCE, []int{256, 0x1F}, 0xFFFF}, + {decompCE, []int{0x1F, 256}, 0xFFFF}, +} + +func TestColElem(t *testing.T) { + for i, tt := range ceTests { + in := make([]int, len(tt.arg)) + copy(in, tt.arg) + ce, err := tt.f(in) + if tt.val == 0xFFFF { + if err == nil { + t.Errorf("%d: expected error for args %x", i, tt.arg) + } + continue + } + if err != nil { + t.Errorf("%d: unexpected error: %v", i, err.Error()) + } + if ce != tt.val { + t.Errorf("%d: colElem=%X; want %X", i, ce, tt.val) + } + } +} diff --git a/src/pkg/exp/locale/collate/colelem.go b/src/pkg/exp/locale/collate/colelem.go new file mode 100644 index 0000000000..03cfc678e8 --- /dev/null +++ b/src/pkg/exp/locale/collate/colelem.go @@ -0,0 +1,170 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package collate + +import ( + "unicode" +) + +// weights holds the decoded weights per collation level. +type weights struct { + primary uint32 + secondary uint16 + tertiary uint8 + // TODO: compute quaternary on the fly or compress this value into 8 bits + // such that weights fit within 64bit. + quaternary uint32 +} + +const ( + defaultSecondary = 0x20 + defaultTertiary = 0x2 + maxTertiary = 0x1F +) + +// colElem is a representation of a collation element. +// In the typical case, a rune maps to a single collation element. If a rune +// can be the start of a contraction or expands into multiple collation elements, +// then the colElem that is associated with a rune will have a special form to represent +// such m to n mappings. Such special colElems have a value >= 0x80000000. +type colElem uint32 + +const ( + maxCE colElem = 0x7FFFFFFF + minContract = 0x80000000 + maxContract = 0xBFFFFFFF + minExpand = 0xC0000000 + maxExpand = 0xDFFFFFFF + minDecomp = 0xE0000000 +) + +type ceType int + +const ( + ceNormal ceType = iota // ceNormal includes implicits (ce == 0) + ceContractionIndex // rune can be a start of a contraction + ceExpansionIndex // rune expands into a sequence of collation elements + ceDecompose // rune expands using NFKC decomposition +) + +func (ce colElem) ctype() ceType { + if ce <= maxCE { + return ceNormal + } + if ce <= maxContract { + return ceContractionIndex + } else { + if ce <= maxExpand { + return ceExpansionIndex + } + return ceDecompose + } + panic("should not reach here") + return ceType(-1) +} + +// For normal collation elements, we assume that a collation element either has +// a primary or non-default secondary value, not both. +// Collation elements with a primary value are of the form +// 010ppppp pppppppp pppppppp tttttttt, where +// - p* is primary collation value +// - t* is the tertiary collation value +// Collation elements with a secondary value are of the form +// 00000000 ssssssss ssssssss tttttttt, where +// - s* is the secondary collation value +// - t* is the tertiary collation value +func splitCE(ce colElem) weights { + w := weights{} + w.tertiary = uint8(ce) + if ce&0x40000000 != 0 { + // primary weight form + w.primary = uint32((ce >> 8) & 0x1FFFFF) + w.secondary = defaultSecondary + } else { + // secondary weight form + w.secondary = uint16(ce >> 8) + } + return w +} + +// For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where +// - h* is the compTrieHandle. +// - b* is the offset into the contraction collation element table. +// See contract.go for details on the contraction trie. +const ( + maxNBits = 5 + maxTrieIndexBits = 11 + maxContractOffsetBits = 14 +) + +func splitContractIndex(ce colElem) (index, n, offset int) { + h := uint16(ce) + return int(h >> maxNBits), int(h & (1<>16) & (1<> 8) +} + +const ( + // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. + minUnified rune = 0x4E00 + maxUnified = 0x9FFF + minCompatibility = 0xF900 + maxCompatibility = 0xFAFF + minRare = 0x3400 + maxRare = 0x4DBF +) +const ( + commonUnifiedOffset = 0xFB40 + rareUnifiedOffset = 0x1FB40 + otherOffset = 0x4FB40 + maxPrimary = otherOffset + unicode.MaxRune +) + +// implicitPrimary returns the primary weight for the a rune +// for which there is no entry for the rune in the collation table. +// We take a different approach from the one specified in +// http://unicode.org/reports/tr10/#Implicit_Weights, +// but preserve the resulting relative ordering of the runes. +func implicitPrimary(r rune) int { + + if r >= minUnified && r <= maxUnified { + // The most common case for CJK. + return int(r) + commonUnifiedOffset + } + if r >= minCompatibility && r <= maxCompatibility { + // This will never hit as long as we don't remove the characters + // that would match from the table. + return int(r) + commonUnifiedOffset + } + if unicode.Is(unicode.Unified_Ideograph, r) { + return int(r) + rareUnifiedOffset + } + return int(r) + otherOffset +} diff --git a/src/pkg/exp/locale/collate/colelem_test.go b/src/pkg/exp/locale/collate/colelem_test.go new file mode 100644 index 0000000000..b201f81457 --- /dev/null +++ b/src/pkg/exp/locale/collate/colelem_test.go @@ -0,0 +1,157 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package collate + +import ( + "testing" + "unicode" +) + +type ceTest struct { + f func(inout []int) (colElem, ceType) + arg []int +} + +// The make* funcs are simplified versions of the functions in build/colelem.go +func makeCE(weights []int) colElem { + const ( + maxPrimaryBits = 21 + maxSecondaryBits = 16 + maxTertiaryBits = 8 + isPrimary = 0x40000000 + ) + var ce colElem + if weights[0] != 0 { + ce = colElem(weights[0]<