From 882b6ef4542e7055c7dcd262b22c434b957195df Mon Sep 17 00:00:00 2001 From: Marcel van Lohuizen Date: Fri, 13 Jul 2012 11:38:22 +0200 Subject: [PATCH] exp/locale/collate: This CL includes the following changes: - Changed the representation of colElem to support a few cases for some languages not supported by the current format. - Changed offsets for implicit primary values. This makes the values both easier to read and debug (last 4 nibbles are identical to implicit primary value) and also results in better packing. - Fixed bug in weight conversion code that did not pop up yet by sheer luck. Note that tables.go also includes changes to the contraction trie from CL 6346092. R=r, mpvl CC=golang-dev https://golang.org/cl/6392060 --- src/pkg/exp/locale/collate/build/builder.go | 49 +- .../exp/locale/collate/build/builder_test.go | 6 +- src/pkg/exp/locale/collate/build/colelem.go | 83 +- .../exp/locale/collate/build/colelem_test.go | 33 +- src/pkg/exp/locale/collate/colelem.go | 87 +- src/pkg/exp/locale/collate/colelem_test.go | 66 +- src/pkg/exp/locale/collate/table_test.go | 4 +- src/pkg/exp/locale/collate/tables.go | 13178 ++++++++-------- 8 files changed, 6779 insertions(+), 6727 deletions(-) diff --git a/src/pkg/exp/locale/collate/build/builder.go b/src/pkg/exp/locale/collate/build/builder.go index 4451361e4d9..4b9fda6e9df 100644 --- a/src/pkg/exp/locale/collate/build/builder.go +++ b/src/pkg/exp/locale/collate/build/builder.go @@ -111,6 +111,11 @@ func (b *Builder) Add(str []rune, colelems [][]int) error { e.elems[i] = append(e.elems[i], ce[0]) } } + elems, err := convertLargeWeights(e.elems) + if err != nil { + return err + } + e.elems = elems b.entryMap[string(str)] = e b.entry = append(b.entry, e) return nil @@ -184,8 +189,7 @@ func (b *Builder) build() (*table, error) { b.built = true b.t = &table{} - b.contractCJK() - b.simplify() // requires contractCJK + b.simplify() b.processExpansions() // requires simplify b.processContractions() // requires simplify b.buildTrie() // requires process* @@ -231,6 +235,8 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { return false } // Tertiary values should be equal to maxTertiary for third element onwards. + // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can + // simply be dropped. Try this out by dropping the following code. if i >= 2 && ce[2] != maxTertiary { return false } @@ -322,10 +328,16 @@ func (b *Builder) simplify() { // convertLargeWeights converts collation elements with large // primaries (either double primaries or for illegal runes) // to our own representation. +// A CJK character C is represented in the DUCET as +// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] +// We will rewrite these characters to a single CE. +// We assume the CJK values start at 0x8000. // See http://unicode.org/reports/tr10/#Implicit_Weights func convertLargeWeights(elems [][]int) (res [][]int, err error) { const ( - firstLargePrimary = 0xFB40 + cjkPrimaryStart = 0xFB40 + rarePrimaryStart = 0xFB80 + otherPrimaryStart = 0xFBC0 illegalPrimary = 0xFFFE highBitsMask = 0x3F lowBitsMask = 0x7FFF @@ -335,7 +347,7 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) { for i := 0; i < len(elems); i++ { ce := elems[i] p := ce[0] - if p < firstLargePrimary { + if p < cjkPrimaryStart { continue } if p > 0xFFFF { @@ -350,8 +362,16 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) { if elems[i+1][0]&lowBitsFlag == 0 { return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) } - r := rune(((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask) - ce[0] = implicitPrimary(r) + np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask + switch { + case p < rarePrimaryStart: + np += commonUnifiedOffset + case p < otherPrimaryStart: + np += rareUnifiedOffset + default: + p += otherOffset + } + ce[0] = np for j := i + 1; j+1 < len(elems); j++ { elems[j] = elems[j+1] } @@ -361,21 +381,6 @@ func convertLargeWeights(elems [][]int) (res [][]int, err error) { return elems, nil } -// A CJK character C is represented in the DUCET as -// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] -// We will rewrite these characters to a single CE. -// We assume the CJK values start at 0x8000. -func (b *Builder) contractCJK() { - for _, e := range b.entry { - elms, err := convertLargeWeights(e.elems) - e.elems = elms - if err != nil { - err = fmt.Errorf("%U: %s", e.runes, err) - } - b.error(err) - } -} - // appendExpansion converts the given collation sequence to // collation elements and adds them to the expansion table. // It returns an index to the expansion table. @@ -479,7 +484,7 @@ func (b *Builder) processContractions() { str := []byte(string(e.runes[1:])) o, sn = t.contractTries.lookup(handle, str) if sn != len(str) { - log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", []rune(string(str)), sn, len(str)) + log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", e.runes, sn, len(str)) } } if es[o] != nil { diff --git a/src/pkg/exp/locale/collate/build/builder_test.go b/src/pkg/exp/locale/collate/build/builder_test.go index 6f627e478f5..a113d449aaf 100644 --- a/src/pkg/exp/locale/collate/build/builder_test.go +++ b/src/pkg/exp/locale/collate/build/builder_test.go @@ -63,7 +63,7 @@ type convertTest struct { var convLargeTests = []convertTest{ {pCE(0xFB39), pCE(0xFB39), false}, - {cjk(0x2F9B2), pqCE(0x4F4F2, 0x2F9B2), false}, + {cjk(0x2F9B2), pqCE(0x3F9B2, 0x2F9B2), false}, {pCE(0xFB40), pCE(0), true}, {append(pCE(0xFB40), pCE(0)[0]), pCE(0), true}, {pCE(0xFFFE), pCE(illegalOffset), false}, @@ -109,9 +109,9 @@ var simplifyTest = []ducetElem{ } var genColTests = []ducetElem{ - {"\uFA70", pqCE(0x1F5B0, 0xFA70)}, + {"\uFA70", pqCE(0x1FA70, 0xFA70)}, {"A\u0300", append(ptCE(100, 8), sCE(30)...)}, - {"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1F5B0, 0xFA70)[0])}, + {"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1FA70, 0xFA70)[0])}, {"A\u0300A\u0300", append(ptCE(100, 8), sCE(30)[0], ptCE(100, 8)[0], sCE(30)[0])}, } diff --git a/src/pkg/exp/locale/collate/build/colelem.go b/src/pkg/exp/locale/collate/build/colelem.go index 3e951bb7a38..629ec22f208 100644 --- a/src/pkg/exp/locale/collate/build/colelem.go +++ b/src/pkg/exp/locale/collate/build/colelem.go @@ -25,19 +25,29 @@ const ( // For normal collation elements, we assume that a collation element either has // a primary or non-default secondary value, not both. // Collation elements with a primary value are of the form -// 000ppppp pppppppp pppppppp tttttttt, where +// 010ppppp pppppppp pppppppp ssssssss // - p* is primary collation value +// - s* is the secondary collation value +// or +// 00pppppp pppppppp ppppppps sssttttt, where +// - p* is primary collation value +// - s* offset of secondary from default value. // - t* is the tertiary collation value // Collation elements with a secondary value are of the form -// 01000000 ssssssss ssssssss tttttttt, where -// - s* is the secondary collation value -// - t* is the tertiary collation value +// 10000000 0000ssss ssssssss tttttttt, where +// - 16 BMP implicit -> weight +// - 8 bit s +// - default tertiary const ( - maxPrimaryBits = 21 - maxSecondaryBits = 16 - maxTertiaryBits = 8 + maxPrimaryBits = 21 + maxSecondaryBits = 12 + maxSecondaryCompactBits = 8 + maxSecondaryDiffBits = 4 + maxTertiaryBits = 8 + maxTertiaryCompactBits = 5 - isSecondary = 0x40000000 + isSecondary = 0x80000000 + isPrimary = 0x40000000 ) func makeCE(weights []int) (uint32, error) { @@ -48,17 +58,28 @@ func makeCE(weights []int) (uint32, error) { return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<= 1<= %d", w, 1<= %x", w, 1<= 1<= %x", weights[1], 1<= 1< %x", d, d, 1<= 1< %x (%X)", weights[2], 1<= %d", h.index, 1<= 1<= %x", offset, 1<= %x", offset, 1<= 1<= %x", index, 1<= %x", index, 1< weight +// - 8 bit s +// - default tertiary func splitCE(ce colElem) weights { - const secondaryMask = 0x40000000 + const primaryMask = 0x40000000 + const secondaryMask = 0x80000000 w := weights{} - w.tertiary = uint8(ce) - if ce&secondaryMask == 0 { - // primary weight form + if ce&primaryMask != 0 { + w.tertiary = defaultTertiary + w.secondary = uint16(uint8(ce)) w.primary = uint32((ce >> 8) & 0x1FFFFF) - w.secondary = defaultSecondary + } else if ce&secondaryMask == 0 { + w.tertiary = uint8(ce & 0x1F) + ce >>= 5 + w.secondary = defaultSecondary + uint16(ce&0xF) + ce >>= 4 + w.primary = uint32(ce) } else { - // secondary weight form + w.tertiary = uint8(ce) w.secondary = uint16(ce >> 8) } return w } -// For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where -// - h* is the compTrieHandle. +// For contractions, collation elements are of the form +// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where +// - n* is the size of the first node in the contraction trie. +// - i* is the index of the first node in the contraction trie. // - b* is the offset into the contraction collation element table. // See contract.go for details on the contraction trie. const ( - maxNBits = 5 - maxTrieIndexBits = 11 - maxContractOffsetBits = 14 + maxNBits = 4 + maxTrieIndexBits = 12 + maxContractOffsetBits = 13 ) func splitContractIndex(ce colElem) (index, n, offset int) { - h := ce & 0xffff - return int(h >> maxNBits), int(h & (1<>16) & (1<>= maxNBits + index = int(ce & (1<>= maxTrieIndexBits + offset = int(ce & (1<> 8) } @@ -143,10 +154,10 @@ const ( maxRare = 0x4DBF ) const ( - commonUnifiedOffset = 0xFB40 - rareUnifiedOffset = 0x1FB40 - otherOffset = 0x4FB40 - illegalOffset = otherOffset + unicode.MaxRune + commonUnifiedOffset = 0x10000 + rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF + otherOffset = 0x50000 // largest rune in rare is U+2FA1D + illegalOffset = otherOffset + int(unicode.MaxRune) maxPrimary = illegalOffset + 1 ) diff --git a/src/pkg/exp/locale/collate/colelem_test.go b/src/pkg/exp/locale/collate/colelem_test.go index 9971151e6da..bcb4ddb68c1 100644 --- a/src/pkg/exp/locale/collate/colelem_test.go +++ b/src/pkg/exp/locale/collate/colelem_test.go @@ -17,14 +17,25 @@ type ceTest struct { // The make* funcs are simplified versions of the functions in build/colelem.go func makeCE(weights []int) colElem { const ( - maxPrimaryBits = 21 - maxSecondaryBits = 16 - maxTertiaryBits = 8 - isSecondary = 0x40000000 + maxPrimaryBits = 21 + maxSecondaryBits = 12 + maxSecondaryCompactBits = 8 + maxSecondaryDiffBits = 4 + maxTertiaryBits = 8 + maxTertiaryCompactBits = 5 + isSecondary = 0x80000000 + isPrimary = 0x40000000 ) var ce colElem if weights[0] != 0 { - ce = colElem(weights[0]<