1
0
mirror of https://github.com/golang/go synced 2024-10-04 12:21:26 -06:00
go/src/pkg/exp/norm/forminfo.go
Marcel van Lohuizen 2517143957 exp/norm: added Reader and Writer and bug fixes to support these.
Needed to ensure that finding the last boundary does not result in O(n^2)-like behavior.
Now prevents lookbacks beyond 31 characters across the board (starter + 30 non-starters).
composition.go:
- maxCombiningCharacters now means exactly that.
- Bug fix.
- Small performance improvement/ made code consistent with other code.
forminfo.go:
- Bug fix: ccc needs to be 0 for inert runes.
normalize.go:
- A few bug fixes.
- Limit the amount of combining characters considered in FirstBoundary.
- Ditto for LastBoundary.
- Changed semantics of LastBoundary to not consider trailing illegal runes a boundary
  as long as adding bytes might still make them legal.
trie.go:
- As utf8.UTFMax is 4, we should treat UTF-8 encodings of size 5 or greater as illegal.
  This has no impact on the normalization process, but it prevents buffer overflows
  where we expect at most UTFMax bytes.

R=r
CC=golang-dev
https://golang.org/cl/4963041
2011-09-02 12:39:35 +02:00

190 lines
5.6 KiB
Go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
// This file contains Form-specific logic and wrappers for data in tables.go.
type runeInfo struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // canonical combining class
flags qcInfo // quick check flags
}
// functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b []byte) runeInfo
type lookupFuncString func(s string) runeInfo
type decompFunc func(b []byte) []byte
type decompFuncString func(s string) []byte
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
composing, compatibility bool // form type
decompose decompFunc
decomposeString decompFuncString
info lookupFunc
infoString lookupFuncString
boundaryBefore boundaryFunc
boundaryAfter boundaryFunc
}
var formTable []*formInfo
func init() {
formTable = make([]*formInfo, 4)
for i := range formTable {
f := &formInfo{}
formTable[i] = f
f.form = Form(i)
if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true
f.decompose = decomposeNFKC
f.decomposeString = decomposeStringNFKC
f.info = lookupInfoNFKC
f.infoString = lookupInfoStringNFKC
} else {
f.decompose = decomposeNFC
f.decomposeString = decomposeStringNFC
f.info = lookupInfoNFC
f.infoString = lookupInfoStringNFC
}
if Form(i) == NFC || Form(i) == NFKC {
f.composing = true
f.boundaryBefore = compBoundaryBefore
f.boundaryAfter = compBoundaryAfter
} else {
f.boundaryBefore = decompBoundary
f.boundaryAfter = decompBoundary
}
}
}
func decompBoundary(f *formInfo, info runeInfo) bool {
if info.ccc == 0 && info.flags.isYesD() { // Implies isHangul(b) == true
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
func compBoundaryBefore(f *formInfo, info runeInfo) bool {
if info.ccc == 0 && !info.flags.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
func compBoundaryAfter(f *formInfo, info runeInfo) bool {
// This misses values where the last char in a decomposition is a
// boundary such as Hangul with JamoT.
return info.isInert()
}
// We pack quick check data in 4 bits:
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..2: NFC_QC Yes(00), No (01), or Maybe (11)
// 3: Combines forward (0 == false, 1 == true)
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
//
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8
func (i qcInfo) isYesC() bool { return i&0x2 == 0 }
func (i qcInfo) isNoC() bool { return i&0x6 == 0x2 }
func (i qcInfo) isMaybe() bool { return i&0x4 != 0 }
func (i qcInfo) isYesD() bool { return i&0x1 == 0 }
func (i qcInfo) isNoD() bool { return i&0x1 != 0 }
func (i qcInfo) combinesForward() bool { return i&0x8 != 0 }
func (i qcInfo) combinesBackward() bool { return i&0x4 != 0 } // == isMaybe
func (i qcInfo) hasDecomposition() bool { return i&0x1 != 0 } // == isNoD
func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0
}
// Wrappers for tables.go
// The 16-bit value of the decompostion tries is an index into a byte
// array of UTF-8 decomposition sequences. The first byte is the number
// of bytes in the decomposition (excluding this length byte). The actual
// sequence starts at the offset+1.
func decomposeNFC(b []byte) []byte {
p := nfcDecompTrie.lookupUnsafe(b)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeNFKC(b []byte) []byte {
p := nfkcDecompTrie.lookupUnsafe(b)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeStringNFC(s string) []byte {
p := nfcDecompTrie.lookupStringUnsafe(s)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeStringNFKC(s string) []byte {
p := nfkcDecompTrie.lookupStringUnsafe(s)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b uint32) uint32 {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
return recompMap[key]
}
// The 16-bit character info has the following bit layout:
// 0..7 CCC value.
// 8..11 qcInfo for NFC/NFD
// 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b []byte) runeInfo {
v, sz := charInfoTrie.lookup(b)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}
func lookupInfoStringNFC(s string) runeInfo {
v, sz := charInfoTrie.lookupString(s)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}
func lookupInfoNFKC(b []byte) runeInfo {
v, sz := charInfoTrie.lookup(b)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}
func lookupInfoStringNFKC(s string) runeInfo {
v, sz := charInfoTrie.lookupString(s)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}