mirror of
https://github.com/golang/go
synced 2024-10-05 06:21:24 -06:00
d5e24b6975
- fixed performance bug that could lead to O(n^2) behavior - performance improvement for ASCII case R=r, r CC=golang-dev https://golang.org/cl/4956060
432 lines
9.8 KiB
Go
432 lines
9.8 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package norm contains types and functions for normalizing Unicode strings.
|
|
package norm
|
|
|
|
import "utf8"
|
|
|
|
// A Form denotes a canonical representation of Unicode code points.
|
|
// The Unicode-defined normalization and equivalence forms are:
|
|
//
|
|
// NFC Unicode Normalization Form C
|
|
// NFD Unicode Normalization Form D
|
|
// NFKC Unicode Normalization Form KC
|
|
// NFKD Unicode Normalization Form KD
|
|
//
|
|
// For a Form f, this documentation uses the notation f(x) to mean
|
|
// the bytes or string x converted to the given form.
|
|
// A position n in x is called a boundary if conversion to the form can
|
|
// proceed independently on both sides:
|
|
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
|
//
|
|
// References: http://unicode.org/reports/tr15/ and
|
|
// http://unicode.org/notes/tn5/.
|
|
type Form int
|
|
|
|
const (
|
|
NFC Form = iota
|
|
NFD
|
|
NFKC
|
|
NFKD
|
|
)
|
|
|
|
// Bytes returns f(b). May return b if f(b) = b.
|
|
func (f Form) Bytes(b []byte) []byte {
|
|
n := f.QuickSpan(b)
|
|
if n == len(b) {
|
|
return b
|
|
}
|
|
out := make([]byte, n, len(b))
|
|
copy(out, b[0:n])
|
|
return f.Append(out, b[n:]...)
|
|
}
|
|
|
|
// String returns f(s).
|
|
func (f Form) String(s string) string {
|
|
n := f.QuickSpanString(s)
|
|
if n == len(s) {
|
|
return s
|
|
}
|
|
out := make([]byte, 0, len(s))
|
|
copy(out, s[0:n])
|
|
return string(f.AppendString(out, s[n:]))
|
|
}
|
|
|
|
// IsNormal returns true if b == f(b).
|
|
func (f Form) IsNormal(b []byte) bool {
|
|
fd := formTable[f]
|
|
bp := quickSpan(fd, b)
|
|
if bp == len(b) {
|
|
return true
|
|
}
|
|
rb := reorderBuffer{f: *fd}
|
|
for bp < len(b) {
|
|
decomposeSegment(&rb, b[bp:])
|
|
if fd.composing {
|
|
rb.compose()
|
|
}
|
|
for i := 0; i < rb.nrune; i++ {
|
|
info := rb.rune[i]
|
|
if bp+int(info.size) > len(b) {
|
|
return false
|
|
}
|
|
p := info.pos
|
|
pe := p + info.size
|
|
for ; p < pe; p++ {
|
|
if b[bp] != rb.byte[p] {
|
|
return false
|
|
}
|
|
bp++
|
|
}
|
|
}
|
|
rb.reset()
|
|
bp += quickSpan(fd, b[bp:])
|
|
}
|
|
return true
|
|
}
|
|
|
|
// IsNormalString returns true if s == f(s).
|
|
func (f Form) IsNormalString(s string) bool {
|
|
panic("not implemented")
|
|
}
|
|
|
|
// patchTail fixes a case where a rune may be incorrectly normalized
|
|
// if it is followed by illegal continuation bytes. It returns the
|
|
// patched buffer and the number of trailing continuation bytes that
|
|
// have been dropped.
|
|
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, int) {
|
|
info, p := lastRuneStart(&rb.f, buf)
|
|
if p == -1 || info.size == 0 {
|
|
return buf, 0
|
|
}
|
|
end := p + int(info.size)
|
|
extra := len(buf) - end
|
|
if extra > 0 {
|
|
buf = decomposeToLastBoundary(rb, buf[:end])
|
|
if rb.f.composing {
|
|
rb.compose()
|
|
}
|
|
return rb.flush(buf), extra
|
|
}
|
|
return buf, 0
|
|
}
|
|
|
|
func appendQuick(f *formInfo, dst, src []byte) ([]byte, int) {
|
|
if len(src) == 0 {
|
|
return dst, 0
|
|
}
|
|
end := quickSpan(f, src)
|
|
return append(dst, src[:end]...), end
|
|
}
|
|
|
|
// Append returns f(append(out, b...)).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) Append(out []byte, src ...byte) []byte {
|
|
if len(src) == 0 {
|
|
return out
|
|
}
|
|
fd := formTable[f]
|
|
rb := &reorderBuffer{f: *fd}
|
|
return doAppend(rb, out, src)
|
|
}
|
|
|
|
func doAppend(rb *reorderBuffer, out, src []byte) []byte {
|
|
doMerge := len(out) > 0
|
|
p := 0
|
|
if !utf8.RuneStart(src[0]) {
|
|
// Move leading non-starters to destination.
|
|
for p++; p < len(src) && !utf8.RuneStart(src[p]); p++ {
|
|
}
|
|
out = append(out, src[:p]...)
|
|
buf, ndropped := patchTail(rb, out)
|
|
if ndropped > 0 {
|
|
out = append(buf, src[p-ndropped:p]...)
|
|
doMerge = false // no need to merge, ends with illegal UTF-8
|
|
} else {
|
|
out = decomposeToLastBoundary(rb, buf) // force decomposition
|
|
}
|
|
}
|
|
fd := &rb.f
|
|
if doMerge {
|
|
var info runeInfo
|
|
if p < len(src) {
|
|
info = fd.info(src[p:])
|
|
if p == 0 && !fd.boundaryBefore(fd, info) {
|
|
out = decomposeToLastBoundary(rb, out)
|
|
}
|
|
}
|
|
if info.size == 0 || fd.boundaryBefore(fd, info) {
|
|
if fd.composing {
|
|
rb.compose()
|
|
}
|
|
out = rb.flush(out)
|
|
if info.size == 0 {
|
|
// Append incomplete UTF-8 encoding.
|
|
return append(out, src[p:]...)
|
|
}
|
|
}
|
|
}
|
|
if rb.nrune == 0 {
|
|
src = src[p:]
|
|
out, p = appendQuick(fd, out, src)
|
|
}
|
|
for n := 0; p < len(src); p += n {
|
|
p += decomposeSegment(rb, src[p:])
|
|
if fd.composing {
|
|
rb.compose()
|
|
}
|
|
out = rb.flush(out)
|
|
out, n = appendQuick(fd, out, src[p:])
|
|
}
|
|
return out
|
|
}
|
|
|
|
// AppendString returns f(append(out, []byte(s))).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) AppendString(out []byte, s string) []byte {
|
|
panic("not implemented")
|
|
}
|
|
|
|
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpan(b []byte) int {
|
|
return quickSpan(formTable[f], b)
|
|
}
|
|
|
|
func quickSpan(fd *formInfo, b []byte) int {
|
|
var lastCC uint8
|
|
var lastSegStart int
|
|
var i, nc int
|
|
for i < len(b) {
|
|
if b[i] < utf8.RuneSelf {
|
|
// Keep the loop tight for ASCII processing, as this is where
|
|
// most of the time is spent for this case.
|
|
for i++; i < len(b) && b[i] < utf8.RuneSelf; i++ {
|
|
}
|
|
lastSegStart = i - 1
|
|
lastCC = 0
|
|
nc = 0
|
|
continue
|
|
}
|
|
info := fd.info(b[i:])
|
|
if info.size == 0 {
|
|
// include incomplete runes
|
|
return len(b)
|
|
}
|
|
cc := info.ccc
|
|
if fd.composing {
|
|
if !info.flags.isYesC() {
|
|
break
|
|
}
|
|
} else {
|
|
if !info.flags.isYesD() {
|
|
break
|
|
}
|
|
}
|
|
if cc == 0 {
|
|
lastSegStart = i
|
|
nc = 0
|
|
} else {
|
|
if nc >= maxCombiningChars {
|
|
lastSegStart = i
|
|
lastCC = cc
|
|
nc = 1
|
|
} else {
|
|
if lastCC > cc {
|
|
return lastSegStart
|
|
}
|
|
nc++
|
|
}
|
|
}
|
|
lastCC = cc
|
|
i += int(info.size)
|
|
}
|
|
if i == len(b) {
|
|
return len(b)
|
|
}
|
|
if fd.composing {
|
|
return lastSegStart
|
|
}
|
|
return i
|
|
}
|
|
|
|
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpanString(s string) int {
|
|
panic("not implemented")
|
|
}
|
|
|
|
// FirstBoundary returns the position i of the first boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) FirstBoundary(b []byte) int {
|
|
i := 0
|
|
for ; i < len(b) && !utf8.RuneStart(b[i]); i++ {
|
|
}
|
|
if i >= len(b) {
|
|
return -1
|
|
}
|
|
fd := formTable[f]
|
|
info := fd.info(b[i:])
|
|
for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); {
|
|
i += int(info.size)
|
|
if n++; n >= maxCombiningChars {
|
|
return i
|
|
}
|
|
if i >= len(b) {
|
|
if !fd.boundaryAfter(fd, info) {
|
|
return -1
|
|
}
|
|
return len(b)
|
|
}
|
|
info = fd.info(b[i:])
|
|
}
|
|
if info.size == 0 {
|
|
return -1
|
|
}
|
|
return i
|
|
}
|
|
|
|
// FirstBoundaryInString returns the position i of the first boundary in s
|
|
// or -1 if s contains no boundary.
|
|
func (f Form) FirstBoundaryInString(s string) (i int, ok bool) {
|
|
panic("not implemented")
|
|
}
|
|
|
|
// LastBoundary returns the position i of the last boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) LastBoundary(b []byte) int {
|
|
return lastBoundary(formTable[f], b)
|
|
}
|
|
|
|
func lastBoundary(fd *formInfo, b []byte) int {
|
|
i := len(b)
|
|
info, p := lastRuneStart(fd, b)
|
|
if p == -1 {
|
|
return -1
|
|
}
|
|
if info.size == 0 { // ends with incomplete rune
|
|
if p == 0 { // starts wtih incomplete rune
|
|
return -1
|
|
}
|
|
i = p
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
|
return i
|
|
}
|
|
}
|
|
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
|
return i
|
|
}
|
|
if fd.boundaryAfter(fd, info) {
|
|
return i
|
|
}
|
|
i = p
|
|
for n := 0; i >= 0 && !fd.boundaryBefore(fd, info); {
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if n++; n >= maxCombiningChars {
|
|
return len(b)
|
|
}
|
|
if p+int(info.size) != i {
|
|
if p == -1 { // no boundary found
|
|
return -1
|
|
}
|
|
return i // boundary after an illegal UTF-8 encoding
|
|
}
|
|
i = p
|
|
}
|
|
return i
|
|
}
|
|
|
|
// LastBoundaryInString returns the position i of the last boundary in s
|
|
// or -1 if s contains no boundary.
|
|
func (f Form) LastBoundaryInString(s string) int {
|
|
panic("not implemented")
|
|
}
|
|
|
|
// decomposeSegment scans the first segment in src into rb.
|
|
// It returns the number of bytes consumed from src.
|
|
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
|
|
// when we detect a sequence of 30+ non-starter chars.
|
|
func decomposeSegment(rb *reorderBuffer, src []byte) int {
|
|
// Force one character to be consumed.
|
|
info := rb.f.info(src)
|
|
if info.size == 0 {
|
|
return 0
|
|
}
|
|
sp := 0
|
|
for rb.insert(src[sp:], info) {
|
|
sp += int(info.size)
|
|
if sp >= len(src) {
|
|
break
|
|
}
|
|
info = rb.f.info(src[sp:])
|
|
bound := rb.f.boundaryBefore(&rb.f, info)
|
|
if bound || info.size == 0 {
|
|
break
|
|
}
|
|
}
|
|
return sp
|
|
}
|
|
|
|
// lastRuneStart returns the runeInfo and position of the last
|
|
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
|
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
|
|
p := len(buf) - 1
|
|
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
|
|
}
|
|
if p < 0 {
|
|
return runeInfo{0, 0, 0, 0}, -1
|
|
}
|
|
return fd.info(buf[p:]), p
|
|
}
|
|
|
|
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
|
// and scans it into rb. Returns the buffer minus the last segment.
|
|
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
|
|
fd := &rb.f
|
|
info, i := lastRuneStart(fd, buf)
|
|
if int(info.size) != len(buf)-i {
|
|
// illegal trailing continuation bytes
|
|
return buf
|
|
}
|
|
if rb.f.boundaryAfter(fd, info) {
|
|
return buf
|
|
}
|
|
var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
|
|
add[0] = info
|
|
padd := 1
|
|
n := 1
|
|
p := len(buf) - int(info.size)
|
|
for ; p >= 0 && !rb.f.boundaryBefore(fd, info); p -= int(info.size) {
|
|
info, i = lastRuneStart(fd, buf[:p])
|
|
if int(info.size) != p-i {
|
|
break
|
|
}
|
|
// Check that decomposition doesn't result in overflow.
|
|
if info.flags.hasDecomposition() {
|
|
dcomp := rb.f.decompose(buf[p-int(info.size):])
|
|
for i := 0; i < len(dcomp); {
|
|
inf := rb.f.info(dcomp[i:])
|
|
i += int(inf.size)
|
|
n++
|
|
}
|
|
} else {
|
|
n++
|
|
}
|
|
if n > maxBackRunes {
|
|
break
|
|
}
|
|
add[padd] = info
|
|
padd++
|
|
}
|
|
pp := p
|
|
for padd--; padd >= 0; padd-- {
|
|
info = add[padd]
|
|
rb.insert(buf[pp:], info)
|
|
pp += int(info.size)
|
|
}
|
|
return buf[:p]
|
|
}
|