mirror of
https://github.com/golang/go
synced 2024-11-12 03:50:21 -07:00
exp/norm: Added Iter type for iterating on segment boundaries. This type is mainly to be used
by other low-level libraries, like collate. Extra care has been given to optimize the performance of normalizing to NFD, as this is what will be used by the collator. The overhead of checking whether a string is normalized vs simply decomposing a string is neglible. Assuming that most strings are in the FCD form, this iterator can be used to decompose strings and normalize with minimal overhead. R=r CC=golang-dev https://golang.org/cl/5676057
This commit is contained in:
parent
9666a959cf
commit
ecd24f381e
@ -66,6 +66,18 @@ func (rb *reorderBuffer) flush(out []byte) []byte {
|
||||
return out
|
||||
}
|
||||
|
||||
// flushCopy copies the normalized segment to buf and resets rb.
|
||||
// It returns the number of bytes written to buf.
|
||||
func (rb *reorderBuffer) flushCopy(buf []byte) int {
|
||||
p := 0
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
runep := rb.rune[i]
|
||||
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
|
||||
}
|
||||
rb.reset()
|
||||
return p
|
||||
}
|
||||
|
||||
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||
// It returns false if the buffer is not large enough to hold the rune.
|
||||
// It is used internally by insert and insertString only.
|
||||
@ -96,35 +108,44 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
|
||||
// insert inserts the given rune in the buffer ordered by CCC.
|
||||
// It returns true if the buffer was large enough to hold the decomposed rune.
|
||||
func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
|
||||
if info.size == 3 {
|
||||
if rune := src.hangul(i); rune != 0 {
|
||||
return rb.decomposeHangul(rune)
|
||||
}
|
||||
if rune := src.hangul(i); rune != 0 {
|
||||
return rb.decomposeHangul(rune)
|
||||
}
|
||||
if info.hasDecomposition() {
|
||||
dcomp := info.decomposition()
|
||||
rb.tmpBytes = inputBytes(dcomp)
|
||||
for i := 0; i < len(dcomp); {
|
||||
info = rb.f.info(&rb.tmpBytes, i)
|
||||
pos := rb.nbyte
|
||||
if !rb.insertOrdered(info) {
|
||||
return false
|
||||
}
|
||||
end := i + int(info.size)
|
||||
copy(rb.byte[pos:], dcomp[i:end])
|
||||
i = end
|
||||
}
|
||||
} else {
|
||||
// insertOrder changes nbyte
|
||||
return rb.insertDecomposed(info.decomposition())
|
||||
}
|
||||
return rb.insertSingle(src, i, info)
|
||||
}
|
||||
|
||||
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
|
||||
saveNrune, saveNbyte := rb.nrune, rb.nbyte
|
||||
rb.tmpBytes = inputBytes(dcomp)
|
||||
for i := 0; i < len(dcomp); {
|
||||
info := rb.f.info(&rb.tmpBytes, i)
|
||||
pos := rb.nbyte
|
||||
if !rb.insertOrdered(info) {
|
||||
rb.nrune, rb.nbyte = saveNrune, saveNbyte
|
||||
return false
|
||||
}
|
||||
src.copySlice(rb.byte[pos:], i, i+int(info.size))
|
||||
i += copy(rb.byte[pos:], dcomp[i:i+int(info.size)])
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||
// position i. info is the runeInfo for the rune at position i.
|
||||
func (rb *reorderBuffer) insertSingle(src input, i int, info runeInfo) bool {
|
||||
// insertOrder changes nbyte
|
||||
pos := rb.nbyte
|
||||
if !rb.insertOrdered(info) {
|
||||
return false
|
||||
}
|
||||
src.copySlice(rb.byte[pos:], i, i+int(info.size))
|
||||
return true
|
||||
}
|
||||
|
||||
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||
func (rb *reorderBuffer) appendRune(r rune) {
|
||||
bn := rb.nbyte
|
||||
@ -182,8 +203,12 @@ const (
|
||||
jamoLVTCount = 19 * 21 * 28
|
||||
)
|
||||
|
||||
// Caller must verify that len(b) >= 3.
|
||||
const hangulUTF8Size = 3
|
||||
|
||||
func isHangul(b []byte) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
@ -202,8 +227,10 @@ func isHangul(b []byte) bool {
|
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2
|
||||
}
|
||||
|
||||
// Caller must verify that len(b) >= 3.
|
||||
func isHangulString(b string) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
@ -234,6 +261,22 @@ func isHangulWithoutJamoT(b []byte) bool {
|
||||
return c < jamoLVTCount && c%jamoTCount == 0
|
||||
}
|
||||
|
||||
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||
// of bytes written. len(buf) should be at least 9.
|
||||
func decomposeHangul(buf []byte, r rune) int {
|
||||
const JamoUTF8Len = 3
|
||||
r -= hangulBase
|
||||
x := r % jamoTCount
|
||||
r /= jamoTCount
|
||||
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
|
||||
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
|
||||
if x != 0 {
|
||||
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
|
||||
return 3 * JamoUTF8Len
|
||||
}
|
||||
return 2 * JamoUTF8Len
|
||||
}
|
||||
|
||||
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||
// its Jamo components.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||
|
@ -47,14 +47,14 @@ func runTests(t *testing.T, name string, fm Form, f insertFunc, tests []TestCase
|
||||
}
|
||||
}
|
||||
|
||||
func TestFlush(t *testing.T) {
|
||||
type flushFunc func(rb *reorderBuffer) []byte
|
||||
|
||||
func testFlush(t *testing.T, name string, fn flushFunc) {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(NFC, nil)
|
||||
out := make([]byte, 0)
|
||||
|
||||
out = rb.flush(out)
|
||||
out := fn(&rb)
|
||||
if len(out) != 0 {
|
||||
t.Errorf("wrote bytes on flush of empty buffer. (len(out) = %d)", len(out))
|
||||
t.Errorf("%s: wrote bytes on flush of empty buffer. (len(out) = %d)", name, len(out))
|
||||
}
|
||||
|
||||
for _, r := range []rune("world!") {
|
||||
@ -65,16 +65,32 @@ func TestFlush(t *testing.T) {
|
||||
out = rb.flush(out)
|
||||
want := "Hello world!"
|
||||
if string(out) != want {
|
||||
t.Errorf(`output after flush was "%s"; want "%s"`, string(out), want)
|
||||
t.Errorf(`%s: output after flush was "%s"; want "%s"`, name, string(out), want)
|
||||
}
|
||||
if rb.nrune != 0 {
|
||||
t.Errorf("flush: non-null size of info buffer (rb.nrune == %d)", rb.nrune)
|
||||
t.Errorf("%s: non-null size of info buffer (rb.nrune == %d)", name, rb.nrune)
|
||||
}
|
||||
if rb.nbyte != 0 {
|
||||
t.Errorf("flush: non-null size of byte buffer (rb.nbyte == %d)", rb.nbyte)
|
||||
t.Errorf("%s: non-null size of byte buffer (rb.nbyte == %d)", name, rb.nbyte)
|
||||
}
|
||||
}
|
||||
|
||||
func flushF(rb *reorderBuffer) []byte {
|
||||
out := make([]byte, 0)
|
||||
return rb.flush(out)
|
||||
}
|
||||
|
||||
func flushCopyF(rb *reorderBuffer) []byte {
|
||||
out := make([]byte, MaxSegmentSize)
|
||||
n := rb.flushCopy(out)
|
||||
return out[:n]
|
||||
}
|
||||
|
||||
func TestFlush(t *testing.T) {
|
||||
testFlush(t, "flush", flushF)
|
||||
testFlush(t, "flushCopy", flushCopyF)
|
||||
}
|
||||
|
||||
var insertTests = []TestCase{
|
||||
{[]rune{'a'}, []rune{'a'}},
|
||||
{[]rune{0x300}, []rune{0x300}},
|
||||
|
@ -7,7 +7,7 @@ package norm
|
||||
import "unicode/utf8"
|
||||
|
||||
type input interface {
|
||||
skipASCII(p int) int
|
||||
skipASCII(p, max int) int
|
||||
skipNonStarter(p int) int
|
||||
appendSlice(buf []byte, s, e int) []byte
|
||||
copySlice(buf []byte, s, e int)
|
||||
@ -18,8 +18,8 @@ type input interface {
|
||||
|
||||
type inputString string
|
||||
|
||||
func (s inputString) skipASCII(p int) int {
|
||||
for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
|
||||
func (s inputString) skipASCII(p, max int) int {
|
||||
for ; p < max && s[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
return p
|
||||
}
|
||||
@ -59,8 +59,8 @@ func (s inputString) hangul(p int) rune {
|
||||
|
||||
type inputBytes []byte
|
||||
|
||||
func (s inputBytes) skipASCII(p int) int {
|
||||
for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
|
||||
func (s inputBytes) skipASCII(p, max int) int {
|
||||
for ; p < max && s[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
286
src/pkg/exp/norm/iter.go
Normal file
286
src/pkg/exp/norm/iter.go
Normal file
@ -0,0 +1,286 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
const MaxSegmentSize = maxByteBufferSize
|
||||
|
||||
// An Iter iterates over a string or byte slice, while normalizing it
|
||||
// to a given Form.
|
||||
type Iter struct {
|
||||
rb reorderBuffer
|
||||
info runeInfo // first character saved from previous iteration
|
||||
next iterFunc // implementation of next depends on form
|
||||
|
||||
p int // current position in input source
|
||||
outStart int // start of current segment in output buffer
|
||||
inStart int // start of current segment in input source
|
||||
maxp int // position in output buffer after which not to start a new segment
|
||||
maxseg int // for tracking an excess of combining characters
|
||||
|
||||
tccc uint8
|
||||
done bool
|
||||
}
|
||||
|
||||
type iterFunc func(*Iter, []byte) int
|
||||
|
||||
// SetInput initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) SetInput(f Form, src []byte) {
|
||||
i.rb.init(f, src)
|
||||
if i.rb.f.composing {
|
||||
i.next = nextComposed
|
||||
} else {
|
||||
i.next = nextDecomposed
|
||||
}
|
||||
i.p = 0
|
||||
if i.done = len(src) == 0; !i.done {
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
}
|
||||
|
||||
// SetInputString initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) SetInputString(f Form, src string) {
|
||||
i.rb.initString(f, src)
|
||||
if i.rb.f.composing {
|
||||
i.next = nextComposed
|
||||
} else {
|
||||
i.next = nextDecomposed
|
||||
}
|
||||
i.p = 0
|
||||
if i.done = len(src) == 0; !i.done {
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
}
|
||||
|
||||
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||
func (i *Iter) Pos() int {
|
||||
return i.p
|
||||
}
|
||||
|
||||
// Done returns true if there is no more input to process.
|
||||
func (i *Iter) Done() bool {
|
||||
return i.done
|
||||
}
|
||||
|
||||
// Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the
|
||||
// largest boundary of i.input such that the result fits in buf.
|
||||
// It returns the number of bytes written to buf.
|
||||
// len(buf) should be at least MaxSegmentSize.
|
||||
// Done must be false before calling Next.
|
||||
func (i *Iter) Next(buf []byte) int {
|
||||
return i.next(i, buf)
|
||||
}
|
||||
|
||||
func (i *Iter) initNext(outn, inStart int) {
|
||||
i.outStart = 0
|
||||
i.inStart = inStart
|
||||
i.maxp = outn - MaxSegmentSize
|
||||
i.maxseg = MaxSegmentSize
|
||||
}
|
||||
|
||||
// setStart resets the start of the new segment to the given position.
|
||||
// It returns true if there is not enough room for the new segment.
|
||||
func (i *Iter) setStart(outp, inp int) bool {
|
||||
if outp > i.maxp {
|
||||
return true
|
||||
}
|
||||
i.outStart = outp
|
||||
i.inStart = inp
|
||||
i.maxseg = outp + MaxSegmentSize
|
||||
return false
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||
func nextDecomposed(i *Iter, out []byte) int {
|
||||
var outp int
|
||||
i.initNext(len(out), i.p)
|
||||
doFast:
|
||||
inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
|
||||
for {
|
||||
if sz := int(i.info.size); sz <= 1 {
|
||||
// ASCII or illegal byte. Either way, advance by 1.
|
||||
i.p++
|
||||
outp++
|
||||
max := min(i.rb.nsrc, len(out)-outp+i.p)
|
||||
if np := i.rb.src.skipASCII(i.p, max); np > i.p {
|
||||
outp += np - i.p
|
||||
i.p = np
|
||||
if i.p >= i.rb.nsrc {
|
||||
break
|
||||
}
|
||||
// ASCII may combine with consecutive runes.
|
||||
if i.setStart(outp-1, i.p-1) {
|
||||
i.p--
|
||||
outp--
|
||||
i.info.size = 1
|
||||
break
|
||||
}
|
||||
}
|
||||
} else if d := i.info.decomposition(); d != nil {
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
|
||||
p := outp + len(d)
|
||||
if p > i.maxseg && i.setStart(outp, i.p) {
|
||||
return outp
|
||||
}
|
||||
copy(out[outp:], d)
|
||||
outp = p
|
||||
i.p += sz
|
||||
inCopyStart, outCopyStart = i.p, outp
|
||||
} else if r := i.rb.src.hangul(i.p); r != 0 {
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
|
||||
for {
|
||||
outp += decomposeHangul(out[outp:], r)
|
||||
i.p += hangulUTF8Size
|
||||
if r = i.rb.src.hangul(i.p); r == 0 {
|
||||
break
|
||||
}
|
||||
if i.setStart(outp, i.p) {
|
||||
return outp
|
||||
}
|
||||
}
|
||||
inCopyStart, outCopyStart = i.p, outp
|
||||
} else {
|
||||
p := outp + sz
|
||||
if p > i.maxseg && i.setStart(outp, i.p) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
}
|
||||
if i.p >= i.rb.nsrc {
|
||||
break
|
||||
}
|
||||
prevCC := i.info.tccc
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if cc := i.info.ccc; cc == 0 {
|
||||
if i.setStart(outp, i.p) {
|
||||
break
|
||||
}
|
||||
} else if cc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
}
|
||||
if inCopyStart != i.p {
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
|
||||
}
|
||||
i.done = i.p >= i.rb.nsrc
|
||||
return outp
|
||||
doNorm:
|
||||
// Insert what we have decomposed so far in the reorderBuffer.
|
||||
// As we will only reorder, there will always be enough room.
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
|
||||
if !i.rb.insertDecomposed(out[i.outStart:outp]) {
|
||||
// Start over to prevent decompositions from crossing segment boundaries.
|
||||
// This is a rare occurance.
|
||||
i.p = i.inStart
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
outp = i.outStart
|
||||
for {
|
||||
if !i.rb.insert(i.rb.src, i.p, i.info) {
|
||||
break
|
||||
}
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
outp += i.rb.flushCopy(out[outp:])
|
||||
i.done = true
|
||||
return outp
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.ccc == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
// new segment or too many combining characters: exit normalization
|
||||
if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
|
||||
return outp
|
||||
}
|
||||
goto doFast
|
||||
}
|
||||
|
||||
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||
func nextComposed(i *Iter, out []byte) int {
|
||||
var outp int
|
||||
i.initNext(len(out), i.p)
|
||||
doFast:
|
||||
inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
|
||||
var prevCC uint8
|
||||
for {
|
||||
if !i.info.isYesC() {
|
||||
goto doNorm
|
||||
}
|
||||
if cc := i.info.ccc; cc == 0 {
|
||||
if i.setStart(outp, i.p) {
|
||||
break
|
||||
}
|
||||
} else if cc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
prevCC = i.info.tccc
|
||||
sz := int(i.info.size)
|
||||
if sz == 0 {
|
||||
sz = 1 // illegal rune: copy byte-by-byte
|
||||
}
|
||||
p := outp + sz
|
||||
if p > i.maxseg && i.setStart(outp, i.p) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
max := min(i.rb.nsrc, len(out)-outp+i.p)
|
||||
if np := i.rb.src.skipASCII(i.p, max); np > i.p {
|
||||
outp += np - i.p
|
||||
i.p = np
|
||||
if i.p >= i.rb.nsrc {
|
||||
break
|
||||
}
|
||||
// ASCII may combine with consecutive runes.
|
||||
if i.setStart(outp-1, i.p-1) {
|
||||
i.p--
|
||||
outp--
|
||||
i.info = runeInfo{size: 1}
|
||||
break
|
||||
}
|
||||
}
|
||||
if i.p >= i.rb.nsrc {
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
if inCopyStart != i.p {
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
|
||||
}
|
||||
i.done = i.p >= i.rb.nsrc
|
||||
return outp
|
||||
doNorm:
|
||||
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart)
|
||||
outp, i.p = i.outStart, i.inStart
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
for {
|
||||
if !i.rb.insert(i.rb.src, i.p, i.info) {
|
||||
break
|
||||
}
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
i.rb.compose()
|
||||
outp += i.rb.flushCopy(out[outp:])
|
||||
i.done = true
|
||||
return outp
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.boundaryBefore() {
|
||||
break
|
||||
}
|
||||
}
|
||||
i.rb.compose()
|
||||
if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
|
||||
return outp
|
||||
}
|
||||
goto doFast
|
||||
}
|
186
src/pkg/exp/norm/iter_test.go
Normal file
186
src/pkg/exp/norm/iter_test.go
Normal file
@ -0,0 +1,186 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var iterBufSizes = []int{
|
||||
MaxSegmentSize,
|
||||
1.5 * MaxSegmentSize,
|
||||
2 * MaxSegmentSize,
|
||||
3 * MaxSegmentSize,
|
||||
100 * MaxSegmentSize,
|
||||
}
|
||||
|
||||
func doIterNorm(f Form, buf []byte, s string) []byte {
|
||||
acc := []byte{}
|
||||
i := Iter{}
|
||||
i.SetInputString(f, s)
|
||||
for !i.Done() {
|
||||
n := i.Next(buf)
|
||||
acc = append(acc, buf[:n]...)
|
||||
}
|
||||
return acc
|
||||
}
|
||||
|
||||
func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bool) {
|
||||
for i, test := range tests {
|
||||
in := test.left + test.right
|
||||
gold := test.out
|
||||
if norm {
|
||||
gold = string(f.AppendString(nil, test.out))
|
||||
}
|
||||
for _, sz := range iterBufSizes {
|
||||
buf := make([]byte, sz)
|
||||
out := string(doIterNorm(f, buf, in))
|
||||
if len(out) != len(gold) {
|
||||
const msg = "%s:%d:%d: length is %d; want %d"
|
||||
t.Errorf(msg, name, i, sz, len(out), len(gold))
|
||||
}
|
||||
if out != gold {
|
||||
// Find first rune that differs and show context.
|
||||
ir := []rune(out)
|
||||
ig := []rune(gold)
|
||||
for j := 0; j < len(ir) && j < len(ig); j++ {
|
||||
if ir[j] == ig[j] {
|
||||
continue
|
||||
}
|
||||
if j -= 3; j < 0 {
|
||||
j = 0
|
||||
}
|
||||
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
|
||||
const msg = "%s:%d:%d: runeAt(%d) = %U; want %U"
|
||||
t.Errorf(msg, name, i, sz, j, ir[j], ig[j])
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func rep(r rune, n int) string {
|
||||
return strings.Repeat(string(r), n)
|
||||
}
|
||||
|
||||
var iterTests = []AppendTest{
|
||||
{"", ascii, ascii},
|
||||
{"", txt_all, txt_all},
|
||||
{"", "a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, MaxSegmentSize/2)},
|
||||
}
|
||||
|
||||
var iterTestsD = []AppendTest{
|
||||
{ // segment overflow on unchanged character
|
||||
"",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2) + "\u0316",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0316\u0300",
|
||||
},
|
||||
{ // segment overflow on unchanged character + start value
|
||||
"",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars+4) + "\u0316",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
|
||||
},
|
||||
{ // segment overflow on decomposition
|
||||
"",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2),
|
||||
},
|
||||
{ // segment overflow on decomposition + start value
|
||||
"",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"a" + rep(0x0300, MaxSegmentSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
{ // start value after ASCII overflow
|
||||
"",
|
||||
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
|
||||
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
|
||||
},
|
||||
{ // start value after Hangul overflow
|
||||
"",
|
||||
rep(0xAC00, MaxSegmentSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
|
||||
strings.Repeat("\u1100\u1161", MaxSegmentSize/6) + rep(0x300, maxCombiningChars-1) + "\u0320" + rep(0x300, 3),
|
||||
},
|
||||
{ // start value after cc=0
|
||||
"",
|
||||
"您您" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"您您" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
{ // start value after normalization
|
||||
"",
|
||||
"\u0300\u0320a" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"\u0320\u0300a" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
}
|
||||
|
||||
var iterTestsC = []AppendTest{
|
||||
{ // ordering of non-composing combining characters
|
||||
"",
|
||||
"\u0305\u0316",
|
||||
"\u0316\u0305",
|
||||
},
|
||||
{ // segment overflow
|
||||
"",
|
||||
"a" + rep(0x0305, MaxSegmentSize/2+4) + "\u0316",
|
||||
"a" + rep(0x0305, MaxSegmentSize/2-1) + "\u0316" + rep(0x305, 5),
|
||||
},
|
||||
}
|
||||
|
||||
func TestIterNextD(t *testing.T) {
|
||||
runIterTests(t, "IterNextD1", NFKD, appendTests, true)
|
||||
runIterTests(t, "IterNextD2", NFKD, iterTests, true)
|
||||
runIterTests(t, "IterNextD3", NFKD, iterTestsD, false)
|
||||
}
|
||||
|
||||
func TestIterNextC(t *testing.T) {
|
||||
runIterTests(t, "IterNextC1", NFKC, appendTests, true)
|
||||
runIterTests(t, "IterNextC2", NFKC, iterTests, true)
|
||||
runIterTests(t, "IterNextC3", NFKC, iterTestsC, false)
|
||||
}
|
||||
|
||||
type SegmentTest struct {
|
||||
in string
|
||||
out []string
|
||||
}
|
||||
|
||||
var segmentTests = []SegmentTest{
|
||||
{rep('a', MaxSegmentSize), []string{rep('a', MaxSegmentSize), ""}},
|
||||
{rep('a', MaxSegmentSize+2), []string{rep('a', MaxSegmentSize-1), "aaa", ""}},
|
||||
{rep('a', MaxSegmentSize) + "\u0300aa", []string{rep('a', MaxSegmentSize-1), "a\u0300", "aa", ""}},
|
||||
}
|
||||
|
||||
// Note that, by design, segmentation is equal for composing and decomposing forms.
|
||||
func TestIterSegmentation(t *testing.T) {
|
||||
segmentTest(t, "SegmentTestD", NFD, segmentTests)
|
||||
segmentTest(t, "SegmentTestC", NFC, segmentTests)
|
||||
}
|
||||
|
||||
func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
|
||||
iter := Iter{}
|
||||
for i, tt := range segmentTests {
|
||||
buf := make([]byte, MaxSegmentSize)
|
||||
iter.SetInputString(f, tt.in)
|
||||
for j, seg := range tt.out {
|
||||
if seg == "" {
|
||||
if !iter.Done() {
|
||||
n := iter.Next(buf)
|
||||
res := string(buf[:n])
|
||||
t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if iter.Done() {
|
||||
t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
|
||||
}
|
||||
n := iter.Next(buf)
|
||||
seg = f.String(seg)
|
||||
if res := string(buf[:n]); res != seg {
|
||||
t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d)`, name, i, j, res, len(res), seg, len(seg))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -243,7 +243,7 @@ func quickSpan(rb *reorderBuffer, i int) int {
|
||||
lastSegStart := i
|
||||
src, n := rb.src, rb.nsrc
|
||||
for i < n {
|
||||
if j := src.skipASCII(i); i != j {
|
||||
if j := src.skipASCII(i, n); i != j {
|
||||
i = j
|
||||
lastSegStart = i - 1
|
||||
lastCC = 0
|
||||
@ -448,11 +448,16 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
|
||||
}
|
||||
// Check that decomposition doesn't result in overflow.
|
||||
if info.hasDecomposition() {
|
||||
dcomp := info.decomposition()
|
||||
for i := 0; i < len(dcomp); {
|
||||
inf := rb.f.info(inputBytes(dcomp), i)
|
||||
i += int(inf.size)
|
||||
if isHangul(buf) {
|
||||
i += int(info.size)
|
||||
n++
|
||||
} else {
|
||||
dcomp := info.decomposition()
|
||||
for i := 0; i < len(dcomp); {
|
||||
inf := rb.f.info(inputBytes(dcomp), i)
|
||||
i += int(inf.size)
|
||||
n++
|
||||
}
|
||||
}
|
||||
} else {
|
||||
n++
|
||||
|
@ -5,6 +5,7 @@
|
||||
package norm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
@ -495,15 +496,40 @@ func TestAppend(t *testing.T) {
|
||||
runAppendTests(t, "TestString", NFKC, stringF, appendTests)
|
||||
}
|
||||
|
||||
func appendBench(f Form, in []byte) func() {
|
||||
buf := make([]byte, 0, 4*len(in))
|
||||
return func() {
|
||||
f.Append(buf, in...)
|
||||
}
|
||||
}
|
||||
|
||||
func iterBench(f Form, in []byte) func() {
|
||||
buf := make([]byte, 4*len(in))
|
||||
iter := Iter{}
|
||||
return func() {
|
||||
iter.SetInput(f, in)
|
||||
for !iter.Done() {
|
||||
iter.Next(buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
|
||||
//bm = append(bm, appendBench(f, in))
|
||||
bm = append(bm, iterBench(f, in))
|
||||
return bm
|
||||
}
|
||||
|
||||
func doFormBenchmark(b *testing.B, inf, f Form, s string) {
|
||||
b.StopTimer()
|
||||
in := inf.Bytes([]byte(s))
|
||||
buf := make([]byte, 2*len(in))
|
||||
b.SetBytes(int64(len(in)))
|
||||
bm := appendBenchmarks(nil, f, in)
|
||||
b.SetBytes(int64(len(in) * len(bm)))
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
buf = f.Append(buf[0:0], in...)
|
||||
buf = buf[0:0]
|
||||
for _, fn := range bm {
|
||||
fn()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -549,17 +575,21 @@ func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFD, NFD, txt_kr)
|
||||
}
|
||||
|
||||
var forms = []Form{NFC, NFD, NFKC, NFKD}
|
||||
|
||||
func doTextBenchmark(b *testing.B, s string) {
|
||||
b.StopTimer()
|
||||
b.SetBytes(int64(len(s)) * 4)
|
||||
in := []byte(s)
|
||||
var buf = make([]byte, 0, 2*len(in))
|
||||
bm := []func(){}
|
||||
for _, f := range forms {
|
||||
bm = appendBenchmarks(bm, f, in)
|
||||
}
|
||||
b.SetBytes(int64(len(s) * len(bm)))
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
NFC.Append(buf, in...)
|
||||
NFD.Append(buf, in...)
|
||||
NFKC.Append(buf, in...)
|
||||
NFKD.Append(buf, in...)
|
||||
for _, f := range bm {
|
||||
f()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -584,6 +614,11 @@ func BenchmarkJapanese(b *testing.B) {
|
||||
func BenchmarkChinese(b *testing.B) {
|
||||
doTextBenchmark(b, txt_cn)
|
||||
}
|
||||
func BenchmarkOverflow(b *testing.B) {
|
||||
doTextBenchmark(b, overflow)
|
||||
}
|
||||
|
||||
var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
|
||||
|
||||
// Tests sampled from the Canonical ordering tests (Part 2) of
|
||||
// http://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
|
@ -220,6 +220,17 @@ func cmpIsNormal(t *Test, name string, f norm.Form, test string, result, want bo
|
||||
func doTest(t *Test, f norm.Form, gold, test string) {
|
||||
result := f.Bytes([]byte(test))
|
||||
cmpResult(t, "Bytes", f, gold, test, string(result))
|
||||
sresult := f.String(test)
|
||||
cmpResult(t, "String", f, gold, test, sresult)
|
||||
buf := make([]byte, norm.MaxSegmentSize)
|
||||
acc := []byte{}
|
||||
i := norm.Iter{}
|
||||
i.SetInputString(f, test)
|
||||
for !i.Done() {
|
||||
n := i.Next(buf)
|
||||
acc = append(acc, buf[:n]...)
|
||||
}
|
||||
cmpResult(t, "Iter.Next", f, gold, test, string(acc))
|
||||
for i := range test {
|
||||
out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
|
||||
cmpResult(t, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
|
||||
|
Loading…
Reference in New Issue
Block a user