mirror of
https://github.com/golang/go
synced 2024-11-12 07:30:25 -07:00
exp/norm: delete, part of moving to go.text
See also https://golang.org/cl/7520044 R=golang-dev, rsc CC=golang-dev https://golang.org/cl/7533044
This commit is contained in:
parent
a30bede5ef
commit
d07978a0f7
@ -1,30 +0,0 @@
|
||||
# Copyright 2011 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
maketables: maketables.go triegen.go
|
||||
go build $^
|
||||
|
||||
maketesttables: maketesttables.go triegen.go
|
||||
go build $^
|
||||
|
||||
normregtest: normregtest.go
|
||||
go build $^
|
||||
|
||||
tables: maketables
|
||||
./maketables > tables.go
|
||||
gofmt -w tables.go
|
||||
|
||||
trietesttables: maketesttables
|
||||
./maketesttables > triedata_test.go
|
||||
gofmt -w triedata_test.go
|
||||
|
||||
# Downloads from www.unicode.org, so not part
|
||||
# of standard test scripts.
|
||||
test: testtables regtest
|
||||
|
||||
testtables: maketables
|
||||
./maketables -test -tables=
|
||||
|
||||
regtest: normregtest
|
||||
./normregtest
|
@ -1,382 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
const (
|
||||
maxCombiningChars = 30
|
||||
maxBufferSize = maxCombiningChars + 2 // +1 to hold starter +1 to hold CGJ
|
||||
maxBackRunes = maxCombiningChars - 1
|
||||
maxNFCExpansion = 3 // NFC(0x1D160)
|
||||
maxNFKCExpansion = 18 // NFKC(0xFDFA)
|
||||
|
||||
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
|
||||
)
|
||||
|
||||
// reorderBuffer is used to normalize a single segment. Characters inserted with
|
||||
// insert are decomposed and reordered based on CCC. The compose method can
|
||||
// be used to recombine characters. Note that the byte buffer does not hold
|
||||
// the UTF-8 characters in order. Only the rune array is maintained in sorted
|
||||
// order. flush writes the resulting segment to a byte array.
|
||||
type reorderBuffer struct {
|
||||
rune [maxBufferSize]Properties // Per character info.
|
||||
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
|
||||
nrune int // Number of runeInfos.
|
||||
nbyte uint8 // Number or bytes.
|
||||
f formInfo
|
||||
|
||||
src input
|
||||
nsrc int
|
||||
tmpBytes input
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) init(f Form, src []byte) {
|
||||
rb.f = *formTable[f]
|
||||
rb.src.setBytes(src)
|
||||
rb.nsrc = len(src)
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) initString(f Form, src string) {
|
||||
rb.f = *formTable[f]
|
||||
rb.src.setString(src)
|
||||
rb.nsrc = len(src)
|
||||
}
|
||||
|
||||
// reset discards all characters from the buffer.
|
||||
func (rb *reorderBuffer) reset() {
|
||||
rb.nrune = 0
|
||||
rb.nbyte = 0
|
||||
}
|
||||
|
||||
// flush appends the normalized segment to out and resets rb.
|
||||
func (rb *reorderBuffer) flush(out []byte) []byte {
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
start := rb.rune[i].pos
|
||||
end := start + rb.rune[i].size
|
||||
out = append(out, rb.byte[start:end]...)
|
||||
}
|
||||
rb.reset()
|
||||
return out
|
||||
}
|
||||
|
||||
// flushCopy copies the normalized segment to buf and resets rb.
|
||||
// It returns the number of bytes written to buf.
|
||||
func (rb *reorderBuffer) flushCopy(buf []byte) int {
|
||||
p := 0
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
runep := rb.rune[i]
|
||||
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
|
||||
}
|
||||
rb.reset()
|
||||
return p
|
||||
}
|
||||
|
||||
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||
// It returns false if the buffer is not large enough to hold the rune.
|
||||
// It is used internally by insert and insertString only.
|
||||
func (rb *reorderBuffer) insertOrdered(info Properties) bool {
|
||||
n := rb.nrune
|
||||
if n >= maxCombiningChars+1 {
|
||||
return false
|
||||
}
|
||||
b := rb.rune[:]
|
||||
cc := info.ccc
|
||||
if cc > 0 {
|
||||
// Find insertion position + move elements to make room.
|
||||
for ; n > 0; n-- {
|
||||
if b[n-1].ccc <= cc {
|
||||
break
|
||||
}
|
||||
b[n] = b[n-1]
|
||||
}
|
||||
}
|
||||
rb.nrune += 1
|
||||
pos := uint8(rb.nbyte)
|
||||
rb.nbyte += utf8.UTFMax
|
||||
info.pos = pos
|
||||
b[n] = info
|
||||
return true
|
||||
}
|
||||
|
||||
// insert inserts the given rune in the buffer ordered by CCC.
|
||||
// It returns true if the buffer was large enough to hold the decomposed rune.
|
||||
func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
|
||||
if rune := src.hangul(i); rune != 0 {
|
||||
return rb.decomposeHangul(rune)
|
||||
}
|
||||
if info.hasDecomposition() {
|
||||
return rb.insertDecomposed(info.Decomposition())
|
||||
}
|
||||
return rb.insertSingle(src, i, info)
|
||||
}
|
||||
|
||||
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
|
||||
saveNrune, saveNbyte := rb.nrune, rb.nbyte
|
||||
rb.tmpBytes.setBytes(dcomp)
|
||||
for i := 0; i < len(dcomp); {
|
||||
info := rb.f.info(rb.tmpBytes, i)
|
||||
pos := rb.nbyte
|
||||
if !rb.insertOrdered(info) {
|
||||
rb.nrune, rb.nbyte = saveNrune, saveNbyte
|
||||
return false
|
||||
}
|
||||
i += copy(rb.byte[pos:], dcomp[i:i+int(info.size)])
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||
// position i. info is the runeInfo for the rune at position i.
|
||||
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) bool {
|
||||
// insertOrder changes nbyte
|
||||
pos := rb.nbyte
|
||||
if !rb.insertOrdered(info) {
|
||||
return false
|
||||
}
|
||||
src.copySlice(rb.byte[pos:], i, i+int(info.size))
|
||||
return true
|
||||
}
|
||||
|
||||
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||
func (rb *reorderBuffer) appendRune(r rune) {
|
||||
bn := rb.nbyte
|
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
|
||||
rb.nbyte += utf8.UTFMax
|
||||
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
|
||||
rb.nrune++
|
||||
}
|
||||
|
||||
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) assignRune(pos int, r rune) {
|
||||
bn := rb.rune[pos].pos
|
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
|
||||
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
|
||||
}
|
||||
|
||||
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) runeAt(n int) rune {
|
||||
inf := rb.rune[n]
|
||||
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
|
||||
return r
|
||||
}
|
||||
|
||||
// bytesAt returns the UTF-8 encoding of the rune at position n.
|
||||
// It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) bytesAt(n int) []byte {
|
||||
inf := rb.rune[n]
|
||||
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
|
||||
}
|
||||
|
||||
// For Hangul we combine algorithmically, instead of using tables.
|
||||
const (
|
||||
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
|
||||
hangulBase0 = 0xEA
|
||||
hangulBase1 = 0xB0
|
||||
hangulBase2 = 0x80
|
||||
|
||||
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
|
||||
hangulEnd0 = 0xED
|
||||
hangulEnd1 = 0x9E
|
||||
hangulEnd2 = 0xA4
|
||||
|
||||
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
|
||||
jamoLBase0 = 0xE1
|
||||
jamoLBase1 = 0x84
|
||||
jamoLEnd = 0x1113
|
||||
jamoVBase = 0x1161
|
||||
jamoVEnd = 0x1176
|
||||
jamoTBase = 0x11A7
|
||||
jamoTEnd = 0x11C3
|
||||
|
||||
jamoTCount = 28
|
||||
jamoVCount = 21
|
||||
jamoVTCount = 21 * 28
|
||||
jamoLVTCount = 19 * 21 * 28
|
||||
)
|
||||
|
||||
const hangulUTF8Size = 3
|
||||
|
||||
func isHangul(b []byte) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
}
|
||||
b1 := b[1]
|
||||
switch {
|
||||
case b0 == hangulBase0:
|
||||
return b1 >= hangulBase1
|
||||
case b0 < hangulEnd0:
|
||||
return true
|
||||
case b0 > hangulEnd0:
|
||||
return false
|
||||
case b1 < hangulEnd1:
|
||||
return true
|
||||
}
|
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2
|
||||
}
|
||||
|
||||
func isHangulString(b string) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
}
|
||||
b1 := b[1]
|
||||
switch {
|
||||
case b0 == hangulBase0:
|
||||
return b1 >= hangulBase1
|
||||
case b0 < hangulEnd0:
|
||||
return true
|
||||
case b0 > hangulEnd0:
|
||||
return false
|
||||
case b1 < hangulEnd1:
|
||||
return true
|
||||
}
|
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2
|
||||
}
|
||||
|
||||
// Caller must ensure len(b) >= 2.
|
||||
func isJamoVT(b []byte) bool {
|
||||
// True if (rune & 0xff00) == jamoLBase
|
||||
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
|
||||
}
|
||||
|
||||
func isHangulWithoutJamoT(b []byte) bool {
|
||||
c, _ := utf8.DecodeRune(b)
|
||||
c -= hangulBase
|
||||
return c < jamoLVTCount && c%jamoTCount == 0
|
||||
}
|
||||
|
||||
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||
// of bytes written. len(buf) should be at least 9.
|
||||
func decomposeHangul(buf []byte, r rune) int {
|
||||
const JamoUTF8Len = 3
|
||||
r -= hangulBase
|
||||
x := r % jamoTCount
|
||||
r /= jamoTCount
|
||||
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
|
||||
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
|
||||
if x != 0 {
|
||||
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
|
||||
return 3 * JamoUTF8Len
|
||||
}
|
||||
return 2 * JamoUTF8Len
|
||||
}
|
||||
|
||||
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||
// its Jamo components.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||
func (rb *reorderBuffer) decomposeHangul(r rune) bool {
|
||||
b := rb.rune[:]
|
||||
n := rb.nrune
|
||||
if n+3 > len(b) {
|
||||
return false
|
||||
}
|
||||
r -= hangulBase
|
||||
x := r % jamoTCount
|
||||
r /= jamoTCount
|
||||
rb.appendRune(jamoLBase + r/jamoVCount)
|
||||
rb.appendRune(jamoVBase + r%jamoVCount)
|
||||
if x != 0 {
|
||||
rb.appendRune(jamoTBase + x)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// combineHangul algorithmically combines Jamo character components into Hangul.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
|
||||
func (rb *reorderBuffer) combineHangul(s, i, k int) {
|
||||
b := rb.rune[:]
|
||||
bn := rb.nrune
|
||||
for ; i < bn; i++ {
|
||||
cccB := b[k-1].ccc
|
||||
cccC := b[i].ccc
|
||||
if cccB == 0 {
|
||||
s = k - 1
|
||||
}
|
||||
if s != k-1 && cccB >= cccC {
|
||||
// b[i] is blocked by greater-equal cccX below it
|
||||
b[k] = b[i]
|
||||
k++
|
||||
} else {
|
||||
l := rb.runeAt(s) // also used to compare to hangulBase
|
||||
v := rb.runeAt(i) // also used to compare to jamoT
|
||||
switch {
|
||||
case jamoLBase <= l && l < jamoLEnd &&
|
||||
jamoVBase <= v && v < jamoVEnd:
|
||||
// 11xx plus 116x to LV
|
||||
rb.assignRune(s, hangulBase+
|
||||
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
|
||||
case hangulBase <= l && l < hangulEnd &&
|
||||
jamoTBase < v && v < jamoTEnd &&
|
||||
((l-hangulBase)%jamoTCount) == 0:
|
||||
// ACxx plus 11Ax to LVT
|
||||
rb.assignRune(s, l+v-jamoTBase)
|
||||
default:
|
||||
b[k] = b[i]
|
||||
k++
|
||||
}
|
||||
}
|
||||
}
|
||||
rb.nrune = k
|
||||
}
|
||||
|
||||
// compose recombines the runes in the buffer.
|
||||
// It should only be used to recompose a single segment, as it will not
|
||||
// handle alternations between Hangul and non-Hangul characters correctly.
|
||||
func (rb *reorderBuffer) compose() {
|
||||
// UAX #15, section X5 , including Corrigendum #5
|
||||
// "In any character sequence beginning with starter S, a character C is
|
||||
// blocked from S if and only if there is some character B between S
|
||||
// and C, and either B is a starter or it has the same or higher
|
||||
// combining class as C."
|
||||
bn := rb.nrune
|
||||
if bn == 0 {
|
||||
return
|
||||
}
|
||||
k := 1
|
||||
b := rb.rune[:]
|
||||
for s, i := 0, 1; i < bn; i++ {
|
||||
if isJamoVT(rb.bytesAt(i)) {
|
||||
// Redo from start in Hangul mode. Necessary to support
|
||||
// U+320E..U+321E in NFKC mode.
|
||||
rb.combineHangul(s, i, k)
|
||||
return
|
||||
}
|
||||
ii := b[i]
|
||||
// We can only use combineForward as a filter if we later
|
||||
// get the info for the combined character. This is more
|
||||
// expensive than using the filter. Using combinesBackward()
|
||||
// is safe.
|
||||
if ii.combinesBackward() {
|
||||
cccB := b[k-1].ccc
|
||||
cccC := ii.ccc
|
||||
blocked := false // b[i] blocked by starter or greater or equal CCC?
|
||||
if cccB == 0 {
|
||||
s = k - 1
|
||||
} else {
|
||||
blocked = s != k-1 && cccB >= cccC
|
||||
}
|
||||
if !blocked {
|
||||
combined := combine(rb.runeAt(s), rb.runeAt(i))
|
||||
if combined != 0 {
|
||||
rb.assignRune(s, combined)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
b[k] = b[i]
|
||||
k++
|
||||
}
|
||||
rb.nrune = k
|
||||
}
|
@ -1,143 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "testing"
|
||||
|
||||
// TestCase is used for most tests.
|
||||
type TestCase struct {
|
||||
in []rune
|
||||
out []rune
|
||||
}
|
||||
|
||||
type insertFunc func(rb *reorderBuffer, r rune) bool
|
||||
|
||||
func insert(rb *reorderBuffer, r rune) bool {
|
||||
src := inputString(string(r))
|
||||
return rb.insert(src, 0, rb.f.info(src, 0))
|
||||
}
|
||||
|
||||
func runTests(t *testing.T, name string, fm Form, f insertFunc, tests []TestCase) {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(fm, nil)
|
||||
for i, test := range tests {
|
||||
rb.reset()
|
||||
for j, rune := range test.in {
|
||||
b := []byte(string(rune))
|
||||
src := inputBytes(b)
|
||||
if !rb.insert(src, 0, rb.f.info(src, 0)) {
|
||||
t.Errorf("%s:%d: insert failed for rune %d", name, i, j)
|
||||
}
|
||||
}
|
||||
if rb.f.composing {
|
||||
rb.compose()
|
||||
}
|
||||
if rb.nrune != len(test.out) {
|
||||
t.Errorf("%s:%d: length = %d; want %d", name, i, rb.nrune, len(test.out))
|
||||
continue
|
||||
}
|
||||
for j, want := range test.out {
|
||||
found := rune(rb.runeAt(j))
|
||||
if found != want {
|
||||
t.Errorf("%s:%d: runeAt(%d) = %U; want %U", name, i, j, found, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type flushFunc func(rb *reorderBuffer) []byte
|
||||
|
||||
func testFlush(t *testing.T, name string, fn flushFunc) {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(NFC, nil)
|
||||
out := fn(&rb)
|
||||
if len(out) != 0 {
|
||||
t.Errorf("%s: wrote bytes on flush of empty buffer. (len(out) = %d)", name, len(out))
|
||||
}
|
||||
|
||||
for _, r := range []rune("world!") {
|
||||
insert(&rb, r)
|
||||
}
|
||||
|
||||
out = []byte("Hello ")
|
||||
out = rb.flush(out)
|
||||
want := "Hello world!"
|
||||
if string(out) != want {
|
||||
t.Errorf(`%s: output after flush was "%s"; want "%s"`, name, string(out), want)
|
||||
}
|
||||
if rb.nrune != 0 {
|
||||
t.Errorf("%s: non-null size of info buffer (rb.nrune == %d)", name, rb.nrune)
|
||||
}
|
||||
if rb.nbyte != 0 {
|
||||
t.Errorf("%s: non-null size of byte buffer (rb.nbyte == %d)", name, rb.nbyte)
|
||||
}
|
||||
}
|
||||
|
||||
func flushF(rb *reorderBuffer) []byte {
|
||||
out := make([]byte, 0)
|
||||
return rb.flush(out)
|
||||
}
|
||||
|
||||
func flushCopyF(rb *reorderBuffer) []byte {
|
||||
out := make([]byte, maxByteBufferSize)
|
||||
n := rb.flushCopy(out)
|
||||
return out[:n]
|
||||
}
|
||||
|
||||
func TestFlush(t *testing.T) {
|
||||
testFlush(t, "flush", flushF)
|
||||
testFlush(t, "flushCopy", flushCopyF)
|
||||
}
|
||||
|
||||
var insertTests = []TestCase{
|
||||
{[]rune{'a'}, []rune{'a'}},
|
||||
{[]rune{0x300}, []rune{0x300}},
|
||||
{[]rune{0x300, 0x316}, []rune{0x316, 0x300}}, // CCC(0x300)==230; CCC(0x316)==220
|
||||
{[]rune{0x316, 0x300}, []rune{0x316, 0x300}},
|
||||
{[]rune{0x41, 0x316, 0x300}, []rune{0x41, 0x316, 0x300}},
|
||||
{[]rune{0x41, 0x300, 0x316}, []rune{0x41, 0x316, 0x300}},
|
||||
{[]rune{0x300, 0x316, 0x41}, []rune{0x316, 0x300, 0x41}},
|
||||
{[]rune{0x41, 0x300, 0x40, 0x316}, []rune{0x41, 0x300, 0x40, 0x316}},
|
||||
}
|
||||
|
||||
func TestInsert(t *testing.T) {
|
||||
runTests(t, "TestInsert", NFD, insert, insertTests)
|
||||
}
|
||||
|
||||
var decompositionNFDTest = []TestCase{
|
||||
{[]rune{0xC0}, []rune{0x41, 0x300}},
|
||||
{[]rune{0xAC00}, []rune{0x1100, 0x1161}},
|
||||
{[]rune{0x01C4}, []rune{0x01C4}},
|
||||
{[]rune{0x320E}, []rune{0x320E}},
|
||||
{[]rune("음ẻ과"), []rune{0x110B, 0x1173, 0x11B7, 0x65, 0x309, 0x1100, 0x116A}},
|
||||
}
|
||||
|
||||
var decompositionNFKDTest = []TestCase{
|
||||
{[]rune{0xC0}, []rune{0x41, 0x300}},
|
||||
{[]rune{0xAC00}, []rune{0x1100, 0x1161}},
|
||||
{[]rune{0x01C4}, []rune{0x44, 0x5A, 0x030C}},
|
||||
{[]rune{0x320E}, []rune{0x28, 0x1100, 0x1161, 0x29}},
|
||||
}
|
||||
|
||||
func TestDecomposition(t *testing.T) {
|
||||
runTests(t, "TestDecompositionNFD", NFD, insert, decompositionNFDTest)
|
||||
runTests(t, "TestDecompositionNFKD", NFKD, insert, decompositionNFKDTest)
|
||||
}
|
||||
|
||||
var compositionTest = []TestCase{
|
||||
{[]rune{0x41, 0x300}, []rune{0xC0}},
|
||||
{[]rune{0x41, 0x316}, []rune{0x41, 0x316}},
|
||||
{[]rune{0x41, 0x300, 0x35D}, []rune{0xC0, 0x35D}},
|
||||
{[]rune{0x41, 0x316, 0x300}, []rune{0xC0, 0x316}},
|
||||
// blocking starter
|
||||
{[]rune{0x41, 0x316, 0x40, 0x300}, []rune{0x41, 0x316, 0x40, 0x300}},
|
||||
{[]rune{0x1100, 0x1161}, []rune{0xAC00}},
|
||||
// parenthesized Hangul, alternate between ASCII and Hangul.
|
||||
{[]rune{0x28, 0x1100, 0x1161, 0x29}, []rune{0x28, 0xAC00, 0x29}},
|
||||
}
|
||||
|
||||
func TestComposition(t *testing.T) {
|
||||
runTests(t, "TestComposition", NFC, insert, compositionTest)
|
||||
}
|
@ -1,81 +0,0 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"exp/norm"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// EqualSimple uses a norm.Iter to compare two non-normalized
|
||||
// strings for equivalence.
|
||||
func EqualSimple(a, b string) bool {
|
||||
var ia, ib norm.Iter
|
||||
ia.InitString(norm.NFKD, a)
|
||||
ib.InitString(norm.NFKD, b)
|
||||
for !ia.Done() && !ib.Done() {
|
||||
if !bytes.Equal(ia.Next(), ib.Next()) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return ia.Done() && ib.Done()
|
||||
}
|
||||
|
||||
// FindPrefix finds the longest common prefix of ASCII characters
|
||||
// of a and b.
|
||||
func FindPrefix(a, b string) int {
|
||||
i := 0
|
||||
for ; i < len(a) && i < len(b) && a[i] < utf8.RuneSelf && a[i] == b[i]; i++ {
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// EqualOpt is like EqualSimple, but optimizes the special
|
||||
// case for ASCII characters.
|
||||
func EqualOpt(a, b string) bool {
|
||||
n := FindPrefix(a, b)
|
||||
a, b = a[n:], b[n:]
|
||||
var ia, ib norm.Iter
|
||||
ia.InitString(norm.NFKD, a)
|
||||
ib.InitString(norm.NFKD, b)
|
||||
for !ia.Done() && !ib.Done() {
|
||||
if !bytes.Equal(ia.Next(), ib.Next()) {
|
||||
return false
|
||||
}
|
||||
if n := int64(FindPrefix(a[ia.Pos():], b[ib.Pos():])); n != 0 {
|
||||
ia.Seek(n, 1)
|
||||
ib.Seek(n, 1)
|
||||
}
|
||||
}
|
||||
return ia.Done() && ib.Done()
|
||||
}
|
||||
|
||||
var compareTests = []struct{ a, b string }{
|
||||
{"aaa", "aaa"},
|
||||
{"aaa", "aab"},
|
||||
{"a\u0300a", "\u00E0a"},
|
||||
{"a\u0300\u0320b", "a\u0320\u0300b"},
|
||||
{"\u1E0A\u0323", "\x44\u0323\u0307"},
|
||||
// A character that decomposes into multiple segments
|
||||
// spans several iterations.
|
||||
{"\u3304", "\u30A4\u30CB\u30F3\u30AF\u3099"},
|
||||
}
|
||||
|
||||
func ExampleIter() {
|
||||
for i, t := range compareTests {
|
||||
r0 := EqualSimple(t.a, t.b)
|
||||
r1 := EqualOpt(t.a, t.b)
|
||||
fmt.Printf("%d: %v %v\n", i, r0, r1)
|
||||
}
|
||||
// Output:
|
||||
// 0: true true
|
||||
// 1: false false
|
||||
// 2: true true
|
||||
// 3: true true
|
||||
// 4: true true
|
||||
// 5: true true
|
||||
}
|
@ -1,229 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||
|
||||
// Rune info is stored in a separate trie per composing form. A composing form
|
||||
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||
// bits
|
||||
// 0..8: ccc
|
||||
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
|
||||
// 16: 1
|
||||
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||
// has the form:
|
||||
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||
// The header contains the number of bytes in the decomposition (excluding this
|
||||
// length byte). The two most significant bits of this length byte correspond
|
||||
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
|
||||
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||
// for these are not zero. The value of v determines which ccc are appended
|
||||
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||
// there is an additional leading ccc.
|
||||
|
||||
const (
|
||||
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
|
||||
headerLenMask = 0x3F // extract the length value from the header byte
|
||||
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||
)
|
||||
|
||||
// Properties provides access to normalization properties of a rune.
|
||||
type Properties struct {
|
||||
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||
size uint8 // length of UTF-8 encoding of this rune
|
||||
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||
flags qcInfo // quick check flags
|
||||
index uint16
|
||||
}
|
||||
|
||||
// functions dispatchable per form
|
||||
type lookupFunc func(b input, i int) Properties
|
||||
|
||||
// formInfo holds Form-specific functions and tables.
|
||||
type formInfo struct {
|
||||
form Form
|
||||
composing, compatibility bool // form type
|
||||
info lookupFunc
|
||||
nextMain iterFunc
|
||||
}
|
||||
|
||||
var formTable []*formInfo
|
||||
|
||||
func init() {
|
||||
formTable = make([]*formInfo, 4)
|
||||
|
||||
for i := range formTable {
|
||||
f := &formInfo{}
|
||||
formTable[i] = f
|
||||
f.form = Form(i)
|
||||
if Form(i) == NFKD || Form(i) == NFKC {
|
||||
f.compatibility = true
|
||||
f.info = lookupInfoNFKC
|
||||
} else {
|
||||
f.info = lookupInfoNFC
|
||||
}
|
||||
f.nextMain = nextDecomposed
|
||||
if Form(i) == NFC || Form(i) == NFKC {
|
||||
f.nextMain = nextComposed
|
||||
f.composing = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||||
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||||
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||||
// perspective it is not a good boundary. We will therefore always use the
|
||||
// boundaries for the combining variants.
|
||||
|
||||
// BoundaryBefore returns true if this rune starts a new segment and
|
||||
// cannot combine with any rune on the left.
|
||||
func (p Properties) BoundaryBefore() bool {
|
||||
if p.ccc == 0 && !p.combinesBackward() {
|
||||
return true
|
||||
}
|
||||
// We assume that the CCC of the first character in a decomposition
|
||||
// is always non-zero if different from info.ccc and that we can return
|
||||
// false at this point. This is verified by maketables.
|
||||
return false
|
||||
}
|
||||
|
||||
// BoundaryAfter returns true if this rune cannot combine with runes to the right
|
||||
// and always denotes the end of a segment.
|
||||
func (p Properties) BoundaryAfter() bool {
|
||||
return p.isInert()
|
||||
}
|
||||
|
||||
// We pack quick check data in 4 bits:
|
||||
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||||
// 1..2: NFC_QC Yes(00), No (10), or Maybe (11)
|
||||
// 3: Combines forward (0 == false, 1 == true)
|
||||
//
|
||||
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||
// influenced by normalization.
|
||||
type qcInfo uint8
|
||||
|
||||
func (p Properties) isYesC() bool { return p.flags&0x4 == 0 }
|
||||
func (p Properties) isYesD() bool { return p.flags&0x1 == 0 }
|
||||
|
||||
func (p Properties) combinesForward() bool { return p.flags&0x8 != 0 }
|
||||
func (p Properties) combinesBackward() bool { return p.flags&0x2 != 0 } // == isMaybe
|
||||
func (p Properties) hasDecomposition() bool { return p.flags&0x1 != 0 } // == isNoD
|
||||
|
||||
func (p Properties) isInert() bool {
|
||||
return p.flags&0xf == 0 && p.ccc == 0
|
||||
}
|
||||
|
||||
func (p Properties) multiSegment() bool {
|
||||
return p.index >= firstMulti && p.index < endMulti
|
||||
}
|
||||
|
||||
// Decomposition returns the decomposition for the underlying rune
|
||||
// or nil if there is none.
|
||||
func (p Properties) Decomposition() []byte {
|
||||
if p.index == 0 {
|
||||
return nil
|
||||
}
|
||||
i := p.index
|
||||
n := decomps[i] & headerLenMask
|
||||
i++
|
||||
return decomps[i : i+uint16(n)]
|
||||
}
|
||||
|
||||
// Size returns the length of UTF-8 encoding of the rune.
|
||||
func (p Properties) Size() int {
|
||||
return int(p.size)
|
||||
}
|
||||
|
||||
// CCC returns the canonical combining class of the underlying rune.
|
||||
func (p Properties) CCC() uint8 {
|
||||
if p.index > firstCCCZeroExcept {
|
||||
return 0
|
||||
}
|
||||
return p.ccc
|
||||
}
|
||||
|
||||
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||||
// If there is no decomposition, LeadCCC equals CCC.
|
||||
func (p Properties) LeadCCC() uint8 {
|
||||
return p.ccc
|
||||
}
|
||||
|
||||
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||||
// If there is no decomposition, TrailCCC equals CCC.
|
||||
func (p Properties) TrailCCC() uint8 {
|
||||
return p.tccc
|
||||
}
|
||||
|
||||
// Recomposition
|
||||
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||||
// This clips off the bits of three entries, but we know this will not
|
||||
// result in a collision. In the unlikely event that changes to
|
||||
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||||
// Note that the recomposition map for NFC and NFKC are identical.
|
||||
|
||||
// combine returns the combined rune or 0 if it doesn't exist.
|
||||
func combine(a, b rune) rune {
|
||||
key := uint32(uint16(a))<<16 + uint32(uint16(b))
|
||||
return recompMap[key]
|
||||
}
|
||||
|
||||
func lookupInfoNFC(b input, i int) Properties {
|
||||
v, sz := b.charinfoNFC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
func lookupInfoNFKC(b input, i int) Properties {
|
||||
v, sz := b.charinfoNFKC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
// Properties returns properties for the first rune in s.
|
||||
func (f Form) Properties(s []byte) Properties {
|
||||
if f == NFC || f == NFD {
|
||||
return compInfo(nfcTrie.lookup(s))
|
||||
}
|
||||
return compInfo(nfkcTrie.lookup(s))
|
||||
}
|
||||
|
||||
// PropertiesString returns properties for the first rune in s.
|
||||
func (f Form) PropertiesString(s string) Properties {
|
||||
if f == NFC || f == NFD {
|
||||
return compInfo(nfcTrie.lookupString(s))
|
||||
}
|
||||
return compInfo(nfkcTrie.lookupString(s))
|
||||
}
|
||||
|
||||
// compInfo converts the information contained in v and sz
|
||||
// to a Properties. See the comment at the top of the file
|
||||
// for more information on the format.
|
||||
func compInfo(v uint16, sz int) Properties {
|
||||
if v == 0 {
|
||||
return Properties{size: uint8(sz)}
|
||||
} else if v >= 0x8000 {
|
||||
return Properties{
|
||||
size: uint8(sz),
|
||||
ccc: uint8(v),
|
||||
tccc: uint8(v),
|
||||
flags: qcInfo(v>>8) & qcInfoMask,
|
||||
}
|
||||
}
|
||||
// has decomposition
|
||||
h := decomps[v]
|
||||
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
|
||||
ri := Properties{size: uint8(sz), flags: f, index: v}
|
||||
if v >= firstCCC {
|
||||
v += uint16(h&headerLenMask) + 1
|
||||
ri.tccc = decomps[v]
|
||||
if v >= firstLeadingCCC {
|
||||
ri.ccc = decomps[v+1]
|
||||
}
|
||||
}
|
||||
return ri
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
type input struct {
|
||||
str string
|
||||
bytes []byte
|
||||
}
|
||||
|
||||
func inputBytes(str []byte) input {
|
||||
return input{bytes: str}
|
||||
}
|
||||
|
||||
func inputString(str string) input {
|
||||
return input{str: str}
|
||||
}
|
||||
|
||||
func (in *input) setBytes(str []byte) {
|
||||
in.str = ""
|
||||
in.bytes = str
|
||||
}
|
||||
|
||||
func (in *input) setString(str string) {
|
||||
in.str = str
|
||||
in.bytes = nil
|
||||
}
|
||||
|
||||
func (in *input) _byte(p int) byte {
|
||||
if in.bytes == nil {
|
||||
return in.str[p]
|
||||
}
|
||||
return in.bytes[p]
|
||||
}
|
||||
|
||||
func (in *input) skipASCII(p, max int) int {
|
||||
if in.bytes == nil {
|
||||
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
} else {
|
||||
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func (in *input) skipNonStarter(p int) int {
|
||||
if in.bytes == nil {
|
||||
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
|
||||
}
|
||||
} else {
|
||||
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func (in *input) appendSlice(buf []byte, b, e int) []byte {
|
||||
if in.bytes != nil {
|
||||
return append(buf, in.bytes[b:e]...)
|
||||
}
|
||||
for i := b; i < e; i++ {
|
||||
buf = append(buf, in.str[i])
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (in *input) copySlice(buf []byte, b, e int) int {
|
||||
if in.bytes == nil {
|
||||
return copy(buf, in.str[b:e])
|
||||
}
|
||||
return copy(buf, in.bytes[b:e])
|
||||
}
|
||||
|
||||
func (in *input) charinfoNFC(p int) (uint16, int) {
|
||||
if in.bytes == nil {
|
||||
return nfcTrie.lookupString(in.str[p:])
|
||||
}
|
||||
return nfcTrie.lookup(in.bytes[p:])
|
||||
}
|
||||
|
||||
func (in *input) charinfoNFKC(p int) (uint16, int) {
|
||||
if in.bytes == nil {
|
||||
return nfkcTrie.lookupString(in.str[p:])
|
||||
}
|
||||
return nfkcTrie.lookup(in.bytes[p:])
|
||||
}
|
||||
|
||||
func (in *input) hangul(p int) (r rune) {
|
||||
if in.bytes == nil {
|
||||
if !isHangulString(in.str[p:]) {
|
||||
return 0
|
||||
}
|
||||
r, _ = utf8.DecodeRuneInString(in.str[p:])
|
||||
} else {
|
||||
if !isHangul(in.bytes[p:]) {
|
||||
return 0
|
||||
}
|
||||
r, _ = utf8.DecodeRune(in.bytes[p:])
|
||||
}
|
||||
return r
|
||||
}
|
@ -1,401 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const MaxSegmentSize = maxByteBufferSize
|
||||
|
||||
// An Iter iterates over a string or byte slice, while normalizing it
|
||||
// to a given Form.
|
||||
type Iter struct {
|
||||
rb reorderBuffer
|
||||
buf [maxByteBufferSize]byte
|
||||
info Properties // first character saved from previous iteration
|
||||
next iterFunc // implementation of next depends on form
|
||||
asciiF iterFunc
|
||||
|
||||
p int // current position in input source
|
||||
multiSeg []byte // remainder of multi-segment decomposition
|
||||
}
|
||||
|
||||
type iterFunc func(*Iter) []byte
|
||||
|
||||
// Init initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) Init(f Form, src []byte) {
|
||||
i.p = 0
|
||||
if len(src) == 0 {
|
||||
i.setDone()
|
||||
i.rb.nsrc = 0
|
||||
return
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.rb.init(f, src)
|
||||
i.next = i.rb.f.nextMain
|
||||
i.asciiF = nextASCIIBytes
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
|
||||
// InitString initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) InitString(f Form, src string) {
|
||||
i.p = 0
|
||||
if len(src) == 0 {
|
||||
i.setDone()
|
||||
i.rb.nsrc = 0
|
||||
return
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.rb.initString(f, src)
|
||||
i.next = i.rb.f.nextMain
|
||||
i.asciiF = nextASCIIString
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
|
||||
// Seek sets the segment to be returned by the next call to Next to start
|
||||
// at position p. It is the responsibility of the caller to set p to the
|
||||
// start of a UTF8 rune.
|
||||
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
|
||||
var abs int64
|
||||
switch whence {
|
||||
case 0:
|
||||
abs = offset
|
||||
case 1:
|
||||
abs = int64(i.p) + offset
|
||||
case 2:
|
||||
abs = int64(i.rb.nsrc) + offset
|
||||
default:
|
||||
return 0, fmt.Errorf("norm: invalid whence")
|
||||
}
|
||||
if abs < 0 {
|
||||
return 0, fmt.Errorf("norm: negative position")
|
||||
}
|
||||
if int(abs) >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return int64(i.p), nil
|
||||
}
|
||||
i.p = int(abs)
|
||||
i.multiSeg = nil
|
||||
i.next = i.rb.f.nextMain
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
return abs, nil
|
||||
}
|
||||
|
||||
// returnSlice returns a slice of the underlying input type as a byte slice.
|
||||
// If the underlying is of type []byte, it will simply return a slice.
|
||||
// If the underlying is of type string, it will copy the slice to the buffer
|
||||
// and return that.
|
||||
func (i *Iter) returnSlice(a, b int) []byte {
|
||||
if i.rb.src.bytes == nil {
|
||||
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
|
||||
}
|
||||
return i.rb.src.bytes[a:b]
|
||||
}
|
||||
|
||||
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||
func (i *Iter) Pos() int {
|
||||
return i.p
|
||||
}
|
||||
|
||||
func (i *Iter) setDone() {
|
||||
i.next = nextDone
|
||||
i.p = i.rb.nsrc
|
||||
}
|
||||
|
||||
// Done returns true if there is no more input to process.
|
||||
func (i *Iter) Done() bool {
|
||||
return i.p >= i.rb.nsrc
|
||||
}
|
||||
|
||||
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
||||
// For any input a and b for which f(a) == f(b), subsequent calls
|
||||
// to Next will return the same segments.
|
||||
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
||||
// Although not guaranteed, n will typically be the smallest possible n.
|
||||
func (i *Iter) Next() []byte {
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextASCIIBytes(i *Iter) []byte {
|
||||
p := i.p + 1
|
||||
if p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return i.rb.src.bytes[i.p:p]
|
||||
}
|
||||
if i.rb.src.bytes[p] < utf8.RuneSelf {
|
||||
p0 := i.p
|
||||
i.p = p
|
||||
return i.rb.src.bytes[p0:p]
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextASCIIString(i *Iter) []byte {
|
||||
p := i.p + 1
|
||||
if p >= i.rb.nsrc {
|
||||
i.buf[0] = i.rb.src.str[i.p]
|
||||
i.setDone()
|
||||
return i.buf[:1]
|
||||
}
|
||||
if i.rb.src.str[p] < utf8.RuneSelf {
|
||||
i.buf[0] = i.rb.src.str[i.p]
|
||||
i.p = p
|
||||
return i.buf[:1]
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextHangul(i *Iter) []byte {
|
||||
if r := i.rb.src.hangul(i.p); r != 0 {
|
||||
i.p += hangulUTF8Size
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
}
|
||||
return i.buf[:decomposeHangul(i.buf[:], r)]
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextDone(i *Iter) []byte {
|
||||
return nil
|
||||
}
|
||||
|
||||
// nextMulti is used for iterating over multi-segment decompositions
|
||||
// for decomposing normal forms.
|
||||
func nextMulti(i *Iter) []byte {
|
||||
j := 0
|
||||
d := i.multiSeg
|
||||
// skip first rune
|
||||
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
|
||||
}
|
||||
for j < len(d) {
|
||||
info := i.rb.f.info(input{bytes: d}, j)
|
||||
if info.ccc == 0 {
|
||||
i.multiSeg = d[j:]
|
||||
return d[:j]
|
||||
}
|
||||
j += int(info.size)
|
||||
}
|
||||
// treat last segment as normal decomposition
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
// nextMultiNorm is used for iterating over multi-segment decompositions
|
||||
// for composing normal forms.
|
||||
func nextMultiNorm(i *Iter) []byte {
|
||||
j := 0
|
||||
d := i.multiSeg
|
||||
// skip first rune
|
||||
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
|
||||
}
|
||||
for j < len(d) {
|
||||
info := i.rb.f.info(input{bytes: d}, j)
|
||||
if info.ccc == 0 {
|
||||
i.multiSeg = d[j:]
|
||||
return d[:j]
|
||||
}
|
||||
j += int(info.size)
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.next = nextComposed
|
||||
i.p++ // restore old valud of i.p. See nextComposed.
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||
func nextDecomposed(i *Iter) (next []byte) {
|
||||
startp, outp := i.p, 0
|
||||
inCopyStart, outCopyStart := i.p, 0
|
||||
for {
|
||||
if sz := int(i.info.size); sz <= 1 {
|
||||
p := i.p
|
||||
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return i.returnSlice(p, i.p)
|
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
||||
i.next = i.asciiF
|
||||
return i.returnSlice(p, i.p)
|
||||
}
|
||||
outp++
|
||||
} else if d := i.info.Decomposition(); d != nil {
|
||||
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
||||
// Case 1: there is a leftover to copy. In this case the decomposition
|
||||
// must begin with a modifier and should always be appended.
|
||||
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
||||
p := outp + len(d)
|
||||
if outp > 0 {
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
if p > len(i.buf) {
|
||||
return i.buf[:outp]
|
||||
}
|
||||
} else if i.info.multiSegment() {
|
||||
// outp must be 0 as multi-segment decompositions always
|
||||
// start a new segment.
|
||||
if i.multiSeg == nil {
|
||||
i.multiSeg = d
|
||||
i.next = nextMulti
|
||||
return nextMulti(i)
|
||||
}
|
||||
// We are in the last segment. Treat as normal decomposition.
|
||||
d = i.multiSeg
|
||||
i.multiSeg = nil
|
||||
p = len(d)
|
||||
}
|
||||
prevCC := i.info.tccc
|
||||
if i.p += sz; i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
i.info = Properties{} // Force BoundaryBefore to succeed.
|
||||
} else {
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
if i.info.BoundaryBefore() {
|
||||
if outp > 0 {
|
||||
copy(i.buf[outp:], d)
|
||||
return i.buf[:p]
|
||||
}
|
||||
return d
|
||||
}
|
||||
copy(i.buf[outp:], d)
|
||||
outp = p
|
||||
inCopyStart, outCopyStart = i.p, outp
|
||||
if i.info.ccc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
continue
|
||||
} else if r := i.rb.src.hangul(i.p); r != 0 {
|
||||
i.next = nextHangul
|
||||
i.p += hangulUTF8Size
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
}
|
||||
return i.buf[:decomposeHangul(i.buf[:], r)]
|
||||
} else {
|
||||
p := outp + sz
|
||||
if p > len(i.buf) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
}
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
prevCC := i.info.tccc
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.BoundaryBefore() {
|
||||
break
|
||||
} else if i.info.ccc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
}
|
||||
if outCopyStart == 0 {
|
||||
return i.returnSlice(inCopyStart, i.p)
|
||||
} else if inCopyStart < i.p {
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
}
|
||||
return i.buf[:outp]
|
||||
doNorm:
|
||||
// Insert what we have decomposed so far in the reorderBuffer.
|
||||
// As we will only reorder, there will always be enough room.
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
if !i.rb.insertDecomposed(i.buf[0:outp]) {
|
||||
// Start over to prevent decompositions from crossing segment boundaries.
|
||||
// This is a rare occurrence.
|
||||
i.p = startp
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
for {
|
||||
if !i.rb.insert(i.rb.src, i.p, i.info) {
|
||||
break
|
||||
}
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.ccc == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
// new segment or too many combining characters: exit normalization
|
||||
return i.buf[:i.rb.flushCopy(i.buf[:])]
|
||||
}
|
||||
|
||||
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||
func nextComposed(i *Iter) []byte {
|
||||
outp, startp := 0, i.p
|
||||
var prevCC uint8
|
||||
for {
|
||||
if !i.info.isYesC() {
|
||||
goto doNorm
|
||||
}
|
||||
if cc := i.info.ccc; cc == 0 && outp > 0 {
|
||||
break
|
||||
} else if cc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
prevCC = i.info.tccc
|
||||
sz := int(i.info.size)
|
||||
if sz == 0 {
|
||||
sz = 1 // illegal rune: copy byte-by-byte
|
||||
}
|
||||
p := outp + sz
|
||||
if p > len(i.buf) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
||||
i.next = i.asciiF
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
return i.returnSlice(startp, i.p)
|
||||
doNorm:
|
||||
multi := false
|
||||
i.p = startp
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
for {
|
||||
if !i.rb.insert(i.rb.src, i.p, i.info) {
|
||||
break
|
||||
}
|
||||
multi = multi || i.info.multiSegment()
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.BoundaryBefore() {
|
||||
break
|
||||
}
|
||||
}
|
||||
i.rb.compose()
|
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
||||
if multi {
|
||||
i.p-- // fake not being done yet
|
||||
i.multiSeg = seg
|
||||
i.next = nextMultiNorm
|
||||
return nextMultiNorm(i)
|
||||
}
|
||||
return seg
|
||||
}
|
@ -1,188 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func doIterNorm(f Form, s string) []byte {
|
||||
acc := []byte{}
|
||||
i := Iter{}
|
||||
i.InitString(f, s)
|
||||
for !i.Done() {
|
||||
acc = append(acc, i.Next()...)
|
||||
}
|
||||
return acc
|
||||
}
|
||||
|
||||
func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bool) {
|
||||
for i, test := range tests {
|
||||
in := test.left + test.right
|
||||
gold := test.out
|
||||
if norm {
|
||||
gold = string(f.AppendString(nil, test.out))
|
||||
}
|
||||
out := string(doIterNorm(f, in))
|
||||
if len(out) != len(gold) {
|
||||
const msg = "%s:%d: length is %d; want %d"
|
||||
t.Errorf(msg, name, i, len(out), len(gold))
|
||||
}
|
||||
if out != gold {
|
||||
// Find first rune that differs and show context.
|
||||
ir := []rune(out)
|
||||
ig := []rune(gold)
|
||||
t.Errorf("\n%X != \n%X", ir, ig)
|
||||
for j := 0; j < len(ir) && j < len(ig); j++ {
|
||||
if ir[j] == ig[j] {
|
||||
continue
|
||||
}
|
||||
if j -= 3; j < 0 {
|
||||
j = 0
|
||||
}
|
||||
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
|
||||
const msg = "%s:%d: runeAt(%d) = %U; want %U"
|
||||
t.Errorf(msg, name, i, j, ir[j], ig[j])
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func rep(r rune, n int) string {
|
||||
return strings.Repeat(string(r), n)
|
||||
}
|
||||
|
||||
const segSize = maxByteBufferSize
|
||||
|
||||
var iterTests = []AppendTest{
|
||||
{"", ascii, ascii},
|
||||
{"", txt_all, txt_all},
|
||||
{"", "a" + rep(0x0300, segSize/2), "a" + rep(0x0300, segSize/2)},
|
||||
}
|
||||
|
||||
var iterTestsD = []AppendTest{
|
||||
{ // segment overflow on unchanged character
|
||||
"",
|
||||
"a" + rep(0x0300, segSize/2) + "\u0316",
|
||||
"a" + rep(0x0300, segSize/2-1) + "\u0316\u0300",
|
||||
},
|
||||
{ // segment overflow on unchanged character + start value
|
||||
"",
|
||||
"a" + rep(0x0300, segSize/2+maxCombiningChars+4) + "\u0316",
|
||||
"a" + rep(0x0300, segSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
|
||||
},
|
||||
{ // segment overflow on decomposition
|
||||
"",
|
||||
"a" + rep(0x0300, segSize/2-1) + "\u0340",
|
||||
"a" + rep(0x0300, segSize/2),
|
||||
},
|
||||
{ // segment overflow on decomposition + start value
|
||||
"",
|
||||
"a" + rep(0x0300, segSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"a" + rep(0x0300, segSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
{ // start value after ASCII overflow
|
||||
"",
|
||||
rep('a', segSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
|
||||
rep('a', segSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
|
||||
},
|
||||
{ // start value after Hangul overflow
|
||||
"",
|
||||
rep(0xAC00, segSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
|
||||
strings.Repeat("\u1100\u1161", segSize/6) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 1),
|
||||
},
|
||||
{ // start value after cc=0
|
||||
"",
|
||||
"您您" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"您您" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
{ // start value after normalization
|
||||
"",
|
||||
"\u0300\u0320a" + rep(0x300, maxCombiningChars+4) + "\u0320",
|
||||
"\u0320\u0300a" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
|
||||
},
|
||||
}
|
||||
|
||||
var iterTestsC = []AppendTest{
|
||||
{ // ordering of non-composing combining characters
|
||||
"",
|
||||
"\u0305\u0316",
|
||||
"\u0316\u0305",
|
||||
},
|
||||
{ // segment overflow
|
||||
"",
|
||||
"a" + rep(0x0305, segSize/2+4) + "\u0316",
|
||||
"a" + rep(0x0305, segSize/2-1) + "\u0316" + rep(0x305, 5),
|
||||
},
|
||||
}
|
||||
|
||||
func TestIterNextD(t *testing.T) {
|
||||
runIterTests(t, "IterNextD1", NFKD, appendTests, true)
|
||||
runIterTests(t, "IterNextD2", NFKD, iterTests, true)
|
||||
runIterTests(t, "IterNextD3", NFKD, iterTestsD, false)
|
||||
}
|
||||
|
||||
func TestIterNextC(t *testing.T) {
|
||||
runIterTests(t, "IterNextC1", NFKC, appendTests, true)
|
||||
runIterTests(t, "IterNextC2", NFKC, iterTests, true)
|
||||
runIterTests(t, "IterNextC3", NFKC, iterTestsC, false)
|
||||
}
|
||||
|
||||
type SegmentTest struct {
|
||||
in string
|
||||
out []string
|
||||
}
|
||||
|
||||
var segmentTests = []SegmentTest{
|
||||
{"\u1E0A\u0323a", []string{"\x44\u0323\u0307", "a", ""}},
|
||||
{rep('a', segSize), append(strings.Split(rep('a', segSize), ""), "")},
|
||||
{rep('a', segSize+2), append(strings.Split(rep('a', segSize+2), ""), "")},
|
||||
{rep('a', segSize) + "\u0300aa",
|
||||
append(strings.Split(rep('a', segSize-1), ""), "a\u0300", "a", "a", "")},
|
||||
}
|
||||
|
||||
var segmentTestsK = []SegmentTest{
|
||||
{"\u3332", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u3099", ""}},
|
||||
// last segment of multi-segment decomposition needs normalization
|
||||
{"\u3332\u093C", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u093C\u3099", ""}},
|
||||
// Hangul and Jamo are grouped togeter.
|
||||
{"\uAC00", []string{"\u1100\u1161", ""}},
|
||||
{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
|
||||
{"\u1100\u1161", []string{"\u1100\u1161", ""}},
|
||||
}
|
||||
|
||||
// Note that, by design, segmentation is equal for composing and decomposing forms.
|
||||
func TestIterSegmentation(t *testing.T) {
|
||||
segmentTest(t, "SegmentTestD", NFD, segmentTests)
|
||||
segmentTest(t, "SegmentTestC", NFC, segmentTests)
|
||||
segmentTest(t, "SegmentTestD", NFKD, segmentTestsK)
|
||||
segmentTest(t, "SegmentTestC", NFKC, segmentTestsK)
|
||||
}
|
||||
|
||||
func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
|
||||
iter := Iter{}
|
||||
for i, tt := range tests {
|
||||
iter.InitString(f, tt.in)
|
||||
for j, seg := range tt.out {
|
||||
if seg == "" {
|
||||
if !iter.Done() {
|
||||
res := string(iter.Next())
|
||||
t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if iter.Done() {
|
||||
t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
|
||||
}
|
||||
seg = f.String(seg)
|
||||
if res := string(iter.Next()); res != seg {
|
||||
t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d) %X %X`, name, i, j, res, len(res), seg, len(seg), []rune(res), []rune(seg))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,923 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Normalization table generator.
|
||||
// Data read from the web.
|
||||
// See forminfo.go for a description of the trie values associated with each rune.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
loadUnicodeData()
|
||||
loadCompositionExclusions()
|
||||
completeCharFields(FCanonical)
|
||||
completeCharFields(FCompatibility)
|
||||
verifyComputed()
|
||||
printChars()
|
||||
makeTables()
|
||||
testDerived()
|
||||
}
|
||||
|
||||
var url = flag.String("url",
|
||||
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/",
|
||||
"URL of Unicode database directory")
|
||||
var tablelist = flag.String("tables",
|
||||
"all",
|
||||
"comma-separated list of which tables to generate; "+
|
||||
"can be 'decomp', 'recomp', 'info' and 'all'")
|
||||
var test = flag.Bool("test",
|
||||
false,
|
||||
"test existing tables; can be used to compare web data with package data")
|
||||
var verbose = flag.Bool("verbose",
|
||||
false,
|
||||
"write data to stdout as it is parsed")
|
||||
var localFiles = flag.Bool("local",
|
||||
false,
|
||||
"data files have been copied to the current directory; for debugging only")
|
||||
|
||||
var logger = log.New(os.Stderr, "", log.Lshortfile)
|
||||
|
||||
// UnicodeData.txt has form:
|
||||
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
|
||||
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
|
||||
// See http://unicode.org/reports/tr44/ for full explanation
|
||||
// The fields:
|
||||
const (
|
||||
FCodePoint = iota
|
||||
FName
|
||||
FGeneralCategory
|
||||
FCanonicalCombiningClass
|
||||
FBidiClass
|
||||
FDecompMapping
|
||||
FDecimalValue
|
||||
FDigitValue
|
||||
FNumericValue
|
||||
FBidiMirrored
|
||||
FUnicode1Name
|
||||
FISOComment
|
||||
FSimpleUppercaseMapping
|
||||
FSimpleLowercaseMapping
|
||||
FSimpleTitlecaseMapping
|
||||
NumField
|
||||
|
||||
MaxChar = 0x10FFFF // anything above this shouldn't exist
|
||||
)
|
||||
|
||||
// Quick Check properties of runes allow us to quickly
|
||||
// determine whether a rune may occur in a normal form.
|
||||
// For a given normal form, a rune may be guaranteed to occur
|
||||
// verbatim (QC=Yes), may or may not combine with another
|
||||
// rune (QC=Maybe), or may not occur (QC=No).
|
||||
type QCResult int
|
||||
|
||||
const (
|
||||
QCUnknown QCResult = iota
|
||||
QCYes
|
||||
QCNo
|
||||
QCMaybe
|
||||
)
|
||||
|
||||
func (r QCResult) String() string {
|
||||
switch r {
|
||||
case QCYes:
|
||||
return "Yes"
|
||||
case QCNo:
|
||||
return "No"
|
||||
case QCMaybe:
|
||||
return "Maybe"
|
||||
}
|
||||
return "***UNKNOWN***"
|
||||
}
|
||||
|
||||
const (
|
||||
FCanonical = iota // NFC or NFD
|
||||
FCompatibility // NFKC or NFKD
|
||||
FNumberOfFormTypes
|
||||
)
|
||||
|
||||
const (
|
||||
MComposed = iota // NFC or NFKC
|
||||
MDecomposed // NFD or NFKD
|
||||
MNumberOfModes
|
||||
)
|
||||
|
||||
// This contains only the properties we're interested in.
|
||||
type Char struct {
|
||||
name string
|
||||
codePoint rune // if zero, this index is not a valid code point.
|
||||
ccc uint8 // canonical combining class
|
||||
excludeInComp bool // from CompositionExclusions.txt
|
||||
compatDecomp bool // it has a compatibility expansion
|
||||
|
||||
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
|
||||
|
||||
state State
|
||||
}
|
||||
|
||||
var chars = make([]Char, MaxChar+1)
|
||||
|
||||
func (c Char) String() string {
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
|
||||
fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
|
||||
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
|
||||
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
|
||||
fmt.Fprintf(buf, " state: %v\n", c.state)
|
||||
fmt.Fprintf(buf, " NFC:\n")
|
||||
fmt.Fprint(buf, c.forms[FCanonical])
|
||||
fmt.Fprintf(buf, " NFKC:\n")
|
||||
fmt.Fprint(buf, c.forms[FCompatibility])
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// In UnicodeData.txt, some ranges are marked like this:
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
// parseCharacter keeps a state variable indicating the weirdness.
|
||||
type State int
|
||||
|
||||
const (
|
||||
SNormal State = iota // known to be zero for the type
|
||||
SFirst
|
||||
SLast
|
||||
SMissing
|
||||
)
|
||||
|
||||
var lastChar = rune('\u0000')
|
||||
|
||||
func (c Char) isValid() bool {
|
||||
return c.codePoint != 0 && c.state != SMissing
|
||||
}
|
||||
|
||||
type FormInfo struct {
|
||||
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
|
||||
verified [MNumberOfModes]bool // index: MComposed or MDecomposed
|
||||
|
||||
combinesForward bool // May combine with rune on the right
|
||||
combinesBackward bool // May combine with rune on the left
|
||||
isOneWay bool // Never appears in result
|
||||
inDecomp bool // Some decompositions result in this char.
|
||||
decomp Decomposition
|
||||
expandedDecomp Decomposition
|
||||
}
|
||||
|
||||
func (f FormInfo) String() string {
|
||||
buf := bytes.NewBuffer(make([]byte, 0))
|
||||
|
||||
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
|
||||
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
|
||||
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
|
||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
|
||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
|
||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
|
||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
|
||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
type Decomposition []rune
|
||||
|
||||
func openReader(file string) (input io.ReadCloser) {
|
||||
if *localFiles {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
input = f
|
||||
} else {
|
||||
path := *url + file
|
||||
resp, err := http.Get(path)
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
logger.Fatal("bad GET status for "+file, resp.Status)
|
||||
}
|
||||
input = resp.Body
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func parseDecomposition(s string, skipfirst bool) (a []rune, e error) {
|
||||
decomp := strings.Split(s, " ")
|
||||
if len(decomp) > 0 && skipfirst {
|
||||
decomp = decomp[1:]
|
||||
}
|
||||
for _, d := range decomp {
|
||||
point, err := strconv.ParseUint(d, 16, 64)
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
a = append(a, rune(point))
|
||||
}
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func parseCharacter(line string) {
|
||||
field := strings.Split(line, ";")
|
||||
if len(field) != NumField {
|
||||
logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
|
||||
}
|
||||
x, err := strconv.ParseUint(field[FCodePoint], 16, 64)
|
||||
point := int(x)
|
||||
if err != nil {
|
||||
logger.Fatalf("%.5s...: %s", line, err)
|
||||
}
|
||||
if point == 0 {
|
||||
return // not interesting and we use 0 as unset
|
||||
}
|
||||
if point > MaxChar {
|
||||
logger.Fatalf("%5s: Rune %X > MaxChar (%X)", line, point, MaxChar)
|
||||
return
|
||||
}
|
||||
state := SNormal
|
||||
switch {
|
||||
case strings.Index(field[FName], ", First>") > 0:
|
||||
state = SFirst
|
||||
case strings.Index(field[FName], ", Last>") > 0:
|
||||
state = SLast
|
||||
}
|
||||
firstChar := lastChar + 1
|
||||
lastChar = rune(point)
|
||||
if state != SLast {
|
||||
firstChar = lastChar
|
||||
}
|
||||
x, err = strconv.ParseUint(field[FCanonicalCombiningClass], 10, 64)
|
||||
if err != nil {
|
||||
logger.Fatalf("%U: bad ccc field: %s", int(x), err)
|
||||
}
|
||||
ccc := uint8(x)
|
||||
decmap := field[FDecompMapping]
|
||||
exp, e := parseDecomposition(decmap, false)
|
||||
isCompat := false
|
||||
if e != nil {
|
||||
if len(decmap) > 0 {
|
||||
exp, e = parseDecomposition(decmap, true)
|
||||
if e != nil {
|
||||
logger.Fatalf(`%U: bad decomp |%v|: "%s"`, int(x), decmap, e)
|
||||
}
|
||||
isCompat = true
|
||||
}
|
||||
}
|
||||
for i := firstChar; i <= lastChar; i++ {
|
||||
char := &chars[i]
|
||||
char.name = field[FName]
|
||||
char.codePoint = i
|
||||
char.forms[FCompatibility].decomp = exp
|
||||
if !isCompat {
|
||||
char.forms[FCanonical].decomp = exp
|
||||
} else {
|
||||
char.compatDecomp = true
|
||||
}
|
||||
if len(decmap) > 0 {
|
||||
char.forms[FCompatibility].decomp = exp
|
||||
}
|
||||
char.ccc = ccc
|
||||
char.state = SMissing
|
||||
if i == lastChar {
|
||||
char.state = state
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func loadUnicodeData() {
|
||||
f := openReader("UnicodeData.txt")
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
parseCharacter(scanner.Text())
|
||||
}
|
||||
if scanner.Err() != nil {
|
||||
logger.Fatal(scanner.Err())
|
||||
}
|
||||
}
|
||||
|
||||
var singlePointRe = regexp.MustCompile(`^([0-9A-F]+) *$`)
|
||||
|
||||
// CompositionExclusions.txt has form:
|
||||
// 0958 # ...
|
||||
// See http://unicode.org/reports/tr44/ for full explanation
|
||||
func parseExclusion(line string) int {
|
||||
comment := strings.Index(line, "#")
|
||||
if comment >= 0 {
|
||||
line = line[0:comment]
|
||||
}
|
||||
if len(line) == 0 {
|
||||
return 0
|
||||
}
|
||||
matches := singlePointRe.FindStringSubmatch(line)
|
||||
if len(matches) != 2 {
|
||||
logger.Fatalf("%s: %d matches (expected 1)\n", line, len(matches))
|
||||
}
|
||||
point, err := strconv.ParseUint(matches[1], 16, 64)
|
||||
if err != nil {
|
||||
logger.Fatalf("%.5s...: %s", line, err)
|
||||
}
|
||||
return int(point)
|
||||
}
|
||||
|
||||
func loadCompositionExclusions() {
|
||||
f := openReader("CompositionExclusions.txt")
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
point := parseExclusion(scanner.Text())
|
||||
if point == 0 {
|
||||
continue
|
||||
}
|
||||
c := &chars[point]
|
||||
if c.excludeInComp {
|
||||
logger.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
|
||||
}
|
||||
c.excludeInComp = true
|
||||
}
|
||||
if scanner.Err() != nil {
|
||||
log.Fatal(scanner.Err())
|
||||
}
|
||||
}
|
||||
|
||||
// hasCompatDecomp returns true if any of the recursive
|
||||
// decompositions contains a compatibility expansion.
|
||||
// In this case, the character may not occur in NFK*.
|
||||
func hasCompatDecomp(r rune) bool {
|
||||
c := &chars[r]
|
||||
if c.compatDecomp {
|
||||
return true
|
||||
}
|
||||
for _, d := range c.forms[FCompatibility].decomp {
|
||||
if hasCompatDecomp(d) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Hangul related constants.
|
||||
const (
|
||||
HangulBase = 0xAC00
|
||||
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
|
||||
|
||||
JamoLBase = 0x1100
|
||||
JamoLEnd = 0x1113
|
||||
JamoVBase = 0x1161
|
||||
JamoVEnd = 0x1176
|
||||
JamoTBase = 0x11A8
|
||||
JamoTEnd = 0x11C3
|
||||
)
|
||||
|
||||
func isHangul(r rune) bool {
|
||||
return HangulBase <= r && r < HangulEnd
|
||||
}
|
||||
|
||||
func ccc(r rune) uint8 {
|
||||
return chars[r].ccc
|
||||
}
|
||||
|
||||
// Insert a rune in a buffer, ordered by Canonical Combining Class.
|
||||
func insertOrdered(b Decomposition, r rune) Decomposition {
|
||||
n := len(b)
|
||||
b = append(b, 0)
|
||||
cc := ccc(r)
|
||||
if cc > 0 {
|
||||
// Use bubble sort.
|
||||
for ; n > 0; n-- {
|
||||
if ccc(b[n-1]) <= cc {
|
||||
break
|
||||
}
|
||||
b[n] = b[n-1]
|
||||
}
|
||||
}
|
||||
b[n] = r
|
||||
return b
|
||||
}
|
||||
|
||||
// Recursively decompose.
|
||||
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
|
||||
if isHangul(r) {
|
||||
return d
|
||||
}
|
||||
dcomp := chars[r].forms[form].decomp
|
||||
if len(dcomp) == 0 {
|
||||
return insertOrdered(d, r)
|
||||
}
|
||||
for _, c := range dcomp {
|
||||
d = decomposeRecursive(form, c, d)
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func completeCharFields(form int) {
|
||||
// Phase 0: pre-expand decomposition.
|
||||
for i := range chars {
|
||||
f := &chars[i].forms[form]
|
||||
if len(f.decomp) == 0 {
|
||||
continue
|
||||
}
|
||||
exp := make(Decomposition, 0)
|
||||
for _, c := range f.decomp {
|
||||
exp = decomposeRecursive(form, c, exp)
|
||||
}
|
||||
f.expandedDecomp = exp
|
||||
}
|
||||
|
||||
// Phase 1: composition exclusion, mark decomposition.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
// Marks script-specific exclusions and version restricted.
|
||||
f.isOneWay = c.excludeInComp
|
||||
|
||||
// Singletons
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) == 1
|
||||
|
||||
// Non-starter decompositions
|
||||
if len(f.decomp) > 1 {
|
||||
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
|
||||
f.isOneWay = f.isOneWay || chk
|
||||
}
|
||||
|
||||
// Runes that decompose into more than two runes.
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) > 2
|
||||
|
||||
if form == FCompatibility {
|
||||
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
|
||||
}
|
||||
|
||||
for _, r := range f.decomp {
|
||||
chars[r].forms[form].inDecomp = true
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: forward and backward combining.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
if !f.isOneWay && len(f.decomp) == 2 {
|
||||
f0 := &chars[f.decomp[0]].forms[form]
|
||||
f1 := &chars[f.decomp[1]].forms[form]
|
||||
if !f0.isOneWay {
|
||||
f0.combinesForward = true
|
||||
}
|
||||
if !f1.isOneWay {
|
||||
f1.combinesBackward = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: quick check values.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
switch {
|
||||
case len(f.decomp) > 0:
|
||||
f.quickCheck[MDecomposed] = QCNo
|
||||
case isHangul(rune(i)):
|
||||
f.quickCheck[MDecomposed] = QCNo
|
||||
default:
|
||||
f.quickCheck[MDecomposed] = QCYes
|
||||
}
|
||||
switch {
|
||||
case f.isOneWay:
|
||||
f.quickCheck[MComposed] = QCNo
|
||||
case (i & 0xffff00) == JamoLBase:
|
||||
f.quickCheck[MComposed] = QCYes
|
||||
if JamoLBase <= i && i < JamoLEnd {
|
||||
f.combinesForward = true
|
||||
}
|
||||
if JamoVBase <= i && i < JamoVEnd {
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
f.combinesBackward = true
|
||||
f.combinesForward = true
|
||||
}
|
||||
if JamoTBase <= i && i < JamoTEnd {
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
f.combinesBackward = true
|
||||
}
|
||||
case !f.combinesBackward:
|
||||
f.quickCheck[MComposed] = QCYes
|
||||
default:
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func printBytes(b []byte, name string) {
|
||||
fmt.Printf("// %s: %d bytes\n", name, len(b))
|
||||
fmt.Printf("var %s = [...]byte {", name)
|
||||
for i, c := range b {
|
||||
switch {
|
||||
case i%64 == 0:
|
||||
fmt.Printf("\n// Bytes %x - %x\n", i, i+63)
|
||||
case i%8 == 0:
|
||||
fmt.Printf("\n")
|
||||
}
|
||||
fmt.Printf("0x%.2X, ", c)
|
||||
}
|
||||
fmt.Print("\n}\n\n")
|
||||
}
|
||||
|
||||
// See forminfo.go for format.
|
||||
func makeEntry(f *FormInfo) uint16 {
|
||||
e := uint16(0)
|
||||
if f.combinesForward {
|
||||
e |= 0x8
|
||||
}
|
||||
if f.quickCheck[MDecomposed] == QCNo {
|
||||
e |= 0x1
|
||||
}
|
||||
switch f.quickCheck[MComposed] {
|
||||
case QCYes:
|
||||
case QCNo:
|
||||
e |= 0x4
|
||||
case QCMaybe:
|
||||
e |= 0x6
|
||||
default:
|
||||
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// decompSet keeps track of unique decompositions, grouped by whether
|
||||
// the decomposition is followed by a trailing and/or leading CCC.
|
||||
type decompSet [6]map[string]bool
|
||||
|
||||
const (
|
||||
normalDecomp = iota
|
||||
firstMulti
|
||||
firstCCC
|
||||
endMulti
|
||||
firstLeadingCCC
|
||||
firstCCCZeroExcept
|
||||
lastDecomp
|
||||
)
|
||||
|
||||
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
|
||||
|
||||
func makeDecompSet() decompSet {
|
||||
m := decompSet{}
|
||||
for i := range m {
|
||||
m[i] = make(map[string]bool)
|
||||
}
|
||||
return m
|
||||
}
|
||||
func (m *decompSet) insert(key int, s string) {
|
||||
m[key][s] = true
|
||||
}
|
||||
|
||||
func printCharInfoTables() int {
|
||||
mkstr := func(r rune, f *FormInfo) (int, string) {
|
||||
d := f.expandedDecomp
|
||||
s := string([]rune(d))
|
||||
if max := 1 << 6; len(s) >= max {
|
||||
const msg = "%U: too many bytes in decomposition: %d >= %d"
|
||||
logger.Fatalf(msg, r, len(s), max)
|
||||
}
|
||||
head := uint8(len(s))
|
||||
if f.quickCheck[MComposed] != QCYes {
|
||||
head |= 0x40
|
||||
}
|
||||
if f.combinesForward {
|
||||
head |= 0x80
|
||||
}
|
||||
s = string([]byte{head}) + s
|
||||
|
||||
lccc := ccc(d[0])
|
||||
tccc := ccc(d[len(d)-1])
|
||||
cc := ccc(r)
|
||||
if cc != 0 && lccc == 0 && tccc == 0 {
|
||||
logger.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
|
||||
}
|
||||
if tccc < lccc && lccc != 0 {
|
||||
const msg = "%U: lccc (%d) must be <= tcc (%d)"
|
||||
logger.Fatalf(msg, r, lccc, tccc)
|
||||
}
|
||||
index := normalDecomp
|
||||
if tccc > 0 || lccc > 0 {
|
||||
s += string([]byte{tccc})
|
||||
index = endMulti
|
||||
for _, r := range d[1:] {
|
||||
if ccc(r) == 0 {
|
||||
index = firstCCC
|
||||
}
|
||||
}
|
||||
if lccc > 0 {
|
||||
s += string([]byte{lccc})
|
||||
if index == firstCCC {
|
||||
logger.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
|
||||
}
|
||||
index = firstLeadingCCC
|
||||
}
|
||||
if cc != lccc {
|
||||
if cc != 0 {
|
||||
logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
|
||||
}
|
||||
index = firstCCCZeroExcept
|
||||
}
|
||||
} else if len(d) > 1 {
|
||||
index = firstMulti
|
||||
}
|
||||
return index, s
|
||||
}
|
||||
|
||||
decompSet := makeDecompSet()
|
||||
|
||||
// Store the uniqued decompositions in a byte buffer,
|
||||
// preceded by their byte length.
|
||||
for _, c := range chars {
|
||||
for _, f := range c.forms {
|
||||
if len(f.expandedDecomp) == 0 {
|
||||
continue
|
||||
}
|
||||
if f.combinesBackward {
|
||||
logger.Fatalf("%U: combinesBackward and decompose", c.codePoint)
|
||||
}
|
||||
index, s := mkstr(c.codePoint, &f)
|
||||
decompSet.insert(index, s)
|
||||
}
|
||||
}
|
||||
|
||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
|
||||
size := 0
|
||||
positionMap := make(map[string]uint16)
|
||||
decompositions.WriteString("\000")
|
||||
fmt.Println("const (")
|
||||
for i, m := range decompSet {
|
||||
sa := []string{}
|
||||
for s := range m {
|
||||
sa = append(sa, s)
|
||||
}
|
||||
sort.Strings(sa)
|
||||
for _, s := range sa {
|
||||
p := decompositions.Len()
|
||||
decompositions.WriteString(s)
|
||||
positionMap[s] = uint16(p)
|
||||
}
|
||||
if cname[i] != "" {
|
||||
fmt.Printf("%s = 0x%X\n", cname[i], decompositions.Len())
|
||||
}
|
||||
}
|
||||
fmt.Println("maxDecomp = 0x8000")
|
||||
fmt.Println(")")
|
||||
b := decompositions.Bytes()
|
||||
printBytes(b, "decomps")
|
||||
size += len(b)
|
||||
|
||||
varnames := []string{"nfc", "nfkc"}
|
||||
for i := 0; i < FNumberOfFormTypes; i++ {
|
||||
trie := newNode()
|
||||
for r, c := range chars {
|
||||
f := c.forms[i]
|
||||
d := f.expandedDecomp
|
||||
if len(d) != 0 {
|
||||
_, key := mkstr(c.codePoint, &f)
|
||||
trie.insert(rune(r), positionMap[key])
|
||||
if c.ccc != ccc(d[0]) {
|
||||
// We assume the lead ccc of a decomposition !=0 in this case.
|
||||
if ccc(d[0]) == 0 {
|
||||
logger.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
|
||||
}
|
||||
}
|
||||
} else if v := makeEntry(&f)<<8 | uint16(c.ccc); v != 0 {
|
||||
trie.insert(c.codePoint, 0x8000|v)
|
||||
}
|
||||
}
|
||||
size += trie.printTables(varnames[i])
|
||||
}
|
||||
return size
|
||||
}
|
||||
|
||||
func contains(sa []string, s string) bool {
|
||||
for _, a := range sa {
|
||||
if a == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Extract the version number from the URL.
|
||||
func version() string {
|
||||
// From http://www.unicode.org/standard/versions/#Version_Numbering:
|
||||
// for the later Unicode versions, data files are located in
|
||||
// versioned directories.
|
||||
fields := strings.Split(*url, "/")
|
||||
for _, f := range fields {
|
||||
if match, _ := regexp.MatchString(`[0-9]\.[0-9]\.[0-9]`, f); match {
|
||||
return f
|
||||
}
|
||||
}
|
||||
logger.Fatal("unknown version")
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
const fileHeader = `// Generated by running
|
||||
// maketables --tables=%s --url=%s
|
||||
// DO NOT EDIT
|
||||
|
||||
package norm
|
||||
|
||||
`
|
||||
|
||||
func makeTables() {
|
||||
size := 0
|
||||
if *tablelist == "" {
|
||||
return
|
||||
}
|
||||
list := strings.Split(*tablelist, ",")
|
||||
if *tablelist == "all" {
|
||||
list = []string{"recomp", "info"}
|
||||
}
|
||||
fmt.Printf(fileHeader, *tablelist, *url)
|
||||
|
||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
|
||||
fmt.Printf("const Version = %q\n\n", version())
|
||||
|
||||
if contains(list, "info") {
|
||||
size += printCharInfoTables()
|
||||
}
|
||||
|
||||
if contains(list, "recomp") {
|
||||
// Note that we use 32 bit keys, instead of 64 bit.
|
||||
// This clips the bits of three entries, but we know
|
||||
// this won't cause a collision. The compiler will catch
|
||||
// any changes made to UnicodeData.txt that introduces
|
||||
// a collision.
|
||||
// Note that the recomposition map for NFC and NFKC
|
||||
// are identical.
|
||||
|
||||
// Recomposition map
|
||||
nrentries := 0
|
||||
for _, c := range chars {
|
||||
f := c.forms[FCanonical]
|
||||
if !f.isOneWay && len(f.decomp) > 0 {
|
||||
nrentries++
|
||||
}
|
||||
}
|
||||
sz := nrentries * 8
|
||||
size += sz
|
||||
fmt.Printf("// recompMap: %d bytes (entries only)\n", sz)
|
||||
fmt.Println("var recompMap = map[uint32]rune{")
|
||||
for i, c := range chars {
|
||||
f := c.forms[FCanonical]
|
||||
d := f.decomp
|
||||
if !f.isOneWay && len(d) > 0 {
|
||||
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
|
||||
fmt.Printf("0x%.8X: 0x%.4X,\n", key, i)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
|
||||
}
|
||||
|
||||
func printChars() {
|
||||
if *verbose {
|
||||
for _, c := range chars {
|
||||
if !c.isValid() || c.state == SMissing {
|
||||
continue
|
||||
}
|
||||
fmt.Println(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// verifyComputed does various consistency tests.
|
||||
func verifyComputed() {
|
||||
for i, c := range chars {
|
||||
for _, f := range c.forms {
|
||||
isNo := (f.quickCheck[MDecomposed] == QCNo)
|
||||
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
|
||||
log.Fatalf("%U: NF*D must be no if rune decomposes", i)
|
||||
}
|
||||
|
||||
isMaybe := f.quickCheck[MComposed] == QCMaybe
|
||||
if f.combinesBackward != isMaybe {
|
||||
log.Fatalf("%U: NF*C must be maybe if combinesBackward", i)
|
||||
}
|
||||
}
|
||||
nfc := c.forms[FCanonical]
|
||||
nfkc := c.forms[FCompatibility]
|
||||
if nfc.combinesBackward != nfkc.combinesBackward {
|
||||
logger.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var qcRe = regexp.MustCompile(`([0-9A-F\.]+) *; (NF.*_QC); ([YNM]) #.*`)
|
||||
|
||||
// Use values in DerivedNormalizationProps.txt to compare against the
|
||||
// values we computed.
|
||||
// DerivedNormalizationProps.txt has form:
|
||||
// 00C0..00C5 ; NFD_QC; N # ...
|
||||
// 0374 ; NFD_QC; N # ...
|
||||
// See http://unicode.org/reports/tr44/ for full explanation
|
||||
func testDerived() {
|
||||
if !*test {
|
||||
return
|
||||
}
|
||||
f := openReader("DerivedNormalizationProps.txt")
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
qc := qcRe.FindStringSubmatch(line)
|
||||
if qc == nil {
|
||||
continue
|
||||
}
|
||||
rng := strings.Split(qc[1], "..")
|
||||
i, err := strconv.ParseUint(rng[0], 16, 64)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
j := i
|
||||
if len(rng) > 1 {
|
||||
j, err = strconv.ParseUint(rng[1], 16, 64)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
var ftype, mode int
|
||||
qt := strings.TrimSpace(qc[2])
|
||||
switch qt {
|
||||
case "NFC_QC":
|
||||
ftype, mode = FCanonical, MComposed
|
||||
case "NFD_QC":
|
||||
ftype, mode = FCanonical, MDecomposed
|
||||
case "NFKC_QC":
|
||||
ftype, mode = FCompatibility, MComposed
|
||||
case "NFKD_QC":
|
||||
ftype, mode = FCompatibility, MDecomposed
|
||||
default:
|
||||
log.Fatalf(`Unexpected quick check type "%s"`, qt)
|
||||
}
|
||||
var qr QCResult
|
||||
switch qc[3] {
|
||||
case "Y":
|
||||
qr = QCYes
|
||||
case "N":
|
||||
qr = QCNo
|
||||
case "M":
|
||||
qr = QCMaybe
|
||||
default:
|
||||
log.Fatalf(`Unexpected quick check value "%s"`, qc[3])
|
||||
}
|
||||
var lastFailed bool
|
||||
// Verify current
|
||||
for ; i <= j; i++ {
|
||||
c := &chars[int(i)]
|
||||
c.forms[ftype].verified[mode] = true
|
||||
curqr := c.forms[ftype].quickCheck[mode]
|
||||
if curqr != qr {
|
||||
if !lastFailed {
|
||||
logger.Printf("%s: %.4X..%.4X -- %s\n",
|
||||
qt, int(i), int(j), line[0:50])
|
||||
}
|
||||
logger.Printf("%U: FAILED %s (was %v need %v)\n",
|
||||
int(i), qt, curqr, qr)
|
||||
lastFailed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if scanner.Err() != nil {
|
||||
logger.Fatal(scanner.Err())
|
||||
}
|
||||
// Any unspecified value must be QCYes. Verify this.
|
||||
for i, c := range chars {
|
||||
for j, fd := range c.forms {
|
||||
for k, qr := range fd.quickCheck {
|
||||
if !fd.verified[k] && qr != QCYes {
|
||||
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
|
||||
logger.Printf(m, i, j, k, qr, c.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Generate test data for trie code.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func main() {
|
||||
printTestTables()
|
||||
}
|
||||
|
||||
// We take the smallest, largest and an arbitrary value for each
|
||||
// of the UTF-8 sequence lengths.
|
||||
var testRunes = []rune{
|
||||
0x01, 0x0C, 0x7F, // 1-byte sequences
|
||||
0x80, 0x100, 0x7FF, // 2-byte sequences
|
||||
0x800, 0x999, 0xFFFF, // 3-byte sequences
|
||||
0x10000, 0x10101, 0x10FFFF, // 4-byte sequences
|
||||
0x200, 0x201, 0x202, 0x210, 0x215, // five entries in one sparse block
|
||||
}
|
||||
|
||||
const fileHeader = `// Generated by running
|
||||
// maketesttables
|
||||
// DO NOT EDIT
|
||||
|
||||
package norm
|
||||
|
||||
`
|
||||
|
||||
func printTestTables() {
|
||||
fmt.Print(fileHeader)
|
||||
fmt.Printf("var testRunes = %#v\n\n", testRunes)
|
||||
t := newNode()
|
||||
for i, r := range testRunes {
|
||||
t.insert(r, uint16(i))
|
||||
}
|
||||
t.printTables("testdata")
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlaceHolder(t *testing.T) {
|
||||
// Does nothing, just allows the Makefile to be canonical
|
||||
// while waiting for the package itself to be written.
|
||||
}
|
@ -1,478 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package norm contains types and functions for normalizing Unicode strings.
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// A Form denotes a canonical representation of Unicode code points.
|
||||
// The Unicode-defined normalization and equivalence forms are:
|
||||
//
|
||||
// NFC Unicode Normalization Form C
|
||||
// NFD Unicode Normalization Form D
|
||||
// NFKC Unicode Normalization Form KC
|
||||
// NFKD Unicode Normalization Form KD
|
||||
//
|
||||
// For a Form f, this documentation uses the notation f(x) to mean
|
||||
// the bytes or string x converted to the given form.
|
||||
// A position n in x is called a boundary if conversion to the form can
|
||||
// proceed independently on both sides:
|
||||
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
||||
//
|
||||
// References: http://unicode.org/reports/tr15/ and
|
||||
// http://unicode.org/notes/tn5/.
|
||||
type Form int
|
||||
|
||||
const (
|
||||
NFC Form = iota
|
||||
NFD
|
||||
NFKC
|
||||
NFKD
|
||||
)
|
||||
|
||||
// Bytes returns f(b). May return b if f(b) = b.
|
||||
func (f Form) Bytes(b []byte) []byte {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, b)
|
||||
n := quickSpan(&rb, 0)
|
||||
if n == len(b) {
|
||||
return b
|
||||
}
|
||||
out := make([]byte, n, len(b))
|
||||
copy(out, b[0:n])
|
||||
return doAppend(&rb, out, n)
|
||||
}
|
||||
|
||||
// String returns f(s).
|
||||
func (f Form) String(s string) string {
|
||||
rb := reorderBuffer{}
|
||||
rb.initString(f, s)
|
||||
n := quickSpan(&rb, 0)
|
||||
if n == len(s) {
|
||||
return s
|
||||
}
|
||||
out := make([]byte, n, len(s))
|
||||
copy(out, s[0:n])
|
||||
return string(doAppend(&rb, out, n))
|
||||
}
|
||||
|
||||
// IsNormal returns true if b == f(b).
|
||||
func (f Form) IsNormal(b []byte) bool {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, b)
|
||||
bp := quickSpan(&rb, 0)
|
||||
if bp == len(b) {
|
||||
return true
|
||||
}
|
||||
for bp < len(b) {
|
||||
decomposeSegment(&rb, bp)
|
||||
if rb.f.composing {
|
||||
rb.compose()
|
||||
}
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
info := rb.rune[i]
|
||||
if bp+int(info.size) > len(b) {
|
||||
return false
|
||||
}
|
||||
p := info.pos
|
||||
pe := p + info.size
|
||||
for ; p < pe; p++ {
|
||||
if b[bp] != rb.byte[p] {
|
||||
return false
|
||||
}
|
||||
bp++
|
||||
}
|
||||
}
|
||||
rb.reset()
|
||||
bp = quickSpan(&rb, bp)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsNormalString returns true if s == f(s).
|
||||
func (f Form) IsNormalString(s string) bool {
|
||||
rb := reorderBuffer{}
|
||||
rb.initString(f, s)
|
||||
bp := quickSpan(&rb, 0)
|
||||
if bp == len(s) {
|
||||
return true
|
||||
}
|
||||
for bp < len(s) {
|
||||
decomposeSegment(&rb, bp)
|
||||
if rb.f.composing {
|
||||
rb.compose()
|
||||
}
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
info := rb.rune[i]
|
||||
if bp+int(info.size) > len(s) {
|
||||
return false
|
||||
}
|
||||
p := info.pos
|
||||
pe := p + info.size
|
||||
for ; p < pe; p++ {
|
||||
if s[bp] != rb.byte[p] {
|
||||
return false
|
||||
}
|
||||
bp++
|
||||
}
|
||||
}
|
||||
rb.reset()
|
||||
bp = quickSpan(&rb, bp)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||
// if it is followed by illegal continuation bytes. It returns the
|
||||
// patched buffer and whether there were trailing continuation bytes.
|
||||
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
|
||||
info, p := lastRuneStart(&rb.f, buf)
|
||||
if p == -1 || info.size == 0 {
|
||||
return buf, false
|
||||
}
|
||||
end := p + int(info.size)
|
||||
extra := len(buf) - end
|
||||
if extra > 0 {
|
||||
// Potentially allocating memory. However, this only
|
||||
// happens with ill-formed UTF-8.
|
||||
x := make([]byte, 0)
|
||||
x = append(x, buf[len(buf)-extra:]...)
|
||||
buf = decomposeToLastBoundary(rb, buf[:end])
|
||||
if rb.f.composing {
|
||||
rb.compose()
|
||||
}
|
||||
buf = rb.flush(buf)
|
||||
return append(buf, x...), true
|
||||
}
|
||||
return buf, false
|
||||
}
|
||||
|
||||
func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
|
||||
if rb.nsrc == i {
|
||||
return dst, i
|
||||
}
|
||||
end := quickSpan(rb, i)
|
||||
return rb.src.appendSlice(dst, i, end), end
|
||||
}
|
||||
|
||||
// Append returns f(append(out, b...)).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) Append(out []byte, src ...byte) []byte {
|
||||
if len(src) == 0 {
|
||||
return out
|
||||
}
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, src)
|
||||
return doAppend(&rb, out, 0)
|
||||
}
|
||||
|
||||
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
|
||||
src, n := rb.src, rb.nsrc
|
||||
doMerge := len(out) > 0
|
||||
if q := src.skipNonStarter(p); q > p {
|
||||
// Move leading non-starters to destination.
|
||||
out = src.appendSlice(out, p, q)
|
||||
buf, endsInError := patchTail(rb, out)
|
||||
if endsInError {
|
||||
out = buf
|
||||
doMerge = false // no need to merge, ends with illegal UTF-8
|
||||
} else {
|
||||
out = decomposeToLastBoundary(rb, buf) // force decomposition
|
||||
}
|
||||
p = q
|
||||
}
|
||||
fd := &rb.f
|
||||
if doMerge {
|
||||
var info Properties
|
||||
if p < n {
|
||||
info = fd.info(src, p)
|
||||
if p == 0 && !info.BoundaryBefore() {
|
||||
out = decomposeToLastBoundary(rb, out)
|
||||
}
|
||||
}
|
||||
if info.size == 0 || info.BoundaryBefore() {
|
||||
if fd.composing {
|
||||
rb.compose()
|
||||
}
|
||||
out = rb.flush(out)
|
||||
if info.size == 0 {
|
||||
// Append incomplete UTF-8 encoding.
|
||||
return src.appendSlice(out, p, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
if rb.nrune == 0 {
|
||||
out, p = appendQuick(rb, out, p)
|
||||
}
|
||||
for p < n {
|
||||
p = decomposeSegment(rb, p)
|
||||
if fd.composing {
|
||||
rb.compose()
|
||||
}
|
||||
out = rb.flush(out)
|
||||
out, p = appendQuick(rb, out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// AppendString returns f(append(out, []byte(s))).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) AppendString(out []byte, src string) []byte {
|
||||
if len(src) == 0 {
|
||||
return out
|
||||
}
|
||||
rb := reorderBuffer{}
|
||||
rb.initString(f, src)
|
||||
return doAppend(&rb, out, 0)
|
||||
}
|
||||
|
||||
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpan(b []byte) int {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, b)
|
||||
n := quickSpan(&rb, 0)
|
||||
return n
|
||||
}
|
||||
|
||||
func quickSpan(rb *reorderBuffer, i int) int {
|
||||
var lastCC uint8
|
||||
var nc int
|
||||
lastSegStart := i
|
||||
src, n := rb.src, rb.nsrc
|
||||
for i < n {
|
||||
if j := src.skipASCII(i, n); i != j {
|
||||
i = j
|
||||
lastSegStart = i - 1
|
||||
lastCC = 0
|
||||
nc = 0
|
||||
continue
|
||||
}
|
||||
info := rb.f.info(src, i)
|
||||
if info.size == 0 {
|
||||
// include incomplete runes
|
||||
return n
|
||||
}
|
||||
cc := info.ccc
|
||||
if rb.f.composing {
|
||||
if !info.isYesC() {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if !info.isYesD() {
|
||||
break
|
||||
}
|
||||
}
|
||||
if cc == 0 {
|
||||
lastSegStart = i
|
||||
nc = 0
|
||||
} else {
|
||||
if nc >= maxCombiningChars {
|
||||
lastSegStart = i
|
||||
lastCC = cc
|
||||
nc = 1
|
||||
} else {
|
||||
if lastCC > cc {
|
||||
return lastSegStart
|
||||
}
|
||||
nc++
|
||||
}
|
||||
}
|
||||
lastCC = cc
|
||||
i += int(info.size)
|
||||
}
|
||||
if i == n {
|
||||
return n
|
||||
}
|
||||
if rb.f.composing {
|
||||
return lastSegStart
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpanString(s string) int {
|
||||
rb := reorderBuffer{}
|
||||
rb.initString(f, s)
|
||||
return quickSpan(&rb, 0)
|
||||
}
|
||||
|
||||
// FirstBoundary returns the position i of the first boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) FirstBoundary(b []byte) int {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, b)
|
||||
return firstBoundary(&rb)
|
||||
}
|
||||
|
||||
func firstBoundary(rb *reorderBuffer) int {
|
||||
src, nsrc := rb.src, rb.nsrc
|
||||
i := src.skipNonStarter(0)
|
||||
if i >= nsrc {
|
||||
return -1
|
||||
}
|
||||
fd := &rb.f
|
||||
info := fd.info(src, i)
|
||||
for n := 0; info.size != 0 && !info.BoundaryBefore(); {
|
||||
i += int(info.size)
|
||||
if n++; n >= maxCombiningChars {
|
||||
return i
|
||||
}
|
||||
if i >= nsrc {
|
||||
if !info.BoundaryAfter() {
|
||||
return -1
|
||||
}
|
||||
return nsrc
|
||||
}
|
||||
info = fd.info(src, i)
|
||||
}
|
||||
if info.size == 0 {
|
||||
return -1
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// FirstBoundaryInString returns the position i of the first boundary in s
|
||||
// or -1 if s contains no boundary.
|
||||
func (f Form) FirstBoundaryInString(s string) int {
|
||||
rb := reorderBuffer{}
|
||||
rb.initString(f, s)
|
||||
return firstBoundary(&rb)
|
||||
}
|
||||
|
||||
// LastBoundary returns the position i of the last boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) LastBoundary(b []byte) int {
|
||||
return lastBoundary(formTable[f], b)
|
||||
}
|
||||
|
||||
func lastBoundary(fd *formInfo, b []byte) int {
|
||||
i := len(b)
|
||||
info, p := lastRuneStart(fd, b)
|
||||
if p == -1 {
|
||||
return -1
|
||||
}
|
||||
if info.size == 0 { // ends with incomplete rune
|
||||
if p == 0 { // starts with incomplete rune
|
||||
return -1
|
||||
}
|
||||
i = p
|
||||
info, p = lastRuneStart(fd, b[:i])
|
||||
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
||||
return i
|
||||
}
|
||||
}
|
||||
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
||||
return i
|
||||
}
|
||||
if info.BoundaryAfter() {
|
||||
return i
|
||||
}
|
||||
i = p
|
||||
for n := 0; i >= 0 && !info.BoundaryBefore(); {
|
||||
info, p = lastRuneStart(fd, b[:i])
|
||||
if n++; n >= maxCombiningChars {
|
||||
return len(b)
|
||||
}
|
||||
if p+int(info.size) != i {
|
||||
if p == -1 { // no boundary found
|
||||
return -1
|
||||
}
|
||||
return i // boundary after an illegal UTF-8 encoding
|
||||
}
|
||||
i = p
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// decomposeSegment scans the first segment in src into rb.
|
||||
// It returns the number of bytes consumed from src.
|
||||
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
|
||||
// when we detect a sequence of 30+ non-starter chars.
|
||||
func decomposeSegment(rb *reorderBuffer, sp int) int {
|
||||
// Force one character to be consumed.
|
||||
info := rb.f.info(rb.src, sp)
|
||||
if info.size == 0 {
|
||||
return 0
|
||||
}
|
||||
for rb.insert(rb.src, sp, info) {
|
||||
sp += int(info.size)
|
||||
if sp >= rb.nsrc {
|
||||
break
|
||||
}
|
||||
info = rb.f.info(rb.src, sp)
|
||||
bound := info.BoundaryBefore()
|
||||
if bound || info.size == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return sp
|
||||
}
|
||||
|
||||
// lastRuneStart returns the runeInfo and position of the last
|
||||
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
||||
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
|
||||
p := len(buf) - 1
|
||||
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
|
||||
}
|
||||
if p < 0 {
|
||||
return Properties{}, -1
|
||||
}
|
||||
return fd.info(inputBytes(buf), p), p
|
||||
}
|
||||
|
||||
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
||||
// and scans it into rb. Returns the buffer minus the last segment.
|
||||
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
|
||||
fd := &rb.f
|
||||
info, i := lastRuneStart(fd, buf)
|
||||
if int(info.size) != len(buf)-i {
|
||||
// illegal trailing continuation bytes
|
||||
return buf
|
||||
}
|
||||
if info.BoundaryAfter() {
|
||||
return buf
|
||||
}
|
||||
var add [maxBackRunes]Properties // stores runeInfo in reverse order
|
||||
add[0] = info
|
||||
padd := 1
|
||||
n := 1
|
||||
p := len(buf) - int(info.size)
|
||||
for ; p >= 0 && !info.BoundaryBefore(); p -= int(info.size) {
|
||||
info, i = lastRuneStart(fd, buf[:p])
|
||||
if int(info.size) != p-i {
|
||||
break
|
||||
}
|
||||
// Check that decomposition doesn't result in overflow.
|
||||
if info.hasDecomposition() {
|
||||
if isHangul(buf) {
|
||||
i += int(info.size)
|
||||
n++
|
||||
} else {
|
||||
dcomp := info.Decomposition()
|
||||
for i := 0; i < len(dcomp); {
|
||||
inf := rb.f.info(inputBytes(dcomp), i)
|
||||
i += int(inf.size)
|
||||
n++
|
||||
}
|
||||
}
|
||||
} else {
|
||||
n++
|
||||
}
|
||||
if n > maxBackRunes {
|
||||
break
|
||||
}
|
||||
add[padd] = info
|
||||
padd++
|
||||
}
|
||||
pp := p
|
||||
for padd--; padd >= 0; padd-- {
|
||||
info = add[padd]
|
||||
rb.insert(inputBytes(buf), pp, info)
|
||||
pp += int(info.size)
|
||||
}
|
||||
return buf[:p]
|
||||
}
|
@ -1,750 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type PositionTest struct {
|
||||
input string
|
||||
pos int
|
||||
buffer string // expected contents of reorderBuffer, if applicable
|
||||
}
|
||||
|
||||
type positionFunc func(rb *reorderBuffer, s string) int
|
||||
|
||||
func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, nil)
|
||||
for i, test := range tests {
|
||||
rb.reset()
|
||||
rb.src = inputString(test.input)
|
||||
rb.nsrc = len(test.input)
|
||||
pos := fn(&rb, test.input)
|
||||
if pos != test.pos {
|
||||
t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
|
||||
}
|
||||
runes := []rune(test.buffer)
|
||||
if rb.nrune != len(runes) {
|
||||
t.Errorf("%s:%d: reorder buffer length is %d; want %d", name, i, rb.nrune, len(runes))
|
||||
continue
|
||||
}
|
||||
for j, want := range runes {
|
||||
found := rune(rb.runeAt(j))
|
||||
if found != want {
|
||||
t.Errorf("%s:%d: rune at %d is %U; want %U", name, i, j, found, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var decomposeSegmentTests = []PositionTest{
|
||||
// illegal runes
|
||||
{"\xC0", 0, ""},
|
||||
{"\u00E0\x80", 2, "\u0061\u0300"},
|
||||
// starter
|
||||
{"a", 1, "a"},
|
||||
{"ab", 1, "a"},
|
||||
// starter + composing
|
||||
{"a\u0300", 3, "a\u0300"},
|
||||
{"a\u0300b", 3, "a\u0300"},
|
||||
// with decomposition
|
||||
{"\u00C0", 2, "A\u0300"},
|
||||
{"\u00C0b", 2, "A\u0300"},
|
||||
// long
|
||||
{strings.Repeat("\u0300", 31), 62, strings.Repeat("\u0300", 31)},
|
||||
// ends with incomplete UTF-8 encoding
|
||||
{"\xCC", 0, ""},
|
||||
{"\u0300\xCC", 2, "\u0300"},
|
||||
}
|
||||
|
||||
func decomposeSegmentF(rb *reorderBuffer, s string) int {
|
||||
rb.src = inputString(s)
|
||||
rb.nsrc = len(s)
|
||||
return decomposeSegment(rb, 0)
|
||||
}
|
||||
|
||||
func TestDecomposeSegment(t *testing.T) {
|
||||
runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
|
||||
}
|
||||
|
||||
var firstBoundaryTests = []PositionTest{
|
||||
// no boundary
|
||||
{"", -1, ""},
|
||||
{"\u0300", -1, ""},
|
||||
{"\x80\x80", -1, ""},
|
||||
// illegal runes
|
||||
{"\xff", 0, ""},
|
||||
{"\u0300\xff", 2, ""},
|
||||
{"\u0300\xc0\x80\x80", 2, ""},
|
||||
// boundaries
|
||||
{"a", 0, ""},
|
||||
{"\u0300a", 2, ""},
|
||||
// Hangul
|
||||
{"\u1103\u1161", 0, ""},
|
||||
{"\u110B\u1173\u11B7", 0, ""},
|
||||
{"\u1161\u110B\u1173\u11B7", 3, ""},
|
||||
{"\u1173\u11B7\u1103\u1161", 6, ""},
|
||||
// too many combining characters.
|
||||
{strings.Repeat("\u0300", maxCombiningChars-1), -1, ""},
|
||||
{strings.Repeat("\u0300", maxCombiningChars), 60, ""},
|
||||
{strings.Repeat("\u0300", maxCombiningChars+1), 60, ""},
|
||||
}
|
||||
|
||||
func firstBoundaryF(rb *reorderBuffer, s string) int {
|
||||
return rb.f.form.FirstBoundary([]byte(s))
|
||||
}
|
||||
|
||||
func firstBoundaryStringF(rb *reorderBuffer, s string) int {
|
||||
return rb.f.form.FirstBoundaryInString(s)
|
||||
}
|
||||
|
||||
func TestFirstBoundary(t *testing.T) {
|
||||
runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
|
||||
runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
|
||||
}
|
||||
|
||||
var decomposeToLastTests = []PositionTest{
|
||||
// ends with inert character
|
||||
{"Hello!", 6, ""},
|
||||
{"\u0632", 2, ""},
|
||||
{"a\u0301\u0635", 5, ""},
|
||||
// ends with non-inert starter
|
||||
{"a", 0, "a"},
|
||||
{"a\u0301a", 3, "a"},
|
||||
{"a\u0301\u03B9", 3, "\u03B9"},
|
||||
{"a\u0327", 0, "a\u0327"},
|
||||
// illegal runes
|
||||
{"\xFF", 1, ""},
|
||||
{"aa\xFF", 3, ""},
|
||||
{"\xC0\x80\x80", 3, ""},
|
||||
{"\xCC\x80\x80", 3, ""},
|
||||
// ends with incomplete UTF-8 encoding
|
||||
{"a\xCC", 2, ""},
|
||||
// ends with combining characters
|
||||
{"\u0300\u0301", 0, "\u0300\u0301"},
|
||||
{"a\u0300\u0301", 0, "a\u0300\u0301"},
|
||||
{"a\u0301\u0308", 0, "a\u0301\u0308"},
|
||||
{"a\u0308\u0301", 0, "a\u0308\u0301"},
|
||||
{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
|
||||
{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
|
||||
{"\u00C0", 0, "A\u0300"},
|
||||
{"a\u00C0", 1, "A\u0300"},
|
||||
// decomposing
|
||||
{"a\u0300\uFDC0", 3, "\u0645\u062C\u064A"},
|
||||
{"\uFDC0" + strings.Repeat("\u0300", 26), 0, "\u0645\u062C\u064A" + strings.Repeat("\u0300", 26)},
|
||||
// Hangul
|
||||
{"a\u1103", 1, "\u1103"},
|
||||
{"a\u110B", 1, "\u110B"},
|
||||
{"a\u110B\u1173", 1, "\u110B\u1173"},
|
||||
// See comment in composition.go:compBoundaryAfter.
|
||||
{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
|
||||
{"a\uC73C", 1, "\u110B\u1173"},
|
||||
{"다음", 3, "\u110B\u1173\u11B7"},
|
||||
{"다", 0, "\u1103\u1161"},
|
||||
{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
|
||||
{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
|
||||
{"다음음", 6, "\u110B\u1173\u11B7"},
|
||||
{"음다다", 6, "\u1103\u1161"},
|
||||
// buffer overflow
|
||||
{"a" + strings.Repeat("\u0300", 30), 3, strings.Repeat("\u0300", 29)},
|
||||
{"\uFDFA" + strings.Repeat("\u0300", 14), 3, strings.Repeat("\u0300", 14)},
|
||||
// weird UTF-8
|
||||
{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
|
||||
}
|
||||
|
||||
func decomposeToLast(rb *reorderBuffer, s string) int {
|
||||
buf := decomposeToLastBoundary(rb, []byte(s))
|
||||
return len(buf)
|
||||
}
|
||||
|
||||
func TestDecomposeToLastBoundary(t *testing.T) {
|
||||
runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
|
||||
}
|
||||
|
||||
var lastBoundaryTests = []PositionTest{
|
||||
// ends with inert character
|
||||
{"Hello!", 6, ""},
|
||||
{"\u0632", 2, ""},
|
||||
// ends with non-inert starter
|
||||
{"a", 0, ""},
|
||||
// illegal runes
|
||||
{"\xff", 1, ""},
|
||||
{"aa\xff", 3, ""},
|
||||
{"a\xff\u0300", 1, ""},
|
||||
{"\xc0\x80\x80", 3, ""},
|
||||
{"\xc0\x80\x80\u0300", 3, ""},
|
||||
// ends with incomplete UTF-8 encoding
|
||||
{"\xCC", -1, ""},
|
||||
{"\xE0\x80", -1, ""},
|
||||
{"\xF0\x80\x80", -1, ""},
|
||||
{"a\xCC", 0, ""},
|
||||
{"\x80\xCC", 1, ""},
|
||||
{"\xCC\xCC", 1, ""},
|
||||
// ends with combining characters
|
||||
{"a\u0300\u0301", 0, ""},
|
||||
{"aaaa\u0300\u0301", 3, ""},
|
||||
{"\u0300a\u0300\u0301", 2, ""},
|
||||
{"\u00C0", 0, ""},
|
||||
{"a\u00C0", 1, ""},
|
||||
// decomposition may recombine
|
||||
{"\u0226", 0, ""},
|
||||
// no boundary
|
||||
{"", -1, ""},
|
||||
{"\u0300\u0301", -1, ""},
|
||||
{"\u0300", -1, ""},
|
||||
{"\x80\x80", -1, ""},
|
||||
{"\x80\x80\u0301", -1, ""},
|
||||
// Hangul
|
||||
{"다음", 3, ""},
|
||||
{"다", 0, ""},
|
||||
{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
|
||||
{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
|
||||
// too many combining characters.
|
||||
{strings.Repeat("\u0300", maxCombiningChars-1), -1, ""},
|
||||
{strings.Repeat("\u0300", maxCombiningChars), 60, ""},
|
||||
{strings.Repeat("\u0300", maxCombiningChars+1), 62, ""},
|
||||
}
|
||||
|
||||
func lastBoundaryF(rb *reorderBuffer, s string) int {
|
||||
return rb.f.form.LastBoundary([]byte(s))
|
||||
}
|
||||
|
||||
func TestLastBoundary(t *testing.T) {
|
||||
runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
|
||||
}
|
||||
|
||||
var quickSpanTests = []PositionTest{
|
||||
{"", 0, ""},
|
||||
// starters
|
||||
{"a", 1, ""},
|
||||
{"abc", 3, ""},
|
||||
{"\u043Eb", 3, ""},
|
||||
// incomplete last rune.
|
||||
{"\xCC", 1, ""},
|
||||
{"a\xCC", 2, ""},
|
||||
// incorrectly ordered combining characters
|
||||
{"\u0300\u0316", 0, ""},
|
||||
{"\u0300\u0316cd", 0, ""},
|
||||
// have a maximum number of combining characters.
|
||||
{strings.Repeat("\u035D", 30) + "\u035B", 62, ""},
|
||||
{"a" + strings.Repeat("\u035D", 30) + "\u035B", 63, ""},
|
||||
{"Ɵ" + strings.Repeat("\u035D", 30) + "\u035B", 64, ""},
|
||||
{"aa" + strings.Repeat("\u035D", 30) + "\u035B", 64, ""},
|
||||
}
|
||||
|
||||
var quickSpanNFDTests = []PositionTest{
|
||||
// needs decomposing
|
||||
{"\u00C0", 0, ""},
|
||||
{"abc\u00C0", 3, ""},
|
||||
// correctly ordered combining characters
|
||||
{"\u0300", 2, ""},
|
||||
{"ab\u0300", 4, ""},
|
||||
{"ab\u0300cd", 6, ""},
|
||||
{"\u0300cd", 4, ""},
|
||||
{"\u0316\u0300", 4, ""},
|
||||
{"ab\u0316\u0300", 6, ""},
|
||||
{"ab\u0316\u0300cd", 8, ""},
|
||||
{"ab\u0316\u0300\u00C0", 6, ""},
|
||||
{"\u0316\u0300cd", 6, ""},
|
||||
{"\u043E\u0308b", 5, ""},
|
||||
// incorrectly ordered combining characters
|
||||
{"ab\u0300\u0316", 1, ""}, // TODO: we could skip 'b' as well.
|
||||
{"ab\u0300\u0316cd", 1, ""},
|
||||
// Hangul
|
||||
{"같은", 0, ""},
|
||||
}
|
||||
|
||||
var quickSpanNFCTests = []PositionTest{
|
||||
// okay composed
|
||||
{"\u00C0", 2, ""},
|
||||
{"abc\u00C0", 5, ""},
|
||||
// correctly ordered combining characters
|
||||
{"ab\u0300", 1, ""},
|
||||
{"ab\u0300cd", 1, ""},
|
||||
{"ab\u0316\u0300", 1, ""},
|
||||
{"ab\u0316\u0300cd", 1, ""},
|
||||
{"\u00C0\u035D", 4, ""},
|
||||
// we do not special case leading combining characters
|
||||
{"\u0300cd", 0, ""},
|
||||
{"\u0300", 0, ""},
|
||||
{"\u0316\u0300", 0, ""},
|
||||
{"\u0316\u0300cd", 0, ""},
|
||||
// incorrectly ordered combining characters
|
||||
{"ab\u0300\u0316", 1, ""},
|
||||
{"ab\u0300\u0316cd", 1, ""},
|
||||
// Hangul
|
||||
{"같은", 6, ""},
|
||||
}
|
||||
|
||||
func doQuickSpan(rb *reorderBuffer, s string) int {
|
||||
return rb.f.form.QuickSpan([]byte(s))
|
||||
}
|
||||
|
||||
func doQuickSpanString(rb *reorderBuffer, s string) int {
|
||||
return rb.f.form.QuickSpanString(s)
|
||||
}
|
||||
|
||||
func TestQuickSpan(t *testing.T) {
|
||||
runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests)
|
||||
runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests)
|
||||
runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests)
|
||||
runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests)
|
||||
|
||||
runPosTests(t, "TestQuickSpanStringNFD1", NFD, doQuickSpanString, quickSpanTests)
|
||||
runPosTests(t, "TestQuickSpanStringNFD2", NFD, doQuickSpanString, quickSpanNFDTests)
|
||||
runPosTests(t, "TestQuickSpanStringNFC1", NFC, doQuickSpanString, quickSpanTests)
|
||||
runPosTests(t, "TestQuickSpanStringNFC2", NFC, doQuickSpanString, quickSpanNFCTests)
|
||||
}
|
||||
|
||||
var isNormalTests = []PositionTest{
|
||||
{"", 1, ""},
|
||||
// illegal runes
|
||||
{"\xff", 1, ""},
|
||||
// starters
|
||||
{"a", 1, ""},
|
||||
{"abc", 1, ""},
|
||||
{"\u043Eb", 1, ""},
|
||||
// incorrectly ordered combining characters
|
||||
{"\u0300\u0316", 0, ""},
|
||||
{"ab\u0300\u0316", 0, ""},
|
||||
{"ab\u0300\u0316cd", 0, ""},
|
||||
{"\u0300\u0316cd", 0, ""},
|
||||
}
|
||||
var isNormalNFDTests = []PositionTest{
|
||||
// needs decomposing
|
||||
{"\u00C0", 0, ""},
|
||||
{"abc\u00C0", 0, ""},
|
||||
// correctly ordered combining characters
|
||||
{"\u0300", 1, ""},
|
||||
{"ab\u0300", 1, ""},
|
||||
{"ab\u0300cd", 1, ""},
|
||||
{"\u0300cd", 1, ""},
|
||||
{"\u0316\u0300", 1, ""},
|
||||
{"ab\u0316\u0300", 1, ""},
|
||||
{"ab\u0316\u0300cd", 1, ""},
|
||||
{"\u0316\u0300cd", 1, ""},
|
||||
{"\u043E\u0308b", 1, ""},
|
||||
// Hangul
|
||||
{"같은", 0, ""},
|
||||
}
|
||||
var isNormalNFCTests = []PositionTest{
|
||||
// okay composed
|
||||
{"\u00C0", 1, ""},
|
||||
{"abc\u00C0", 1, ""},
|
||||
// need reordering
|
||||
{"a\u0300", 0, ""},
|
||||
{"a\u0300cd", 0, ""},
|
||||
{"a\u0316\u0300", 0, ""},
|
||||
{"a\u0316\u0300cd", 0, ""},
|
||||
// correctly ordered combining characters
|
||||
{"ab\u0300", 1, ""},
|
||||
{"ab\u0300cd", 1, ""},
|
||||
{"ab\u0316\u0300", 1, ""},
|
||||
{"ab\u0316\u0300cd", 1, ""},
|
||||
{"\u00C0\u035D", 1, ""},
|
||||
{"\u0300", 1, ""},
|
||||
{"\u0316\u0300cd", 1, ""},
|
||||
// Hangul
|
||||
{"같은", 1, ""},
|
||||
}
|
||||
|
||||
func isNormalF(rb *reorderBuffer, s string) int {
|
||||
if rb.f.form.IsNormal([]byte(s)) {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func TestIsNormal(t *testing.T) {
|
||||
runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
|
||||
runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
|
||||
runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
|
||||
runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
|
||||
}
|
||||
|
||||
type AppendTest struct {
|
||||
left string
|
||||
right string
|
||||
out string
|
||||
}
|
||||
|
||||
type appendFunc func(f Form, out []byte, s string) []byte
|
||||
|
||||
func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
|
||||
for i, test := range tests {
|
||||
out := []byte(test.left)
|
||||
out = fn(f, out, test.right)
|
||||
outs := string(out)
|
||||
if len(outs) != len(test.out) {
|
||||
t.Errorf("%s:%d: length is %d; want %d", name, i, len(outs), len(test.out))
|
||||
}
|
||||
if outs != test.out {
|
||||
// Find first rune that differs and show context.
|
||||
ir := []rune(outs)
|
||||
ig := []rune(test.out)
|
||||
for j := 0; j < len(ir) && j < len(ig); j++ {
|
||||
if ir[j] == ig[j] {
|
||||
continue
|
||||
}
|
||||
if j -= 3; j < 0 {
|
||||
j = 0
|
||||
}
|
||||
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
|
||||
t.Errorf("%s:%d: runeAt(%d) = %U; want %U", name, i, j, ir[j], ig[j])
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var appendTests = []AppendTest{
|
||||
// empty buffers
|
||||
{"", "", ""},
|
||||
{"a", "", "a"},
|
||||
{"", "a", "a"},
|
||||
{"", "\u0041\u0307\u0304", "\u01E0"},
|
||||
// segment split across buffers
|
||||
{"", "a\u0300b", "\u00E0b"},
|
||||
{"a", "\u0300b", "\u00E0b"},
|
||||
{"a", "\u0300\u0316", "\u00E0\u0316"},
|
||||
{"a", "\u0316\u0300", "\u00E0\u0316"},
|
||||
{"a", "\u0300a\u0300", "\u00E0\u00E0"},
|
||||
{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
|
||||
{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
|
||||
{"a\u0300", "\u0327", "\u00E0\u0327"},
|
||||
{"a\u0327", "\u0300", "\u00E0\u0327"},
|
||||
{"a\u0316", "\u0300", "\u00E0\u0316"},
|
||||
{"\u0041\u0307", "\u0304", "\u01E0"},
|
||||
// Hangul
|
||||
{"", "\u110B\u1173", "\uC73C"},
|
||||
{"", "\u1103\u1161", "\uB2E4"},
|
||||
{"", "\u110B\u1173\u11B7", "\uC74C"},
|
||||
{"", "\u320E", "\x28\uAC00\x29"},
|
||||
{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
|
||||
{"\u1103", "\u1161", "\uB2E4"},
|
||||
{"\u110B", "\u1173\u11B7", "\uC74C"},
|
||||
{"\u110B\u1173", "\u11B7", "\uC74C"},
|
||||
{"\uC73C", "\u11B7", "\uC74C"},
|
||||
// UTF-8 encoding split across buffers
|
||||
{"a\xCC", "\x80", "\u00E0"},
|
||||
{"a\xCC", "\x80b", "\u00E0b"},
|
||||
{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
|
||||
{"a\xCC", "\x80\x80", "\u00E0\x80"},
|
||||
{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
|
||||
{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
|
||||
// ending in incomplete UTF-8 encoding
|
||||
{"", "\xCC", "\xCC"},
|
||||
{"a", "\xCC", "a\xCC"},
|
||||
{"a", "b\xCC", "ab\xCC"},
|
||||
{"\u0226", "\xCC", "\u0226\xCC"},
|
||||
// illegal runes
|
||||
{"", "\x80", "\x80"},
|
||||
{"", "\x80\x80\x80", "\x80\x80\x80"},
|
||||
{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
|
||||
{"", "a\x80", "a\x80"},
|
||||
{"", "a\x80\x80\x80", "a\x80\x80\x80"},
|
||||
{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
|
||||
{"a", "\x80\x80\x80", "a\x80\x80\x80"},
|
||||
// overflow
|
||||
{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
|
||||
{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
|
||||
{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
|
||||
// overflow of combining characters
|
||||
{strings.Repeat("\u0300", 33), "", strings.Repeat("\u0300", 33)},
|
||||
// weird UTF-8
|
||||
{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
|
||||
{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
|
||||
{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
|
||||
{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
|
||||
{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
|
||||
{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
|
||||
{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
|
||||
{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
|
||||
}
|
||||
|
||||
func appendF(f Form, out []byte, s string) []byte {
|
||||
return f.Append(out, []byte(s)...)
|
||||
}
|
||||
|
||||
func appendStringF(f Form, out []byte, s string) []byte {
|
||||
return f.AppendString(out, s)
|
||||
}
|
||||
|
||||
func bytesF(f Form, out []byte, s string) []byte {
|
||||
buf := []byte{}
|
||||
buf = append(buf, out...)
|
||||
buf = append(buf, s...)
|
||||
return f.Bytes(buf)
|
||||
}
|
||||
|
||||
func stringF(f Form, out []byte, s string) []byte {
|
||||
outs := string(out) + s
|
||||
return []byte(f.String(outs))
|
||||
}
|
||||
|
||||
func TestAppend(t *testing.T) {
|
||||
runAppendTests(t, "TestAppend", NFKC, appendF, appendTests)
|
||||
runAppendTests(t, "TestAppendString", NFKC, appendStringF, appendTests)
|
||||
runAppendTests(t, "TestBytes", NFKC, bytesF, appendTests)
|
||||
runAppendTests(t, "TestString", NFKC, stringF, appendTests)
|
||||
}
|
||||
|
||||
func appendBench(f Form, in []byte) func() {
|
||||
buf := make([]byte, 0, 4*len(in))
|
||||
return func() {
|
||||
f.Append(buf, in...)
|
||||
}
|
||||
}
|
||||
|
||||
func iterBench(f Form, in []byte) func() {
|
||||
iter := Iter{}
|
||||
return func() {
|
||||
iter.Init(f, in)
|
||||
for !iter.Done() {
|
||||
iter.Next()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func readerBench(f Form, in []byte) func() {
|
||||
buf := make([]byte, 4*len(in))
|
||||
return func() {
|
||||
r := f.Reader(bytes.NewReader(in))
|
||||
var err error
|
||||
for err == nil {
|
||||
_, err = r.Read(buf)
|
||||
}
|
||||
if err != io.EOF {
|
||||
panic("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func writerBench(f Form, in []byte) func() {
|
||||
buf := make([]byte, 0, 4*len(in))
|
||||
return func() {
|
||||
r := f.Writer(bytes.NewBuffer(buf))
|
||||
if _, err := r.Write(in); err != nil {
|
||||
panic("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
|
||||
//bm = append(bm, appendBench(f, in))
|
||||
bm = append(bm, iterBench(f, in))
|
||||
//bm = append(bm, readerBench(f, in))
|
||||
//bm = append(bm, writerBench(f, in))
|
||||
return bm
|
||||
}
|
||||
|
||||
func doFormBenchmark(b *testing.B, inf, f Form, s string) {
|
||||
b.StopTimer()
|
||||
in := inf.Bytes([]byte(s))
|
||||
bm := appendBenchmarks(nil, f, in)
|
||||
b.SetBytes(int64(len(in) * len(bm)))
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, fn := range bm {
|
||||
fn()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var ascii = strings.Repeat("There is nothing to change here! ", 500)
|
||||
|
||||
func BenchmarkNormalizeAsciiNFC(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFC, ascii)
|
||||
}
|
||||
func BenchmarkNormalizeAsciiNFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFD, ascii)
|
||||
}
|
||||
func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFKC, ascii)
|
||||
}
|
||||
func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFKD, ascii)
|
||||
}
|
||||
|
||||
func BenchmarkNormalizeNFC2NFC(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFC, txt_all)
|
||||
}
|
||||
func BenchmarkNormalizeNFC2NFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFD, txt_all)
|
||||
}
|
||||
func BenchmarkNormalizeNFD2NFC(b *testing.B) {
|
||||
doFormBenchmark(b, NFD, NFC, txt_all)
|
||||
}
|
||||
func BenchmarkNormalizeNFD2NFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFD, NFD, txt_all)
|
||||
}
|
||||
|
||||
// Hangul is often special-cased, so we test it separately.
|
||||
func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFC, txt_kr)
|
||||
}
|
||||
func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFC, NFD, txt_kr)
|
||||
}
|
||||
func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
|
||||
doFormBenchmark(b, NFD, NFC, txt_kr)
|
||||
}
|
||||
func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
|
||||
doFormBenchmark(b, NFD, NFD, txt_kr)
|
||||
}
|
||||
|
||||
var forms = []Form{NFC, NFD, NFKC, NFKD}
|
||||
|
||||
func doTextBenchmark(b *testing.B, s string) {
|
||||
b.StopTimer()
|
||||
in := []byte(s)
|
||||
bm := []func(){}
|
||||
for _, f := range forms {
|
||||
bm = appendBenchmarks(bm, f, in)
|
||||
}
|
||||
b.SetBytes(int64(len(s) * len(bm)))
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, f := range bm {
|
||||
f()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCanonicalOrdering(b *testing.B) {
|
||||
doTextBenchmark(b, txt_canon)
|
||||
}
|
||||
func BenchmarkExtendedLatin(b *testing.B) {
|
||||
doTextBenchmark(b, txt_vn)
|
||||
}
|
||||
func BenchmarkMiscTwoByteUtf8(b *testing.B) {
|
||||
doTextBenchmark(b, twoByteUtf8)
|
||||
}
|
||||
func BenchmarkMiscThreeByteUtf8(b *testing.B) {
|
||||
doTextBenchmark(b, threeByteUtf8)
|
||||
}
|
||||
func BenchmarkHangul(b *testing.B) {
|
||||
doTextBenchmark(b, txt_kr)
|
||||
}
|
||||
func BenchmarkJapanese(b *testing.B) {
|
||||
doTextBenchmark(b, txt_jp)
|
||||
}
|
||||
func BenchmarkChinese(b *testing.B) {
|
||||
doTextBenchmark(b, txt_cn)
|
||||
}
|
||||
func BenchmarkOverflow(b *testing.B) {
|
||||
doTextBenchmark(b, overflow)
|
||||
}
|
||||
|
||||
var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
|
||||
|
||||
// Tests sampled from the Canonical ordering tests (Part 2) of
|
||||
// http://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
|
||||
\u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
|
||||
\u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
|
||||
\u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
|
||||
\u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062
|
||||
\u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
|
||||
\u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
|
||||
\u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
|
||||
\u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
|
||||
\u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
|
||||
\u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
|
||||
\u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
|
||||
\u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
|
||||
\u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
|
||||
\u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
|
||||
\u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
|
||||
\u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
|
||||
\u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
|
||||
const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
|
||||
Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
|
||||
nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
|
||||
một giấy phép khác có các điều khoản tương tự như giấy phép này
|
||||
cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
|
||||
trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
|
||||
người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
|
||||
bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
|
||||
chúng theo quy định của pháp luật thì tình trạng của nó không
|
||||
bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
|
||||
const txt_ru = `При обязательном соблюдении следующих условий:
|
||||
Attribution — Вы должны атрибутировать произведение (указывать
|
||||
автора и источник) в порядке, предусмотренном автором или
|
||||
лицензиаром (но только так, чтобы никоим образом не подразумевалось,
|
||||
что они поддерживают вас или использование вами данного произведения).
|
||||
Υπό τις ακόλουθες προϋποθέσεις:`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
|
||||
const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
|
||||
τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
|
||||
(χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
|
||||
τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
|
||||
τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
|
||||
μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
|
||||
παρόμοια άδεια.`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
|
||||
const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
|
||||
تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
|
||||
الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
|
||||
المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
|
||||
من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
|
||||
لهذا الترخيص.`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
|
||||
const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
|
||||
המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
|
||||
שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
|
||||
לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
|
||||
החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
|
||||
|
||||
const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
|
||||
const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
|
||||
(Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
|
||||
원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
|
||||
이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
|
||||
동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
|
||||
라이선스와 동일한 라이선스를 적용해야 합니다.`
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
|
||||
const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
|
||||
มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
|
||||
ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
|
||||
คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
|
||||
อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
|
||||
อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
|
||||
|
||||
const threeByteUtf8 = txt_th
|
||||
|
||||
// Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
|
||||
const txt_jp = `あなたの従うべき条件は以下の通りです。
|
||||
表示 — あなたは原著作者のクレジットを表示しなければなりません。
|
||||
継承 — もしあなたがこの作品を改変、変形または加工した場合、
|
||||
あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
|
||||
頒布することができます。`
|
||||
|
||||
// http://creativecommons.org/licenses/by-sa/2.5/cn/
|
||||
const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
|
||||
广播或通过信息网络传播本作品 创作演绎作品
|
||||
对本作品进行商业性使用 惟须遵守下列条件:
|
||||
署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
|
||||
相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
|
||||
您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
|
||||
|
||||
const txt_cjk = txt_cn + txt_jp + txt_kr
|
||||
const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
|
@ -1,304 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"exp/norm"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
loadTestData()
|
||||
CharacterByCharacterTests()
|
||||
StandardTests()
|
||||
PerformanceTest()
|
||||
if errorCount == 0 {
|
||||
fmt.Println("PASS")
|
||||
}
|
||||
}
|
||||
|
||||
const file = "NormalizationTest.txt"
|
||||
|
||||
var url = flag.String("url",
|
||||
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/"+file,
|
||||
"URL of Unicode database directory")
|
||||
var localFiles = flag.Bool("local",
|
||||
false,
|
||||
"data files have been copied to the current directory; for debugging only")
|
||||
|
||||
var logger = log.New(os.Stderr, "", log.Lshortfile)
|
||||
|
||||
// This regression test runs the test set in NormalizationTest.txt
|
||||
// (taken from http://www.unicode.org/Public/<unicode.Version>/ucd/).
|
||||
//
|
||||
// NormalizationTest.txt has form:
|
||||
// @Part0 # Specific cases
|
||||
// #
|
||||
// 1E0A;1E0A;0044 0307;1E0A;0044 0307; # (Ḋ; Ḋ; D◌̇; Ḋ; D◌̇; ) LATIN CAPITAL LETTER D WITH DOT ABOVE
|
||||
// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # (Ḍ; Ḍ; D◌̣; Ḍ; D◌̣; ) LATIN CAPITAL LETTER D WITH DOT BELOW
|
||||
//
|
||||
// Each test has 5 columns (c1, c2, c3, c4, c5), where
|
||||
// (c1, c2, c3, c4, c5) == (c1, NFC(c1), NFD(c1), NFKC(c1), NFKD(c1))
|
||||
//
|
||||
// CONFORMANCE:
|
||||
// 1. The following invariants must be true for all conformant implementations
|
||||
//
|
||||
// NFC
|
||||
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
// c4 == NFC(c4) == NFC(c5)
|
||||
//
|
||||
// NFD
|
||||
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
// c5 == NFD(c4) == NFD(c5)
|
||||
//
|
||||
// NFKC
|
||||
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
//
|
||||
// NFKD
|
||||
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
//
|
||||
// 2. For every code point X assigned in this version of Unicode that is not
|
||||
// specifically listed in Part 1, the following invariants must be true
|
||||
// for all conformant implementations:
|
||||
//
|
||||
// X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
|
||||
//
|
||||
|
||||
// Column types.
|
||||
const (
|
||||
cRaw = iota
|
||||
cNFC
|
||||
cNFD
|
||||
cNFKC
|
||||
cNFKD
|
||||
cMaxColumns
|
||||
)
|
||||
|
||||
// Holds data from NormalizationTest.txt
|
||||
var part []Part
|
||||
|
||||
type Part struct {
|
||||
name string
|
||||
number int
|
||||
tests []Test
|
||||
}
|
||||
|
||||
type Test struct {
|
||||
name string
|
||||
partnr int
|
||||
number int
|
||||
r rune // used for character by character test
|
||||
cols [cMaxColumns]string // Each has 5 entries, see below.
|
||||
}
|
||||
|
||||
func (t Test) Name() string {
|
||||
if t.number < 0 {
|
||||
return part[t.partnr].name
|
||||
}
|
||||
return fmt.Sprintf("%s:%d", part[t.partnr].name, t.number)
|
||||
}
|
||||
|
||||
var partRe = regexp.MustCompile(`@Part(\d) # (.*)$`)
|
||||
var testRe = regexp.MustCompile(`^` + strings.Repeat(`([\dA-F ]+);`, 5) + ` # (.*)$`)
|
||||
|
||||
var counter int
|
||||
|
||||
// Load the data form NormalizationTest.txt
|
||||
func loadTestData() {
|
||||
if *localFiles {
|
||||
pwd, _ := os.Getwd()
|
||||
*url = "file://" + path.Join(pwd, file)
|
||||
}
|
||||
t := &http.Transport{}
|
||||
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
|
||||
c := &http.Client{Transport: t}
|
||||
resp, err := c.Get(*url)
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
logger.Fatal("bad GET status for "+file, resp.Status)
|
||||
}
|
||||
f := resp.Body
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if len(line) == 0 || line[0] == '#' {
|
||||
continue
|
||||
}
|
||||
m := partRe.FindStringSubmatch(line)
|
||||
if m != nil {
|
||||
if len(m) < 3 {
|
||||
logger.Fatal("Failed to parse Part: ", line)
|
||||
}
|
||||
i, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
name := m[2]
|
||||
part = append(part, Part{name: name[:len(name)-1], number: i})
|
||||
continue
|
||||
}
|
||||
m = testRe.FindStringSubmatch(line)
|
||||
if m == nil || len(m) < 7 {
|
||||
logger.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
|
||||
}
|
||||
test := Test{name: m[6], partnr: len(part) - 1, number: counter}
|
||||
counter++
|
||||
for j := 1; j < len(m)-1; j++ {
|
||||
for _, split := range strings.Split(m[j], " ") {
|
||||
r, err := strconv.ParseUint(split, 16, 64)
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
if test.r == 0 {
|
||||
// save for CharacterByCharacterTests
|
||||
test.r = rune(r)
|
||||
}
|
||||
var buf [utf8.UTFMax]byte
|
||||
sz := utf8.EncodeRune(buf[:], rune(r))
|
||||
test.cols[j-1] += string(buf[:sz])
|
||||
}
|
||||
}
|
||||
part := &part[len(part)-1]
|
||||
part.tests = append(part.tests, test)
|
||||
}
|
||||
if scanner.Err() != nil {
|
||||
logger.Fatal(scanner.Err())
|
||||
}
|
||||
}
|
||||
|
||||
var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
|
||||
|
||||
var errorCount int
|
||||
|
||||
func cmpResult(t *Test, name string, f norm.Form, gold, test, result string) {
|
||||
if gold != result {
|
||||
errorCount++
|
||||
if errorCount > 20 {
|
||||
return
|
||||
}
|
||||
st, sr, sg := []rune(test), []rune(result), []rune(gold)
|
||||
logger.Printf("%s:%s: %s(%X)=%X; want:%X: %s",
|
||||
t.Name(), name, fstr[f], st, sr, sg, t.name)
|
||||
}
|
||||
}
|
||||
|
||||
func cmpIsNormal(t *Test, name string, f norm.Form, test string, result, want bool) {
|
||||
if result != want {
|
||||
errorCount++
|
||||
if errorCount > 20 {
|
||||
return
|
||||
}
|
||||
logger.Printf("%s:%s: %s(%X)=%v; want: %v", t.Name(), name, fstr[f], []rune(test), result, want)
|
||||
}
|
||||
}
|
||||
|
||||
func doTest(t *Test, f norm.Form, gold, test string) {
|
||||
result := f.Bytes([]byte(test))
|
||||
cmpResult(t, "Bytes", f, gold, test, string(result))
|
||||
sresult := f.String(test)
|
||||
cmpResult(t, "String", f, gold, test, sresult)
|
||||
acc := []byte{}
|
||||
i := norm.Iter{}
|
||||
i.InitString(f, test)
|
||||
for !i.Done() {
|
||||
acc = append(acc, i.Next()...)
|
||||
}
|
||||
cmpResult(t, "Iter.Next", f, gold, test, string(acc))
|
||||
for i := range test {
|
||||
out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
|
||||
cmpResult(t, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
|
||||
}
|
||||
cmpIsNormal(t, "IsNormal", f, test, f.IsNormal([]byte(test)), test == gold)
|
||||
}
|
||||
|
||||
func doConformanceTests(t *Test, partn int) {
|
||||
for i := 0; i <= 2; i++ {
|
||||
doTest(t, norm.NFC, t.cols[1], t.cols[i])
|
||||
doTest(t, norm.NFD, t.cols[2], t.cols[i])
|
||||
doTest(t, norm.NFKC, t.cols[3], t.cols[i])
|
||||
doTest(t, norm.NFKD, t.cols[4], t.cols[i])
|
||||
}
|
||||
for i := 3; i <= 4; i++ {
|
||||
doTest(t, norm.NFC, t.cols[3], t.cols[i])
|
||||
doTest(t, norm.NFD, t.cols[4], t.cols[i])
|
||||
doTest(t, norm.NFKC, t.cols[3], t.cols[i])
|
||||
doTest(t, norm.NFKD, t.cols[4], t.cols[i])
|
||||
}
|
||||
}
|
||||
|
||||
func CharacterByCharacterTests() {
|
||||
tests := part[1].tests
|
||||
var last rune = 0
|
||||
for i := 0; i <= len(tests); i++ { // last one is special case
|
||||
var r rune
|
||||
if i == len(tests) {
|
||||
r = 0x2FA1E // Don't have to go to 0x10FFFF
|
||||
} else {
|
||||
r = tests[i].r
|
||||
}
|
||||
for last++; last < r; last++ {
|
||||
// Check all characters that were not explicitly listed in the test.
|
||||
t := &Test{partnr: 1, number: -1}
|
||||
char := string(last)
|
||||
doTest(t, norm.NFC, char, char)
|
||||
doTest(t, norm.NFD, char, char)
|
||||
doTest(t, norm.NFKC, char, char)
|
||||
doTest(t, norm.NFKD, char, char)
|
||||
}
|
||||
if i < len(tests) {
|
||||
doConformanceTests(&tests[i], 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func StandardTests() {
|
||||
for _, j := range []int{0, 2, 3} {
|
||||
for _, test := range part[j].tests {
|
||||
doConformanceTests(&test, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PerformanceTest verifies that normalization is O(n). If any of the
|
||||
// code does not properly check for maxCombiningChars, normalization
|
||||
// may exhibit O(n**2) behavior.
|
||||
func PerformanceTest() {
|
||||
runtime.GOMAXPROCS(2)
|
||||
success := make(chan bool, 1)
|
||||
go func() {
|
||||
buf := bytes.Repeat([]byte("\u035D"), 1024*1024)
|
||||
buf = append(buf, "\u035B"...)
|
||||
norm.NFC.Append(nil, buf...)
|
||||
success <- true
|
||||
}()
|
||||
timeout := time.After(1 * time.Second)
|
||||
select {
|
||||
case <-success:
|
||||
// test completed before the timeout
|
||||
case <-timeout:
|
||||
errorCount++
|
||||
logger.Printf(`unexpectedly long time to complete PerformanceTest`)
|
||||
}
|
||||
}
|
@ -1,126 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "io"
|
||||
|
||||
type normWriter struct {
|
||||
rb reorderBuffer
|
||||
w io.Writer
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// Write implements the standard write interface. If the last characters are
|
||||
// not at a normalization boundary, the bytes will be buffered for the next
|
||||
// write. The remaining bytes will be written on close.
|
||||
func (w *normWriter) Write(data []byte) (n int, err error) {
|
||||
// Process data in pieces to keep w.buf size bounded.
|
||||
const chunk = 4000
|
||||
|
||||
for len(data) > 0 {
|
||||
// Normalize into w.buf.
|
||||
m := len(data)
|
||||
if m > chunk {
|
||||
m = chunk
|
||||
}
|
||||
w.rb.src = inputBytes(data[:m])
|
||||
w.rb.nsrc = m
|
||||
w.buf = doAppend(&w.rb, w.buf, 0)
|
||||
data = data[m:]
|
||||
n += m
|
||||
|
||||
// Write out complete prefix, save remainder.
|
||||
// Note that lastBoundary looks back at most 30 runes.
|
||||
i := lastBoundary(&w.rb.f, w.buf)
|
||||
if i == -1 {
|
||||
i = 0
|
||||
}
|
||||
if i > 0 {
|
||||
if _, err = w.w.Write(w.buf[:i]); err != nil {
|
||||
break
|
||||
}
|
||||
bn := copy(w.buf, w.buf[i:])
|
||||
w.buf = w.buf[:bn]
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
// Close forces data that remains in the buffer to be written.
|
||||
func (w *normWriter) Close() error {
|
||||
if len(w.buf) > 0 {
|
||||
_, err := w.w.Write(w.buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Writer returns a new writer that implements Write(b)
|
||||
// by writing f(b) to w. The returned writer may use an
|
||||
// an internal buffer to maintain state across Write calls.
|
||||
// Calling its Close method writes any buffered data to w.
|
||||
func (f Form) Writer(w io.Writer) io.WriteCloser {
|
||||
wr := &normWriter{rb: reorderBuffer{}, w: w}
|
||||
wr.rb.init(f, nil)
|
||||
return wr
|
||||
}
|
||||
|
||||
type normReader struct {
|
||||
rb reorderBuffer
|
||||
r io.Reader
|
||||
inbuf []byte
|
||||
outbuf []byte
|
||||
bufStart int
|
||||
lastBoundary int
|
||||
err error
|
||||
}
|
||||
|
||||
// Read implements the standard read interface.
|
||||
func (r *normReader) Read(p []byte) (int, error) {
|
||||
for {
|
||||
if r.lastBoundary-r.bufStart > 0 {
|
||||
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary])
|
||||
r.bufStart += n
|
||||
if r.lastBoundary-r.bufStart > 0 {
|
||||
return n, nil
|
||||
}
|
||||
return n, r.err
|
||||
}
|
||||
if r.err != nil {
|
||||
return 0, r.err
|
||||
}
|
||||
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:])
|
||||
r.outbuf = r.outbuf[0:outn]
|
||||
r.bufStart = 0
|
||||
|
||||
n, err := r.r.Read(r.inbuf)
|
||||
r.rb.src = inputBytes(r.inbuf[0:n])
|
||||
r.rb.nsrc, r.err = n, err
|
||||
if n > 0 {
|
||||
r.outbuf = doAppend(&r.rb, r.outbuf, 0)
|
||||
}
|
||||
if err == io.EOF {
|
||||
r.lastBoundary = len(r.outbuf)
|
||||
} else {
|
||||
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf)
|
||||
if r.lastBoundary == -1 {
|
||||
r.lastBoundary = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
panic("should not reach here")
|
||||
}
|
||||
|
||||
// Reader returns a new reader that implements Read
|
||||
// by reading data from r and returning f(data).
|
||||
func (f Form) Reader(r io.Reader) io.Reader {
|
||||
const chunk = 4000
|
||||
buf := make([]byte, chunk)
|
||||
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf}
|
||||
rr.rb.init(f, buf)
|
||||
return rr
|
||||
}
|
@ -1,68 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var ioTests = []AppendTest{
|
||||
{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
|
||||
{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
|
||||
{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
|
||||
{"", "\u0041\u0307\u0304", "\u01E0"},
|
||||
}
|
||||
|
||||
var bufSizes = []int{1, 2, 3, 4, 5, 6, 7, 8, 100, 101, 102, 103, 4000, 4001, 4002, 4003}
|
||||
|
||||
func readFunc(size int) appendFunc {
|
||||
return func(f Form, out []byte, s string) []byte {
|
||||
out = append(out, s...)
|
||||
r := f.Reader(bytes.NewBuffer(out))
|
||||
buf := make([]byte, size)
|
||||
result := []byte{}
|
||||
for n, err := 0, error(nil); err == nil; {
|
||||
n, err = r.Read(buf)
|
||||
result = append(result, buf[:n]...)
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
func TestReader(t *testing.T) {
|
||||
for _, s := range bufSizes {
|
||||
name := fmt.Sprintf("TestReader%da", s)
|
||||
runAppendTests(t, name, NFKC, readFunc(s), appendTests)
|
||||
name = fmt.Sprintf("TestReader%db", s)
|
||||
runAppendTests(t, name, NFKC, readFunc(s), ioTests)
|
||||
}
|
||||
}
|
||||
|
||||
func writeFunc(size int) appendFunc {
|
||||
return func(f Form, out []byte, s string) []byte {
|
||||
in := append(out, s...)
|
||||
result := new(bytes.Buffer)
|
||||
w := f.Writer(result)
|
||||
buf := make([]byte, size)
|
||||
for n := 0; len(in) > 0; in = in[n:] {
|
||||
n = copy(buf, in)
|
||||
_, _ = w.Write(buf[:n])
|
||||
}
|
||||
w.Close()
|
||||
return result.Bytes()
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriter(t *testing.T) {
|
||||
for _, s := range bufSizes {
|
||||
name := fmt.Sprintf("TestWriter%da", s)
|
||||
runAppendTests(t, name, NFKC, writeFunc(s), appendTests)
|
||||
name = fmt.Sprintf("TestWriter%db", s)
|
||||
runAppendTests(t, name, NFKC, writeFunc(s), ioTests)
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,232 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
type valueRange struct {
|
||||
value uint16 // header: value:stride
|
||||
lo, hi byte // header: lo:n
|
||||
}
|
||||
|
||||
type trie struct {
|
||||
index []uint8
|
||||
values []uint16
|
||||
sparse []valueRange
|
||||
sparseOffset []uint16
|
||||
cutoff uint8 // indices >= cutoff are sparse
|
||||
}
|
||||
|
||||
// lookupValue determines the type of block n and looks up the value for b.
|
||||
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||
// the value for b is by r.value + (b - r.lo) * stride.
|
||||
func (t *trie) lookupValue(n uint8, b byte) uint16 {
|
||||
if n < t.cutoff {
|
||||
return t.values[uint16(n)<<6+uint16(b)]
|
||||
}
|
||||
offset := t.sparseOffset[n-t.cutoff]
|
||||
header := t.sparse[offset]
|
||||
lo := offset + 1
|
||||
hi := lo + uint16(header.lo)
|
||||
for lo < hi {
|
||||
m := lo + (hi-lo)/2
|
||||
r := t.sparse[m]
|
||||
if r.lo <= b && b <= r.hi {
|
||||
return r.value + uint16(b-r.lo)*header.value
|
||||
}
|
||||
if b < r.lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
const (
|
||||
t1 = 0x00 // 0000 0000
|
||||
tx = 0x80 // 1000 0000
|
||||
t2 = 0xC0 // 1100 0000
|
||||
t3 = 0xE0 // 1110 0000
|
||||
t4 = 0xF0 // 1111 0000
|
||||
t5 = 0xF8 // 1111 1000
|
||||
t6 = 0xFC // 1111 1100
|
||||
te = 0xFE // 1111 1110
|
||||
)
|
||||
|
||||
// lookup returns the trie value for the first UTF-8 encoding in s and
|
||||
// the width in bytes of this encoding. The size will be 0 if s does not
|
||||
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
|
||||
func (t *trie) lookup(s []byte) (v uint16, sz int) {
|
||||
c0 := s[0]
|
||||
switch {
|
||||
case c0 < tx:
|
||||
return t.values[c0], 1
|
||||
case c0 < t2:
|
||||
return 0, 1
|
||||
case c0 < t3:
|
||||
if len(s) < 2 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
return t.lookupValue(i, c1), 2
|
||||
case c0 < t4:
|
||||
if len(s) < 3 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := uint16(i)<<6 + uint16(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
return t.lookupValue(i, c2), 3
|
||||
case c0 < t5:
|
||||
if len(s) < 4 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := uint16(i)<<6 + uint16(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
o = uint16(i)<<6 + uint16(c2)
|
||||
i = t.index[o]
|
||||
c3 := s[3]
|
||||
if c3 < tx || t2 <= c3 {
|
||||
return 0, 3
|
||||
}
|
||||
return t.lookupValue(i, c3), 4
|
||||
}
|
||||
// Illegal rune
|
||||
return 0, 1
|
||||
}
|
||||
|
||||
// lookupString returns the trie value for the first UTF-8 encoding in s and
|
||||
// the width in bytes of this encoding. The size will be 0 if s does not
|
||||
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
|
||||
func (t *trie) lookupString(s string) (v uint16, sz int) {
|
||||
c0 := s[0]
|
||||
switch {
|
||||
case c0 < tx:
|
||||
return t.values[c0], 1
|
||||
case c0 < t2:
|
||||
return 0, 1
|
||||
case c0 < t3:
|
||||
if len(s) < 2 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
return t.lookupValue(i, c1), 2
|
||||
case c0 < t4:
|
||||
if len(s) < 3 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := uint16(i)<<6 + uint16(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
return t.lookupValue(i, c2), 3
|
||||
case c0 < t5:
|
||||
if len(s) < 4 {
|
||||
return 0, 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
c1 := s[1]
|
||||
if c1 < tx || t2 <= c1 {
|
||||
return 0, 1
|
||||
}
|
||||
o := uint16(i)<<6 + uint16(c1)
|
||||
i = t.index[o]
|
||||
c2 := s[2]
|
||||
if c2 < tx || t2 <= c2 {
|
||||
return 0, 2
|
||||
}
|
||||
o = uint16(i)<<6 + uint16(c2)
|
||||
i = t.index[o]
|
||||
c3 := s[3]
|
||||
if c3 < tx || t2 <= c3 {
|
||||
return 0, 3
|
||||
}
|
||||
return t.lookupValue(i, c3), 4
|
||||
}
|
||||
// Illegal rune
|
||||
return 0, 1
|
||||
}
|
||||
|
||||
// lookupUnsafe returns the trie value for the first UTF-8 encoding in s.
|
||||
// s must hold a full encoding.
|
||||
func (t *trie) lookupUnsafe(s []byte) uint16 {
|
||||
c0 := s[0]
|
||||
if c0 < tx {
|
||||
return t.values[c0]
|
||||
}
|
||||
if c0 < t2 {
|
||||
return 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
if c0 < t3 {
|
||||
return t.lookupValue(i, s[1])
|
||||
}
|
||||
i = t.index[uint16(i)<<6+uint16(s[1])]
|
||||
if c0 < t4 {
|
||||
return t.lookupValue(i, s[2])
|
||||
}
|
||||
i = t.index[uint16(i)<<6+uint16(s[2])]
|
||||
if c0 < t5 {
|
||||
return t.lookupValue(i, s[3])
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// lookupStringUnsafe returns the trie value for the first UTF-8 encoding in s.
|
||||
// s must hold a full encoding.
|
||||
func (t *trie) lookupStringUnsafe(s string) uint16 {
|
||||
c0 := s[0]
|
||||
if c0 < tx {
|
||||
return t.values[c0]
|
||||
}
|
||||
if c0 < t2 {
|
||||
return 0
|
||||
}
|
||||
i := t.index[c0]
|
||||
if c0 < t3 {
|
||||
return t.lookupValue(i, s[1])
|
||||
}
|
||||
i = t.index[uint16(i)<<6+uint16(s[1])]
|
||||
if c0 < t4 {
|
||||
return t.lookupValue(i, s[2])
|
||||
}
|
||||
i = t.index[uint16(i)<<6+uint16(s[2])]
|
||||
if c0 < t5 {
|
||||
return t.lookupValue(i, s[3])
|
||||
}
|
||||
return 0
|
||||
}
|
@ -1,152 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Test data is located in triedata_test.go; generated by maketesttables.
|
||||
var testdata = testdataTrie
|
||||
|
||||
type rangeTest struct {
|
||||
block uint8
|
||||
lookup byte
|
||||
result uint16
|
||||
table []valueRange
|
||||
offsets []uint16
|
||||
}
|
||||
|
||||
var range1Off = []uint16{0, 2}
|
||||
var range1 = []valueRange{
|
||||
{0, 1, 0},
|
||||
{1, 0x80, 0x80},
|
||||
{0, 2, 0},
|
||||
{1, 0x80, 0x80},
|
||||
{9, 0xff, 0xff},
|
||||
}
|
||||
|
||||
var rangeTests = []rangeTest{
|
||||
{10, 0x80, 1, range1, range1Off},
|
||||
{10, 0x00, 0, range1, range1Off},
|
||||
{11, 0x80, 1, range1, range1Off},
|
||||
{11, 0xff, 9, range1, range1Off},
|
||||
{11, 0x00, 0, range1, range1Off},
|
||||
}
|
||||
|
||||
func TestLookupSparse(t *testing.T) {
|
||||
for i, test := range rangeTests {
|
||||
n := trie{sparse: test.table, sparseOffset: test.offsets, cutoff: 10}
|
||||
v := n.lookupValue(test.block, test.lookup)
|
||||
if v != test.result {
|
||||
t.Errorf("LookupSparse:%d: found %X; want %X", i, v, test.result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test cases for illegal runes.
|
||||
type trietest struct {
|
||||
size int
|
||||
bytes []byte
|
||||
}
|
||||
|
||||
var tests = []trietest{
|
||||
// illegal runes
|
||||
{1, []byte{0x80}},
|
||||
{1, []byte{0xFF}},
|
||||
{1, []byte{t2, tx - 1}},
|
||||
{1, []byte{t2, t2}},
|
||||
{2, []byte{t3, tx, tx - 1}},
|
||||
{2, []byte{t3, tx, t2}},
|
||||
{1, []byte{t3, tx - 1, tx}},
|
||||
{3, []byte{t4, tx, tx, tx - 1}},
|
||||
{3, []byte{t4, tx, tx, t2}},
|
||||
{1, []byte{t4, t2, tx, tx - 1}},
|
||||
{2, []byte{t4, tx, t2, tx - 1}},
|
||||
|
||||
// short runes
|
||||
{0, []byte{t2}},
|
||||
{0, []byte{t3, tx}},
|
||||
{0, []byte{t4, tx, tx}},
|
||||
|
||||
// we only support UTF-8 up to utf8.UTFMax bytes (4 bytes)
|
||||
{1, []byte{t5, tx, tx, tx, tx}},
|
||||
{1, []byte{t6, tx, tx, tx, tx, tx}},
|
||||
}
|
||||
|
||||
func mkUTF8(r rune) ([]byte, int) {
|
||||
var b [utf8.UTFMax]byte
|
||||
sz := utf8.EncodeRune(b[:], r)
|
||||
return b[:sz], sz
|
||||
}
|
||||
|
||||
func TestLookup(t *testing.T) {
|
||||
for i, tt := range testRunes {
|
||||
b, szg := mkUTF8(tt)
|
||||
v, szt := testdata.lookup(b)
|
||||
if int(v) != i {
|
||||
t.Errorf("lookup(%U): found value %#x, expected %#x", tt, v, i)
|
||||
}
|
||||
if szt != szg {
|
||||
t.Errorf("lookup(%U): found size %d, expected %d", tt, szt, szg)
|
||||
}
|
||||
}
|
||||
for i, tt := range tests {
|
||||
v, sz := testdata.lookup(tt.bytes)
|
||||
if v != 0 {
|
||||
t.Errorf("lookup of illegal rune, case %d: found value %#x, expected 0", i, v)
|
||||
}
|
||||
if sz != tt.size {
|
||||
t.Errorf("lookup of illegal rune, case %d: found size %d, expected %d", i, sz, tt.size)
|
||||
}
|
||||
}
|
||||
// Verify defaults.
|
||||
if v, _ := testdata.lookup([]byte{0xC1, 0x8C}); v != 0 {
|
||||
t.Errorf("lookup of non-existing rune should be 0; found %X", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLookupUnsafe(t *testing.T) {
|
||||
for i, tt := range testRunes {
|
||||
b, _ := mkUTF8(tt)
|
||||
v := testdata.lookupUnsafe(b)
|
||||
if int(v) != i {
|
||||
t.Errorf("lookupUnsafe(%U): found value %#x, expected %#x", i, v, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLookupString(t *testing.T) {
|
||||
for i, tt := range testRunes {
|
||||
b, szg := mkUTF8(tt)
|
||||
v, szt := testdata.lookupString(string(b))
|
||||
if int(v) != i {
|
||||
t.Errorf("lookup(%U): found value %#x, expected %#x", i, v, i)
|
||||
}
|
||||
if szt != szg {
|
||||
t.Errorf("lookup(%U): found size %d, expected %d", i, szt, szg)
|
||||
}
|
||||
}
|
||||
for i, tt := range tests {
|
||||
v, sz := testdata.lookupString(string(tt.bytes))
|
||||
if int(v) != 0 {
|
||||
t.Errorf("lookup of illegal rune, case %d: found value %#x, expected 0", i, v)
|
||||
}
|
||||
if sz != tt.size {
|
||||
t.Errorf("lookup of illegal rune, case %d: found size %d, expected %d", i, sz, tt.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLookupStringUnsafe(t *testing.T) {
|
||||
for i, tt := range testRunes {
|
||||
b, _ := mkUTF8(tt)
|
||||
v := testdata.lookupStringUnsafe(string(b))
|
||||
if int(v) != i {
|
||||
t.Errorf("lookupUnsafe(%U): found value %#x, expected %#x", i, v, i)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
// Generated by running
|
||||
// maketesttables
|
||||
// DO NOT EDIT
|
||||
|
||||
package norm
|
||||
|
||||
var testRunes = []int32{1, 12, 127, 128, 256, 2047, 2048, 2457, 65535, 65536, 65793, 1114111, 512, 513, 514, 528, 533}
|
||||
|
||||
// testdataValues: 192 entries, 384 bytes
|
||||
// Block 2 is the null block.
|
||||
var testdataValues = [192]uint16{
|
||||
// Block 0x0, offset 0x0
|
||||
0x000c: 0x0001,
|
||||
// Block 0x1, offset 0x40
|
||||
0x007f: 0x0002,
|
||||
// Block 0x2, offset 0x80
|
||||
}
|
||||
|
||||
// testdataSparseOffset: 10 entries, 20 bytes
|
||||
var testdataSparseOffset = []uint16{0x0, 0x2, 0x4, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14}
|
||||
|
||||
// testdataSparseValues: 22 entries, 88 bytes
|
||||
var testdataSparseValues = [22]valueRange{
|
||||
// Block 0x0, offset 0x1
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0003, lo: 0x80, hi: 0x80},
|
||||
// Block 0x1, offset 0x2
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0004, lo: 0x80, hi: 0x80},
|
||||
// Block 0x2, offset 0x3
|
||||
{value: 0x0001, lo: 0x03},
|
||||
{value: 0x000c, lo: 0x80, hi: 0x82},
|
||||
{value: 0x000f, lo: 0x90, hi: 0x90},
|
||||
{value: 0x0010, lo: 0x95, hi: 0x95},
|
||||
// Block 0x3, offset 0x4
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0005, lo: 0xbf, hi: 0xbf},
|
||||
// Block 0x4, offset 0x5
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0006, lo: 0x80, hi: 0x80},
|
||||
// Block 0x5, offset 0x6
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0007, lo: 0x99, hi: 0x99},
|
||||
// Block 0x6, offset 0x7
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0008, lo: 0xbf, hi: 0xbf},
|
||||
// Block 0x7, offset 0x8
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x0009, lo: 0x80, hi: 0x80},
|
||||
// Block 0x8, offset 0x9
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x000a, lo: 0x81, hi: 0x81},
|
||||
// Block 0x9, offset 0xa
|
||||
{value: 0x0000, lo: 0x01},
|
||||
{value: 0x000b, lo: 0xbf, hi: 0xbf},
|
||||
}
|
||||
|
||||
// testdataLookup: 640 bytes
|
||||
// Block 0 is the null block.
|
||||
var testdataLookup = [640]uint8{
|
||||
// Block 0x0, offset 0x0
|
||||
// Block 0x1, offset 0x40
|
||||
// Block 0x2, offset 0x80
|
||||
// Block 0x3, offset 0xc0
|
||||
0x0c2: 0x01, 0x0c4: 0x02,
|
||||
0x0c8: 0x03,
|
||||
0x0df: 0x04,
|
||||
0x0e0: 0x02,
|
||||
0x0ef: 0x03,
|
||||
0x0f0: 0x05, 0x0f4: 0x07,
|
||||
// Block 0x4, offset 0x100
|
||||
0x120: 0x05, 0x126: 0x06,
|
||||
// Block 0x5, offset 0x140
|
||||
0x17f: 0x07,
|
||||
// Block 0x6, offset 0x180
|
||||
0x180: 0x08, 0x184: 0x09,
|
||||
// Block 0x7, offset 0x1c0
|
||||
0x1d0: 0x04,
|
||||
// Block 0x8, offset 0x200
|
||||
0x23f: 0x0a,
|
||||
// Block 0x9, offset 0x240
|
||||
0x24f: 0x06,
|
||||
}
|
||||
|
||||
var testdataTrie = trie{testdataLookup[:], testdataValues[:], testdataSparseValues[:], testdataSparseOffset[:], 1}
|
@ -1,317 +0,0 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Trie table generator.
|
||||
// Used by make*tables tools to generate a go file with trie data structures
|
||||
// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
|
||||
// sequence are used to lookup offsets in the index table to be used for the
|
||||
// next byte. The last byte is used to index into a table with 16-bit values.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/crc32"
|
||||
"log"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const (
|
||||
blockSize = 64
|
||||
blockOffset = 2 // Subtract two blocks to compensate for the 0x80 added to continuation bytes.
|
||||
maxSparseEntries = 16
|
||||
)
|
||||
|
||||
// Intermediate trie structure
|
||||
type trieNode struct {
|
||||
table [256]*trieNode
|
||||
value int
|
||||
b byte
|
||||
leaf bool
|
||||
}
|
||||
|
||||
func newNode() *trieNode {
|
||||
return new(trieNode)
|
||||
}
|
||||
|
||||
func (n trieNode) String() string {
|
||||
s := fmt.Sprint("trieNode{table: { non-nil at index: ")
|
||||
for i, v := range n.table {
|
||||
if v != nil {
|
||||
s += fmt.Sprintf("%d, ", i)
|
||||
}
|
||||
}
|
||||
s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf)
|
||||
return s
|
||||
}
|
||||
|
||||
func (n trieNode) isInternal() bool {
|
||||
internal := true
|
||||
for i := 0; i < 256; i++ {
|
||||
if nn := n.table[i]; nn != nil {
|
||||
if !internal && !nn.leaf {
|
||||
log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n)
|
||||
}
|
||||
internal = internal && !nn.leaf
|
||||
}
|
||||
}
|
||||
return internal
|
||||
}
|
||||
|
||||
func (n trieNode) mostFrequentStride() int {
|
||||
counts := make(map[int]int)
|
||||
v := 0
|
||||
for _, t := range n.table[0x80 : 0x80+blockSize] {
|
||||
if t != nil {
|
||||
if stride := t.value - v; v != 0 && stride >= 0 {
|
||||
counts[stride]++
|
||||
}
|
||||
v = t.value
|
||||
} else {
|
||||
v = 0
|
||||
}
|
||||
}
|
||||
var maxs, maxc int
|
||||
for stride, cnt := range counts {
|
||||
if cnt > maxc || (cnt == maxc && stride < maxs) {
|
||||
maxs, maxc = stride, cnt
|
||||
}
|
||||
}
|
||||
return maxs
|
||||
}
|
||||
|
||||
func (n trieNode) countSparseEntries() int {
|
||||
stride := n.mostFrequentStride()
|
||||
var count, v int
|
||||
for _, t := range n.table[0x80 : 0x80+blockSize] {
|
||||
tv := 0
|
||||
if t != nil {
|
||||
tv = t.value
|
||||
}
|
||||
if tv-v != stride {
|
||||
if tv != 0 {
|
||||
count++
|
||||
}
|
||||
}
|
||||
v = tv
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func (n *trieNode) insert(r rune, value uint16) {
|
||||
var p [utf8.UTFMax]byte
|
||||
sz := utf8.EncodeRune(p[:], r)
|
||||
|
||||
for i := 0; i < sz; i++ {
|
||||
if n.leaf {
|
||||
log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n)
|
||||
}
|
||||
nn := n.table[p[i]]
|
||||
if nn == nil {
|
||||
nn = newNode()
|
||||
nn.b = p[i]
|
||||
n.table[p[i]] = nn
|
||||
}
|
||||
n = nn
|
||||
}
|
||||
n.value = int(value)
|
||||
n.leaf = true
|
||||
}
|
||||
|
||||
type nodeIndex struct {
|
||||
lookupBlocks []*trieNode
|
||||
valueBlocks []*trieNode
|
||||
sparseBlocks []*trieNode
|
||||
sparseOffset []uint16
|
||||
sparseCount int
|
||||
|
||||
lookupBlockIdx map[uint32]int
|
||||
valueBlockIdx map[uint32]int
|
||||
}
|
||||
|
||||
func newIndex() *nodeIndex {
|
||||
index := &nodeIndex{}
|
||||
index.lookupBlocks = make([]*trieNode, 0)
|
||||
index.valueBlocks = make([]*trieNode, 0)
|
||||
index.sparseBlocks = make([]*trieNode, 0)
|
||||
index.sparseOffset = make([]uint16, 1)
|
||||
index.lookupBlockIdx = make(map[uint32]int)
|
||||
index.valueBlockIdx = make(map[uint32]int)
|
||||
return index
|
||||
}
|
||||
|
||||
func computeOffsets(index *nodeIndex, n *trieNode) int {
|
||||
if n.leaf {
|
||||
return n.value
|
||||
}
|
||||
hasher := crc32.New(crc32.MakeTable(crc32.IEEE))
|
||||
// We only index continuation bytes.
|
||||
for i := 0; i < blockSize; i++ {
|
||||
v := 0
|
||||
if nn := n.table[0x80+i]; nn != nil {
|
||||
v = computeOffsets(index, nn)
|
||||
}
|
||||
hasher.Write([]byte{uint8(v >> 8), uint8(v)})
|
||||
}
|
||||
h := hasher.Sum32()
|
||||
if n.isInternal() {
|
||||
v, ok := index.lookupBlockIdx[h]
|
||||
if !ok {
|
||||
v = len(index.lookupBlocks) - blockOffset
|
||||
index.lookupBlocks = append(index.lookupBlocks, n)
|
||||
index.lookupBlockIdx[h] = v
|
||||
}
|
||||
n.value = v
|
||||
} else {
|
||||
v, ok := index.valueBlockIdx[h]
|
||||
if !ok {
|
||||
if c := n.countSparseEntries(); c > maxSparseEntries {
|
||||
v = len(index.valueBlocks) - blockOffset
|
||||
index.valueBlocks = append(index.valueBlocks, n)
|
||||
index.valueBlockIdx[h] = v
|
||||
} else {
|
||||
v = -len(index.sparseOffset)
|
||||
index.sparseBlocks = append(index.sparseBlocks, n)
|
||||
index.sparseOffset = append(index.sparseOffset, uint16(index.sparseCount))
|
||||
index.sparseCount += c + 1
|
||||
index.valueBlockIdx[h] = v
|
||||
}
|
||||
}
|
||||
n.value = v
|
||||
}
|
||||
return n.value
|
||||
}
|
||||
|
||||
func printValueBlock(nr int, n *trieNode, offset int) {
|
||||
boff := nr * blockSize
|
||||
fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
|
||||
var printnewline bool
|
||||
for i := 0; i < blockSize; i++ {
|
||||
if i%6 == 0 {
|
||||
printnewline = true
|
||||
}
|
||||
v := 0
|
||||
if nn := n.table[i+offset]; nn != nil {
|
||||
v = nn.value
|
||||
}
|
||||
if v != 0 {
|
||||
if printnewline {
|
||||
fmt.Printf("\n")
|
||||
printnewline = false
|
||||
}
|
||||
fmt.Printf("%#04x:%#04x, ", boff+i, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func printSparseBlock(nr int, n *trieNode) {
|
||||
boff := -n.value
|
||||
fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
|
||||
v := 0
|
||||
//stride := f(n)
|
||||
stride := n.mostFrequentStride()
|
||||
c := n.countSparseEntries()
|
||||
fmt.Printf("\n{value:%#04x,lo:%#02x},", stride, uint8(c))
|
||||
for i, nn := range n.table[0x80 : 0x80+blockSize] {
|
||||
nv := 0
|
||||
if nn != nil {
|
||||
nv = nn.value
|
||||
}
|
||||
if nv-v != stride {
|
||||
if v != 0 {
|
||||
fmt.Printf(",hi:%#02x},", 0x80+i-1)
|
||||
}
|
||||
if nv != 0 {
|
||||
fmt.Printf("\n{value:%#04x,lo:%#02x", nv, nn.b)
|
||||
}
|
||||
}
|
||||
v = nv
|
||||
}
|
||||
if v != 0 {
|
||||
fmt.Printf(",hi:%#02x},", 0x80+blockSize-1)
|
||||
}
|
||||
}
|
||||
|
||||
func printLookupBlock(nr int, n *trieNode, offset, cutoff int) {
|
||||
boff := nr * blockSize
|
||||
fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
|
||||
var printnewline bool
|
||||
for i := 0; i < blockSize; i++ {
|
||||
if i%8 == 0 {
|
||||
printnewline = true
|
||||
}
|
||||
v := 0
|
||||
if nn := n.table[i+offset]; nn != nil {
|
||||
v = nn.value
|
||||
}
|
||||
if v != 0 {
|
||||
if v < 0 {
|
||||
v = -v - 1 + cutoff
|
||||
}
|
||||
if printnewline {
|
||||
fmt.Printf("\n")
|
||||
printnewline = false
|
||||
}
|
||||
fmt.Printf("%#03x:%#02x, ", boff+i, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// printTables returns the size in bytes of the generated tables.
|
||||
func (t *trieNode) printTables(name string) int {
|
||||
index := newIndex()
|
||||
// Values for 7-bit ASCII are stored in first two block, followed by nil block.
|
||||
index.valueBlocks = append(index.valueBlocks, nil, nil, nil)
|
||||
// First byte of multi-byte UTF-8 codepoints are indexed in 4th block.
|
||||
index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil)
|
||||
// Index starter bytes of multi-byte UTF-8.
|
||||
for i := 0xC0; i < 0x100; i++ {
|
||||
if t.table[i] != nil {
|
||||
computeOffsets(index, t.table[i])
|
||||
}
|
||||
}
|
||||
|
||||
nv := len(index.valueBlocks) * blockSize
|
||||
fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2)
|
||||
fmt.Printf("// Block 2 is the null block.\n")
|
||||
fmt.Printf("var %sValues = [%d]uint16 {", name, nv)
|
||||
printValueBlock(0, t, 0)
|
||||
printValueBlock(1, t, 64)
|
||||
printValueBlock(2, newNode(), 0)
|
||||
for i := 3; i < len(index.valueBlocks); i++ {
|
||||
printValueBlock(i, index.valueBlocks[i], 0x80)
|
||||
}
|
||||
fmt.Print("\n}\n\n")
|
||||
|
||||
ls := len(index.sparseBlocks)
|
||||
fmt.Printf("// %sSparseOffset: %d entries, %d bytes\n", name, ls, ls*2)
|
||||
fmt.Printf("var %sSparseOffset = %#v\n\n", name, index.sparseOffset[1:])
|
||||
|
||||
ns := index.sparseCount
|
||||
fmt.Printf("// %sSparseValues: %d entries, %d bytes\n", name, ns, ns*4)
|
||||
fmt.Printf("var %sSparseValues = [%d]valueRange {", name, ns)
|
||||
for i, n := range index.sparseBlocks {
|
||||
printSparseBlock(i, n)
|
||||
}
|
||||
fmt.Print("\n}\n\n")
|
||||
|
||||
cutoff := len(index.valueBlocks) - blockOffset
|
||||
ni := len(index.lookupBlocks) * blockSize
|
||||
fmt.Printf("// %sLookup: %d bytes\n", name, ni)
|
||||
fmt.Printf("// Block 0 is the null block.\n")
|
||||
fmt.Printf("var %sLookup = [%d]uint8 {", name, ni)
|
||||
printLookupBlock(0, newNode(), 0, cutoff)
|
||||
printLookupBlock(1, newNode(), 0, cutoff)
|
||||
printLookupBlock(2, newNode(), 0, cutoff)
|
||||
printLookupBlock(3, t, 0xC0, cutoff)
|
||||
for i := 4; i < len(index.lookupBlocks); i++ {
|
||||
printLookupBlock(i, index.lookupBlocks[i], 0x80, cutoff)
|
||||
}
|
||||
fmt.Print("\n}\n\n")
|
||||
fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:], %sSparseValues[:], %sSparseOffset[:], %d}\n\n",
|
||||
name, name, name, name, name, cutoff)
|
||||
return nv*2 + ns*4 + ni + ls*2
|
||||
}
|
Loading…
Reference in New Issue
Block a user