mirror of
https://github.com/golang/go
synced 2024-11-22 09:24:41 -07:00
exp/norm: fixed two unrelated bugs in normalization library.
1) incorrect length given for out buffer in String. 2) patchTail bug that could cause characters to be lost when crossing into the out-buffer boundary. Added tests to expose these bugs. Also slightly improved performance of Bytes() and String() by sharing the reorderBuffer across operations. Fixes #2567. R=r CC=golang-dev https://golang.org/cl/5502069
This commit is contained in:
parent
335c5db76a
commit
cadbd3ea49
@ -8,7 +8,7 @@ import "unicode/utf8"
|
|||||||
|
|
||||||
type input interface {
|
type input interface {
|
||||||
skipASCII(p int) int
|
skipASCII(p int) int
|
||||||
skipNonStarter() int
|
skipNonStarter(p int) int
|
||||||
appendSlice(buf []byte, s, e int) []byte
|
appendSlice(buf []byte, s, e int) []byte
|
||||||
copySlice(buf []byte, s, e int)
|
copySlice(buf []byte, s, e int)
|
||||||
charinfo(p int) (uint16, int)
|
charinfo(p int) (uint16, int)
|
||||||
@ -25,8 +25,7 @@ func (s inputString) skipASCII(p int) int {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s inputString) skipNonStarter() int {
|
func (s inputString) skipNonStarter(p int) int {
|
||||||
p := 0
|
|
||||||
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
|
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
|
||||||
}
|
}
|
||||||
return p
|
return p
|
||||||
@ -71,8 +70,7 @@ func (s inputBytes) skipASCII(p int) int {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s inputBytes) skipNonStarter() int {
|
func (s inputBytes) skipNonStarter(p int) int {
|
||||||
p := 0
|
|
||||||
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
|
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
|
||||||
}
|
}
|
||||||
return p
|
return p
|
||||||
|
@ -34,24 +34,28 @@ const (
|
|||||||
|
|
||||||
// Bytes returns f(b). May return b if f(b) = b.
|
// Bytes returns f(b). May return b if f(b) = b.
|
||||||
func (f Form) Bytes(b []byte) []byte {
|
func (f Form) Bytes(b []byte) []byte {
|
||||||
n := f.QuickSpan(b)
|
rb := reorderBuffer{}
|
||||||
|
rb.init(f, b)
|
||||||
|
n := quickSpan(&rb, 0)
|
||||||
if n == len(b) {
|
if n == len(b) {
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
out := make([]byte, n, len(b))
|
out := make([]byte, n, len(b))
|
||||||
copy(out, b[0:n])
|
copy(out, b[0:n])
|
||||||
return f.Append(out, b[n:]...)
|
return doAppend(&rb, out, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// String returns f(s).
|
// String returns f(s).
|
||||||
func (f Form) String(s string) string {
|
func (f Form) String(s string) string {
|
||||||
n := f.QuickSpanString(s)
|
rb := reorderBuffer{}
|
||||||
|
rb.initString(f, s)
|
||||||
|
n := quickSpan(&rb, 0)
|
||||||
if n == len(s) {
|
if n == len(s) {
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
out := make([]byte, 0, len(s))
|
out := make([]byte, n, len(s))
|
||||||
copy(out, s[0:n])
|
copy(out, s[0:n])
|
||||||
return string(f.AppendString(out, s[n:]))
|
return string(doAppend(&rb, out, n))
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsNormal returns true if b == f(b).
|
// IsNormal returns true if b == f(b).
|
||||||
@ -122,23 +126,27 @@ func (f Form) IsNormalString(s string) bool {
|
|||||||
|
|
||||||
// patchTail fixes a case where a rune may be incorrectly normalized
|
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||||
// if it is followed by illegal continuation bytes. It returns the
|
// if it is followed by illegal continuation bytes. It returns the
|
||||||
// patched buffer and the number of trailing continuation bytes that
|
// patched buffer and whether there were trailing continuation bytes.
|
||||||
// have been dropped.
|
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
|
||||||
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, int) {
|
|
||||||
info, p := lastRuneStart(&rb.f, buf)
|
info, p := lastRuneStart(&rb.f, buf)
|
||||||
if p == -1 || info.size == 0 {
|
if p == -1 || info.size == 0 {
|
||||||
return buf, 0
|
return buf, false
|
||||||
}
|
}
|
||||||
end := p + int(info.size)
|
end := p + int(info.size)
|
||||||
extra := len(buf) - end
|
extra := len(buf) - end
|
||||||
if extra > 0 {
|
if extra > 0 {
|
||||||
|
// Potentially allocating memory. However, this only
|
||||||
|
// happens with ill-formed UTF-8.
|
||||||
|
x := make([]byte, 0)
|
||||||
|
x = append(x, buf[len(buf)-extra:]...)
|
||||||
buf = decomposeToLastBoundary(rb, buf[:end])
|
buf = decomposeToLastBoundary(rb, buf[:end])
|
||||||
if rb.f.composing {
|
if rb.f.composing {
|
||||||
rb.compose()
|
rb.compose()
|
||||||
}
|
}
|
||||||
return rb.flush(buf), extra
|
buf = rb.flush(buf)
|
||||||
|
return append(buf, x...), true
|
||||||
}
|
}
|
||||||
return buf, 0
|
return buf, false
|
||||||
}
|
}
|
||||||
|
|
||||||
func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
|
func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
|
||||||
@ -157,23 +165,23 @@ func (f Form) Append(out []byte, src ...byte) []byte {
|
|||||||
}
|
}
|
||||||
rb := reorderBuffer{}
|
rb := reorderBuffer{}
|
||||||
rb.init(f, src)
|
rb.init(f, src)
|
||||||
return doAppend(&rb, out)
|
return doAppend(&rb, out, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func doAppend(rb *reorderBuffer, out []byte) []byte {
|
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
|
||||||
src, n := rb.src, rb.nsrc
|
src, n := rb.src, rb.nsrc
|
||||||
doMerge := len(out) > 0
|
doMerge := len(out) > 0
|
||||||
p := 0
|
if q := src.skipNonStarter(p); q > p {
|
||||||
if p = src.skipNonStarter(); p > 0 {
|
|
||||||
// Move leading non-starters to destination.
|
// Move leading non-starters to destination.
|
||||||
out = src.appendSlice(out, 0, p)
|
out = src.appendSlice(out, p, q)
|
||||||
buf, ndropped := patchTail(rb, out)
|
buf, endsInError := patchTail(rb, out)
|
||||||
if ndropped > 0 {
|
if endsInError {
|
||||||
out = src.appendSlice(buf, p-ndropped, p)
|
out = buf
|
||||||
doMerge = false // no need to merge, ends with illegal UTF-8
|
doMerge = false // no need to merge, ends with illegal UTF-8
|
||||||
} else {
|
} else {
|
||||||
out = decomposeToLastBoundary(rb, buf) // force decomposition
|
out = decomposeToLastBoundary(rb, buf) // force decomposition
|
||||||
}
|
}
|
||||||
|
p = q
|
||||||
}
|
}
|
||||||
fd := &rb.f
|
fd := &rb.f
|
||||||
if doMerge {
|
if doMerge {
|
||||||
@ -217,7 +225,7 @@ func (f Form) AppendString(out []byte, src string) []byte {
|
|||||||
}
|
}
|
||||||
rb := reorderBuffer{}
|
rb := reorderBuffer{}
|
||||||
rb.initString(f, src)
|
rb.initString(f, src)
|
||||||
return doAppend(&rb, out)
|
return doAppend(&rb, out, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||||
@ -225,7 +233,8 @@ func (f Form) AppendString(out []byte, src string) []byte {
|
|||||||
func (f Form) QuickSpan(b []byte) int {
|
func (f Form) QuickSpan(b []byte) int {
|
||||||
rb := reorderBuffer{}
|
rb := reorderBuffer{}
|
||||||
rb.init(f, b)
|
rb.init(f, b)
|
||||||
return quickSpan(&rb, 0)
|
n := quickSpan(&rb, 0)
|
||||||
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
func quickSpan(rb *reorderBuffer, i int) int {
|
func quickSpan(rb *reorderBuffer, i int) int {
|
||||||
@ -301,7 +310,7 @@ func (f Form) FirstBoundary(b []byte) int {
|
|||||||
|
|
||||||
func firstBoundary(rb *reorderBuffer) int {
|
func firstBoundary(rb *reorderBuffer) int {
|
||||||
src, nsrc := rb.src, rb.nsrc
|
src, nsrc := rb.src, rb.nsrc
|
||||||
i := src.skipNonStarter()
|
i := src.skipNonStarter(0)
|
||||||
if i >= nsrc {
|
if i >= nsrc {
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
@ -253,7 +253,7 @@ var quickSpanNFDTests = []PositionTest{
|
|||||||
{"\u0316\u0300cd", 6, ""},
|
{"\u0316\u0300cd", 6, ""},
|
||||||
{"\u043E\u0308b", 5, ""},
|
{"\u043E\u0308b", 5, ""},
|
||||||
// incorrectly ordered combining characters
|
// incorrectly ordered combining characters
|
||||||
{"ab\u0300\u0316", 1, ""}, // TODO(mpvl): we could skip 'b' as well.
|
{"ab\u0300\u0316", 1, ""}, // TODO: we could skip 'b' as well.
|
||||||
{"ab\u0300\u0316cd", 1, ""},
|
{"ab\u0300\u0316cd", 1, ""},
|
||||||
// Hangul
|
// Hangul
|
||||||
{"같은", 0, ""},
|
{"같은", 0, ""},
|
||||||
@ -465,6 +465,7 @@ var appendTests = []AppendTest{
|
|||||||
{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
|
{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
|
||||||
{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
|
{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
|
||||||
{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
|
{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
|
||||||
|
{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
|
||||||
}
|
}
|
||||||
|
|
||||||
func appendF(f Form, out []byte, s string) []byte {
|
func appendF(f Form, out []byte, s string) []byte {
|
||||||
@ -475,9 +476,23 @@ func appendStringF(f Form, out []byte, s string) []byte {
|
|||||||
return f.AppendString(out, s)
|
return f.AppendString(out, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bytesF(f Form, out []byte, s string) []byte {
|
||||||
|
buf := []byte{}
|
||||||
|
buf = append(buf, out...)
|
||||||
|
buf = append(buf, s...)
|
||||||
|
return f.Bytes(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringF(f Form, out []byte, s string) []byte {
|
||||||
|
outs := string(out) + s
|
||||||
|
return []byte(f.String(outs))
|
||||||
|
}
|
||||||
|
|
||||||
func TestAppend(t *testing.T) {
|
func TestAppend(t *testing.T) {
|
||||||
runAppendTests(t, "TestAppend", NFKC, appendF, appendTests)
|
runAppendTests(t, "TestAppend", NFKC, appendF, appendTests)
|
||||||
runAppendTests(t, "TestAppendString", NFKC, appendStringF, appendTests)
|
runAppendTests(t, "TestAppendString", NFKC, appendStringF, appendTests)
|
||||||
|
runAppendTests(t, "TestBytes", NFKC, bytesF, appendTests)
|
||||||
|
runAppendTests(t, "TestString", NFKC, stringF, appendTests)
|
||||||
}
|
}
|
||||||
|
|
||||||
func doFormBenchmark(b *testing.B, f Form, s string) {
|
func doFormBenchmark(b *testing.B, f Form, s string) {
|
||||||
|
@ -27,7 +27,7 @@ func (w *normWriter) Write(data []byte) (n int, err error) {
|
|||||||
}
|
}
|
||||||
w.rb.src = inputBytes(data[:m])
|
w.rb.src = inputBytes(data[:m])
|
||||||
w.rb.nsrc = m
|
w.rb.nsrc = m
|
||||||
w.buf = doAppend(&w.rb, w.buf)
|
w.buf = doAppend(&w.rb, w.buf, 0)
|
||||||
data = data[m:]
|
data = data[m:]
|
||||||
n += m
|
n += m
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ func (r *normReader) Read(p []byte) (int, error) {
|
|||||||
r.rb.src = inputBytes(r.inbuf[0:n])
|
r.rb.src = inputBytes(r.inbuf[0:n])
|
||||||
r.rb.nsrc, r.err = n, err
|
r.rb.nsrc, r.err = n, err
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
r.outbuf = doAppend(&r.rb, r.outbuf)
|
r.outbuf = doAppend(&r.rb, r.outbuf, 0)
|
||||||
}
|
}
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
r.lastBoundary = len(r.outbuf)
|
r.lastBoundary = len(r.outbuf)
|
||||||
|
Loading…
Reference in New Issue
Block a user