1
0
mirror of https://github.com/golang/go synced 2024-11-22 04:24:39 -07:00

encoding/json: coerce invalid UTF-8 to valid UTF-8 during Marshal

In practice, rejecting an entire structure due to a single invalid byte
in a string is just too picky, and too hard to track down.
Be consistent with the bulk of the standard library by converting
invalid UTF-8 into UTF-8 with replacement runes.

R=golang-dev, crawshaw
CC=golang-dev
https://golang.org/cl/11211045
This commit is contained in:
Russ Cox 2013-07-12 17:37:10 -04:00
parent cfefe6a763
commit 64054a40ad
3 changed files with 17 additions and 11 deletions

View File

@ -17,6 +17,7 @@ crypto/sha1: Sum function to simplify hashing (CL 10571043).
crypto/sha256: Sum256 and Sum224 functions to simplify hashing (CL 10629043).
crypto/sha512: Sum512 and Sum384 functions to simplify hashing (CL 10630043).
crypto/tls: add support for TLS 1.1. (CL 7872043).
encoding/json: accept but correct invalid UTF-8 in Marshal (CL 11211045).
flag: add Getter interface (CL 10472043).
fmt: indexed access to arguments in Printf etc. (CL 9680043).
go/build: support including C++ code with cgo (CL 8248043).

View File

@ -393,15 +393,10 @@ func TestMarshal(t *testing.T) {
func TestMarshalBadUTF8(t *testing.T) {
s := "hello\xffworld"
const enc = `"hello\ufffdworld"`
b, err := Marshal(s)
if err == nil {
t.Fatal("Marshal bad UTF8: no error")
}
if len(b) != 0 {
t.Fatal("Marshal returned data")
}
if _, ok := err.(*InvalidUTF8Error); !ok {
t.Fatalf("Marshal did not return InvalidUTF8Error: %T %v", err, err)
if string(b) != enc || err != nil {
t.Errorf("Marshal(%q) = %#q, %v, want %#q, nil", s, b, err, enc)
}
}

View File

@ -209,8 +209,12 @@ func (e *UnsupportedValueError) Error() string {
return "json: unsupported value: " + e.Str
}
// An InvalidUTF8Error is returned by Marshal when attempting
// to encode a string value with invalid UTF-8 sequences.
// Before Go 1.2, an InvalidUTF8Error was returned by Marshal when
// attempting to encode a string value with invalid UTF-8 sequences.
// As of Go 1.2, Marshal instead coerces the string to valid UTF-8 by
// replacing invalid bytes with the Unicode replacement rune U+FFFD.
// This error is no longer generated but is kept for backwards compatibility
// with programs that might mention it.
type InvalidUTF8Error struct {
S string // the whole string value that caused the error
}
@ -555,7 +559,13 @@ func (e *encodeState) string(s string) (int, error) {
}
c, size := utf8.DecodeRuneInString(s[i:])
if c == utf8.RuneError && size == 1 {
e.error(&InvalidUTF8Error{s})
if start < i {
e.WriteString(s[start:i])
}
e.WriteString(`\ufffd`)
i += size
start = i
continue
}
// U+2028 is LINE SEPARATOR.
// U+2029 is PARAGRAPH SEPARATOR.