From 27f2d5ce8cda3734200300d06889729e3c517d15 Mon Sep 17 00:00:00 2001 From: Nigel Kerr Date: Thu, 9 Dec 2010 14:51:01 -0500 Subject: [PATCH] xml: disallow invalid Unicode code points Fixes #1259. R=rsc CC=golang-dev https://golang.org/cl/2967041 --- src/pkg/xml/xml.go | 28 ++++++++++++++++++++++++++++ src/pkg/xml/xml_test.go | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/src/pkg/xml/xml.go b/src/pkg/xml/xml.go index eed9355547c..4d9c672d273 100644 --- a/src/pkg/xml/xml.go +++ b/src/pkg/xml/xml.go @@ -16,6 +16,7 @@ package xml import ( "bufio" "bytes" + "fmt" "io" "os" "strconv" @@ -871,6 +872,21 @@ Input: data := p.buf.Bytes() data = data[0 : len(data)-trunc] + // Inspect each rune for being a disallowed character. + buf := data + for len(buf) > 0 { + r, size := utf8.DecodeRune(buf) + if r == utf8.RuneError && size == 1 { + p.err = p.syntaxError("invalid UTF-8") + return nil + } + buf = buf[size:] + if !isInCharacterRange(r) { + p.err = p.syntaxError(fmt.Sprintf("illegal character code %U", r)) + return nil + } + } + // Must rewrite \r and \r\n into \n. w := 0 for r := 0; r < len(data); r++ { @@ -887,6 +903,18 @@ Input: return data[0:w] } +// Decide whether the given rune is in the XML Character Range, per +// the Char production of http://www.xml.com/axml/testaxml.htm, +// Section 2.2 Characters. +func isInCharacterRange(rune int) (inrange bool) { + return rune == 0x09 || + rune == 0x0A || + rune == 0x0D || + rune >= 0x20 && rune <= 0xDF77 || + rune >= 0xE000 && rune <= 0xFFFD || + rune >= 0x10000 && rune <= 0x10FFFF +} + // Get name space name: name with a : stuck in the middle. // The part before the : is the name space identifier. func (p *Parser) nsname() (name Name, ok bool) { diff --git a/src/pkg/xml/xml_test.go b/src/pkg/xml/xml_test.go index 2c73fcc803e..9ab199a30e7 100644 --- a/src/pkg/xml/xml_test.go +++ b/src/pkg/xml/xml_test.go @@ -398,3 +398,44 @@ func TestEntityInsideCDATA(t *testing.T) { t.Fatalf("p.Token() = _, %v, want _, os.EOF", err) } } + + +// The last three tests (respectively one for characters in attribute +// names and two for character entities) pass not because of code +// changed for issue 1259, but instead pass with the given messages +// from other parts of xml.Parser. I provide these to note the +// current behavior of situations where one might think that character +// range checking would detect the error, but it does not in fact. + +var characterTests = []struct { + in string + err string +}{ + {"\x12", "illegal character code U+0012"}, + {"\x0b", "illegal character code U+000B"}, + {"\xef\xbf\xbe", "illegal character code U+FFFE"}, + {"\r\n\x07", "illegal character code U+0007"}, + {"what's up", "expected attribute name in element"}, + {"&\x01;", "invalid character entity &;"}, + {"&\xef\xbf\xbe;", "invalid character entity &;"}, +} + + +func TestDisallowedCharacters(t *testing.T) { + + for i, tt := range characterTests { + p := NewParser(StringReader(tt.in)) + var err os.Error + + for err == nil { + _, err = p.Token() + } + synerr, ok := err.(*SyntaxError) + if !ok { + t.Fatalf("input %d p.Token() = _, %v, want _, *SyntaxError", i, err) + } + if synerr.Msg != tt.err { + t.Fatalf("input %d synerr.Msg wrong: want '%s', got '%s'", i, tt.err, synerr.Msg) + } + } +}