From 1887907fee0b51502a3ef10a63a96543d5dd3e23 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Sun, 16 Oct 2011 20:50:11 +1100 Subject: [PATCH] html: tokenize "a < b" as one whole text token. R=andybalholm CC=golang-dev https://golang.org/cl/5284042 --- src/pkg/html/token.go | 47 ++++++++++++----- src/pkg/html/token_test.go | 104 +++++++++++++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 23 deletions(-) diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index a02b968dc28..2105cc6f561 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() { z.nextBogusComment() return default: - z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") + z.nextText() return } // Read the tag name and attribute key/value pairs. z.readTagName() + if z.skipWhiteSpace(); z.err != nil { + z.tt = ErrorToken + return + } for { - if z.skipWhiteSpace(); z.err != nil { - break - } c := z.readByte() if z.err != nil || c == '>' { break @@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() { if z.pendingAttr[0].start != z.pendingAttr[0].end { z.attr = append(z.attr, z.pendingAttr) } + if z.skipWhiteSpace(); z.err != nil { + break + } } // Check for a self-closing token. if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' { @@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() { } } -// nextText reads all text up until an '<'. -// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end. +// nextText reads all text up until a start tag "bd", "$b$$d$", }, + // Angle brackets that aren't a tag. + { + "not a tag #0", + "<", + "<", + }, + { + "not a tag #1", + "", + "", + }, + { + "not a tag #3", + "ab", + "a$b", + }, + */ + { + "not a tag #4", + "", + "", + }, + { + "not a tag #5", + "a < b", + "a < b", + }, + { + "not a tag #6", + "<.>", + "<.>", + }, + { + "not a tag #7", + "a<<>>c", + "a<<$$>>c", + }, + { + "not a tag #8", + "if x<0 and y < 0 then x*y>0", + "if x<0 and y < 0 then x*y>0", + }, + // EOF in a tag name. + { + "tag name eof #0", + "", + }, + { + "tag name eof #4", + ``, + }, // Some malformed tags that are missing a '>'. { "malformed tag #0", @@ -257,8 +337,8 @@ var tokenTests = []tokenTest{ }, { "Attributes with a solitary single quote", - "

", - "

$

", + `

`, + `

$

`, }, } @@ -267,15 +347,17 @@ loop: for _, tt := range tokenTests { z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) z.ReturnComments = true - for i, s := range strings.Split(tt.golden, "$") { - if z.Next() == ErrorToken { - t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) - continue loop - } - actual := z.Token().String() - if s != actual { - t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) - continue loop + if tt.golden != "" { + for i, s := range strings.Split(tt.golden, "$") { + if z.Next() == ErrorToken { + t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) + continue loop + } + actual := z.Token().String() + if s != actual { + t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) + continue loop + } } } z.Next()