html: refactor the tokenizer; parse "</>" correctly.

Previously, Next would call either nextText or nextTag, but nextTag could also call nextText. Both nextText and nextTag were responsible for detecting "</a" end tags and "<!" comments. This change simplifies the call chain and puts that responsibility in a single place. R=andybalholm CC=golang-dev https://golang.org/cl/5263050
2024-11-18 15:44:41 -07:00 · 2011-10-18 09:42:16 +11:00 · 2011-10-18 09:42:16 +11:00 · e5f3dc8bc5
commit e5f3dc8bc5
parent d2b73730b7
2 changed files with 159 additions and 153 deletions
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -225,11 +225,9 @@ func (z *Tokenizer) skipWhiteSpace() {
 	}
 }

-// nextComment reads the next token starting with "<!--".
-// The opening "<!--" has already been consumed.
-// Pre-condition: z.tt == CommentToken && z.err == nil &&
-//   z.raw.start + 4 <= z.raw.end.
-func (z *Tokenizer) nextComment() {
+// readComment reads the next comment token starting with "<!--". The opening
+// "<!--" has already been consumed.
+func (z *Tokenizer) readComment() {
 	z.data.start = z.raw.end
 	defer func() {
 		if z.data.end < z.data.start {
@ -269,10 +267,8 @@ func (z *Tokenizer) nextComment() {
 	}
 }

-// nextBogusComment reads text until the next ">" and treats it as a comment.
-// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
-func (z *Tokenizer) nextBogusComment() {
-	z.tt = CommentToken
+// readUntilCloseAngle reads until the next ">".
+func (z *Tokenizer) readUntilCloseAngle() {
 	z.data.start = z.raw.end
 	for {
 		c := z.readByte()
@ -287,24 +283,22 @@ func (z *Tokenizer) nextBogusComment() {
 	}
 }

-// nextMarkupDeclaration reads the next token starting with "<!".
-// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
-// The opening "<!" has already been consumed.
-// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
-func (z *Tokenizer) nextMarkupDeclaration() {
-	z.tt = CommentToken
+// readMarkupDeclaration reads the next token starting with "<!". It might be
+// a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening
+// "<!" has already been consumed.
+func (z *Tokenizer) readMarkupDeclaration() TokenType {
 	z.data.start = z.raw.end
 	var c [2]byte
 	for i := 0; i < 2; i++ {
 		c[i] = z.readByte()
 		if z.err != nil {
 			z.data.end = z.raw.end
-			return
+			return CommentToken
 		}
 	}
 	if c[0] == '-' && c[1] == '-' {
-		z.nextComment()
-		return
+		z.readComment()
+		return CommentToken
 	}
 	z.raw.end -= 2
 	const s = "DOCTYPE"
@ -312,81 +306,33 @@ func (z *Tokenizer) nextMarkupDeclaration() {
 		c := z.readByte()
 		if z.err != nil {
 			z.data.end = z.raw.end
-			return
+			return CommentToken
 		}
 		if c != s[i] && c != s[i]+('a'-'A') {
 			// Back up to read the fragment of "DOCTYPE" again.
 			z.raw.end = z.data.start
-			z.nextBogusComment()
-			return
+			z.readUntilCloseAngle()
+			return CommentToken
 		}
 	}
-	z.tt = DoctypeToken
 	if z.skipWhiteSpace(); z.err != nil {
 		z.data.start = z.raw.end
 		z.data.end = z.raw.end
-		return
-	}
-	z.data.start = z.raw.end
-	for {
-		c := z.readByte()
-		if z.err != nil {
-			z.data.end = z.raw.end
-			return
-		}
-		if c == '>' {
-			z.data.end = z.raw.end - len(">")
-			return
-		}
+		return DoctypeToken
 	}
+	z.readUntilCloseAngle()
+	return DoctypeToken
 }

-// nextTag reads the next token starting with "<". It might be a "<startTag>",
-// an "</endTag>", a "<!markup declaration>", or "<malformed text".
-// The opening "<" has already been consumed.
-// Pre-condition: z.tt == TextToken && z.err == nil &&
-//   z.raw.start + 1 <= z.raw.end.
-func (z *Tokenizer) nextTag() {
-	c := z.readByte()
-	if z.err != nil {
-		z.data = z.raw
-		return
-	}
-	switch {
-	case c == '/':
-		// Check that the "</" is followed by something in A-Za-z.
-		c = z.readByte()
-		if z.err != nil {
-			z.data = z.raw
-			return
-		}
-		z.raw.end--
-		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
-			z.nextBogusComment()
-			return
-		}
-		z.tt = EndTagToken
-		z.data.start += len("</")
-	// Lower-cased characters are more common in tag names, so we check for them first.
-	case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-		z.tt = StartTagToken
-		z.data.start += len("<")
-	case c == '!':
-		z.nextMarkupDeclaration()
-		return
-	case c == '?':
-		z.raw.end--
-		z.nextBogusComment()
-		return
-	default:
-		z.nextText()
-		return
-	}
+// readStartTag reads the next start tag token. The opening "<a" has already
+// been consumed, where 'a' means anything in [A-Za-z].
+func (z *Tokenizer) readStartTag() TokenType {
+	z.attr = z.attr[:0]
+	z.nAttrReturned = 0
 	// Read the tag name and attribute key/value pairs.
 	z.readTagName()
 	if z.skipWhiteSpace(); z.err != nil {
-		z.tt = ErrorToken
-		return
+		return ErrorToken
 	}
 	for {
 		c := z.readByte()
@ -404,14 +350,31 @@ func (z *Tokenizer) nextTag() {
 			break
 		}
 	}
-	// Check for a self-closing token.
-	if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
-		z.tt = SelfClosingTagToken
+	if z.err == nil && z.buf[z.raw.end-2] == '/' {
+		return SelfClosingTagToken
+	}
+	return StartTagToken
+}
+
+// readEndTag reads the next end tag token. The opening "</a" has already
+// been consumed, where 'a' means anything in [A-Za-z].
+func (z *Tokenizer) readEndTag() {
+	z.attr = z.attr[:0]
+	z.nAttrReturned = 0
+	z.readTagName()
+	for {
+		c := z.readByte()
+		if z.err != nil || c == '>' {
+			return
+		}
 	}
 }

-// readTagName sets z.data to the "p" in "<p k=v>".
+// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
+// is positioned such that the first byte of the tag name (the "d" in "<div")
+// has already been consumed.
 func (z *Tokenizer) readTagName() {
+	z.data.start = z.raw.end - 1
 	for {
 		c := z.readByte()
 		if z.err != nil {
@ -430,7 +393,7 @@ func (z *Tokenizer) readTagName() {
 	}
 }

-// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<p k=v>".
+// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
 // Precondition: z.err == nil.
 func (z *Tokenizer) readTagAttrKey() {
 	z.pendingAttr[0].start = z.raw.end
@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() {
 	}
 }

-// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<p k=v>".
+// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
 func (z *Tokenizer) readTagAttrVal() {
 	z.pendingAttr[1].start = z.raw.end
 	z.pendingAttr[1].end = z.raw.end
@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() {
 	}
 }

-// nextText reads all text up until a start tag "<a", end tag "</a", comment
-// "<!" or XML processing instruction "<?".
-// Pre-condition: z.tt == TextToken && z.err == nil &&
-//   z.raw.start + 1 <= z.raw.end.
-func (z *Tokenizer) nextText() {
+// next scans the next token and returns its type.
+func (z *Tokenizer) next() TokenType {
+	if z.err != nil {
+		return ErrorToken
+	}
+	z.raw.start = z.raw.end
+	z.data.start = z.raw.end
+	z.data.end = z.raw.end
+
+loop:
 	for {
 		c := z.readByte()
 		if z.err != nil {
-			break
+			break loop
 		}
 		if c != '<' {
-			continue
+			continue loop
 		}
+
+		// Check if the '<' we have just read is part of a tag, comment
+		// or doctype. If not, it's part of the accumulated text token.
 		c = z.readByte()
 		if z.err != nil {
-			break
+			break loop
 		}
-		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
-			z.raw.end -= 2
-			break
-		}
-		if c != '/' {
+		var tokenType TokenType
+		switch {
+		case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
+			tokenType = StartTagToken
+		case c == '/':
+			tokenType = EndTagToken
+		case c == '!' || c == '?':
+			// We use CommentToken to mean any of "<!--actual comments-->",
+			// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
+			tokenType = CommentToken
+		default:
 			continue
 		}
-		c = z.readByte()
-		if z.err != nil {
-			break
+
+		// We have a non-text token, but we might have accumulated some text
+		// before that. If so, we return the text first, and return the non-
+		// text token on the subsequent call to Next.
+		if x := z.raw.end - len("<a"); z.raw.start < x {
+			z.raw.end = x
+			z.data.end = x
+			return TextToken
 		}
-		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
-			z.raw.end -= 3
-			break
+		switch tokenType {
+		case StartTagToken:
+			return z.readStartTag()
+		case EndTagToken:
+			c = z.readByte()
+			if z.err != nil {
+				break loop
+			}
+			if c == '>' {
+				// "</>" does not generate a token at all.
+				// Reset the tokenizer state and start again.
+				z.raw.start = z.raw.end
+				z.data.start = z.raw.end
+				z.data.end = z.raw.end
+				continue loop
+			}
+			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+				z.readEndTag()
+				return EndTagToken
+			}
+			z.raw.end--
+			z.readUntilCloseAngle()
+			return CommentToken
+		case CommentToken:
+			if c == '!' {
+				return z.readMarkupDeclaration()
+			}
+			z.raw.end--
+			z.readUntilCloseAngle()
+			return CommentToken
 		}
 	}
-	z.data = z.raw
+	if z.raw.start < z.raw.end {
+		z.data.end = z.raw.end
+		return TextToken
+	}
+	return ErrorToken
 }

 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
 	for {
-		if z.err != nil {
-			z.tt = ErrorToken
-			return z.tt
-		}
-		z.raw.start = z.raw.end
-		z.data.start = z.raw.end
-		z.data.end = z.raw.end
-		z.attr = z.attr[:0]
-		z.nAttrReturned = 0
-
-		c := z.readByte()
-		if z.err != nil {
-			z.tt = ErrorToken
-			return z.tt
-		}
-		// We assume that the next token is text unless proven otherwise.
-		z.tt = TextToken
-		if c != '<' {
-			z.nextText()
-		} else {
-			z.nextTag()
-			if z.tt == CommentToken && !z.ReturnComments {
-				continue
-			}
+		z.tt = z.next()
+		// TODO: remove the ReturnComments option. A tokenizer should
+		// always return comment tags.
+		if z.tt == CommentToken && !z.ReturnComments {
+			continue
 		}
 		return z.tt
 	}
@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte {
 // `<IMG SRC="foo">`) and whether the tag has attributes.
 // The contents of the returned slice may change on the next call to Next.
 func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
-	switch z.tt {
-	case StartTagToken, EndTagToken, SelfClosingTagToken:
-		s := z.buf[z.data.start:z.data.end]
-		z.data.start = z.raw.end
-		z.data.end = z.raw.end
-		return lower(s), z.nAttrReturned < len(z.attr)
+	if z.data.start < z.data.end {
+		switch z.tt {
+		case StartTagToken, EndTagToken, SelfClosingTagToken:
+			s := z.buf[z.data.start:z.data.end]
+			z.data.start = z.raw.end
+			z.data.end = z.raw.end
+			return lower(s), z.nAttrReturned < len(z.attr)
+		}
 	}
 	return nil, false
 }
@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
 	if z.nAttrReturned < len(z.attr) {
 		switch z.tt {
-		case StartTagToken, EndTagToken, SelfClosingTagToken:
+		case StartTagToken, SelfClosingTagToken:
 			x := z.attr[z.nAttrReturned]
 			z.nAttrReturned++
 			key = z.buf[x[0].start:x[0].end]
@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token {
 	switch z.tt {
 	case TextToken, CommentToken, DoctypeToken:
 		t.Data = string(z.Text())
-	case StartTagToken, EndTagToken, SelfClosingTagToken:
+	case StartTagToken, SelfClosingTagToken:
 		var attr []Attribute
 		name, moreAttr := z.TagName()
 		for moreAttr {
@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token {
 		}
 		t.Data = string(name)
 		t.Attr = attr
+	case EndTagToken:
+		name, _ := z.TagName()
+		t.Data = string(name)
 	}
 	return t
 }
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -57,19 +57,16 @@ var tokenTests = []tokenTest{
 		"</",
 		"&lt;/",
 	},
-	/*
-		// TODO: re-enable these tests when we tokenize them correctly.
-		{
-			"not a tag #2",
-			"</>",
-			"",
-		},
-		{
-			"not a tag #3",
-			"a</>b",
-			"a$b",
-		},
-	*/
+	{
+		"not a tag #2",
+		"</>",
+		"",
+	},
+	{
+		"not a tag #3",
+		"a</>b",
+		"a$b",
+	},
 	{
 		"not a tag #4",
 		"</ >",
@ -77,21 +74,31 @@ var tokenTests = []tokenTest{
 	},
 	{
 		"not a tag #5",
+		"</.",
+		"<!--.-->",
+	},
+	{
+		"not a tag #6",
+		"</.>",
+		"<!--.-->",
+	},
+	{
+		"not a tag #7",
 		"a < b",
 		"a &lt; b",
 	},
 	{
-		"not a tag #6",
+		"not a tag #8",
 		"<.>",
 		"&lt;.&gt;",
 	},
 	{
-		"not a tag #7",
+		"not a tag #9",
 		"a<<<b>>>c",
 		"a&lt;&lt;$<b>$&gt;&gt;c",
 	},
 	{
-		"not a tag #8",
+		"not a tag #10",
 		"if x<0 and y < 0 then x*y>0",
 		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
 	},
@ -345,7 +352,7 @@ var tokenTests = []tokenTest{
 func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
-		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
+		z := NewTokenizer(strings.NewReader(tt.html))
 		z.ReturnComments = true
 		if tt.golden != "" {
 			for i, s := range strings.Split(tt.golden, "$") {