html: tokenize "a < b" as one whole text token.

R=andybalholm CC=golang-dev https://golang.org/cl/5284042
2024-11-22 03:24:41 -07:00 · 2011-10-16 20:50:11 +11:00 · 2011-10-16 20:50:11 +11:00 · 1887907fee
commit 1887907fee
parent 1135fc3978
2 changed files with 128 additions and 23 deletions
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
 		z.nextBogusComment()
 		return
 	default:
-		z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
+		z.nextText()
 		return
 	}
 	// Read the tag name and attribute key/value pairs.
 	z.readTagName()
 	if z.skipWhiteSpace(); z.err != nil {
 		z.tt = ErrorToken
 		return
 	}
 	for {
 		if z.skipWhiteSpace(); z.err != nil {
 			break
 		}
 		c := z.readByte()
 		if z.err != nil || c == '>' {
 			break
@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
 		if z.pendingAttr[0].start != z.pendingAttr[0].end {
 			z.attr = append(z.attr, z.pendingAttr)
 		}
 		if z.skipWhiteSpace(); z.err != nil {
 			break
 		}
 	}
 	// Check for a self-closing token.
 	if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
 	}
 }
-// nextText reads all text up until an '<'.
+// nextText reads all text up until a start tag "<a", end tag "</a", comment
-// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
+// "<!" or XML processing instruction "<?".
 // Pre-condition: z.tt == TextToken && z.err == nil &&
 //   z.raw.start + 1 <= z.raw.end.
 func (z *Tokenizer) nextText() {
 	for {
 		c := z.readByte()
 		if z.err != nil {
-			z.data = z.raw
+			break
 			return
 		}
-		if c == '<' {
+		if c != '<' {
-			z.raw.end--
+			continue
-			z.data = z.raw
+		}
-			return
+		c = z.readByte()
 		if z.err != nil {
 			break
 		}
 		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
 			z.raw.end -= 2
 			break
 		}
 		if c != '/' {
 			continue
 		}
 		c = z.readByte()
 		if z.err != nil {
 			break
 		}
 		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 			z.raw.end -= 3
 			break
 		}
 	}
 	z.data = z.raw
 }
 // Next scans the next token and returns its type.
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -21,6 +21,11 @@ type tokenTest struct {
 }
 var tokenTests = []tokenTest{
 	{
 		"empty",
 		"",
 		"",
 	},
 	// A single text node. The tokenizer should not break text nodes on whitespace,
 	// nor should it normalize whitespace within a text node.
 	{
@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
 		"<a>b<c/>d</e>",
 		"<a>$b$<c/>$d$</e>",
 	},
 	// Angle brackets that aren't a tag.
 	{
 		"not a tag #0",
 		"<",
 		"&lt;",
 	},
 	{
 		"not a tag #1",
 		"</",
 		"&lt;/",
 	},
 	/*
 		// TODO: re-enable these tests when we tokenize them correctly.
 		{
 			"not a tag #2",
 			"</>",
 			"",
 		},
 		{
 			"not a tag #3",
 			"a</>b",
 			"a$b",
 		},
 	*/
 	{
 		"not a tag #4",
 		"</ >",
 		"<!-- -->",
 	},
 	{
 		"not a tag #5",
 		"a < b",
 		"a &lt; b",
 	},
 	{
 		"not a tag #6",
 		"<.>",
 		"&lt;.&gt;",
 	},
 	{
 		"not a tag #7",
 		"a<<<b>>>c",
 		"a&lt;&lt;$<b>$&gt;&gt;c",
 	},
 	{
 		"not a tag #8",
 		"if x<0 and y < 0 then x*y>0",
 		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
 	},
 	// EOF in a tag name.
 	{
 		"tag name eof #0",
 		"<a",
 		"",
 	},
 	{
 		"tag name eof #1",
 		"<a ",
 		"",
 	},
 	{
 		"tag name eof #2",
 		"a<b",
 		"a",
 	},
 	{
 		"tag name eof #3",
 		"<a><b",
 		"<a>",
 	},
 	{
 		"tag name eof #4",
 		`<a x`,
 		`<a x="">`,
 	},
 	// Some malformed tags that are missing a '>'.
 	{
 		"malformed tag #0",
@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
 	},
 	{
 		"Attributes with a solitary single quote",
-		"<p id=can't><p id=won't>",
+		`<p id=can't><p id=won't>`,
-		"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
+		`<p id="can&apos;t">$<p id="won&apos;t">`,
 	},
 }
@ -267,15 +347,17 @@ loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
 		z.ReturnComments = true
-		for i, s := range strings.Split(tt.golden, "$") {
+		if tt.golden != "" {
-			if z.Next() == ErrorToken {
+			for i, s := range strings.Split(tt.golden, "$") {
-				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+				if z.Next() == ErrorToken {
-				continue loop
+					t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
-			}
+					continue loop
-			actual := z.Token().String()
+				}
-			if s != actual {
+				actual := z.Token().String()
-				t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+				if s != actual {
-				continue loop
+					t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
 					continue loop
 				}
 			}
 		}
 		z.Next()