mirror of
https://github.com/golang/go
synced 2024-11-24 21:00:09 -07:00
html: tokenize HTML comments.
I'm not sure if it's 100% correct wrt the HTML5 specification, but the test suite has plenty of HTML comment test cases, and we'll shake out any tokenization bugs as the parser improves its coverage. R=gri CC=golang-dev https://golang.org/cl/4186055
This commit is contained in:
parent
3a2d64789b
commit
a5ff8ad9db
@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text:
|
||||
}
|
||||
}
|
||||
|
||||
A Tokenizer typically skips over HTML comments. To return comment tokens, set
|
||||
Tokenizer.ReturnComments to true before looping over calls to Next.
|
||||
|
||||
Parsing is done by calling Parse with an io.Reader, which returns the root of
|
||||
the parse tree (the document element) as a *Node. It is the caller's
|
||||
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
|
||||
|
@ -25,6 +25,8 @@ const (
|
||||
EndTagToken
|
||||
// A SelfClosingTagToken tag looks like <br/>.
|
||||
SelfClosingTagToken
|
||||
// A CommentToken looks like <!--x-->.
|
||||
CommentToken
|
||||
)
|
||||
|
||||
// String returns a string representation of the TokenType.
|
||||
@ -40,6 +42,8 @@ func (t TokenType) String() string {
|
||||
return "EndTag"
|
||||
case SelfClosingTagToken:
|
||||
return "SelfClosingTag"
|
||||
case CommentToken:
|
||||
return "Comment"
|
||||
}
|
||||
return "Invalid(" + strconv.Itoa(int(t)) + ")"
|
||||
}
|
||||
@ -52,8 +56,8 @@ type Attribute struct {
|
||||
}
|
||||
|
||||
// A Token consists of a TokenType and some Data (tag name for start and end
|
||||
// tags, content for text). A tag Token may also contain a slice of Attributes.
|
||||
// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather
|
||||
// tags, content for text and comments). A tag Token may also contain a slice
|
||||
// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather
|
||||
// than "a<b").
|
||||
type Token struct {
|
||||
Type TokenType
|
||||
@ -91,12 +95,18 @@ func (t Token) String() string {
|
||||
return "</" + t.tagString() + ">"
|
||||
case SelfClosingTagToken:
|
||||
return "<" + t.tagString() + "/>"
|
||||
case CommentToken:
|
||||
return "<!--" + EscapeString(t.Data) + "-->"
|
||||
}
|
||||
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
|
||||
}
|
||||
|
||||
// A Tokenizer returns a stream of HTML Tokens.
|
||||
type Tokenizer struct {
|
||||
// If ReturnComments is set, Next returns comment tokens;
|
||||
// otherwise it skips over comments (default).
|
||||
ReturnComments bool
|
||||
|
||||
// r is the source of the HTML text.
|
||||
r io.Reader
|
||||
// tt is the TokenType of the most recently read token. If tt == Error
|
||||
@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
// nextMarkupDeclaration returns the next TokenType starting with "<!".
|
||||
func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) {
|
||||
// TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment.
|
||||
for i := 0; i < 2; i++ {
|
||||
c, err := z.readByte()
|
||||
if err != nil {
|
||||
return TextToken, err
|
||||
}
|
||||
if c != '-' {
|
||||
return z.nextText(), nil
|
||||
}
|
||||
}
|
||||
// <!--> is a valid comment.
|
||||
for dashCount := 2; ; {
|
||||
c, err := z.readByte()
|
||||
if err != nil {
|
||||
return TextToken, err
|
||||
}
|
||||
switch c {
|
||||
case '-':
|
||||
dashCount++
|
||||
case '>':
|
||||
if dashCount >= 2 {
|
||||
return CommentToken, nil
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
dashCount = 0
|
||||
}
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
// nextTag returns the next TokenType starting from the tag open state.
|
||||
func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
|
||||
c, err := z.readByte()
|
||||
@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
|
||||
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
|
||||
tt = StartTagToken
|
||||
case c == '!':
|
||||
return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
|
||||
return z.nextMarkupDeclaration()
|
||||
case c == '?':
|
||||
return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
|
||||
default:
|
||||
@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
// Next scans the next token and returns its type.
|
||||
func (z *Tokenizer) Next() TokenType {
|
||||
if z.err != nil {
|
||||
z.tt = ErrorToken
|
||||
return z.tt
|
||||
}
|
||||
z.p0 = z.p1
|
||||
c, err := z.readByte()
|
||||
if err != nil {
|
||||
z.tt, z.err = ErrorToken, err
|
||||
return z.tt
|
||||
}
|
||||
if c == '<' {
|
||||
z.tt, z.err = z.nextTag()
|
||||
return z.tt
|
||||
}
|
||||
// nextText reads all text up until an '<'.
|
||||
func (z *Tokenizer) nextText() TokenType {
|
||||
for {
|
||||
c, err := z.readByte()
|
||||
if err != nil {
|
||||
@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType {
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
// Next scans the next token and returns its type.
|
||||
func (z *Tokenizer) Next() TokenType {
|
||||
for {
|
||||
if z.err != nil {
|
||||
z.tt = ErrorToken
|
||||
return z.tt
|
||||
}
|
||||
z.p0 = z.p1
|
||||
c, err := z.readByte()
|
||||
if err != nil {
|
||||
z.tt, z.err = ErrorToken, err
|
||||
return z.tt
|
||||
}
|
||||
if c == '<' {
|
||||
z.tt, z.err = z.nextTag()
|
||||
if z.tt == CommentToken && !z.ReturnComments {
|
||||
continue
|
||||
}
|
||||
return z.tt
|
||||
}
|
||||
return z.nextText()
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
// trim returns the largest j such that z.buf[i:j] contains only white space,
|
||||
// or only white space plus the final ">" or "/>" of the raw data.
|
||||
func (z *Tokenizer) trim(i int) int {
|
||||
@ -299,12 +353,27 @@ loop:
|
||||
return z.buf[i0:i], z.trim(i)
|
||||
}
|
||||
|
||||
// Text returns the raw data after unescaping.
|
||||
// Text returns the unescaped text of a TextToken or a CommentToken.
|
||||
// The contents of the returned slice may change on the next call to Next.
|
||||
func (z *Tokenizer) Text() []byte {
|
||||
s := unescape(z.Raw())
|
||||
z.p0 = z.p1
|
||||
return s
|
||||
switch z.tt {
|
||||
case TextToken:
|
||||
s := unescape(z.Raw())
|
||||
z.p0 = z.p1
|
||||
return s
|
||||
case CommentToken:
|
||||
// We trim the "<!--" from the left and the "-->" from the right.
|
||||
// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
|
||||
i0 := z.p0 + 4
|
||||
i1 := z.p1 - 3
|
||||
z.p0 = z.p1
|
||||
var s []byte
|
||||
if i0 < i1 {
|
||||
s = unescape(z.buf[i0:i1])
|
||||
}
|
||||
return s
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TagName returns the lower-cased name of a tag token (the `img` out of
|
||||
@ -372,7 +441,7 @@ loop:
|
||||
func (z *Tokenizer) Token() Token {
|
||||
t := Token{Type: z.tt}
|
||||
switch z.tt {
|
||||
case TextToken:
|
||||
case TextToken, CommentToken:
|
||||
t.Data = string(z.Text())
|
||||
case StartTagToken, EndTagToken, SelfClosingTagToken:
|
||||
var attr []Attribute
|
||||
|
@ -7,6 +7,7 @@ package html
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@ -15,8 +16,8 @@ type tokenTest struct {
|
||||
desc string
|
||||
// The HTML to parse.
|
||||
html string
|
||||
// The string representations of the expected tokens.
|
||||
tokens []string
|
||||
// The string representations of the expected tokens, joined by '$'.
|
||||
golden string
|
||||
}
|
||||
|
||||
var tokenTests = []tokenTest{
|
||||
@ -25,61 +26,86 @@ var tokenTests = []tokenTest{
|
||||
{
|
||||
"text",
|
||||
"foo bar",
|
||||
[]string{
|
||||
"foo bar",
|
||||
},
|
||||
"foo bar",
|
||||
},
|
||||
// An entity.
|
||||
{
|
||||
"entity",
|
||||
"one < two",
|
||||
[]string{
|
||||
"one < two",
|
||||
},
|
||||
"one < two",
|
||||
},
|
||||
// A start, self-closing and end tag. The tokenizer does not care if the start
|
||||
// and end tokens don't match; that is the job of the parser.
|
||||
{
|
||||
"tags",
|
||||
"<a>b<c/>d</e>",
|
||||
[]string{
|
||||
"<a>",
|
||||
"b",
|
||||
"<c/>",
|
||||
"d",
|
||||
"</e>",
|
||||
},
|
||||
"<a>$b$<c/>$d$</e>",
|
||||
},
|
||||
// Comments.
|
||||
{
|
||||
"comment0",
|
||||
"abc<b><!-- skipme --></b>def",
|
||||
"abc$<b>$</b>$def",
|
||||
},
|
||||
{
|
||||
"comment1",
|
||||
"a<!-->z",
|
||||
"a$z",
|
||||
},
|
||||
{
|
||||
"comment2",
|
||||
"a<!--->z",
|
||||
"a$z",
|
||||
},
|
||||
{
|
||||
"comment3",
|
||||
"a<!--x>-->z",
|
||||
"a$z",
|
||||
},
|
||||
{
|
||||
"comment4",
|
||||
"a<!--x->-->z",
|
||||
"a$z",
|
||||
},
|
||||
{
|
||||
"comment5",
|
||||
"a<!>z",
|
||||
"a$<!>z",
|
||||
},
|
||||
{
|
||||
"comment6",
|
||||
"a<!->z",
|
||||
"a$<!->z",
|
||||
},
|
||||
{
|
||||
"comment7",
|
||||
"a<!---<>z",
|
||||
"a$<!---<>z",
|
||||
},
|
||||
{
|
||||
"comment8",
|
||||
"a<!--z",
|
||||
"a$<!--z",
|
||||
},
|
||||
// An attribute with a backslash.
|
||||
{
|
||||
"backslash",
|
||||
`<p id="a\"b">`,
|
||||
[]string{
|
||||
`<p id="a"b">`,
|
||||
},
|
||||
`<p id="a"b">`,
|
||||
},
|
||||
// Entities, tag name and attribute key lower-casing, and whitespace
|
||||
// normalization within a tag.
|
||||
{
|
||||
"tricky",
|
||||
"<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
|
||||
[]string{
|
||||
`<p id="a"B" foo="bar">`,
|
||||
"<em>",
|
||||
"te<&;xt",
|
||||
"</em>",
|
||||
"</p>",
|
||||
},
|
||||
`<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
|
||||
},
|
||||
// A non-existant entity. Tokenizing and converting back to a string should
|
||||
// escape the "&" to become "&".
|
||||
{
|
||||
"noSuchEntity",
|
||||
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
|
||||
[]string{
|
||||
`<a b="c&noSuchEntity;d">`,
|
||||
"<&alsoDoesntExist;&",
|
||||
},
|
||||
`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
|
||||
},
|
||||
}
|
||||
|
||||
@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) {
|
||||
loop:
|
||||
for _, tt := range tokenTests {
|
||||
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
|
||||
for i, s := range tt.tokens {
|
||||
for i, s := range strings.Split(tt.golden, "$", -1) {
|
||||
if z.Next() == ErrorToken {
|
||||
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
|
||||
continue loop
|
||||
|
Loading…
Reference in New Issue
Block a user