| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "bytes" |
| "io" |
| "io/ioutil" |
| "reflect" |
| "runtime" |
| "strings" |
| "testing" |
| ) |
| |
| type tokenTest struct { |
| // A short description of the test case. |
| desc string |
| // The HTML to parse. |
| html string |
| // The string representations of the expected tokens, joined by '$'. |
| golden string |
| } |
| |
| var tokenTests = []tokenTest{ |
| { |
| "empty", |
| "", |
| "", |
| }, |
| // A single text node. The tokenizer should not break text nodes on whitespace, |
| // nor should it normalize whitespace within a text node. |
| { |
| "text", |
| "foo bar", |
| "foo bar", |
| }, |
| // An entity. |
| { |
| "entity", |
| "one < two", |
| "one < two", |
| }, |
| // A start, self-closing and end tag. The tokenizer does not care if the start |
| // and end tokens don't match; that is the job of the parser. |
| { |
| "tags", |
| "<a>b<c/>d</e>", |
| "<a>$b$<c/>$d$</e>", |
| }, |
| // Angle brackets that aren't a tag. |
| { |
| "not a tag #0", |
| "<", |
| "<", |
| }, |
| { |
| "not a tag #1", |
| "</", |
| "</", |
| }, |
| { |
| "not a tag #2", |
| "</>", |
| "<!---->", |
| }, |
| { |
| "not a tag #3", |
| "a</>b", |
| "a$<!---->$b", |
| }, |
| { |
| "not a tag #4", |
| "</ >", |
| "<!-- -->", |
| }, |
| { |
| "not a tag #5", |
| "</.", |
| "<!--.-->", |
| }, |
| { |
| "not a tag #6", |
| "</.>", |
| "<!--.-->", |
| }, |
| { |
| "not a tag #7", |
| "a < b", |
| "a < b", |
| }, |
| { |
| "not a tag #8", |
| "<.>", |
| "<.>", |
| }, |
| { |
| "not a tag #9", |
| "a<<<b>>>c", |
| "a<<$<b>$>>c", |
| }, |
| { |
| "not a tag #10", |
| "if x<0 and y < 0 then x*y>0", |
| "if x<0 and y < 0 then x*y>0", |
| }, |
| { |
| "not a tag #11", |
| "<<p>", |
| "<$<p>", |
| }, |
| // EOF in a tag name. |
| { |
| "tag name eof #0", |
| "<a", |
| "", |
| }, |
| { |
| "tag name eof #1", |
| "<a ", |
| "", |
| }, |
| { |
| "tag name eof #2", |
| "a<b", |
| "a", |
| }, |
| { |
| "tag name eof #3", |
| "<a><b", |
| "<a>", |
| }, |
| { |
| "tag name eof #4", |
| `<a x`, |
| ``, |
| }, |
| // Some malformed tags that are missing a '>'. |
| { |
| "malformed tag #0", |
| `<p</p>`, |
| `<p< p="">`, |
| }, |
| { |
| "malformed tag #1", |
| `<p </p>`, |
| `<p <="" p="">`, |
| }, |
| { |
| "malformed tag #2", |
| `<p id`, |
| ``, |
| }, |
| { |
| "malformed tag #3", |
| `<p id=`, |
| ``, |
| }, |
| { |
| "malformed tag #4", |
| `<p id=>`, |
| `<p id="">`, |
| }, |
| { |
| "malformed tag #5", |
| `<p id=0`, |
| ``, |
| }, |
| { |
| "malformed tag #6", |
| `<p id=0</p>`, |
| `<p id="0</p">`, |
| }, |
| { |
| "malformed tag #7", |
| `<p id="0</p>`, |
| ``, |
| }, |
| { |
| "malformed tag #8", |
| `<p id="0"</p>`, |
| `<p id="0" <="" p="">`, |
| }, |
| { |
| "malformed tag #9", |
| `<p></p id`, |
| `<p>`, |
| }, |
| // Raw text and RCDATA. |
| { |
| "basic raw text", |
| "<script><a></b></script>", |
| "<script>$<a></b>$</script>", |
| }, |
| { |
| "unfinished script end tag", |
| "<SCRIPT>a</SCR", |
| "<script>$a</SCR", |
| }, |
| { |
| "broken script end tag", |
| "<SCRIPT>a</SCR ipt>", |
| "<script>$a</SCR ipt>", |
| }, |
| { |
| "EOF in script end tag", |
| "<SCRIPT>a</SCRipt", |
| "<script>$a</SCRipt", |
| }, |
| { |
| "scriptx end tag", |
| "<SCRIPT>a</SCRiptx", |
| "<script>$a</SCRiptx", |
| }, |
| { |
| "' ' completes script end tag", |
| "<SCRIPT>a</SCRipt ", |
| "<script>$a", |
| }, |
| { |
| "'>' completes script end tag", |
| "<SCRIPT>a</SCRipt>", |
| "<script>$a$</script>", |
| }, |
| { |
| "self-closing script end tag", |
| "<SCRIPT>a</SCRipt/>", |
| "<script>$a$</script>", |
| }, |
| { |
| "nested script tag", |
| "<SCRIPT>a</SCRipt<script>", |
| "<script>$a</SCRipt<script>", |
| }, |
| { |
| "script end tag after unfinished", |
| "<SCRIPT>a</SCRipt</script>", |
| "<script>$a</SCRipt$</script>", |
| }, |
| { |
| "script/style mismatched tags", |
| "<script>a</style>", |
| "<script>$a</style>", |
| }, |
| { |
| "style element with entity", |
| "<style>'", |
| "<style>$&apos;", |
| }, |
| { |
| "textarea with tag", |
| "<textarea><div></textarea>", |
| "<textarea>$<div>$</textarea>", |
| }, |
| { |
| "title with tag and entity", |
| "<title><b>K&R C</b></title>", |
| "<title>$<b>K&R C</b>$</title>", |
| }, |
| { |
| "title with trailing '<' entity", |
| "<title>foobar<</title>", |
| "<title>$foobar<$</title>", |
| }, |
| // DOCTYPE tests. |
| { |
| "Proper DOCTYPE", |
| "<!DOCTYPE html>", |
| "<!DOCTYPE html>", |
| }, |
| { |
| "DOCTYPE with no space", |
| "<!doctypehtml>", |
| "<!DOCTYPE html>", |
| }, |
| { |
| "DOCTYPE with two spaces", |
| "<!doctype html>", |
| "<!DOCTYPE html>", |
| }, |
| { |
| "looks like DOCTYPE but isn't", |
| "<!DOCUMENT html>", |
| "<!--DOCUMENT html-->", |
| }, |
| { |
| "DOCTYPE at EOF", |
| "<!DOCtype", |
| "<!DOCTYPE >", |
| }, |
| // XML processing instructions. |
| { |
| "XML processing instruction", |
| "<?xml?>", |
| "<!--?xml?-->", |
| }, |
| // Comments. |
| { |
| "comment0", |
| "abc<b><!-- skipme --></b>def", |
| "abc$<b>$<!-- skipme -->$</b>$def", |
| }, |
| { |
| "comment1", |
| "a<!-->z", |
| "a$<!---->$z", |
| }, |
| { |
| "comment2", |
| "a<!--->z", |
| "a$<!---->$z", |
| }, |
| { |
| "comment3", |
| "a<!--x>-->z", |
| "a$<!--x>-->$z", |
| }, |
| { |
| "comment4", |
| "a<!--x->-->z", |
| "a$<!--x->-->$z", |
| }, |
| { |
| "comment5", |
| "a<!>z", |
| "a$<!---->$z", |
| }, |
| { |
| "comment6", |
| "a<!->z", |
| "a$<!----->$z", |
| }, |
| { |
| "comment7", |
| "a<!---<>z", |
| "a$<!---<>z-->", |
| }, |
| { |
| "comment8", |
| "a<!--z", |
| "a$<!--z-->", |
| }, |
| { |
| "comment9", |
| "a<!--z-", |
| "a$<!--z-->", |
| }, |
| { |
| "comment10", |
| "a<!--z--", |
| "a$<!--z-->", |
| }, |
| { |
| "comment11", |
| "a<!--z---", |
| "a$<!--z--->", |
| }, |
| { |
| "comment12", |
| "a<!--z----", |
| "a$<!--z---->", |
| }, |
| { |
| "comment13", |
| "a<!--x--!>z", |
| "a$<!--x-->$z", |
| }, |
| // An attribute with a backslash. |
| { |
| "backslash", |
| `<p id="a\"b">`, |
| `<p id="a\" b"="">`, |
| }, |
| // Entities, tag name and attribute key lower-casing, and whitespace |
| // normalization within a tag. |
| { |
| "tricky", |
| "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", |
| `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, |
| }, |
| // A nonexistent entity. Tokenizing and converting back to a string should |
| // escape the "&" to become "&". |
| { |
| "noSuchEntity", |
| `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, |
| `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, |
| }, |
| { |
| "entity without semicolon", |
| `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, |
| `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, |
| }, |
| { |
| "entity with digits", |
| "½", |
| "½", |
| }, |
| // Attribute tests: |
| // http://dev.w3.org/html5/pf-summary/Overview.html#attributes |
| { |
| "Empty attribute", |
| `<input disabled FOO>`, |
| `<input disabled="" foo="">`, |
| }, |
| { |
| "Empty attribute, whitespace", |
| `<input disabled FOO >`, |
| `<input disabled="" foo="">`, |
| }, |
| { |
| "Unquoted attribute value", |
| `<input value=yes FOO=BAR>`, |
| `<input value="yes" foo="BAR">`, |
| }, |
| { |
| "Unquoted attribute value, spaces", |
| `<input value = yes FOO = BAR>`, |
| `<input value="yes" foo="BAR">`, |
| }, |
| { |
| "Unquoted attribute value, trailing space", |
| `<input value=yes FOO=BAR >`, |
| `<input value="yes" foo="BAR">`, |
| }, |
| { |
| "Single-quoted attribute value", |
| `<input value='yes' FOO='BAR'>`, |
| `<input value="yes" foo="BAR">`, |
| }, |
| { |
| "Single-quoted attribute value, trailing space", |
| `<input value='yes' FOO='BAR' >`, |
| `<input value="yes" foo="BAR">`, |
| }, |
| { |
| "Double-quoted attribute value", |
| `<input value="I'm an attribute" FOO="BAR">`, |
| `<input value="I'm an attribute" foo="BAR">`, |
| }, |
| { |
| "Attribute name characters", |
| `<meta http-equiv="content-type">`, |
| `<meta http-equiv="content-type">`, |
| }, |
| { |
| "Mixed attributes", |
| `a<P V="0 1" w='2' X=3 y>z`, |
| `a$<p v="0 1" w="2" x="3" y="">$z`, |
| }, |
| { |
| "Attributes with a solitary single quote", |
| `<p id=can't><p id=won't>`, |
| `<p id="can't">$<p id="won't">`, |
| }, |
| } |
| |
| func TestTokenizer(t *testing.T) { |
| loop: |
| for _, tt := range tokenTests { |
| z := NewTokenizer(strings.NewReader(tt.html)) |
| if tt.golden != "" { |
| for i, s := range strings.Split(tt.golden, "$") { |
| if z.Next() == ErrorToken { |
| t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) |
| continue loop |
| } |
| actual := z.Token().String() |
| if s != actual { |
| t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) |
| continue loop |
| } |
| } |
| } |
| z.Next() |
| if z.Err() != io.EOF { |
| t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) |
| } |
| } |
| } |
| |
| func TestMaxBuffer(t *testing.T) { |
| // Exceeding the maximum buffer size generates ErrBufferExceeded. |
| z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) |
| z.SetMaxBuf(5) |
| tt := z.Next() |
| if got, want := tt, ErrorToken; got != want { |
| t.Fatalf("token type: got: %v want: %v", got, want) |
| } |
| if got, want := z.Err(), ErrBufferExceeded; got != want { |
| t.Errorf("error type: got: %v want: %v", got, want) |
| } |
| if got, want := string(z.Raw()), "<tttt"; got != want { |
| t.Fatalf("buffered before overflow: got: %q want: %q", got, want) |
| } |
| } |
| |
| func TestMaxBufferReconstruction(t *testing.T) { |
| // Exceeding the maximum buffer size at any point while tokenizing permits |
| // reconstructing the original input. |
| tests: |
| for _, test := range tokenTests { |
| for maxBuf := 1; ; maxBuf++ { |
| r := strings.NewReader(test.html) |
| z := NewTokenizer(r) |
| z.SetMaxBuf(maxBuf) |
| var tokenized bytes.Buffer |
| for { |
| tt := z.Next() |
| tokenized.Write(z.Raw()) |
| if tt == ErrorToken { |
| if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { |
| t.Errorf("%s: unexpected error: %v", test.desc, err) |
| } |
| break |
| } |
| } |
| // Anything tokenized along with untokenized input or data left in the reader. |
| assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) |
| if err != nil { |
| t.Errorf("%s: ReadAll: %v", test.desc, err) |
| continue tests |
| } |
| if got, want := string(assembled), test.html; got != want { |
| t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) |
| continue tests |
| } |
| // EOF indicates that we completed tokenization and hence found the max |
| // maxBuf that generates ErrBufferExceeded, so continue to the next test. |
| if z.Err() == io.EOF { |
| break |
| } |
| } // buffer sizes |
| } // tests |
| } |
| |
| func TestPassthrough(t *testing.T) { |
| // Accumulating the raw output for each parse event should reconstruct the |
| // original input. |
| for _, test := range tokenTests { |
| z := NewTokenizer(strings.NewReader(test.html)) |
| var parsed bytes.Buffer |
| for { |
| tt := z.Next() |
| parsed.Write(z.Raw()) |
| if tt == ErrorToken { |
| break |
| } |
| } |
| if got, want := parsed.String(), test.html; got != want { |
| t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) |
| } |
| } |
| } |
| |
| func TestBufAPI(t *testing.T) { |
| s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" |
| z := NewTokenizer(bytes.NewBufferString(s)) |
| var result bytes.Buffer |
| depth := 0 |
| loop: |
| for { |
| tt := z.Next() |
| switch tt { |
| case ErrorToken: |
| if z.Err() != io.EOF { |
| t.Error(z.Err()) |
| } |
| break loop |
| case TextToken: |
| if depth > 0 { |
| result.Write(z.Text()) |
| } |
| case StartTagToken, EndTagToken: |
| tn, _ := z.TagName() |
| if len(tn) == 1 && tn[0] == 'a' { |
| if tt == StartTagToken { |
| depth++ |
| } else { |
| depth-- |
| } |
| } |
| } |
| } |
| u := "14567" |
| v := string(result.Bytes()) |
| if u != v { |
| t.Errorf("TestBufAPI: want %q got %q", u, v) |
| } |
| } |
| |
| func TestConvertNewlines(t *testing.T) { |
| testCases := map[string]string{ |
| "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", |
| "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", |
| "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", |
| "": "", |
| "\n": "\n", |
| "\n\r": "\n\n", |
| "\r": "\n", |
| "\r\n": "\n", |
| "\r\n\n": "\n\n", |
| "\r\n\r": "\n\n", |
| "\r\n\r\n": "\n\n", |
| "\r\r": "\n\n", |
| "\r\r\n": "\n\n", |
| "\r\r\n\n": "\n\n\n", |
| "\r\r\r\n": "\n\n\n", |
| "\r \n": "\n \n", |
| "xyz": "xyz", |
| } |
| for in, want := range testCases { |
| if got := string(convertNewlines([]byte(in))); got != want { |
| t.Errorf("input %q: got %q, want %q", in, got, want) |
| } |
| } |
| } |
| |
| func TestReaderEdgeCases(t *testing.T) { |
| const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" |
| testCases := []io.Reader{ |
| &zeroOneByteReader{s: s}, |
| &eofStringsReader{s: s}, |
| &stuckReader{}, |
| } |
| for i, tc := range testCases { |
| got := []TokenType{} |
| z := NewTokenizer(tc) |
| for { |
| tt := z.Next() |
| if tt == ErrorToken { |
| break |
| } |
| got = append(got, tt) |
| } |
| if err := z.Err(); err != nil && err != io.EOF { |
| if err != io.ErrNoProgress { |
| t.Errorf("i=%d: %v", i, err) |
| } |
| continue |
| } |
| want := []TokenType{ |
| StartTagToken, |
| TextToken, |
| EndTagToken, |
| } |
| if !reflect.DeepEqual(got, want) { |
| t.Errorf("i=%d: got %v, want %v", i, got, want) |
| continue |
| } |
| } |
| } |
| |
| // zeroOneByteReader is like a strings.Reader that alternates between |
| // returning 0 bytes and 1 byte at a time. |
| type zeroOneByteReader struct { |
| s string |
| n int |
| } |
| |
| func (r *zeroOneByteReader) Read(p []byte) (int, error) { |
| if len(p) == 0 { |
| return 0, nil |
| } |
| if len(r.s) == 0 { |
| return 0, io.EOF |
| } |
| r.n++ |
| if r.n%2 != 0 { |
| return 0, nil |
| } |
| p[0], r.s = r.s[0], r.s[1:] |
| return 1, nil |
| } |
| |
| // eofStringsReader is like a strings.Reader but can return an (n, err) where |
| // n > 0 && err != nil. |
| type eofStringsReader struct { |
| s string |
| } |
| |
| func (r *eofStringsReader) Read(p []byte) (int, error) { |
| n := copy(p, r.s) |
| r.s = r.s[n:] |
| if r.s != "" { |
| return n, nil |
| } |
| return n, io.EOF |
| } |
| |
| // stuckReader is an io.Reader that always returns no data and no error. |
| type stuckReader struct{} |
| |
| func (*stuckReader) Read(p []byte) (int, error) { |
| return 0, nil |
| } |
| |
| const ( |
| rawLevel = iota |
| lowLevel |
| highLevel |
| ) |
| |
| func benchmarkTokenizer(b *testing.B, level int) { |
| buf, err := ioutil.ReadFile("testdata/go1.html") |
| if err != nil { |
| b.Fatalf("could not read testdata/go1.html: %v", err) |
| } |
| b.SetBytes(int64(len(buf))) |
| runtime.GC() |
| b.ReportAllocs() |
| b.ResetTimer() |
| for i := 0; i < b.N; i++ { |
| z := NewTokenizer(bytes.NewBuffer(buf)) |
| for { |
| tt := z.Next() |
| if tt == ErrorToken { |
| if err := z.Err(); err != nil && err != io.EOF { |
| b.Fatalf("tokenizer error: %v", err) |
| } |
| break |
| } |
| switch level { |
| case rawLevel: |
| // Calling z.Raw just returns the raw bytes of the token. It does |
| // not unescape < to <, or lower-case tag names and attribute keys. |
| z.Raw() |
| case lowLevel: |
| // Caling z.Text, z.TagName and z.TagAttr returns []byte values |
| // whose contents may change on the next call to z.Next. |
| switch tt { |
| case TextToken, CommentToken, DoctypeToken: |
| z.Text() |
| case StartTagToken, SelfClosingTagToken: |
| _, more := z.TagName() |
| for more { |
| _, _, more = z.TagAttr() |
| } |
| case EndTagToken: |
| z.TagName() |
| } |
| case highLevel: |
| // Calling z.Token converts []byte values to strings whose validity |
| // extend beyond the next call to z.Next. |
| z.Token() |
| } |
| } |
| } |
| } |
| |
| func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } |
| func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } |
| func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) } |