| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "bytes" |
| "os" |
| "strings" |
| "testing" |
| ) |
| |
| type tokenTest struct { |
| // A short description of the test case. |
| desc string |
| // The HTML to parse. |
| html string |
| // The string representations of the expected tokens, joined by '$'. |
| golden string |
| } |
| |
| var tokenTests = []tokenTest{ |
| // A single text node. The tokenizer should not break text nodes on whitespace, |
| // nor should it normalize whitespace within a text node. |
| { |
| "text", |
| "foo bar", |
| "foo bar", |
| }, |
| // An entity. |
| { |
| "entity", |
| "one < two", |
| "one < two", |
| }, |
| // A start, self-closing and end tag. The tokenizer does not care if the start |
| // and end tokens don't match; that is the job of the parser. |
| { |
| "tags", |
| "<a>b<c/>d</e>", |
| "<a>$b$<c/>$d$</e>", |
| }, |
| // Comments. |
| { |
| "comment0", |
| "abc<b><!-- skipme --></b>def", |
| "abc$<b>$</b>$def", |
| }, |
| { |
| "comment1", |
| "a<!-->z", |
| "a$z", |
| }, |
| { |
| "comment2", |
| "a<!--->z", |
| "a$z", |
| }, |
| { |
| "comment3", |
| "a<!--x>-->z", |
| "a$z", |
| }, |
| { |
| "comment4", |
| "a<!--x->-->z", |
| "a$z", |
| }, |
| { |
| "comment5", |
| "a<!>z", |
| "a$<!>z", |
| }, |
| { |
| "comment6", |
| "a<!->z", |
| "a$<!->z", |
| }, |
| { |
| "comment7", |
| "a<!---<>z", |
| "a$<!---<>z", |
| }, |
| { |
| "comment8", |
| "a<!--z", |
| "a$<!--z", |
| }, |
| // An attribute with a backslash. |
| { |
| "backslash", |
| `<p id="a\"b">`, |
| `<p id="a"b">`, |
| }, |
| // Entities, tag name and attribute key lower-casing, and whitespace |
| // normalization within a tag. |
| { |
| "tricky", |
| "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", |
| `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, |
| }, |
| // A non-existant entity. Tokenizing and converting back to a string should |
| // escape the "&" to become "&". |
| { |
| "noSuchEntity", |
| `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, |
| `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, |
| }, |
| } |
| |
| func TestTokenizer(t *testing.T) { |
| loop: |
| for _, tt := range tokenTests { |
| z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) |
| for i, s := range strings.Split(tt.golden, "$", -1) { |
| if z.Next() == ErrorToken { |
| t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) |
| continue loop |
| } |
| actual := z.Token().String() |
| if s != actual { |
| t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) |
| continue loop |
| } |
| } |
| z.Next() |
| if z.Error() != os.EOF { |
| t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String()) |
| } |
| } |
| } |
| |
| type unescapeTest struct { |
| // A short description of the test case. |
| desc string |
| // The HTML text. |
| html string |
| // The unescaped text. |
| unescaped string |
| } |
| |
| var unescapeTests = []unescapeTest{ |
| // Handle no entities. |
| { |
| "copy", |
| "A\ttext\nstring", |
| "A\ttext\nstring", |
| }, |
| // Handle simple named entities. |
| { |
| "simple", |
| "& > <", |
| "& > <", |
| }, |
| // Handle hitting the end of the string. |
| { |
| "stringEnd", |
| "& &", |
| "& &", |
| }, |
| // Handle entities with two codepoints. |
| { |
| "multiCodepoint", |
| "text ⋛︀ blah", |
| "text \u22db\ufe00 blah", |
| }, |
| // Handle decimal numeric entities. |
| { |
| "decimalEntity", |
| "Delta = Δ ", |
| "Delta = Δ ", |
| }, |
| // Handle hexadecimal numeric entities. |
| { |
| "hexadecimalEntity", |
| "Lambda = λ = λ ", |
| "Lambda = λ = λ ", |
| }, |
| // Handle numeric early termination. |
| { |
| "numericEnds", |
| "&# &#x €43 © = ©f = ©", |
| "&# &#x €43 © = ©f = ©", |
| }, |
| // Handle numeric ISO-8859-1 entity replacements. |
| { |
| "numericReplacements", |
| "Footnote‡", |
| "Footnote‡", |
| }, |
| } |
| |
| func TestUnescape(t *testing.T) { |
| for _, tt := range unescapeTests { |
| unescaped := UnescapeString(tt.html) |
| if unescaped != tt.unescaped { |
| t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped) |
| } |
| } |
| } |
| |
| func TestUnescapeEscape(t *testing.T) { |
| ss := []string{ |
| ``, |
| `abc def`, |
| `a & b`, |
| `a&b`, |
| `a & b`, |
| `"`, |
| `"`, |
| `"<&>"`, |
| `"<&>"`, |
| `3&5==1 && 0<1, "0<1", a+acute=á`, |
| } |
| for _, s := range ss { |
| if s != UnescapeString(EscapeString(s)) { |
| t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s) |
| } |
| } |
| } |
| |
| func TestBufAPI(t *testing.T) { |
| s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" |
| z := NewTokenizer(bytes.NewBuffer([]byte(s))) |
| result := bytes.NewBuffer(nil) |
| depth := 0 |
| loop: |
| for { |
| tt := z.Next() |
| switch tt { |
| case ErrorToken: |
| if z.Error() != os.EOF { |
| t.Error(z.Error()) |
| } |
| break loop |
| case TextToken: |
| if depth > 0 { |
| result.Write(z.Text()) |
| } |
| case StartTagToken, EndTagToken: |
| tn, _ := z.TagName() |
| if len(tn) == 1 && tn[0] == 'a' { |
| if tt == StartTagToken { |
| depth++ |
| } else { |
| depth-- |
| } |
| } |
| } |
| } |
| u := "14567" |
| v := string(result.Bytes()) |
| if u != v { |
| t.Errorf("TestBufAPI: want %q got %q", u, v) |
| } |
| } |