blob: a5664dbb64a15086e02cf13b31b553ad608bd903 [file]
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"reflect"
"regexp"
"runtime"
"slices"
"strconv"
"strings"
"testing"
"golang.org/x/net/html/atom"
)
// https://github.com/golang/go/issues/58246
const issue58246 = `<!--[if gte mso 12]>
<xml>
<o:OfficeDocumentSettings>
<o:AllowPNG/>
<o:PixelsPerInch>96</o:PixelsPerInch>
</o:OfficeDocumentSettings>
</xml>
<![endif]-->`
type tokenTest struct {
// A short description of the test case.
desc string
// The HTML to parse.
html string
// The string representations of the expected tokens, joined by '$'.
golden string
}
var tokenTests = []tokenTest{
{
"empty",
"",
"",
},
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
"text",
"foo bar",
"foo bar",
},
// An entity.
{
"entity",
"one &lt; two",
"one &lt; two",
},
// A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser.
{
"tags",
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
// Angle brackets that aren't a tag.
{
"not a tag #0",
"<",
"&lt;",
},
{
"not a tag #1",
"</",
"&lt;/",
},
{
"not a tag #2",
"</>",
"<!---->",
},
{
"not a tag #3",
"a</>b",
"a$<!---->$b",
},
{
"not a tag #4",
"</ >",
"<!-- -->",
},
{
"not a tag #5",
"</.",
"<!--.-->",
},
{
"not a tag #6",
"</.>",
"<!--.-->",
},
{
"not a tag #7",
"a < b",
"a &lt; b",
},
{
"not a tag #8",
"<.>",
"&lt;.&gt;",
},
{
"not a tag #9",
"a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c",
},
{
"not a tag #10",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
},
{
"not a tag #11",
"<<p>",
"&lt;$<p>",
},
// EOF in a tag name.
{
"tag name eof #0",
"<a",
"",
},
{
"tag name eof #1",
"<a ",
"",
},
{
"tag name eof #2",
"a<b",
"a",
},
{
"tag name eof #3",
"<a><b",
"<a>",
},
{
"tag name eof #4",
`<a x`,
``,
},
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
`<p</p>`,
`<p< p="">`,
},
{
"malformed tag #1",
`<p </p>`,
`<p <="" p="">`,
},
{
"malformed tag #2",
`<p id`,
``,
},
{
"malformed tag #3",
`<p id=`,
``,
},
{
"malformed tag #4",
`<p id=>`,
`<p id="">`,
},
{
"malformed tag #5",
`<p id=0`,
``,
},
{
"malformed tag #6",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #7",
`<p id="0</p>`,
``,
},
{
"malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
{
"malformed tag #9",
`<p></p id`,
`<p>`,
},
// Raw text and RCDATA.
{
"basic raw text",
"<script><a></b></script>",
"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
},
{
"unfinished script end tag",
"<SCRIPT>a</SCR",
"<script>$a&lt;/SCR",
},
{
"broken script end tag",
"<SCRIPT>a</SCR ipt>",
"<script>$a&lt;/SCR ipt&gt;",
},
{
"EOF in script end tag",
"<SCRIPT>a</SCRipt",
"<script>$a&lt;/SCRipt",
},
{
"scriptx end tag",
"<SCRIPT>a</SCRiptx",
"<script>$a&lt;/SCRiptx",
},
{
"' ' completes script end tag",
"<SCRIPT>a</SCRipt ",
"<script>$a",
},
{
"'>' completes script end tag",
"<SCRIPT>a</SCRipt>",
"<script>$a$</script>",
},
{
"self-closing script end tag",
"<SCRIPT>a</SCRipt/>",
"<script>$a$</script>",
},
{
"nested script tag",
"<SCRIPT>a</SCRipt<script>",
"<script>$a&lt;/SCRipt&lt;script&gt;",
},
{
"script end tag after unfinished",
"<SCRIPT>a</SCRipt</script>",
"<script>$a&lt;/SCRipt$</script>",
},
{
"script/style mismatched tags",
"<script>a</style>",
"<script>$a&lt;/style&gt;",
},
{
"style element with entity",
"<style>&apos;",
"<style>$&amp;apos;",
},
{
"textarea with tag",
"<textarea><div></textarea>",
"<textarea>$&lt;div&gt;$</textarea>",
},
{
"title with tag and entity",
"<title><b>K&amp;R C</b></title>",
"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
},
{
"title with trailing '&lt;' entity",
"<title>foobar<</title>",
"<title>$foobar&lt;$</title>",
},
// DOCTYPE tests.
{
"Proper DOCTYPE",
"<!DOCTYPE html>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with no space",
"<!doctypehtml>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with two spaces",
"<!doctype html>",
"<!DOCTYPE html>",
},
{
"looks like DOCTYPE but isn't",
"<!DOCUMENT html>",
"<!--DOCUMENT html-->",
},
{
"DOCTYPE at EOF",
"<!DOCtype",
"<!DOCTYPE >",
},
// XML processing instructions.
{
"XML processing instruction",
"<?xml?>",
"<!--?xml?-->",
},
// Comments. See also func TestComments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
"abc$<b>$<!-- skipme -->$</b>$def",
},
{
"comment1",
"a<!-->z",
"a$<!---->$z",
},
{
"comment2",
"a<!--->z",
"a$<!---->$z",
},
{
"comment3",
"a<!--x>-->z",
"a$<!--x>-->$z",
},
{
"comment4",
"a<!--x->-->z",
"a$<!--x-&gt;-->$z",
},
{
"comment5",
"a<!>z",
"a$<!---->$z",
},
{
"comment6",
"a<!->z",
"a$<!----->$z",
},
{
"comment7",
"a<!---<>z",
"a$<!---<>z-->",
},
{
"comment8",
"a<!--z",
"a$<!--z-->",
},
{
"comment9",
"a<!--z-",
"a$<!--z-->",
},
{
"comment10",
"a<!--z--",
"a$<!--z-->",
},
{
"comment11",
"a<!--z---",
"a$<!--z--->",
},
{
"comment12",
"a<!--z----",
"a$<!--z---->",
},
{
"comment13",
"a<!--x--!>z",
"a$<!--x-->$z",
},
{
"comment14",
"a<!--!-->z",
"a$<!--!-->$z",
},
{
"comment15",
"a<!-- !-->z",
"a$<!-- !-->$z",
},
{
"comment16",
"a<!--i\x00j-->z",
"a$<!--i\uFFFDj-->$z",
},
{
"comment17",
"a<!--\x00",
"a$<!--\uFFFD-->",
},
{
"comment18",
"a<!--<!-->z",
"a$<!--<!-->$z",
},
{
"comment19",
"a<!--<!--",
"a$<!--<!-->",
},
{
"comment20",
"a<!--ij--kl-->z",
"a$<!--ij--kl-->$z",
},
{
"comment21",
"a<!--ij--kl--!>z",
"a$<!--ij--kl-->$z",
},
{
"comment22",
"a<!--!--!<--!-->z",
"a$<!--!--!<--!-->$z",
},
{
"comment23",
"a<!--&gt;-->z",
"a$<!--&gt;-->$z",
},
{
"comment24",
"a<!--&gt;>x",
"a$<!--&gt;>x-->",
},
{
"comment25",
"a<!--&gt;&gt;",
"a$<!--&gt;>-->",
},
{
"comment26",
"a<!--&gt;&gt;-",
"a$<!--&gt;>-->",
},
{
"comment27",
"a<!--&gt;&gt;-->z",
"a$<!--&gt;>-->$z",
},
{
"comment28",
"a<!--&amp;&gt;-->z",
"a$<!--&amp;>-->$z",
},
{
"comment29",
"a<!--&amp;gt;-->z",
"a$<!--&amp;gt;-->$z",
},
{
"comment30",
"a<!--&nosuchentity;-->z",
"a$<!--&amp;nosuchentity;-->$z",
},
{
"comment31",
"a<!--i>>j-->z",
"a$<!--i>>j-->$z",
},
{
"comment32",
"a<!--i!>>j-->z",
"a$<!--i!&gt;>j-->$z",
},
// https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions
// says "[For] Windows Outlook 2003 and above... conditional comments allow
// us to add bits of HTML that are only read by the Word-based versions of
// Outlook". These comments (with angle brackets) should pass through
// unchanged (by this Go package) when rendering.
//
// We should also still escape ">" as "&gt;" when necessary.
// https://github.com/golang/go/issues/48237
//
// The "your code" example below comes from that stackoverflow.design link
// above but note that it can contain angle-bracket-rich XML.
// https://github.com/golang/go/issues/58246
{
"issue48237CommentWithAmpgtsemi1",
"a<!--<p></p>&lt;!--[video]--&gt;-->z",
"a$<!--<p></p><!--[video]--&gt;-->$z",
},
{
"issue48237CommentWithAmpgtsemi2",
"a<!--<p></p>&lt;!--[video]--!&gt;-->z",
"a$<!--<p></p><!--[video]--!&gt;-->$z",
},
{
"issue58246MicrosoftOutlookComment1",
"a<!--[if mso]> your code <![endif]-->z",
"a$<!--[if mso]> your code <![endif]-->$z",
},
{
"issue58246MicrosoftOutlookComment2",
"a" + issue58246 + "z",
"a$" + issue58246 + "$z",
},
// An attribute with a backslash.
{
"backslash",
`<p id="a\"b">`,
`<p id="a\" b"="">`,
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
{
"tricky",
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
},
// A nonexistent entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;".
{
"noSuchEntity",
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
{
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
{
"entity with digits",
"&frac12;",
"½",
},
// Attribute tests:
// http://dev.w3.org/html5/pf-summary/Overview.html#attributes
{
"Empty attribute",
`<input disabled FOO>`,
`<input disabled="" foo="">`,
},
{
"Empty attribute, whitespace",
`<input disabled FOO >`,
`<input disabled="" foo="">`,
},
{
"Unquoted attribute value",
`<input value=yes FOO=BAR>`,
`<input value="yes" foo="BAR">`,
},
{
"Unquoted attribute value, spaces",
`<input value = yes FOO = BAR>`,
`<input value="yes" foo="BAR">`,
},
{
"Unquoted attribute value, trailing space",
`<input value=yes FOO=BAR >`,
`<input value="yes" foo="BAR">`,
},
{
"Single-quoted attribute value",
`<input value='yes' FOO='BAR'>`,
`<input value="yes" foo="BAR">`,
},
{
"Single-quoted attribute value, trailing space",
`<input value='yes' FOO='BAR' >`,
`<input value="yes" foo="BAR">`,
},
{
"Double-quoted attribute value",
`<input value="I'm an attribute" FOO="BAR">`,
`<input value="I&#39;m an attribute" foo="BAR">`,
},
{
"Attribute name characters",
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
},
{
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
},
{
"Attributes with a solitary single quote",
`<p id=can't><p id=won't>`,
`<p id="can&#39;t">$<p id="won&#39;t">`,
},
// WHATWG 13.2.5.32 equals sign before attribute name state
{
"equals sign before attribute name",
`<p =>`,
`<p =="">`,
},
{
"equals sign before attribute name, extra cruft",
`<p =asd>`,
`<p =asd="">`,
},
{
"forward slash before attribute name",
`<p/=">`,
`<p ="="">`,
},
{
"forward slash before attribute name with spaces around",
`<p / =">`,
`<p ="="">`,
},
{
"forward slash after attribute name followed by a character",
`<p a/ ="">`,
`<p a="" =""="">`,
},
{
"slash at end of unquoted attribute value",
`<p a="\">`,
`<p a="\">`,
},
{
"self-closing tag with attribute",
`<p a=/>`,
`<p a="/">`,
},
{
"duplicate attributes",
`<p foo="bar" foo="baz">`,
`<p foo="bar">`,
},
{
"duplicate attributes, different case",
`<p FOO="bar" foo="baz">`,
`<p foo="bar">`,
},
{
"partial doctype",
`<!doc`,
`<!--doc-->`,
},
{
"partial cdata",
`<![CDA`,
`<!--[CDA-->`,
},
{
"partial comment",
`<!comment`,
`<!--comment-->`,
},
}
func TestTokenizer(t *testing.T) {
for _, tt := range tokenTests {
t.Run(tt.desc, func(t *testing.T) {
z := NewTokenizer(strings.NewReader(tt.html))
z.AllowCDATA(true)
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
return
}
actual := z.Token().String()
if s != actual {
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
return
}
}
}
z.Next()
if z.Err() != io.EOF {
t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
}
})
}
}
func TestMaxBuffer(t *testing.T) {
// Exceeding the maximum buffer size generates ErrBufferExceeded.
z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
z.SetMaxBuf(5)
tt := z.Next()
if got, want := tt, ErrorToken; got != want {
t.Fatalf("token type: got: %v want: %v", got, want)
}
if got, want := z.Err(), ErrBufferExceeded; got != want {
t.Errorf("error type: got: %v want: %v", got, want)
}
if got, want := string(z.Raw()), "<tttt"; got != want {
t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
}
}
func TestMaxBufferReconstruction(t *testing.T) {
// Exceeding the maximum buffer size at any point while tokenizing permits
// reconstructing the original input.
tests:
for _, test := range tokenTests {
for maxBuf := 1; ; maxBuf++ {
r := strings.NewReader(test.html)
z := NewTokenizer(r)
z.SetMaxBuf(maxBuf)
var tokenized bytes.Buffer
for {
tt := z.Next()
tokenized.Write(z.Raw())
if tt == ErrorToken {
if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
t.Errorf("%s: unexpected error: %v", test.desc, err)
}
break
}
}
// Anything tokenized along with untokenized input or data left in the reader.
assembled, err := io.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
if err != nil {
t.Errorf("%s: ReadAll: %v", test.desc, err)
continue tests
}
if got, want := string(assembled), test.html; got != want {
t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
continue tests
}
// EOF indicates that we completed tokenization and hence found the max
// maxBuf that generates ErrBufferExceeded, so continue to the next test.
if z.Err() == io.EOF {
break
}
} // buffer sizes
} // tests
}
func TestPassthrough(t *testing.T) {
// Accumulating the raw output for each parse event should reconstruct the
// original input.
for _, test := range tokenTests {
z := NewTokenizer(strings.NewReader(test.html))
var parsed bytes.Buffer
for {
tt := z.Next()
parsed.Write(z.Raw())
if tt == ErrorToken {
break
}
}
if got, want := parsed.String(), test.html; got != want {
t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
}
}
}
func TestBufAPI(t *testing.T) {
s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
z := NewTokenizer(bytes.NewBufferString(s))
var result bytes.Buffer
depth := 0
loop:
for {
tt := z.Next()
switch tt {
case ErrorToken:
if z.Err() != io.EOF {
t.Error(z.Err())
}
break loop
case TextToken:
if depth > 0 {
result.Write(z.Text())
}
case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTagToken {
depth++
} else {
depth--
}
}
}
}
u := "14567"
v := string(result.Bytes())
if u != v {
t.Errorf("TestBufAPI: want %q got %q", u, v)
}
}
func TestConvertNewlines(t *testing.T) {
testCases := map[string]string{
"Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
"Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
"": "",
"\n": "\n",
"\n\r": "\n\n",
"\r": "\n",
"\r\n": "\n",
"\r\n\n": "\n\n",
"\r\n\r": "\n\n",
"\r\n\r\n": "\n\n",
"\r\r": "\n\n",
"\r\r\n": "\n\n",
"\r\r\n\n": "\n\n\n",
"\r\r\r\n": "\n\n\n",
"\r \n": "\n \n",
"xyz": "xyz",
}
for in, want := range testCases {
if got := string(convertNewlines([]byte(in))); got != want {
t.Errorf("input %q: got %q, want %q", in, got, want)
}
}
}
func TestReaderEdgeCases(t *testing.T) {
const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
testCases := []io.Reader{
&zeroOneByteReader{s: s},
&eofStringsReader{s: s},
&stuckReader{},
}
for i, tc := range testCases {
got := []TokenType{}
z := NewTokenizer(tc)
for {
tt := z.Next()
if tt == ErrorToken {
break
}
got = append(got, tt)
}
if err := z.Err(); err != nil && err != io.EOF {
if err != io.ErrNoProgress {
t.Errorf("i=%d: %v", i, err)
}
continue
}
want := []TokenType{
StartTagToken,
TextToken,
EndTagToken,
}
if !reflect.DeepEqual(got, want) {
t.Errorf("i=%d: got %v, want %v", i, got, want)
continue
}
}
}
func TestSelfClosingTagValueConfusion(t *testing.T) {
z := NewTokenizer(strings.NewReader(`<p a=/>`))
tok := z.Next()
if tok != StartTagToken {
t.Fatalf("unexpected token type: got %s, want %s", tok, StartTagToken)
}
}
// zeroOneByteReader is like a strings.Reader that alternates between
// returning 0 bytes and 1 byte at a time.
type zeroOneByteReader struct {
s string
n int
}
func (r *zeroOneByteReader) Read(p []byte) (int, error) {
if len(p) == 0 {
return 0, nil
}
if len(r.s) == 0 {
return 0, io.EOF
}
r.n++
if r.n%2 != 0 {
return 0, nil
}
p[0], r.s = r.s[0], r.s[1:]
return 1, nil
}
// eofStringsReader is like a strings.Reader but can return an (n, err) where
// n > 0 && err != nil.
type eofStringsReader struct {
s string
}
func (r *eofStringsReader) Read(p []byte) (int, error) {
n := copy(p, r.s)
r.s = r.s[n:]
if r.s != "" {
return n, nil
}
return n, io.EOF
}
// stuckReader is an io.Reader that always returns no data and no error.
type stuckReader struct{}
func (*stuckReader) Read(p []byte) (int, error) {
return 0, nil
}
const (
rawLevel = iota
lowLevel
highLevel
)
func benchmarkTokenizer(b *testing.B, level int) {
buf, err := os.ReadFile("testdata/go1.html")
if err != nil {
b.Fatalf("could not read testdata/go1.html: %v", err)
}
b.SetBytes(int64(len(buf)))
runtime.GC()
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
z := NewTokenizer(bytes.NewBuffer(buf))
for {
tt := z.Next()
if tt == ErrorToken {
if err := z.Err(); err != nil && err != io.EOF {
b.Fatalf("tokenizer error: %v", err)
}
break
}
switch level {
case rawLevel:
// Calling z.Raw just returns the raw bytes of the token. It does
// not unescape &lt; to <, or lower-case tag names and attribute keys.
z.Raw()
case lowLevel:
// Calling z.Text, z.TagName and z.TagAttr returns []byte values
// whose contents may change on the next call to z.Next.
switch tt {
case TextToken, CommentToken, DoctypeToken:
z.Text()
case StartTagToken, SelfClosingTagToken:
_, more := z.TagName()
for more {
_, _, more = z.TagAttr()
}
case EndTagToken:
z.TagName()
}
case highLevel:
// Calling z.Token converts []byte values to strings whose validity
// extend beyond the next call to z.Next.
z.Token()
}
}
}
}
func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }
type h5libTest struct {
Description string
Input string
InitialStates []string
Output []Token
Errors []struct{ Code string }
}
var unicodeRegexp = regexp.MustCompile(`\\u([0-9a-fA-F]{4})`)
const (
tx = 0b10000000
t2 = 0b11000000
t3 = 0b11100000
t4 = 0b11110000
maskx = 0b00111111
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
surrogateMin = 0xD800
surrogateMax = 0xDFFF
)
func unescapeUnicode(s string) string {
return unicodeRegexp.ReplaceAllStringFunc(s, func(match string) string {
// match is something like "\u0000"
// Extract the 4 hex digits
hex := match[2:]
// Parse the hex digits into an integer
n, err := strconv.ParseInt(hex, 16, 32)
if err != nil {
panic(err)
}
// The following is a loose copy of unicode/utf8.AppendRune, which ignores
// some of the error checking, which is necessary to support some of the
// test characters.
if uint32(n) <= rune1Max {
return string(byte(n))
}
// Convert the integer to a string
switch i := uint32(n); {
case i <= rune2Max:
return string([]byte{t2 | byte(n>>6), tx | byte(n)&maskx})
case i <= rune3Max:
return string([]byte{t3 | byte(n>>12), tx | byte(n>>6)&maskx, tx | byte(n)&maskx})
case i > rune3Max: // && i <= MaxRune:
return string([]byte{t4 | byte(n>>18), tx | byte(n>>12)&maskx, tx | byte(n>>6)&maskx, tx | byte(n)&maskx})
default:
panic(fmt.Sprintf("unsupported rune %x", n))
}
})
}
func (t *h5libTest) UnmarshalJSON(data []byte) error {
var test struct {
Description string
Input string
DoubleEscaped bool
InitialStates []string
Output [][]any
Errors []struct{ Code string }
}
if err := json.Unmarshal(data, &test); err != nil {
return err
}
*t = h5libTest{
Description: test.Description,
Input: test.Input,
InitialStates: test.InitialStates,
Errors: test.Errors,
}
if test.DoubleEscaped {
t.Input = unescapeUnicode(t.Input)
}
for _, testToken := range test.Output {
token := Token{}
switch testToken[0].(string) {
case "DOCTYPE":
token.Type = DoctypeToken
if testToken[1] != nil {
token.Data = testToken[1].(string)
}
// TODO: public/system id, we don't really support this?
case "StartTag":
if len(testToken) == 4 && testToken[3].(bool) == true {
token.Type = SelfClosingTagToken
} else {
token.Type = StartTagToken
}
token.Data = testToken[1].(string)
case "EndTag":
token.Type = EndTagToken
token.Data = testToken[1].(string)
case "Comment":
token.Type = CommentToken
token.Data = testToken[1].(string)
case "Character":
token.Type = TextToken
token.Data = testToken[1].(string)
default:
return fmt.Errorf("unknown token type %s", testToken[0])
}
if test.DoubleEscaped {
token.Data = unescapeUnicode(token.Data)
}
if testToken[0] == "DOCTYPE" || testToken[0] == "StartTag" || testToken[0] == "EndTag" {
token.DataAtom = atom.Lookup([]byte(token.Data))
}
if (testToken[0] == "StartTag" || testToken[0] == "EndTag") && len(testToken) > 2 {
for k, v := range testToken[2].(map[string]any) {
token.Attr = append(token.Attr, Attribute{
Key: k,
Val: v.(string),
})
}
}
t.Output = append(t.Output, token)
}
return nil
}
func TestHTML5LibTests(t *testing.T) {
skipTests := map[string]bool{
// We emit a comment token here, instead of no token. This is a specification
// divergence that we may want to fix.
"test1.test/Empty end tag": true,
"test2.test/Empty end tag with following characters": true,
"test2.test/Empty end tag with following tag": true,
"test2.test/Empty end tag with following comment": true,
"test2.test/Empty end tag with following end tag": true,
"test3.test/</>": true,
"test4.test/CR EOF after doctype name": true,
"test4.test/Doctype public case-sensitivity (1)": true,
"test4.test/Doctype public case-sensitivity (2)": true,
"test4.test/Doctype system case-sensitivity (1)": true,
"test4.test/Doctype system case-sensitivity (2)": true,
}
var tests struct {
Tests []h5libTest
}
testFiles, err := filepath.Glob("testdata/html5lib-tests/tokenizer/*.test")
if err != nil {
t.Fatal(err)
}
for _, testFile := range testFiles {
data, err := os.ReadFile(testFile)
if err != nil {
t.Fatal(err)
}
if err := json.Unmarshal(data, &tests); err != nil {
t.Fatal(err)
}
base := filepath.Base(testFile)
for _, tc := range tests.Tests {
name := fmt.Sprintf("%s/%s", base, tc.Description)
t.Run(name, func(t *testing.T) {
if skipTests[name] {
t.Skip("skipping, known failure")
}
if len(tc.InitialStates) > 0 {
t.Skip("Initial states not supported yet")
}
if strings.Contains(tc.Input, "<!DOCTYPE") {
t.Skip("Skipping DOCTYPE")
}
z := NewTokenizer(strings.NewReader(tc.Input))
var tokens []Token
for {
if z.Next() == ErrorToken {
if z.Err() == io.EOF {
break
}
t.Fatalf("Error: %v", z.Err())
}
tokens = append(tokens, z.Token())
}
sortTokenAttributes(tokens)
sortTokenAttributes(tc.Output)
if !reflect.DeepEqual(tokens, tc.Output) {
t.Errorf("\nInput: %s\nGot:\t%#v\nWant:\t%#v\nParse Errors: %s\n", tc.Input, tokens, tc.Output, tc.Errors)
}
})
}
}
}
func sortTokenAttributes(tokens []Token) {
for _, token := range tokens {
slices.SortFunc(token.Attr, func(a, b Attribute) int {
return strings.Compare(a.Namespace+a.Key+a.Val, b.Namespace+b.Key+b.Val)
})
}
}
func TestUnicodeAttributeCase(t *testing.T) {
// <div a="1" A="1"> is resolved to <div a="1"> because a and A are considered
// duplicate attribute names. Different unicode cases are not considered equal
// though, so <div ä="1" Ä="1"> is tokenized as <div ä="1" Ä="1">.
f := `<div ä="1" Ä="1">`
z := NewTokenizer(strings.NewReader(f))
if tt := z.Next(); tt != StartTagToken {
t.Fatalf("expected StartTagToken, got %s", tt)
}
tok := z.Token()
if len(tok.Attr) != 2 {
t.Fatalf("expected 2 attributes, got %d", len(tok.Attr))
}
if tok.Attr[0].Key != "ä" {
t.Errorf("expected attribute key to be 'ä', got %s", tok.Attr[0].Key)
}
if tok.Attr[1].Key != "Ä" {
t.Errorf("expected attribute key to be 'Ä', got %s", tok.Attr[1].Key)
}
}