src/pkg/html/token_test.go - go - Git at Google

 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package html

 import (
 	"bytes"
 	"os"
 	"strings"
 	"testing"
 )

 type tokenTest struct {
 	// A short description of the test case.
 	desc string
 	// The HTML to parse.
 	html string
 	// The string representations of the expected tokens, joined by '$'.
 	golden string
 }

 var tokenTests = []tokenTest{
 	// A single text node. The tokenizer should not break text nodes on whitespace,
 	// nor should it normalize whitespace within a text node.
 	{
 		"text",
 		"foo  bar",
 		"foo  bar",
 	},
 	// An entity.
 	{
 		"entity",
 		"one &lt; two",
 		"one &lt; two",
 	},
 	// A start, self-closing and end tag. The tokenizer does not care if the start
 	// and end tokens don't match; that is the job of the parser.
 	{
 		"tags",
 		"<a>b<c/>d</e>",
 		"<a>$b$<c/>$d$</e>",
 	},
 	// Some malformed tags that are missing a '>'.
 	{
 		"malformed tag #0",
 		`<p</p>`,
 		`<p< p="">`,
 	},
 	{
 		"malformed tag #1",
 		`<p </p>`,
 		`<p <="" p="">`,
 	},
 	{
 		"malformed tag #2",
 		`<p id=0</p>`,
 		`<p id="0&lt;/p">`,
 	},
 	{
 		"malformed tag #3",
 		`<p id="0</p>`,
 		`<p id="0&lt;/p&gt;">`,
 	},
 	{
 		"malformed tag #4",
 		`<p id="0"</p>`,
 		`<p id="0" <="" p="">`,
 	},
 	// Comments.
 	{
 		"comment0",
 		"abc<b><!-- skipme --></b>def",
 		"abc$<b>$</b>$def",
 	},
 	{
 		"comment1",
 		"a<!-->z",
 		"a$z",
 	},
 	{
 		"comment2",
 		"a<!--->z",
 		"a$z",
 	},
 	{
 		"comment3",
 		"a<!--x>-->z",
 		"a$z",
 	},
 	{
 		"comment4",
 		"a<!--x->-->z",
 		"a$z",
 	},
 	{
 		"comment5",
 		"a<!>z",
 		"a$&lt;!&gt;z",
 	},
 	{
 		"comment6",
 		"a<!->z",
 		"a$&lt;!-&gt;z",
 	},
 	{
 		"comment7",
 		"a<!---<>z",
 		"a$&lt;!---&lt;&gt;z",
 	},
 	{
 		"comment8",
 		"a<!--z",
 		"a$&lt;!--z",
 	},
 	// An attribute with a backslash.
 	{
 		"backslash",
 		`<p id="a\"b">`,
 		`<p id="a&quot;b">`,
 	},
 	// Entities, tag name and attribute key lower-casing, and whitespace
 	// normalization within a tag.
 	{
 		"tricky",
 		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
 		`<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 	},
 	// A nonexistent entity. Tokenizing and converting back to a string should
 	// escape the "&" to become "&amp;".
 	{
 		"noSuchEntity",
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
 	{
 		"entity without semicolon",
 		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
 		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
 	},
 	{
 		"entity with digits",
 		"&frac12;",
 		"½",
 	},
 	// Attribute tests:
 	// http://dev.w3.org/html5/spec/Overview.html#attributes-0
 	{
 		"Empty attribute",
 		`<input disabled FOO>`,
 		`<input disabled="" foo="">`,
 	},
 	{
 		"Empty attribute, whitespace",
 		`<input disabled FOO >`,
 		`<input disabled="" foo="">`,
 	},
 	{
 		"Unquoted attribute value",
 		`<input value=yes FOO=BAR>`,
 		`<input value="yes" foo="BAR">`,
 	},
 	{
 		"Unquoted attribute value, spaces",
 		`<input value = yes FOO = BAR>`,
 		`<input value="yes" foo="BAR">`,
 	},
 	{
 		"Unquoted attribute value, trailing space",
 		`<input value=yes FOO=BAR >`,
 		`<input value="yes" foo="BAR">`,
 	},
 	{
 		"Single-quoted attribute value",
 		`<input value='yes' FOO='BAR'>`,
 		`<input value="yes" foo="BAR">`,
 	},
 	{
 		"Single-quoted attribute value, trailing space",
 		`<input value='yes' FOO='BAR' >`,
 		`<input value="yes" foo="BAR">`,
 	},
 	{
 		"Double-quoted attribute value",
 		`<input value="I'm an attribute" FOO="BAR">`,
 		`<input value="I&apos;m an attribute" foo="BAR">`,
 	},
 	{
 		"Attribute name characters",
 		`<meta http-equiv="content-type">`,
 		`<meta http-equiv="content-type">`,
 	},
 }

 func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
 		for i, s := range strings.Split(tt.golden, "$") {
 			if z.Next() == ErrorToken {
 				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
 				continue loop
 			}
 			actual := z.Token().String()
 			if s != actual {
 				t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
 				continue loop
 			}
 		}
 		z.Next()
 		if z.Error() != os.EOF {
 			t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
 		}
 	}
 }

 type unescapeTest struct {
 	// A short description of the test case.
 	desc string
 	// The HTML text.
 	html string
 	// The unescaped text.
 	unescaped string
 }

 var unescapeTests = []unescapeTest{
 	// Handle no entities.
 	{
 		"copy",
 		"A\ttext\nstring",
 		"A\ttext\nstring",
 	},
 	// Handle simple named entities.
 	{
 		"simple",
 		"&amp; &gt; &lt;",
 		"& > <",
 	},
 	// Handle hitting the end of the string.
 	{
 		"stringEnd",
 		"&amp &amp",
 		"& &",
 	},
 	// Handle entities with two codepoints.
 	{
 		"multiCodepoint",
 		"text &gesl; blah",
 		"text \u22db\ufe00 blah",
 	},
 	// Handle decimal numeric entities.
 	{
 		"decimalEntity",
 		"Delta = &#916; ",
 		"Delta = Δ ",
 	},
 	// Handle hexadecimal numeric entities.
 	{
 		"hexadecimalEntity",
 		"Lambda = &#x3bb; = &#X3Bb ",
 		"Lambda = λ = λ ",
 	},
 	// Handle numeric early termination.
 	{
 		"numericEnds",
 		"&# &#x &#128;43 &copy = &#169f = &#xa9",
 		"&# &#x €43 © = ©f = ©",
 	},
 	// Handle numeric ISO-8859-1 entity replacements.
 	{
 		"numericReplacements",
 		"Footnote&#x87;",
 		"Footnote‡",
 	},
 }

 func TestUnescape(t *testing.T) {
 	for _, tt := range unescapeTests {
 		unescaped := UnescapeString(tt.html)
 		if unescaped != tt.unescaped {
 			t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
 		}
 	}
 }

 func TestUnescapeEscape(t *testing.T) {
 	ss := []string{
 		``,
 		`abc def`,
 		`a & b`,
 		`a&amp;b`,
 		`a &amp b`,
 		`&quot;`,
 		`"`,
 		`"<&>"`,
 		`&quot;&lt;&amp;&gt;&quot;`,
 		`3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
 	}
 	for _, s := range ss {
 		if s != UnescapeString(EscapeString(s)) {
 			t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
 		}
 	}
 }

 func TestBufAPI(t *testing.T) {
 	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 	z := NewTokenizer(bytes.NewBuffer([]byte(s)))
 	result := bytes.NewBuffer(nil)
 	depth := 0
 loop:
 	for {
 		tt := z.Next()
 		switch tt {
 		case ErrorToken:
 			if z.Error() != os.EOF {
 				t.Error(z.Error())
 			}
 			break loop
 		case TextToken:
 			if depth > 0 {
 				result.Write(z.Text())
 			}
 		case StartTagToken, EndTagToken:
 			tn, _ := z.TagName()
 			if len(tn) == 1 && tn[0] == 'a' {
 				if tt == StartTagToken {
 					depth++
 				} else {
 					depth--
 				}
 			}
 		}
 	}
 	u := "14567"
 	v := string(result.Bytes())
 	if u != v {
 		t.Errorf("TestBufAPI: want %q got %q", u, v)
 	}
 }
	// Copyright 2010 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package html

	import (
	"bytes"
	"os"
	"strings"
	"testing"
	)

	type tokenTest struct {
	// A short description of the test case.
	desc string
	// The HTML to parse.
	html string
	// The string representations of the expected tokens, joined by '$'.
	golden string
	}

	var tokenTests = []tokenTest{
	// A single text node. The tokenizer should not break text nodes on whitespace,
	// nor should it normalize whitespace within a text node.
	{
	"text",
	"foo bar",
	"foo bar",
	},
	// An entity.
	{
	"entity",
	"one < two",
	"one < two",
	},
	// A start, self-closing and end tag. The tokenizer does not care if the start
	// and end tokens don't match; that is the job of the parser.
	{
	"tags",
	"<a>b<c/>d</e>",
	"<a>$b$<c/>$d$</e>",
	},
	// Some malformed tags that are missing a '>'.
	{
	"malformed tag #0",
	`<p</p>`,
	`<p< p="">`,
	},
	{
	"malformed tag #1",
	`<p </p>`,
	`<p <="" p="">`,
	},
	{
	"malformed tag #2",
	`<p id=0</p>`,
	`<p id="0</p">`,
	},
	{
	"malformed tag #3",
	`<p id="0</p>`,
	`<p id="0</p>">`,
	},
	{
	"malformed tag #4",
	`<p id="0"</p>`,
	`<p id="0" <="" p="">`,
	},
	// Comments.
	{
	"comment0",
	"abc<b><!-- skipme --></b>def",
	"abc$<b>$</b>$def",
	},
	{
	"comment1",
	"a<!-->z",
	"a$z",
	},
	{
	"comment2",
	"a<!--->z",
	"a$z",
	},
	{
	"comment3",
	"a<!--x>-->z",
	"a$z",
	},
	{
	"comment4",
	"a<!--x->-->z",
	"a$z",
	},
	{
	"comment5",
	"a<!>z",
	"a$<!>z",
	},
	{
	"comment6",
	"a<!->z",
	"a$<!->z",
	},
	{
	"comment7",
	"a<!---<>z",
	"a$<!---<>z",
	},
	{
	"comment8",
	"a<!--z",
	"a$<!--z",
	},
	// An attribute with a backslash.
	{
	"backslash",
	`<p id="a\"b">`,
	`<p id="a"b">`,
	},
	// Entities, tag name and attribute key lower-casing, and whitespace
	// normalization within a tag.
	{
	"tricky",
	"<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
	`<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
	},
	// A nonexistent entity. Tokenizing and converting back to a string should
	// escape the "&" to become "&".
	{
	"noSuchEntity",
	`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
	`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
	},
	{
	"entity without semicolon",
	`&notit;∉<a b="q=z&amp=5&notice=hello¬=world">`,
	`¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
	},
	{
	"entity with digits",
	"½",
	"½",
	},
	// Attribute tests:
	// http://dev.w3.org/html5/spec/Overview.html#attributes-0
	{
	"Empty attribute",
	`<input disabled FOO>`,
	`<input disabled="" foo="">`,
	},
	{
	"Empty attribute, whitespace",
	`<input disabled FOO >`,
	`<input disabled="" foo="">`,
	},
	{
	"Unquoted attribute value",
	`<input value=yes FOO=BAR>`,
	`<input value="yes" foo="BAR">`,
	},
	{
	"Unquoted attribute value, spaces",
	`<input value = yes FOO = BAR>`,
	`<input value="yes" foo="BAR">`,
	},
	{
	"Unquoted attribute value, trailing space",
	`<input value=yes FOO=BAR >`,
	`<input value="yes" foo="BAR">`,
	},
	{
	"Single-quoted attribute value",
	`<input value='yes' FOO='BAR'>`,
	`<input value="yes" foo="BAR">`,
	},
	{
	"Single-quoted attribute value, trailing space",
	`<input value='yes' FOO='BAR' >`,
	`<input value="yes" foo="BAR">`,
	},
	{
	"Double-quoted attribute value",
	`<input value="I'm an attribute" FOO="BAR">`,
	`<input value="I'm an attribute" foo="BAR">`,
	},
	{
	"Attribute name characters",
	`<meta http-equiv="content-type">`,
	`<meta http-equiv="content-type">`,
	},
	}

	func TestTokenizer(t *testing.T) {
	loop:
	for _, tt := range tokenTests {
	z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
	for i, s := range strings.Split(tt.golden, "$") {
	if z.Next() == ErrorToken {
	t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
	continue loop
	}
	actual := z.Token().String()
	if s != actual {
	t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
	continue loop
	}
	}
	z.Next()
	if z.Error() != os.EOF {
	t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
	}
	}
	}

	type unescapeTest struct {
	// A short description of the test case.
	desc string
	// The HTML text.
	html string
	// The unescaped text.
	unescaped string
	}

	var unescapeTests = []unescapeTest{
	// Handle no entities.
	{
	"copy",
	"A\ttext\nstring",
	"A\ttext\nstring",
	},
	// Handle simple named entities.
	{
	"simple",
	"& > <",
	"& > <",
	},
	// Handle hitting the end of the string.
	{
	"stringEnd",
	"&amp &amp",
	"& &",
	},
	// Handle entities with two codepoints.
	{
	"multiCodepoint",
	"text &gesl; blah",
	"text \u22db\ufe00 blah",
	},
	// Handle decimal numeric entities.
	{
	"decimalEntity",
	"Delta = Δ ",
	"Delta = Δ ",
	},
	// Handle hexadecimal numeric entities.
	{
	"hexadecimalEntity",
	"Lambda = λ = &#X3Bb ",
	"Lambda = λ = λ ",
	},
	// Handle numeric early termination.
	{
	"numericEnds",
	"&# &#x 43 &copy = &#169f = &#xa9",
	"&# &#x €43 © = ©f = ©",
	},
	// Handle numeric ISO-8859-1 entity replacements.
	{
	"numericReplacements",
	"Footnote",
	"Footnote‡",
	},
	}

	func TestUnescape(t *testing.T) {
	for _, tt := range unescapeTests {
	unescaped := UnescapeString(tt.html)
	if unescaped != tt.unescaped {
	t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
	}
	}
	}

	func TestUnescapeEscape(t *testing.T) {
	ss := []string{
	``,
	`abc def`,
	`a & b`,
	`a&b`,
	`a &amp b`,
	`"`,
	`"`,
	`"<&>"`,
	`"<&>"`,
	`3&5==1 && 0<1, "0<1", a+acute=á`,
	}
	for _, s := range ss {
	if s != UnescapeString(EscapeString(s)) {
	t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
	}
	}
	}

	func TestBufAPI(t *testing.T) {
	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
	z := NewTokenizer(bytes.NewBuffer([]byte(s)))
	result := bytes.NewBuffer(nil)
	depth := 0
	loop:
	for {
	tt := z.Next()
	switch tt {
	case ErrorToken:
	if z.Error() != os.EOF {
	t.Error(z.Error())
	}
	break loop
	case TextToken:
	if depth > 0 {
	result.Write(z.Text())
	}
	case StartTagToken, EndTagToken:
	tn, _ := z.TagName()
	if len(tn) == 1 && tn[0] == 'a' {
	if tt == StartTagToken {
	depth++
	} else {
	depth--
	}
	}
	}
	}
	u := "14567"
	v := string(result.Bytes())
	if u != v {
	t.Errorf("TestBufAPI: want %q got %q", u, v)
	}
	}