html: avoid using raw text mode if there are raw tags to be ignored in select IM
This follows up on https://golang.org/cl/264977
Change-Id: I5d0e2f39173a8bbd07ca53de4df2a7e8772d4197
Reviewed-on: https://go-review.googlesource.com/c/net/+/265960
Trust: Kunpei Sakai <namusyaka@gmail.com>
Trust: Nigel Tao <nigeltao@golang.org>
Run-TryBot: Kunpei Sakai <namusyaka@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/html/parse.go b/html/parse.go
index 2cd12fc..392327a 100644
--- a/html/parse.go
+++ b/html/parse.go
@@ -1790,6 +1790,13 @@
return true
case a.Script, a.Template:
return inHeadIM(p)
+ case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
+ // Don't let the tokenizer go into raw text mode when there are raw tags
+ // to be ignored. These tags should be ignored from the tokenizer
+ // properly.
+ p.tokenizer.NextIsNotRawText()
+ // Ignore the token.
+ return true
}
case EndTagToken:
switch p.tok.DataAtom {
diff --git a/html/parse_test.go b/html/parse_test.go
index f1eba07..333dd59 100644
--- a/html/parse_test.go
+++ b/html/parse_test.go
@@ -289,10 +289,6 @@
// text is the HTML to be parsed, want is a dump of the correct parse tree,
// and context is the name of the context node, if any.
func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
- if parserTestBlacklist[text] {
- return nil
- }
-
defer func() {
if x := recover(); x != nil {
switch e := x.(type) {
@@ -370,18 +366,6 @@
return nil
}
-// Some tests of html5lib-tests are beyond the scope of the parsing algorithm
-// and are out of scope for the go's parser. The items listed here are limited
-// to testing for behavior outside the whatwg parsing algorithm.
-var parserTestBlacklist = map[string]bool{
- // Even if there is a <plaintext> tag inside a <select> tag, the tokenizer
- // should not go into the PLAINTEXT state, but it is not mentioned in the
- // parsing algorithm.
- // See: https://github.com/whatwg/html/issues/2252
- `<!doctype html><select><plaintext></plaintext>X`: true,
- `<!doctype html><table><select><plaintext>a<caption>b`: true,
-}
-
// Some test input result in parse trees are not 'well-formed' despite
// following the HTML5 recovery algorithms. Rendering and re-parsing such a
// tree will not result in an exact clone of that tree. We blacklist such
diff --git a/html/testdata/go/raw_tags_to_be_ignored.dat b/html/testdata/go/raw_tags_to_be_ignored.dat
new file mode 100644
index 0000000..50bac59
--- /dev/null
+++ b/html/testdata/go/raw_tags_to_be_ignored.dat
@@ -0,0 +1,97 @@
+#data
+<!doctype html><table><select><iframe>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><noembed>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><noframes>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><noscript>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><style>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><title>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"
+
+#data
+<!doctype html><table><select><xmp>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+| <head>
+| <body>
+| <select>
+| "a"
+| <table>
+| <caption>
+| "b"