html: avoid using raw text mode if there are raw tags to be ignored in select IM This follows up on https://golang.org/cl/264977 Change-Id: I5d0e2f39173a8bbd07ca53de4df2a7e8772d4197 Reviewed-on: https://go-review.googlesource.com/c/net/+/265960 Trust: Kunpei Sakai <namusyaka@gmail.com> Trust: Nigel Tao <nigeltao@golang.org> Run-TryBot: Kunpei Sakai <namusyaka@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Nigel Tao <nigeltao@golang.org>

commit: 8adf50f3fefd6f2b13b632624b9f5eb13ce6c001 [log] [tgz]
author: Kunpei Sakai <namusyaka@gmail.com> Thu Oct 29 13:07:43 2020 +0900
committer: Nigel Tao <nigeltao@golang.org> Thu Oct 29 05:33:32 2020 +0000
tree: 32eeb9bf29d9af82dce8ddb0ce5e548d891df6df
parent: e0495509cf55f38c29ab7df4182e0a06de64c055 [diff]
diff --git a/html/parse.go b/html/parse.go
index 2cd12fc..392327a 100644
--- a/html/parse.go
+++ b/html/parse.go

@@ -1790,6 +1790,13 @@
 			return true
 		case a.Script, a.Template:
 			return inHeadIM(p)
+		case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
+			// Don't let the tokenizer go into raw text mode when there are raw tags
+			// to be ignored. These tags should be ignored from the tokenizer
+			// properly.
+			p.tokenizer.NextIsNotRawText()
+			// Ignore the token.
+			return true
 		}
 	case EndTagToken:
 		switch p.tok.DataAtom {

diff --git a/html/parse_test.go b/html/parse_test.go
index f1eba07..333dd59 100644
--- a/html/parse_test.go
+++ b/html/parse_test.go

@@ -289,10 +289,6 @@
 // text is the HTML to be parsed, want is a dump of the correct parse tree,
 // and context is the name of the context node, if any.
 func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
-	if parserTestBlacklist[text] {
-		return nil
-	}
-
 	defer func() {
 		if x := recover(); x != nil {
 			switch e := x.(type) {
@@ -370,18 +366,6 @@
 	return nil
 }
 
-// Some tests of html5lib-tests are beyond the scope of the parsing algorithm
-// and are out of scope for the go's parser. The items listed here are limited
-// to testing for behavior outside the whatwg parsing algorithm.
-var parserTestBlacklist = map[string]bool{
-	// Even if there is a <plaintext> tag inside a <select> tag, the tokenizer
-	// should not go into the PLAINTEXT state, but it is not mentioned in the
-	// parsing algorithm.
-	// See: https://github.com/whatwg/html/issues/2252
-	`<!doctype html><select><plaintext></plaintext>X`:      true,
-	`<!doctype html><table><select><plaintext>a<caption>b`: true,
-}
-
 // Some test input result in parse trees are not 'well-formed' despite
 // following the HTML5 recovery algorithms. Rendering and re-parsing such a
 // tree will not result in an exact clone of that tree. We blacklist such

diff --git a/html/testdata/go/raw_tags_to_be_ignored.dat b/html/testdata/go/raw_tags_to_be_ignored.dat
new file mode 100644
index 0000000..50bac59
--- /dev/null
+++ b/html/testdata/go/raw_tags_to_be_ignored.dat

@@ -0,0 +1,97 @@
+#data
+<!doctype html><table><select><iframe>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><noembed>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><noframes>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><noscript>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><style>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><title>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
+
+#data
+<!doctype html><table><select><xmp>a<caption>b
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+|     <select>
+|       "a"
+|     <table>
+|       <caption>
+|         "b"
commit	8adf50f3fefd6f2b13b632624b9f5eb13ce6c001	[log] [tgz]
author	Kunpei Sakai <namusyaka@gmail.com>	Thu Oct 29 13:07:43 2020 +0900
committer	Nigel Tao <nigeltao@golang.org>	Thu Oct 29 05:33:32 2020 +0000
tree	32eeb9bf29d9af82dce8ddb0ce5e548d891df6df
parent	e0495509cf55f38c29ab7df4182e0a06de64c055 [diff]