html: parse comments per HTML spec
Updates golang/go#58246
Change-Id: Iaba5ed65f5d244fd47372ef0c08fc4cdb5ed90f9
Reviewed-on: https://go-review.googlesource.com/c/net/+/466776
TryBot-Result: Gopher Robot <gobot@golang.org>
Auto-Submit: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Damien Neil <dneil@google.com>
Run-TryBot: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
diff --git a/html/comment_test.go b/html/comment_test.go
new file mode 100644
index 0000000..2c80bc7
--- /dev/null
+++ b/html/comment_test.go
@@ -0,0 +1,270 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "bytes"
+ "testing"
+)
+
+// TestComments exhaustively tests every 'interesting' N-byte string is
+// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
+// where 4 is the length of the "<!--" prefix that starts an HTML comment.
+//
+// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
+// sampled from the interestingCommentBytes const string, below. These cover
+// all of the possible state transitions from comment-related parser states, as
+// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
+// and subsequent sections).
+//
+// The spec is written as an explicit state machine that, as a side effect,
+// accumulates "the comment token's data" to a separate buffer.
+// Tokenizer.readComment in this package does not have an explicit state
+// machine and usually returns the comment text as a sub-slice of the input,
+// between the opening '<' and closing '>' or EOF. This test confirms that the
+// two algorithms match.
+func TestComments(t *testing.T) {
+ const prefix = "<!--"
+ const suffixLen = 6
+ buffer := make([]byte, 0, len(prefix)+suffixLen)
+ testAllComments(t, append(buffer, prefix...))
+}
+
+// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
+// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
+// It's equivalent to any other non-interesting byte (other than being replaced
+// by U+FFFD REPLACEMENT CHARACTER).
+//
+// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
+// testOneComment below breaks the loop instead.
+//
+// 'x' represents all other "non-interesting" comment bytes.
+var interestingCommentBytes = [...]byte{
+ '!', '-', '<', '>', 'x',
+}
+
+// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
+// interesting bytes and then tests that this package's tokenization matches
+// the HTML spec.
+//
+// Precondition: len(buffer) < cap(buffer)
+// Precondition: string(buffer[:4]) == "<!--"
+func testAllComments(t *testing.T, buffer []byte) {
+ for _, interesting := range interestingCommentBytes {
+ b := append(buffer, interesting)
+ testOneComment(t, b)
+ if len(b) < cap(b) {
+ testAllComments(t, b)
+ }
+ }
+}
+
+func testOneComment(t *testing.T, b []byte) {
+ z := NewTokenizer(bytes.NewReader(b))
+ if next := z.Next(); next != CommentToken {
+ t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
+ }
+ gotRemainder := string(b[len(z.Raw()):])
+ gotComment := string(z.Text())
+
+ i := len("<!--")
+ wantBuffer := []byte(nil)
+loop:
+ for state := 43; ; {
+ // Consume the next input character, handling EOF.
+ if i >= len(b) {
+ break
+ }
+ nextInputCharacter := b[i]
+ i++
+
+ switch state {
+ case 43: // 13.2.5.43 Comment start state.
+ switch nextInputCharacter {
+ case '-':
+ state = 44
+ case '>':
+ break loop
+ default:
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 44: // 13.2.5.44 Comment start dash state.
+ switch nextInputCharacter {
+ case '-':
+ state = 51
+ case '>':
+ break loop
+ default:
+ wantBuffer = append(wantBuffer, '-')
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 45: // 13.2.5.45 Comment state.
+ switch nextInputCharacter {
+ case '-':
+ state = 50
+ case '<':
+ wantBuffer = append(wantBuffer, '<')
+ state = 46
+ default:
+ wantBuffer = append(wantBuffer, nextInputCharacter)
+ }
+
+ case 46: // 13.2.5.46 Comment less-than sign state.
+ switch nextInputCharacter {
+ case '!':
+ wantBuffer = append(wantBuffer, '!')
+ state = 47
+ case '<':
+ wantBuffer = append(wantBuffer, '<')
+ state = 46
+ default:
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 47: // 13.2.5.47 Comment less-than sign bang state.
+ switch nextInputCharacter {
+ case '-':
+ state = 48
+ default:
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 48: // 13.2.5.48 Comment less-than sign bang dash state.
+ switch nextInputCharacter {
+ case '-':
+ state = 49
+ default:
+ i-- // Reconsume.
+ state = 50
+ }
+
+ case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
+ switch nextInputCharacter {
+ case '>':
+ break loop
+ default:
+ i-- // Reconsume.
+ state = 51
+ }
+
+ case 50: // 13.2.5.50 Comment end dash state.
+ switch nextInputCharacter {
+ case '-':
+ state = 51
+ default:
+ wantBuffer = append(wantBuffer, '-')
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 51: // 13.2.5.51 Comment end state.
+ switch nextInputCharacter {
+ case '!':
+ state = 52
+ case '-':
+ wantBuffer = append(wantBuffer, '-')
+ case '>':
+ break loop
+ default:
+ wantBuffer = append(wantBuffer, "--"...)
+ i-- // Reconsume.
+ state = 45
+ }
+
+ case 52: // 13.2.5.52 Comment end bang state.
+ switch nextInputCharacter {
+ case '-':
+ wantBuffer = append(wantBuffer, "--!"...)
+ state = 50
+ case '>':
+ break loop
+ default:
+ wantBuffer = append(wantBuffer, "--!"...)
+ i-- // Reconsume.
+ state = 45
+ }
+
+ default:
+ t.Fatalf("input=%q: unexpected state %d", b, state)
+ }
+ }
+
+ wantRemainder := ""
+ if i < len(b) {
+ wantRemainder = string(b[i:])
+ }
+ wantComment := string(wantBuffer)
+ if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
+ t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
+ b, gotComment, gotRemainder, wantComment, wantRemainder)
+ }
+}
+
+// This table below summarizes the HTML-comment-related state machine from
+// 13.2.5.43 "Comment start state" and subsequent sections.
+// https://html.spec.whatwg.org/#comment-start-state
+//
+// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
+// initial 13.2.5.1 "Data state":
+// - "<" moves to 13.2.5.6 "Tag open state",
+// - "!" moves to 13.2.5.42 "Markup declaration open state",
+// - "--" moves to 13.2.5.43 "Comment start state".
+// Each of these transitions are the only way to get to the 6/42/43 states.
+//
+// State ! - < > NUL EOF default HTML spec section
+// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state
+// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state
+// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state
+// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state
+// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state
+// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state
+// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state
+// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state
+// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state
+// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state
+//
+// State 43 is the "Comment start state" meaning that we've only seen "<!--"
+// and nothing else. Similarly, state 44 means that we've only seen "<!---",
+// with three dashes, and nothing else. For the other states, we deduce
+// (working backwards) that the immediate prior input must be:
+// - 45 something that's not '-'
+// - 46 "<"
+// - 47 "<!"
+// - 48 "<!-"
+// - 49 "<!--" not including the opening "<!--"
+// - 50 "-" not including the opening "<!--" and also not "--"
+// - 51 "--" not including the opening "<!--"
+// - 52 "--!"
+//
+// The table cell actions:
+// - ... do the default action
+// - A! append "!" to the comment token's data.
+// - A- append "-" to the comment token's data.
+// - A-- append "--" to the comment token's data.
+// - A--! append "--!" to the comment token's data.
+// - A< append "<" to the comment token's data.
+// - A? append "\uFFFD" to the comment token's data.
+// - Ax append the current input character to the comment token's data.
+// - E0 parse error (abrupt-closing-of-empty-comment).
+// - E1 parse error (eof-in-comment).
+// - E2 parse error (unexpected-null-character).
+// - E3 parse error (nested-comment).
+// - E4 parse error (incorrectly-closed-comment).
+// - T emit the current comment token.
+// - Z emit an end-of-file token.
+// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations).
+// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations).
+// - sNN switch to the 13.2.5.NN state (after any A* or E* operations).
+// - tNN stay in the 13.2.5.NN state (after any A* or E* operations).
+//
+// The E* actions are called errors in the HTML spec but they are not fatal
+// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
+// the parser"). They are warnings that, in practice, browsers simply ignore.
diff --git a/html/token.go b/html/token.go
index ae24a6f..50f7c6a 100644
--- a/html/token.go
+++ b/html/token.go
@@ -598,6 +598,11 @@
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
+ // When modifying this function, consider manually increasing the suffixLen
+ // constant in func TestComments, from 6 to e.g. 9 or more. That increase
+ // should only be temporary, not committed, as it exponentially affects the
+ // test running time.
+
z.data.start = z.raw.end
defer func() {
if z.data.end < z.data.start {
@@ -611,11 +616,7 @@
for {
c := z.readByte()
if z.err != nil {
- // Ignore up to two dashes at EOF.
- if dashCount > 2 {
- dashCount = 2
- }
- z.data.end = z.raw.end - dashCount
+ z.data.end = z.calculateAbruptCommentDataEnd()
return
}
switch c {
@@ -631,12 +632,15 @@
if dashCount >= 2 {
c = z.readByte()
if z.err != nil {
- z.data.end = z.raw.end
+ z.data.end = z.calculateAbruptCommentDataEnd()
return
- }
- if c == '>' {
+ } else if c == '>' {
z.data.end = z.raw.end - len("--!>")
return
+ } else if c == '-' {
+ dashCount = 1
+ beginning = false
+ continue
}
}
}
@@ -645,6 +649,35 @@
}
}
+func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
+ raw := z.Raw()
+ const prefixLen = len("<!--")
+ if len(raw) >= prefixLen {
+ raw = raw[prefixLen:]
+ if hasSuffix(raw, "--!") {
+ return z.raw.end - 3
+ } else if hasSuffix(raw, "--") {
+ return z.raw.end - 2
+ } else if hasSuffix(raw, "-") {
+ return z.raw.end - 1
+ }
+ }
+ return z.raw.end
+}
+
+func hasSuffix(b []byte, suffix string) bool {
+ if len(b) < len(suffix) {
+ return false
+ }
+ b = b[len(b)-len(suffix):]
+ for i := range b {
+ if b[i] != suffix[i] {
+ return false
+ }
+ }
+ return true
+}
+
// readUntilCloseAngle reads until the next ">".
func (z *Tokenizer) readUntilCloseAngle() {
z.data.start = z.raw.end
diff --git a/html/token_test.go b/html/token_test.go
index 0b9a947..3d20e8b 100644
--- a/html/token_test.go
+++ b/html/token_test.go
@@ -295,7 +295,7 @@
"<?xml?>",
"<!--?xml?-->",
},
- // Comments.
+ // Comments. See also func TestComments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
@@ -376,6 +376,41 @@
"a<!-- !-->z",
"a$<!-- !-->$z",
},
+ {
+ "comment16",
+ "a<!--i\x00j-->z",
+ "a$<!--i\uFFFDj-->$z",
+ },
+ {
+ "comment17",
+ "a<!--\x00",
+ "a$<!--\uFFFD-->",
+ },
+ {
+ "comment18",
+ "a<!--<!-->z",
+ "a$<!--<!-->$z",
+ },
+ {
+ "comment19",
+ "a<!--<!--",
+ "a$<!--<!-->",
+ },
+ {
+ "comment20",
+ "a<!--ij--kl-->z",
+ "a$<!--ij--kl-->$z",
+ },
+ {
+ "comment21",
+ "a<!--ij--kl--!>z",
+ "a$<!--ij--kl-->$z",
+ },
+ {
+ "comment22",
+ "a<!--!--!<--!-->z",
+ "a$<!--!--!<--!-->$z",
+ },
// An attribute with a backslash.
{
"backslash",