html/comment_test.go - net - Git at Google

 // Copyright 2023 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package html

 import (
 	"bytes"
 	"strings"
 	"testing"
 )

 // TestComments exhaustively tests every 'interesting' N-byte string is
 // correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
 // inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
 //
 // 'Interesting' means that the N-4 byte suffix consists entirely of bytes
 // sampled from the interestingCommentBytes const string, below. These cover
 // all of the possible state transitions from comment-related parser states, as
 // listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
 // and subsequent sections).
 //
 // The spec is written as an explicit state machine that, as a side effect,
 // accumulates "the comment token's data" to a separate buffer.
 // Tokenizer.readComment in this package does not have an explicit state
 // machine and usually returns the comment text as a sub-slice of the input,
 // between the opening '<' and closing '>' or EOF. This test confirms that the
 // two algorithms match.
 func TestComments(t *testing.T) {
 	const prefix = "<!--"
 	const maxSuffixLen = 6
 	buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
 	testAllComments(t, append(buffer, prefix...))
 }

 // NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
 // 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
 // It's equivalent to any other non-interesting byte (other than being replaced
 // by U+FFFD REPLACEMENT CHARACTER).
 //
 // EOF isn't in this list. The HTML spec treats EOF as "an input character" but
 // testOneComment below breaks the loop instead.
 //
 // 'x' represents all other "non-interesting" comment bytes.
 var interestingCommentBytes = [...]byte{
 	'!', '-', '<', '>', 'x',
 }

 // testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
 // interesting bytes and then tests that this package's tokenization matches
 // the HTML spec.
 //
 // Precondition: len(buffer) < cap(buffer)
 // Precondition: string(buffer[:4]) == "<!--"
 func testAllComments(t *testing.T, buffer []byte) {
 	for _, interesting := range interestingCommentBytes {
 		b := append(buffer, interesting)
 		testOneComment(t, b)
 		if len(b) < cap(b) {
 			testAllComments(t, b)
 		}
 	}
 }

 func testOneComment(t *testing.T, b []byte) {
 	z := NewTokenizer(bytes.NewReader(b))
 	if next := z.Next(); next != CommentToken {
 		t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
 	}
 	gotRemainder := string(b[len(z.Raw()):])
 	gotComment := string(z.Text())

 	i := len("<!--")
 	wantBuffer := []byte(nil)
 loop:
 	for state := 43; ; {
 		// Consume the next input character, handling EOF.
 		if i >= len(b) {
 			break
 		}
 		nextInputCharacter := b[i]
 		i++

 		switch state {
 		case 43: // 13.2.5.43 Comment start state.
 			switch nextInputCharacter {
 			case '-':
 				state = 44
 			case '>':
 				break loop
 			default:
 				i-- // Reconsume.
 				state = 45
 			}

 		case 44: // 13.2.5.44 Comment start dash state.
 			switch nextInputCharacter {
 			case '-':
 				state = 51
 			case '>':
 				break loop
 			default:
 				wantBuffer = append(wantBuffer, '-')
 				i-- // Reconsume.
 				state = 45
 			}

 		case 45: // 13.2.5.45 Comment state.
 			switch nextInputCharacter {
 			case '-':
 				state = 50
 			case '<':
 				wantBuffer = append(wantBuffer, '<')
 				state = 46
 			default:
 				wantBuffer = append(wantBuffer, nextInputCharacter)
 			}

 		case 46: // 13.2.5.46 Comment less-than sign state.
 			switch nextInputCharacter {
 			case '!':
 				wantBuffer = append(wantBuffer, '!')
 				state = 47
 			case '<':
 				wantBuffer = append(wantBuffer, '<')
 				state = 46
 			default:
 				i-- // Reconsume.
 				state = 45
 			}

 		case 47: // 13.2.5.47 Comment less-than sign bang state.
 			switch nextInputCharacter {
 			case '-':
 				state = 48
 			default:
 				i-- // Reconsume.
 				state = 45
 			}

 		case 48: // 13.2.5.48 Comment less-than sign bang dash state.
 			switch nextInputCharacter {
 			case '-':
 				state = 49
 			default:
 				i-- // Reconsume.
 				state = 50
 			}

 		case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
 			switch nextInputCharacter {
 			case '>':
 				break loop
 			default:
 				i-- // Reconsume.
 				state = 51
 			}

 		case 50: // 13.2.5.50 Comment end dash state.
 			switch nextInputCharacter {
 			case '-':
 				state = 51
 			default:
 				wantBuffer = append(wantBuffer, '-')
 				i-- // Reconsume.
 				state = 45
 			}

 		case 51: // 13.2.5.51 Comment end state.
 			switch nextInputCharacter {
 			case '!':
 				state = 52
 			case '-':
 				wantBuffer = append(wantBuffer, '-')
 			case '>':
 				break loop
 			default:
 				wantBuffer = append(wantBuffer, "--"...)
 				i-- // Reconsume.
 				state = 45
 			}

 		case 52: // 13.2.5.52 Comment end bang state.
 			switch nextInputCharacter {
 			case '-':
 				wantBuffer = append(wantBuffer, "--!"...)
 				state = 50
 			case '>':
 				break loop
 			default:
 				wantBuffer = append(wantBuffer, "--!"...)
 				i-- // Reconsume.
 				state = 45
 			}

 		default:
 			t.Fatalf("input=%q: unexpected state %d", b, state)
 		}
 	}

 	wantRemainder := ""
 	if i < len(b) {
 		wantRemainder = string(b[i:])
 	}
 	wantComment := string(wantBuffer)
 	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
 		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
 			b, gotComment, gotRemainder, wantComment, wantRemainder)
 		return
 	}

 	// suffix is the "N-4 byte suffix" per the TestComments comment.
 	suffix := string(b[4:])

 	// Test that a round trip, rendering (escaped) and re-parsing, of a comment
 	// token (with that suffix as the Token.Data) preserves that string.
 	tok := Token{
 		Type: CommentToken,
 		Data: suffix,
 	}
 	z2 := NewTokenizer(strings.NewReader(tok.String()))
 	if next := z2.Next(); next != CommentToken {
 		t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
 	}
 	gotComment2 := string(z2.Text())
 	if gotComment2 != suffix {
 		t.Errorf("round-trip\ngot:  %q\nwant: %q", gotComment2, suffix)
 		return
 	}
 }

 // This table below summarizes the HTML-comment-related state machine from
 // 13.2.5.43 "Comment start state" and subsequent sections.
 // https://html.spec.whatwg.org/#comment-start-state
 //
 // Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
 // initial 13.2.5.1 "Data state":
 //   - "<"  moves to 13.2.5.6  "Tag open state",
 //   - "!"  moves to 13.2.5.42 "Markup declaration open state",
 //   - "--" moves to 13.2.5.43 "Comment start state".
 // Each of these transitions are the only way to get to the 6/42/43 states.
 //
 // State   !         -         <         >         NUL       EOF       default   HTML spec section
 // 43      ...       s44       ...       s01.T.E0  ...       ...       r45       13.2.5.43 Comment start state
 // 44      ...       s51       ...       s01.T.E0  ...       T.Z.E1    r45.A-    13.2.5.44 Comment start dash state
 // 45      ...       s50       s46.A<    ...       t45.A?.E2 T.Z.E1    t45.Ax    13.2.5.45 Comment state
 // 46      s47.A!    ...       t46.A<    ...       ...       ...       r45       13.2.5.46 Comment less-than sign state
 // 47      ...       s48       ...       ...       ...       ...       r45       13.2.5.47 Comment less-than sign bang state
 // 48      ...       s49       ...       ...       ...       ...       r50       13.2.5.48 Comment less-than sign bang dash state
 // 49      ...       ...       ...       s01.T     ...       T.Z.E1    r51.E3    13.2.5.49 Comment less-than sign bang dash dash state
 // 50      ...       s51       ...       ...       ...       T.Z.E1    r45.A-    13.2.5.50 Comment end dash state
 // 51      s52       t51.A-    ...       s01.T     ...       T.Z.E1    r45.A--   13.2.5.51 Comment end state
 // 52      ...       s50.A--!  ...       s01.T.E4  ...       T.Z.E1    r45.A--!  13.2.5.52 Comment end bang state
 //
 // State 43 is the "Comment start state" meaning that we've only seen "<!--"
 // and nothing else. Similarly, state 44 means that we've only seen "<!---",
 // with three dashes, and nothing else. For the other states, we deduce
 // (working backwards) that the immediate prior input must be:
 //   - 45  something that's not '-'
 //   - 46  "<"
 //   - 47  "<!"
 //   - 48  "<!-"
 //   - 49  "<!--"  not including the opening "<!--"
 //   - 50  "-"     not including the opening "<!--" and also not "--"
 //   - 51  "--"    not including the opening "<!--"
 //   - 52  "--!"
 //
 // The table cell actions:
 //   - ...   do the default action
 //   - A!    append "!"      to the comment token's data.
 //   - A-    append "-"      to the comment token's data.
 //   - A--   append "--"     to the comment token's data.
 //   - A--!  append "--!"    to the comment token's data.
 //   - A<    append "<"      to the comment token's data.
 //   - A?    append "\uFFFD" to the comment token's data.
 //   - Ax    append the current input character to the comment token's data.
 //   - E0    parse error (abrupt-closing-of-empty-comment).
 //   - E1    parse error (eof-in-comment).
 //   - E2    parse error (unexpected-null-character).
 //   - E3    parse error (nested-comment).
 //   - E4    parse error (incorrectly-closed-comment).
 //   - T     emit the current comment token.
 //   - Z     emit an end-of-file token.
 //   - rNN   reconsume in the 13.2.5.NN     state (after any A* or E* operations).
 //   - s01   switch to the    13.2.5.1 Data state (after any A* or E* operations).
 //   - sNN   switch to the    13.2.5.NN     state (after any A* or E* operations).
 //   - tNN   stay in the      13.2.5.NN     state (after any A* or E* operations).
 //
 // The E* actions are called errors in the HTML spec but they are not fatal
 // (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
 // the parser"). They are warnings that, in practice, browsers simply ignore.
	// Copyright 2023 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package html

	import (
	"bytes"
	"strings"
	"testing"
	)

	// TestComments exhaustively tests every 'interesting' N-byte string is
	// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
	// inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
	//
	// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
	// sampled from the interestingCommentBytes const string, below. These cover
	// all of the possible state transitions from comment-related parser states, as
	// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
	// and subsequent sections).
	//
	// The spec is written as an explicit state machine that, as a side effect,
	// accumulates "the comment token's data" to a separate buffer.
	// Tokenizer.readComment in this package does not have an explicit state
	// machine and usually returns the comment text as a sub-slice of the input,
	// between the opening '<' and closing '>' or EOF. This test confirms that the
	// two algorithms match.
	func TestComments(t *testing.T) {
	const prefix = "<!--"
	const maxSuffixLen = 6
	buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
	testAllComments(t, append(buffer, prefix...))
	}

	// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
	// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
	// It's equivalent to any other non-interesting byte (other than being replaced
	// by U+FFFD REPLACEMENT CHARACTER).
	//
	// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
	// testOneComment below breaks the loop instead.
	//
	// 'x' represents all other "non-interesting" comment bytes.
	var interestingCommentBytes = [...]byte{
	'!', '-', '<', '>', 'x',
	}

	// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
	// interesting bytes and then tests that this package's tokenization matches
	// the HTML spec.
	//
	// Precondition: len(buffer) < cap(buffer)
	// Precondition: string(buffer[:4]) == "<!--"
	func testAllComments(t *testing.T, buffer []byte) {
	for _, interesting := range interestingCommentBytes {
	b := append(buffer, interesting)
	testOneComment(t, b)
	if len(b) < cap(b) {
	testAllComments(t, b)
	}
	}
	}

	func testOneComment(t *testing.T, b []byte) {
	z := NewTokenizer(bytes.NewReader(b))
	if next := z.Next(); next != CommentToken {
	t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
	}
	gotRemainder := string(b[len(z.Raw()):])
	gotComment := string(z.Text())

	i := len("<!--")
	wantBuffer := []byte(nil)
	loop:
	for state := 43; ; {
	// Consume the next input character, handling EOF.
	if i >= len(b) {
	break
	}
	nextInputCharacter := b[i]
	i++

	switch state {
	case 43: // 13.2.5.43 Comment start state.
	switch nextInputCharacter {
	case '-':
	state = 44
	case '>':
	break loop
	default:
	i-- // Reconsume.
	state = 45
	}

	case 44: // 13.2.5.44 Comment start dash state.
	switch nextInputCharacter {
	case '-':
	state = 51
	case '>':
	break loop
	default:
	wantBuffer = append(wantBuffer, '-')
	i-- // Reconsume.
	state = 45
	}

	case 45: // 13.2.5.45 Comment state.
	switch nextInputCharacter {
	case '-':
	state = 50
	case '<':
	wantBuffer = append(wantBuffer, '<')
	state = 46
	default:
	wantBuffer = append(wantBuffer, nextInputCharacter)
	}

	case 46: // 13.2.5.46 Comment less-than sign state.
	switch nextInputCharacter {
	case '!':
	wantBuffer = append(wantBuffer, '!')
	state = 47
	case '<':
	wantBuffer = append(wantBuffer, '<')
	state = 46
	default:
	i-- // Reconsume.
	state = 45
	}

	case 47: // 13.2.5.47 Comment less-than sign bang state.
	switch nextInputCharacter {
	case '-':
	state = 48
	default:
	i-- // Reconsume.
	state = 45
	}

	case 48: // 13.2.5.48 Comment less-than sign bang dash state.
	switch nextInputCharacter {
	case '-':
	state = 49
	default:
	i-- // Reconsume.
	state = 50
	}

	case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
	switch nextInputCharacter {
	case '>':
	break loop
	default:
	i-- // Reconsume.
	state = 51
	}

	case 50: // 13.2.5.50 Comment end dash state.
	switch nextInputCharacter {
	case '-':
	state = 51
	default:
	wantBuffer = append(wantBuffer, '-')
	i-- // Reconsume.
	state = 45
	}

	case 51: // 13.2.5.51 Comment end state.
	switch nextInputCharacter {
	case '!':
	state = 52
	case '-':
	wantBuffer = append(wantBuffer, '-')
	case '>':
	break loop
	default:
	wantBuffer = append(wantBuffer, "--"...)
	i-- // Reconsume.
	state = 45
	}

	case 52: // 13.2.5.52 Comment end bang state.
	switch nextInputCharacter {
	case '-':
	wantBuffer = append(wantBuffer, "--!"...)
	state = 50
	case '>':
	break loop
	default:
	wantBuffer = append(wantBuffer, "--!"...)
	i-- // Reconsume.
	state = 45
	}

	default:
	t.Fatalf("input=%q: unexpected state %d", b, state)
	}
	}

	wantRemainder := ""
	if i < len(b) {
	wantRemainder = string(b[i:])
	}
	wantComment := string(wantBuffer)
	if (gotComment != wantComment) \|\| (gotRemainder != wantRemainder) {
	t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
	b, gotComment, gotRemainder, wantComment, wantRemainder)
	return
	}

	// suffix is the "N-4 byte suffix" per the TestComments comment.
	suffix := string(b[4:])

	// Test that a round trip, rendering (escaped) and re-parsing, of a comment
	// token (with that suffix as the Token.Data) preserves that string.
	tok := Token{
	Type: CommentToken,
	Data: suffix,
	}
	z2 := NewTokenizer(strings.NewReader(tok.String()))
	if next := z2.Next(); next != CommentToken {
	t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
	}
	gotComment2 := string(z2.Text())
	if gotComment2 != suffix {
	t.Errorf("round-trip\ngot: %q\nwant: %q", gotComment2, suffix)
	return
	}
	}

	// This table below summarizes the HTML-comment-related state machine from
	// 13.2.5.43 "Comment start state" and subsequent sections.
	// https://html.spec.whatwg.org/#comment-start-state
	//
	// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
	// initial 13.2.5.1 "Data state":
	// - "<" moves to 13.2.5.6 "Tag open state",
	// - "!" moves to 13.2.5.42 "Markup declaration open state",
	// - "--" moves to 13.2.5.43 "Comment start state".
	// Each of these transitions are the only way to get to the 6/42/43 states.
	//
	// State ! - < > NUL EOF default HTML spec section
	// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state
	// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state
	// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state
	// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state
	// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state
	// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state
	// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state
	// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state
	// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state
	// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state
	//
	// State 43 is the "Comment start state" meaning that we've only seen "<!--"
	// and nothing else. Similarly, state 44 means that we've only seen "<!---",
	// with three dashes, and nothing else. For the other states, we deduce
	// (working backwards) that the immediate prior input must be:
	// - 45 something that's not '-'
	// - 46 "<"
	// - 47 "<!"
	// - 48 "<!-"
	// - 49 "<!--" not including the opening "<!--"
	// - 50 "-" not including the opening "<!--" and also not "--"
	// - 51 "--" not including the opening "<!--"
	// - 52 "--!"
	//
	// The table cell actions:
	// - ... do the default action
	// - A! append "!" to the comment token's data.
	// - A- append "-" to the comment token's data.
	// - A-- append "--" to the comment token's data.
	// - A--! append "--!" to the comment token's data.
	// - A< append "<" to the comment token's data.
	// - A? append "\uFFFD" to the comment token's data.
	// - Ax append the current input character to the comment token's data.
	// - E0 parse error (abrupt-closing-of-empty-comment).
	// - E1 parse error (eof-in-comment).
	// - E2 parse error (unexpected-null-character).
	// - E3 parse error (nested-comment).
	// - E4 parse error (incorrectly-closed-comment).
	// - T emit the current comment token.
	// - Z emit an end-of-file token.
	// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations).
	// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations).
	// - sNN switch to the 13.2.5.NN state (after any A* or E* operations).
	// - tNN stay in the 13.2.5.NN state (after any A* or E* operations).
	//
	// The E* actions are called errors in the HTML spec but they are not fatal
	// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
	// the parser"). They are warnings that, in practice, browsers simply ignore.