| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "bytes" |
| "strings" |
| "testing" |
| ) |
| |
| // TestComments exhaustively tests every 'interesting' N-byte string is |
| // correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen |
| // inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment. |
| // |
| // 'Interesting' means that the N-4 byte suffix consists entirely of bytes |
| // sampled from the interestingCommentBytes const string, below. These cover |
| // all of the possible state transitions from comment-related parser states, as |
| // listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state |
| // and subsequent sections). |
| // |
| // The spec is written as an explicit state machine that, as a side effect, |
| // accumulates "the comment token's data" to a separate buffer. |
| // Tokenizer.readComment in this package does not have an explicit state |
| // machine and usually returns the comment text as a sub-slice of the input, |
| // between the opening '<' and closing '>' or EOF. This test confirms that the |
| // two algorithms match. |
| func TestComments(t *testing.T) { |
| const prefix = "<!--" |
| const maxSuffixLen = 6 |
| buffer := make([]byte, 0, len(prefix)+maxSuffixLen) |
| testAllComments(t, append(buffer, prefix...)) |
| } |
| |
| // NUL isn't in this list, even though the HTML spec sections 13.2.5.43 - |
| // 13.2.5.52 mentions it. It's not interesting in terms of state transitions. |
| // It's equivalent to any other non-interesting byte (other than being replaced |
| // by U+FFFD REPLACEMENT CHARACTER). |
| // |
| // EOF isn't in this list. The HTML spec treats EOF as "an input character" but |
| // testOneComment below breaks the loop instead. |
| // |
| // 'x' represents all other "non-interesting" comment bytes. |
| var interestingCommentBytes = [...]byte{ |
| '!', '-', '<', '>', 'x', |
| } |
| |
| // testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with |
| // interesting bytes and then tests that this package's tokenization matches |
| // the HTML spec. |
| // |
| // Precondition: len(buffer) < cap(buffer) |
| // Precondition: string(buffer[:4]) == "<!--" |
| func testAllComments(t *testing.T, buffer []byte) { |
| for _, interesting := range interestingCommentBytes { |
| b := append(buffer, interesting) |
| testOneComment(t, b) |
| if len(b) < cap(b) { |
| testAllComments(t, b) |
| } |
| } |
| } |
| |
| func testOneComment(t *testing.T, b []byte) { |
| z := NewTokenizer(bytes.NewReader(b)) |
| if next := z.Next(); next != CommentToken { |
| t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken) |
| } |
| gotRemainder := string(b[len(z.Raw()):]) |
| gotComment := string(z.Text()) |
| |
| i := len("<!--") |
| wantBuffer := []byte(nil) |
| loop: |
| for state := 43; ; { |
| // Consume the next input character, handling EOF. |
| if i >= len(b) { |
| break |
| } |
| nextInputCharacter := b[i] |
| i++ |
| |
| switch state { |
| case 43: // 13.2.5.43 Comment start state. |
| switch nextInputCharacter { |
| case '-': |
| state = 44 |
| case '>': |
| break loop |
| default: |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 44: // 13.2.5.44 Comment start dash state. |
| switch nextInputCharacter { |
| case '-': |
| state = 51 |
| case '>': |
| break loop |
| default: |
| wantBuffer = append(wantBuffer, '-') |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 45: // 13.2.5.45 Comment state. |
| switch nextInputCharacter { |
| case '-': |
| state = 50 |
| case '<': |
| wantBuffer = append(wantBuffer, '<') |
| state = 46 |
| default: |
| wantBuffer = append(wantBuffer, nextInputCharacter) |
| } |
| |
| case 46: // 13.2.5.46 Comment less-than sign state. |
| switch nextInputCharacter { |
| case '!': |
| wantBuffer = append(wantBuffer, '!') |
| state = 47 |
| case '<': |
| wantBuffer = append(wantBuffer, '<') |
| state = 46 |
| default: |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 47: // 13.2.5.47 Comment less-than sign bang state. |
| switch nextInputCharacter { |
| case '-': |
| state = 48 |
| default: |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 48: // 13.2.5.48 Comment less-than sign bang dash state. |
| switch nextInputCharacter { |
| case '-': |
| state = 49 |
| default: |
| i-- // Reconsume. |
| state = 50 |
| } |
| |
| case 49: // 13.2.5.49 Comment less-than sign bang dash dash state. |
| switch nextInputCharacter { |
| case '>': |
| break loop |
| default: |
| i-- // Reconsume. |
| state = 51 |
| } |
| |
| case 50: // 13.2.5.50 Comment end dash state. |
| switch nextInputCharacter { |
| case '-': |
| state = 51 |
| default: |
| wantBuffer = append(wantBuffer, '-') |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 51: // 13.2.5.51 Comment end state. |
| switch nextInputCharacter { |
| case '!': |
| state = 52 |
| case '-': |
| wantBuffer = append(wantBuffer, '-') |
| case '>': |
| break loop |
| default: |
| wantBuffer = append(wantBuffer, "--"...) |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| case 52: // 13.2.5.52 Comment end bang state. |
| switch nextInputCharacter { |
| case '-': |
| wantBuffer = append(wantBuffer, "--!"...) |
| state = 50 |
| case '>': |
| break loop |
| default: |
| wantBuffer = append(wantBuffer, "--!"...) |
| i-- // Reconsume. |
| state = 45 |
| } |
| |
| default: |
| t.Fatalf("input=%q: unexpected state %d", b, state) |
| } |
| } |
| |
| wantRemainder := "" |
| if i < len(b) { |
| wantRemainder = string(b[i:]) |
| } |
| wantComment := string(wantBuffer) |
| if (gotComment != wantComment) || (gotRemainder != wantRemainder) { |
| t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q", |
| b, gotComment, gotRemainder, wantComment, wantRemainder) |
| return |
| } |
| |
| // suffix is the "N-4 byte suffix" per the TestComments comment. |
| suffix := string(b[4:]) |
| |
| // Test that a round trip, rendering (escaped) and re-parsing, of a comment |
| // token (with that suffix as the Token.Data) preserves that string. |
| tok := Token{ |
| Type: CommentToken, |
| Data: suffix, |
| } |
| z2 := NewTokenizer(strings.NewReader(tok.String())) |
| if next := z2.Next(); next != CommentToken { |
| t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken) |
| } |
| gotComment2 := string(z2.Text()) |
| if gotComment2 != suffix { |
| t.Errorf("round-trip\ngot: %q\nwant: %q", gotComment2, suffix) |
| return |
| } |
| } |
| |
| // This table below summarizes the HTML-comment-related state machine from |
| // 13.2.5.43 "Comment start state" and subsequent sections. |
| // https://html.spec.whatwg.org/#comment-start-state |
| // |
| // Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the |
| // initial 13.2.5.1 "Data state": |
| // - "<" moves to 13.2.5.6 "Tag open state", |
| // - "!" moves to 13.2.5.42 "Markup declaration open state", |
| // - "--" moves to 13.2.5.43 "Comment start state". |
| // Each of these transitions are the only way to get to the 6/42/43 states. |
| // |
| // State ! - < > NUL EOF default HTML spec section |
| // 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state |
| // 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state |
| // 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state |
| // 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state |
| // 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state |
| // 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state |
| // 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state |
| // 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state |
| // 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state |
| // 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state |
| // |
| // State 43 is the "Comment start state" meaning that we've only seen "<!--" |
| // and nothing else. Similarly, state 44 means that we've only seen "<!---", |
| // with three dashes, and nothing else. For the other states, we deduce |
| // (working backwards) that the immediate prior input must be: |
| // - 45 something that's not '-' |
| // - 46 "<" |
| // - 47 "<!" |
| // - 48 "<!-" |
| // - 49 "<!--" not including the opening "<!--" |
| // - 50 "-" not including the opening "<!--" and also not "--" |
| // - 51 "--" not including the opening "<!--" |
| // - 52 "--!" |
| // |
| // The table cell actions: |
| // - ... do the default action |
| // - A! append "!" to the comment token's data. |
| // - A- append "-" to the comment token's data. |
| // - A-- append "--" to the comment token's data. |
| // - A--! append "--!" to the comment token's data. |
| // - A< append "<" to the comment token's data. |
| // - A? append "\uFFFD" to the comment token's data. |
| // - Ax append the current input character to the comment token's data. |
| // - E0 parse error (abrupt-closing-of-empty-comment). |
| // - E1 parse error (eof-in-comment). |
| // - E2 parse error (unexpected-null-character). |
| // - E3 parse error (nested-comment). |
| // - E4 parse error (incorrectly-closed-comment). |
| // - T emit the current comment token. |
| // - Z emit an end-of-file token. |
| // - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations). |
| // - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations). |
| // - sNN switch to the 13.2.5.NN state (after any A* or E* operations). |
| // - tNN stay in the 13.2.5.NN state (after any A* or E* operations). |
| // |
| // The E* actions are called errors in the HTML spec but they are not fatal |
| // (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort |
| // the parser"). They are warnings that, in practice, browsers simply ignore. |