go.net/html: Tokenizer.Raw returns the original input when tokenizer errors occur. Two tweaks enable this: 1) Updating the raw and data span pointers when Tokenizer.Next is called, even if an error has occurred. This prevents duplicate data from being returned by Raw in the common case of an EOF. 2) Treating '</>' as an empty comment token to expose the raw text as a tokenization event. (This matches the semantics of other non-token events, e.g., '</ >' is treated as ''.) Fixes golang/go#7029. R=golang-codereviews, r, bradfitz CC=golang-codereviews https://golang.org/cl/46370043

commit: 480e7b06ec3c006363895251ece1bf25d2386ede [log] [tgz]
author: Michael Piatek <piatek@google.com> Thu Jan 02 10:51:00 2014 -0800
committer: Brad Fitzpatrick <bradfitz@golang.org> Thu Jan 02 10:51:00 2014 -0800
tree: 59f1a20203475ddc138ad7c455ed5d1b119dd698
parent: 16ae462212fa7ddcea9a0cb1b91af3875afcfc01 [diff]
diff --git a/html/token.go b/html/token.go
index 3a1ee7c..c43debb 100644
--- a/html/token.go
+++ b/html/token.go

@@ -734,7 +734,6 @@
 			brackets = 0
 		}
 	}
-	panic("unreachable")
 }
 
 // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
@@ -934,13 +933,13 @@
 
 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
+	z.raw.start = z.raw.end
+	z.data.start = z.raw.end
+	z.data.end = z.raw.end
 	if z.err != nil {
 		z.tt = ErrorToken
 		return z.tt
 	}
-	z.raw.start = z.raw.end
-	z.data.start = z.raw.end
-	z.data.end = z.raw.end
 	if z.rawTag != "" {
 		if z.rawTag == "plaintext" {
 			// Read everything up to EOF.
@@ -1010,12 +1009,11 @@
 				break loop
 			}
 			if c == '>' {
-				// "</>" does not generate a token at all.
+				// "</>" does not generate a token at all. Generate an empty comment
+				// to allow passthrough clients to pick up the data using Raw.
 				// Reset the tokenizer state and start again.
-				z.raw.start = z.raw.end
-				z.data.start = z.raw.end
-				z.data.end = z.raw.end
-				continue loop
+				z.tt = CommentToken
+				return z.tt
 			}
 			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 				z.readTag(false)

diff --git a/html/token_test.go b/html/token_test.go
index ca408da..7d54d89 100644
--- a/html/token_test.go
+++ b/html/token_test.go

@@ -63,12 +63,12 @@
 	{
 		"not a tag #2",
 		"</>",
-		"",
+		"<!---->",
 	},
 	{
 		"not a tag #3",
 		"a</>b",
-		"a$b",
+		"a$<!---->$b",
 	},
 	{
 		"not a tag #4",
@@ -469,6 +469,25 @@
 	}
 }
 
+func TestPassthrough(t *testing.T) {
+	// Accumulating the raw output for each parse event should reconstruct the
+	// original input.
+	for _, test := range tokenTests {
+		z := NewTokenizer(strings.NewReader(test.html))
+		var parsed bytes.Buffer
+		for {
+			tt := z.Next()
+			parsed.Write(z.Raw())
+			if tt == ErrorToken {
+				break
+			}
+		}
+		if got, want := parsed.String(), test.html; got != want {
+			t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
+		}
+	}
+}
+
 func TestBufAPI(t *testing.T) {
 	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 	z := NewTokenizer(bytes.NewBufferString(s))
commit	480e7b06ec3c006363895251ece1bf25d2386ede	[log] [tgz]
author	Michael Piatek <piatek@google.com>	Thu Jan 02 10:51:00 2014 -0800
committer	Brad Fitzpatrick <bradfitz@golang.org>	Thu Jan 02 10:51:00 2014 -0800
tree	59f1a20203475ddc138ad7c455ed5d1b119dd698
parent	16ae462212fa7ddcea9a0cb1b91af3875afcfc01 [diff]