html: handle character entities without semicolons Fix the TODO: unescape("&notit;") should be "¬it;" Also accept digits in entity names. R=nigeltao CC=golang-dev, rsc https://golang.org/cl/4781042

commit: 816c972ff04c3975b29605a6cb9b16382460e47c [log] [tgz]
author: Andrew Balholm <andybalholm@gmail.com> Thu Jul 21 09:10:49 2011 +1000
committer: Nigel Tao <nigeltao@golang.org> Thu Jul 21 09:10:49 2011 +1000
tree: ed031018c523d96154db4c2ae31317bf4d1f65db
parent: 78c89d21bcb33b71d716165a9204c397cf1eaf63 [diff]
diff --git a/src/pkg/html/entity.go b/src/pkg/html/entity.go
index 1530290..21263e2 100644
--- a/src/pkg/html/entity.go
+++ b/src/pkg/html/entity.go

@@ -4,6 +4,9 @@
 
 package html
 
+// All entities that do not end with ';' are 6 or fewer bytes long.
+const longestEntityWithoutSemicolon = 6
+
 // entity is a map from HTML entity names to their values. The semicolon matters:
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
 // lists both "amp" and "amp;" as two separate entries.

diff --git a/src/pkg/html/entity_test.go b/src/pkg/html/entity_test.go
index a1eb4d4..2cf49d6 100644
--- a/src/pkg/html/entity_test.go
+++ b/src/pkg/html/entity_test.go

@@ -17,6 +17,9 @@
 		if 1+len(k) < utf8.RuneLen(v) {
 			t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
 		}
+		if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
+			t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
+		}
 	}
 	for k, v := range entity2 {
 		if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {

diff --git a/src/pkg/html/escape.go b/src/pkg/html/escape.go
index 2799f69..0de97c5 100644
--- a/src/pkg/html/escape.go
+++ b/src/pkg/html/escape.go

@@ -53,7 +53,8 @@
 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
 // Precondition: b[src] == '&' && dst <= src.
-func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
+// attribute should be true if parsing an attribute value.
+func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
 
 	// i starts at 1 because we already know that s[0] == '&'.
@@ -121,12 +122,11 @@
 	// Consume the maximum number of characters possible, with the
 	// consumed characters matching one of the named references.
 
-	// TODO(nigeltao): unescape("&notit;") should be "¬it;"
 	for i < len(s) {
 		c := s[i]
 		i++
 		// Lower-cased characters are more common in entities, so we check for them first.
-		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
 			continue
 		}
 		if c != ';' {
@@ -136,11 +136,25 @@
 	}
 
 	entityName := string(s[1:i])
-	if x := entity[entityName]; x != 0 {
+	if entityName == "" {
+		// No-op.
+	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
+		// No-op.
+	} else if x := entity[entityName]; x != 0 {
 		return dst + utf8.EncodeRune(b[dst:], x), src + i
-	} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
+	} else if x := entity2[entityName]; x[0] != 0 {
 		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+	} else if !attribute {
+		maxLen := len(entityName) - 1
+		if maxLen > longestEntityWithoutSemicolon {
+			maxLen = longestEntityWithoutSemicolon
+		}
+		for j := maxLen; j > 1; j-- {
+			if x := entity[entityName[:j]]; x != 0 {
+				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
+			}
+		}
 	}
 
 	dst1, src1 = dst+i, src+i
@@ -152,11 +166,11 @@
 func unescape(b []byte) []byte {
 	for i, c := range b {
 		if c == '&' {
-			dst, src := unescapeEntity(b, i, i)
+			dst, src := unescapeEntity(b, i, i, false)
 			for src < len(b) {
 				c := b[src]
 				if c == '&' {
-					dst, src = unescapeEntity(b, dst, src)
+					dst, src = unescapeEntity(b, dst, src, false)
 				} else {
 					b[dst] = c
 					dst, src = dst+1, src+1

diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index 23c95ec..5c6ed16 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go

@@ -459,7 +459,7 @@
 			src++
 			break loop
 		case '&':
-			dst, src = unescapeEntity(z.buf, dst, src)
+			dst, src = unescapeEntity(z.buf, dst, src, true)
 		case '\\':
 			if src == z.p1 {
 				z.buf[dst] = '\\'

diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index c794612..c8dcc88 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go

@@ -107,6 +107,16 @@
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
+	{
+		"entity without semicolon",
+		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+	},
+	{
+		"entity with digits",
+		"&frac12;",
+		"½",
+	},
 
 	// Attribute tests:
 	// http://dev.w3.org/html5/spec/Overview.html#attributes-0
commit	816c972ff04c3975b29605a6cb9b16382460e47c	[log] [tgz]
author	Andrew Balholm <andybalholm@gmail.com>	Thu Jul 21 09:10:49 2011 +1000
committer	Nigel Tao <nigeltao@golang.org>	Thu Jul 21 09:10:49 2011 +1000
tree	ed031018c523d96154db4c2ae31317bf4d1f65db
parent	78c89d21bcb33b71d716165a9204c397cf1eaf63 [diff]