internal/sanitizer: be more flexible with what we allow

We're allowing more invalid attribute values to reduce diffs with what
the pkgsite is currently doing with bluemonday.

For #61399

Change-Id: If8d52f26a793093e1c210ff8c499469f6dccd7ad
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/547676
Reviewed-by: Jonathan Amsterdam <jba@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
kokoro-CI: kokoro <noreply+kokoro@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
diff --git a/internal/sanitizer/sanitizer.go b/internal/sanitizer/sanitizer.go
index 577f458..5b3df23 100644
--- a/internal/sanitizer/sanitizer.go
+++ b/internal/sanitizer/sanitizer.go
@@ -264,8 +264,8 @@
 var allowAttrs = []allowAttr{
 	// bluemonday AllowStandardAttributes
 	{"", "dir", re(`^(?i)(rtl|ltr)$`)},
-	{"", "lang", re(`^[a-zA-Z]{2,20}$`)},
-	{"", "id", re(`^[a-zA-Z0-9\:\-_\.]+$`)},
+	{"", "lang", re(`^[a-zA-Z-]{2,20}$`)},
+	{"", "id", re(`^#?[a-zA-Z0-9\:\-_\.]+$`)},
 	{"", "title", para},
 
 	{"details", "open", re(`(?i)^(|open)$`)},
@@ -276,11 +276,11 @@
 	{"map", "name", re(`([\p{L}\p{N}_-]+)`)},
 	{"img", "usemap", re(`(?i)^#[\p{L}\p{N}_-]+$`)},
 	{"img", "src", validURL},
-	{"img", "align", re(`(?i)^(left|right|top|texttop|middle|absmiddle|baseline|bottom|absbottom)$`)},
+	{"img", "align", re(`(?i)^(left|right|top|texttop|middle|absmiddle|baseline|bottom|absbottom|center)?;?$`)}, // allow invalid value center and semicolon to reduce diffs
 	{"img", "alt", para},
 	{"img", "height", numOrPercent},
-	{"img", "width", re(`^[0-9]+([%]|[a-z]+)?;?/?$`)}, // a hacky regexp to allow most commonly appearing width errors through
-	{"div", "align", align},
+	{"img", "width", re(`^ *[0-9.,]*([%]|[a-z]+)?;?/? *$`)}, // a hacky regexp to allow most commonly appearing width errors through
+	{"div", "align", flexiblealign},
 	{"div", "width", numOrPercent},
 	{"div", "role", re(`^[a-z]+$`)},
 	{"div", "aria-level", integer},
@@ -288,8 +288,8 @@
 	{"del", "datetime", iso8601},
 	{"ins", "cite", para},
 	{"ins", "datetime", iso8601},
-	{"p", "align", align},        // pkgsite allows all values
-	{"p", "width", numOrPercent}, // pkgsite allows all values
+	{"p", "align", flexiblealign}, // pkgsite allows all values
+	{"p", "width", flexiblewidth}, // pkgsite allows all values
 	{"q", "cite", validURL},
 	{"time", "datetime", iso8601},
 	{"ol", "type", re(`(?i)^(circle|disc|square|a|A|i|I|1)$`)},
@@ -364,12 +364,20 @@
 
 var align = re(`(?i)^(center|justify|left|right)$`)
 
+// flexiblealign allows non-valid align values to reduce diffs
+// with the old pkgsite sanitization code.
+var flexiblealign = re(`(?i)^(|middle|center|justify|left|right);? ?$`)
+
 var valign = re(`(?i)^(baseline|bottom|middle|top)$`)
 
 var para = re(`^[\p{L}\p{N}\s\-_',\[\]!\./\\\(\)]*$`)
 
 var spaceSepTokens = re(`^([\s\p{L}\p{N}_-]+)$`)
 
+// flexiblewidth allows non-valid align values to reduce diffs
+// with the old pkgsite sanitization code.
+var flexiblewidth = re(`^[0-9]+[%]?(px)?$`)
+
 var numOrPercent = re(`^[0-9]+[%]?$`)
 
 var integer = re(`^[0-9]+$`)
diff --git a/internal/sanitizer/sanitizer_test.go b/internal/sanitizer/sanitizer_test.go
index ba91d96..aa6acba 100644
--- a/internal/sanitizer/sanitizer_test.go
+++ b/internal/sanitizer/sanitizer_test.go
@@ -36,7 +36,7 @@
 			`<p dir="RTL" lang="en" id="foo" title="a title"></p>`,
 		},
 		{
-			`<p dir="ABC" lang="e" id="#foo" title="a title%"></p>`,
+			`<p dir="ABC" lang="e" id=" foo" title="a title%"></p>`,
 			`<p></p>`,
 		},
 		{