internal/export/idna: sanitize errors
- Documented that ToASCII and ToUnicode always return some IDN.
- Label length checking disabled by default
(most browsers seem to do this)
- Refactored test logic
- Associate (internal) error codes with errors. This allows:
- more accurate testing
- better documentation
- makes it easier in the future to have options to disable
specific tests
- Added tests for very esoteric cases that often display
very different behavior between browsers.
Change-Id: Ia70335212f1089f280653c2b7bea9f769171ae1c
Reviewed-on: https://go-review.googlesource.com/31470
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/internal/export/idna/idna.go b/internal/export/idna/idna.go
index 330d2e6..24755df 100644
--- a/internal/export/idna/idna.go
+++ b/internal/export/idna/idna.go
@@ -18,7 +18,6 @@
package idna // import "golang.org/x/text/internal/export/idna"
import (
- "errors"
"fmt"
"strings"
"unicode/utf8"
@@ -27,16 +26,33 @@
"golang.org/x/text/unicode/norm"
)
+// NOTE: Unlike common practice in Go APIs, the functions will return a
+// sanitized domain name in case of errors. Browsers sometimes use a partially
+// evaluated string as lookup.
+// TODO: the current error handling is, in my opinion, the least opinionated.
+// Other strategies are also viable, though:
+// Option 1) Return an empty string in case of error, but allow the user to
+// specify explicitly which errors to ignore.
+// Option 2) Return the partially evaluated string if it is itself a valid
+// string, otherwise return the empty string in case of error.
+// Option 3) Option 1 and 2.
+// Option 4) Always return an empty string for now and implement Option 1 as
+// needed, and document that the return string may not be empty in case of
+// error in the future.
+// I think Option 1 is best, but it is quite opinionated.
+
// ToASCII converts a domain or domain label to its ASCII form. For example,
// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
-// ToASCII("golang") is "golang".
+// ToASCII("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
func ToASCII(s string) (string, error) {
return Resolve.process(s, true)
}
// ToUnicode converts a domain or domain label to its Unicode form. For example,
// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
-// ToUnicode("golang") is "golang".
+// ToUnicode("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
func ToUnicode(s string) (string, error) {
return NonTransitional.process(s, false)
}
@@ -45,20 +61,22 @@
type Profile struct {
Transitional bool
IgnoreSTD3Rules bool
- IgnoreDNSLength bool
+ VerifyDNSLength bool
// ErrHandler func(error)
}
// ToASCII converts a domain or domain label to its ASCII form. For example,
// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
-// ToASCII("golang") is "golang".
+// ToASCII("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
func (p *Profile) ToASCII(s string) (string, error) {
return p.process(s, true)
}
// ToUnicode converts a domain or domain label to its Unicode form. For example,
// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
-// ToUnicode("golang") is "golang".
+// ToUnicode("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
func (p *Profile) ToUnicode(s string) (string, error) {
pp := *p
pp.Transitional = false
@@ -72,7 +90,7 @@
if p.Transitional {
s = "Transitional"
} else {
- s = "NonTraditional"
+ s = "NonTransitional"
}
if p.IgnoreSTD3Rules {
s += ":NoSTD3Rules"
@@ -108,15 +126,19 @@
// bundle or block deviation characters.
)
-// TODO: rethink error strategy
+type labelError struct{ label, code_ string }
-var (
- // errDisallowed indicates a domain name contains a disallowed rune.
- errDisallowed = errors.New("idna: disallowed rune")
+func (e labelError) code() string { return e.code_ }
+func (e labelError) Error() string {
+ return fmt.Sprintf("idna: invalid label %q", e.label)
+}
- // errEmptyLabel indicates a label was empty.
- errEmptyLabel = errors.New("idna: empty label")
-)
+type runeError rune
+
+func (e runeError) code() string { return "P1" }
+func (e runeError) Error() string {
+ return fmt.Sprintf("idna: disallowed rune %r", e)
+}
// process implements the algorithm described in section 4 of UTS #46,
// see http://www.unicode.org/reports/tr46.
@@ -136,7 +158,8 @@
continue
case disallowed:
if err == nil {
- err = errDisallowed
+ r, _ := utf8.DecodeRuneInString(s[i:])
+ err = runeError(r)
}
continue
case mapped, deviation:
@@ -166,7 +189,7 @@
for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
}
if s == "" {
- return "", errors.New("idna: there are no labels")
+ return "", &labelError{s, "A4"}
}
labels := labelIter{orig: s}
for ; !labels.done(); labels.next() {
@@ -175,7 +198,7 @@
// Empty labels are not okay. The label iterator skips the last
// label if it is empty.
if err == nil {
- err = errEmptyLabel
+ err = &labelError{s, "A4"}
}
continue
}
@@ -211,20 +234,20 @@
labels.set(a)
}
n := len(label)
- if !p.IgnoreDNSLength && err == nil && (n == 0 || n > 63) {
- err = fmt.Errorf("idna: label with invalid length %d", n)
+ if p.VerifyDNSLength && err == nil && (n == 0 || n > 63) {
+ err = &labelError{label, "A4"}
}
}
}
s = labels.result()
- if toASCII && !p.IgnoreDNSLength && err == nil {
+ if toASCII && p.VerifyDNSLength && err == nil {
// Compute the length of the domain name minus the root label and its dot.
n := len(s)
if n > 0 && s[n-1] == '.' {
n--
}
if len(s) < 1 || n > 253 {
- err = fmt.Errorf("idna: doman name with invalid length %d", n)
+ err = &labelError{s, "A4"}
}
}
return s, err
@@ -320,12 +343,12 @@
func (p *Profile) validateFromPunycode(s string) error {
if !norm.NFC.IsNormalString(s) {
- return errors.New("idna: punycode is not normalized")
+ return &labelError{s, "V1"}
}
for i := 0; i < len(s); {
v, sz := trie.lookupString(s[i:])
if c := p.simplify(info(v).category()); c != valid && c != deviation {
- return fmt.Errorf("idna: invalid character %+q in expanded punycode", s[i:i+sz])
+ return &labelError{s, "V6"}
}
i += sz
}
@@ -398,19 +421,19 @@
// already implicitly satisfied by the overall implementation.
func (p *Profile) validate(s string) error {
if len(s) > 4 && s[2] == '-' && s[3] == '-' {
- return errors.New("idna: label starts with ??--")
+ return &labelError{s, "V2"}
}
if s[0] == '-' || s[len(s)-1] == '-' {
- return errors.New("idna: label may not start or end with '-'")
+ return &labelError{s, "V3"}
}
// TODO: merge the use of this in the trie.
v, sz := trie.lookupString(s)
x := info(v)
if x.isModifier() {
- return fmt.Errorf("idna: label starts with modifier %U", []rune(s[:sz])[0])
+ return &labelError{s, "V5"}
}
if !bidirule.ValidString(s) {
- return errors.New("idna: label violates Bidi Rule")
+ return &labelError{s, "B"}
}
// Quickly return in the absence of zero-width (non) joiners.
if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 {
@@ -435,7 +458,7 @@
x = info(v)
}
if st == stateFAIL || st == stateAfter {
- return errors.New("idna: label violates Context J rule")
+ return &labelError{s, "C"}
}
return nil
}
diff --git a/internal/export/idna/idna_test.go b/internal/export/idna/idna_test.go
index b4ee138..72dbeb1 100644
--- a/internal/export/idna/idna_test.go
+++ b/internal/export/idna/idna_test.go
@@ -33,6 +33,125 @@
}
}
+// doTest performs a single test f(input) and verifies that the output matches
+// out and that the returned error is expected. The errors string contains
+// all allowed error codes as categorized in
+// http://www.unicode.org/Public/idna/9.0.0/IdnaTest.txt:
+// P: Processing
+// V: Validity
+// A: to ASCII
+// B: Bidi
+// C: Context J
+func doTest(t *testing.T, f func(string) (string, error), name, input, want, errors string) {
+ errors = strings.Trim(errors, "[]")
+ test := "ok"
+ if errors != "" {
+ test = "err:" + errors
+ }
+ // Replace some of the escape sequences to make it easier to single out
+ // tests on the command name.
+ in := strings.Trim(strconv.QuoteToASCII(input), `"`)
+ in = strings.Replace(in, `\u`, "#", -1)
+ in = strings.Replace(in, `\U`, "#", -1)
+ name = fmt.Sprintf("%s/%s/%s", name, in, test)
+
+ testtext.Run(t, name, func(t *testing.T) {
+ got, err := f(input)
+
+ if err != nil {
+ code := err.(interface {
+ code() string
+ }).code()
+ if strings.Index(errors, code) == -1 {
+ t.Errorf("error %q not in set of expected errors {%v}", code, errors)
+ }
+ } else if errors != "" {
+ t.Errorf("no errors; want error in {%v}", errors)
+ }
+
+ if want != "" && got != want {
+ t.Errorf(`string: got %+q; want %+q`, got, want)
+ }
+ })
+}
+
+// TestLabelErrors tests strings returned in case of error. All results should
+// be identical to the reference implementation and can be verified at
+// http://unicode.org/cldr/utility/idna.jsp. The reference implementation,
+// however, seems to not display Bidi and ContextJ errors.
+//
+// In some cases the behavior of browsers is added as a comment. In all cases,
+// whenever a resolve search returns an error here, Chrome will treat the input
+// string as a search string (including those for Bidi and Context J errors),
+// unless noted otherwise.
+func TestLabelErrors(t *testing.T) {
+ encode := func(s string) string { s, _ = encode(acePrefix, s); return s }
+ type kind struct {
+ name string
+ f func(string) (string, error)
+ }
+ resolve := kind{"ToASCII", Resolve.ToASCII}
+ display := kind{"ToUnicode", Display.ToUnicode}
+ testCases := []struct {
+ kind
+ input string
+ want string
+ wantErr string
+ }{
+ // Don't map U+2490 (DIGIT NINE FULL STOP). This is the behavior of
+ // Chrome, Safari, and IE. Firefox will first map ⒐ to 9. and return
+ // lab9.be.
+ {resolve, "lab⒐be", "xn--labbe-zh9b", "P1"}, // encode("lab⒐be")
+ {display, "lab⒐be", "lab⒐be", "P1"},
+
+ {resolve, "plan⒐faß.de", "xn--planfass-c31e.de", "P1"}, // encode("plan⒐fass") + ".de"
+ {display, "plan⒐faß.de", "plan⒐faß.de", "P1"},
+
+ // Chrome 54.0 recognizes the error and treats this input verbatim as a
+ // search string.
+ // Safari 10.0 (non-conform spec) decomposes "⒈" and computes the
+ // punycode on the result using transitional mapping.
+ // Firefox 49.0.1 goes haywire on this string and prints a bunch of what
+ // seems to be nested punycode encodings.
+ {resolve, "日本⒈co.ßßß.de", "xn--co-wuw5954azlb.ssssss.de", "P1"},
+ {display, "日本⒈co.ßßß.de", "日本⒈co.ßßß.de", "P1"},
+
+ {resolve, "a\u200Cb", "ab", ""},
+ {display, "a\u200Cb", "a\u200Cb", "C"},
+
+ {resolve, encode("a\u200Cb"), encode("a\u200Cb"), "C"},
+ {display, "a\u200Cb", "a\u200Cb", "C"},
+
+ {resolve, "grﻋﺮﺑﻲ.de", "xn--gr-gtd9a1b0g.de", "B"},
+ {
+ // Notice how the string gets transformed, even with an error.
+ // Chrome will use the original string if it finds an error, so not
+ // the transformed one.
+ display,
+ "gr\ufecb\ufeae\ufe91\ufef2.de",
+ "gr\u0639\u0631\u0628\u064a.de",
+ "B",
+ },
+
+ {resolve, "\u0671.\u03c3\u07dc", "xn--qib.xn--4xa21s", "B"}, // ٱ.σߜ
+ {display, "\u0671.\u03c3\u07dc", "\u0671.\u03c3\u07dc", "B"},
+
+ // normalize input
+ {resolve, "a\u0323\u0322", "xn--jta191l", ""}, // ạ̢
+ {display, "a\u0323\u0322", "\u1ea1\u0322", ""},
+
+ // Non-normalized strings are not normalized when they originate from
+ // punycode. Despite the error, Chrome, Safari and Firefox will attempt
+ // to look up the input punycode.
+ {resolve, encode("a\u0323\u0322") + ".com", "xn--a-tdbc.com", "V1"},
+ {display, encode("a\u0323\u0322") + ".com", "a\u0323\u0322.com", "V1"},
+ }
+
+ for _, tc := range testCases {
+ doTest(t, tc.f, tc.name, tc.input, tc.want, tc.wantErr)
+ }
+}
+
func TestConformance(t *testing.T) {
testtext.SkipIfNotLong(t)
@@ -46,6 +165,13 @@
section = strings.ToLower(strings.Split(s, " ")[0])
}
}))
+ transitional := &Profile{
+ Transitional: true,
+ VerifyDNSLength: true,
+ }
+ nonTransitional := &Profile{
+ VerifyDNSLength: true,
+ }
for p.Next() {
started = true
@@ -53,12 +179,12 @@
profiles := []*Profile{}
switch p.String(0) {
case "T":
- profiles = append(profiles, Transitional)
+ profiles = append(profiles, transitional)
case "N":
- profiles = append(profiles, NonTransitional)
+ profiles = append(profiles, nonTransitional)
case "B":
- profiles = append(profiles, Transitional)
- profiles = append(profiles, NonTransitional)
+ profiles = append(profiles, transitional)
+ profiles = append(profiles, nonTransitional)
}
src := unescape(p.String(1))
@@ -71,48 +197,24 @@
if wantToASCII == "" {
wantToASCII = wantToUnicode
}
- test := "err:"
+ wantErrToUnicode := ""
if strings.HasPrefix(wantToUnicode, "[") {
- test += strings.Replace(strings.Trim(wantToUnicode, "[]"), " ", "", -1)
+ wantErrToUnicode = wantToUnicode
+ wantToUnicode = ""
}
+ wantErrToASCII := ""
if strings.HasPrefix(wantToASCII, "[") {
- test += strings.Replace(strings.Trim(wantToASCII, "[]"), " ", "", -1)
- }
- if test == "err:" {
- test = "ok"
+ wantErrToASCII = wantToASCII
+ wantToASCII = ""
}
// TODO: also do IDNA tests.
// invalidInIDNA2008 := p.String(4) == "NV8"
for _, p := range profiles {
- testtext.Run(t, fmt.Sprintf("%s:%s/%s/%+q", section, test, p, src), func(t *testing.T) {
- got, err := p.ToUnicode(src)
- wantErr := strings.HasPrefix(wantToUnicode, "[")
- gotErr := err != nil
- if wantErr {
- if gotErr != wantErr {
- t.Errorf(`ToUnicode:err got %v; want %v (%s)`,
- gotErr, wantErr, wantToUnicode)
- }
- } else if got != wantToUnicode || gotErr != wantErr {
- t.Errorf(`ToUnicode: got %+q, %v (%v); want %+q, %v`,
- got, gotErr, err, wantToUnicode, wantErr)
- }
-
- got, err = p.ToASCII(src)
- wantErr = strings.HasPrefix(wantToASCII, "[")
- gotErr = err != nil
- if wantErr {
- if gotErr != wantErr {
- t.Errorf(`ToASCII:err got %v; want %v (%s)`,
- gotErr, wantErr, wantToASCII)
- }
- } else if got != wantToASCII || gotErr != wantErr {
- t.Errorf(`ToASCII: got %+q, %v (%v); want %+q, %v`,
- got, gotErr, err, wantToASCII, wantErr)
- }
- })
+ name := fmt.Sprintf("%s:%s", section, p)
+ doTest(t, p.ToUnicode, name+":ToUnicode", src, wantToUnicode, wantErrToUnicode)
+ doTest(t, p.ToASCII, name+":ToASCII", src, wantToASCII, wantErrToASCII)
}
}
}
diff --git a/internal/export/idna/punycode.go b/internal/export/idna/punycode.go
index 026dc79..f0cbd48 100644
--- a/internal/export/idna/punycode.go
+++ b/internal/export/idna/punycode.go
@@ -7,7 +7,6 @@
// This file implements the Punycode algorithm from RFC 3492.
import (
- "fmt"
"math"
"strings"
"unicode/utf8"
@@ -27,6 +26,8 @@
tmin int32 = 1
)
+func punyError(s string) error { return &labelError{s, "A3"} }
+
// decode decodes a string as specified in section 6.2.
func decode(encoded string) (string, error) {
if encoded == "" {
@@ -34,7 +35,7 @@
}
pos := 1 + strings.LastIndex(encoded, "-")
if pos == 1 {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
if pos == len(encoded) {
return encoded[:len(encoded)-1], nil
@@ -50,16 +51,16 @@
oldI, w := i, int32(1)
for k := base; ; k += base {
if pos == len(encoded) {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
digit, ok := decodeDigit(encoded[pos])
if !ok {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
pos++
i += digit * w
if i < 0 {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
t := k - bias
if t < tmin {
@@ -72,7 +73,7 @@
}
w *= base - t
if w >= math.MaxInt32/base {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
}
x := int32(len(output) + 1)
@@ -80,7 +81,7 @@
n += i / x
i %= x
if n > utf8.MaxRune || len(output) >= 1024 {
- return "", fmt.Errorf("idna: invalid label %q", encoded)
+ return "", punyError(encoded)
}
output = append(output, 0)
copy(output[i+1:], output[i:])
@@ -121,14 +122,14 @@
}
delta += (m - n) * (h + 1)
if delta < 0 {
- return "", fmt.Errorf("idna: invalid label %q", s)
+ return "", punyError(s)
}
n = m
for _, r := range s {
if r < n {
delta++
if delta < 0 {
- return "", fmt.Errorf("idna: invalid label %q", s)
+ return "", punyError(s)
}
continue
}