internal/export/idna: sanitize errors

- Documented that ToASCII and ToUnicode always return some IDN.
- Label length checking disabled by default
  (most browsers seem to do this)
- Refactored test logic
- Associate (internal) error codes with errors. This allows:
  - more accurate testing
  - better documentation
  - makes it easier in the future to have options to disable
    specific tests
- Added tests for very esoteric cases that often display
   very different behavior between browsers.

Change-Id: Ia70335212f1089f280653c2b7bea9f769171ae1c
Reviewed-on: https://go-review.googlesource.com/31470
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/internal/export/idna/idna.go b/internal/export/idna/idna.go
index 330d2e6..24755df 100644
--- a/internal/export/idna/idna.go
+++ b/internal/export/idna/idna.go
@@ -18,7 +18,6 @@
 package idna // import "golang.org/x/text/internal/export/idna"
 
 import (
-	"errors"
 	"fmt"
 	"strings"
 	"unicode/utf8"
@@ -27,16 +26,33 @@
 	"golang.org/x/text/unicode/norm"
 )
 
+// NOTE: Unlike common practice in Go APIs, the functions will return a
+// sanitized domain name in case of errors. Browsers sometimes use a partially
+// evaluated string as lookup.
+// TODO: the current error handling is, in my opinion, the least opinionated.
+// Other strategies are also viable, though:
+// Option 1) Return an empty string in case of error, but allow the user to
+//    specify explicitly which errors to ignore.
+// Option 2) Return the partially evaluated string if it is itself a valid
+//    string, otherwise return the empty string in case of error.
+// Option 3) Option 1 and 2.
+// Option 4) Always return an empty string for now and implement Option 1 as
+//    needed, and document that the return string may not be empty in case of
+//    error in the future.
+// I think Option 1 is best, but it is quite opinionated.
+
 // ToASCII converts a domain or domain label to its ASCII form. For example,
 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
-// ToASCII("golang") is "golang".
+// ToASCII("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
 func ToASCII(s string) (string, error) {
 	return Resolve.process(s, true)
 }
 
 // ToUnicode converts a domain or domain label to its Unicode form. For example,
 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
-// ToUnicode("golang") is "golang".
+// ToUnicode("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
 func ToUnicode(s string) (string, error) {
 	return NonTransitional.process(s, false)
 }
@@ -45,20 +61,22 @@
 type Profile struct {
 	Transitional    bool
 	IgnoreSTD3Rules bool
-	IgnoreDNSLength bool
+	VerifyDNSLength bool
 	// ErrHandler      func(error)
 }
 
 // ToASCII converts a domain or domain label to its ASCII form. For example,
 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
-// ToASCII("golang") is "golang".
+// ToASCII("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
 func (p *Profile) ToASCII(s string) (string, error) {
 	return p.process(s, true)
 }
 
 // ToUnicode converts a domain or domain label to its Unicode form. For example,
 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
-// ToUnicode("golang") is "golang".
+// ToUnicode("golang") is "golang". If an error is encountered it will return
+// an error and a (partially) processed result.
 func (p *Profile) ToUnicode(s string) (string, error) {
 	pp := *p
 	pp.Transitional = false
@@ -72,7 +90,7 @@
 	if p.Transitional {
 		s = "Transitional"
 	} else {
-		s = "NonTraditional"
+		s = "NonTransitional"
 	}
 	if p.IgnoreSTD3Rules {
 		s += ":NoSTD3Rules"
@@ -108,15 +126,19 @@
 	// bundle or block deviation characters.
 )
 
-// TODO: rethink error strategy
+type labelError struct{ label, code_ string }
 
-var (
-	// errDisallowed indicates a domain name contains a disallowed rune.
-	errDisallowed = errors.New("idna: disallowed rune")
+func (e labelError) code() string { return e.code_ }
+func (e labelError) Error() string {
+	return fmt.Sprintf("idna: invalid label %q", e.label)
+}
 
-	// errEmptyLabel indicates a label was empty.
-	errEmptyLabel = errors.New("idna: empty label")
-)
+type runeError rune
+
+func (e runeError) code() string { return "P1" }
+func (e runeError) Error() string {
+	return fmt.Sprintf("idna: disallowed rune %r", e)
+}
 
 // process implements the algorithm described in section 4 of UTS #46,
 // see http://www.unicode.org/reports/tr46.
@@ -136,7 +158,8 @@
 			continue
 		case disallowed:
 			if err == nil {
-				err = errDisallowed
+				r, _ := utf8.DecodeRuneInString(s[i:])
+				err = runeError(r)
 			}
 			continue
 		case mapped, deviation:
@@ -166,7 +189,7 @@
 	for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
 	}
 	if s == "" {
-		return "", errors.New("idna: there are no labels")
+		return "", &labelError{s, "A4"}
 	}
 	labels := labelIter{orig: s}
 	for ; !labels.done(); labels.next() {
@@ -175,7 +198,7 @@
 			// Empty labels are not okay. The label iterator skips the last
 			// label if it is empty.
 			if err == nil {
-				err = errEmptyLabel
+				err = &labelError{s, "A4"}
 			}
 			continue
 		}
@@ -211,20 +234,20 @@
 				labels.set(a)
 			}
 			n := len(label)
-			if !p.IgnoreDNSLength && err == nil && (n == 0 || n > 63) {
-				err = fmt.Errorf("idna: label with invalid length %d", n)
+			if p.VerifyDNSLength && err == nil && (n == 0 || n > 63) {
+				err = &labelError{label, "A4"}
 			}
 		}
 	}
 	s = labels.result()
-	if toASCII && !p.IgnoreDNSLength && err == nil {
+	if toASCII && p.VerifyDNSLength && err == nil {
 		// Compute the length of the domain name minus the root label and its dot.
 		n := len(s)
 		if n > 0 && s[n-1] == '.' {
 			n--
 		}
 		if len(s) < 1 || n > 253 {
-			err = fmt.Errorf("idna: doman name with invalid length %d", n)
+			err = &labelError{s, "A4"}
 		}
 	}
 	return s, err
@@ -320,12 +343,12 @@
 
 func (p *Profile) validateFromPunycode(s string) error {
 	if !norm.NFC.IsNormalString(s) {
-		return errors.New("idna: punycode is not normalized")
+		return &labelError{s, "V1"}
 	}
 	for i := 0; i < len(s); {
 		v, sz := trie.lookupString(s[i:])
 		if c := p.simplify(info(v).category()); c != valid && c != deviation {
-			return fmt.Errorf("idna: invalid character %+q in expanded punycode", s[i:i+sz])
+			return &labelError{s, "V6"}
 		}
 		i += sz
 	}
@@ -398,19 +421,19 @@
 // already implicitly satisfied by the overall implementation.
 func (p *Profile) validate(s string) error {
 	if len(s) > 4 && s[2] == '-' && s[3] == '-' {
-		return errors.New("idna: label starts with ??--")
+		return &labelError{s, "V2"}
 	}
 	if s[0] == '-' || s[len(s)-1] == '-' {
-		return errors.New("idna: label may not start or end with '-'")
+		return &labelError{s, "V3"}
 	}
 	// TODO: merge the use of this in the trie.
 	v, sz := trie.lookupString(s)
 	x := info(v)
 	if x.isModifier() {
-		return fmt.Errorf("idna: label starts with modifier %U", []rune(s[:sz])[0])
+		return &labelError{s, "V5"}
 	}
 	if !bidirule.ValidString(s) {
-		return errors.New("idna: label violates Bidi Rule")
+		return &labelError{s, "B"}
 	}
 	// Quickly return in the absence of zero-width (non) joiners.
 	if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 {
@@ -435,7 +458,7 @@
 		x = info(v)
 	}
 	if st == stateFAIL || st == stateAfter {
-		return errors.New("idna: label violates Context J rule")
+		return &labelError{s, "C"}
 	}
 	return nil
 }
diff --git a/internal/export/idna/idna_test.go b/internal/export/idna/idna_test.go
index b4ee138..72dbeb1 100644
--- a/internal/export/idna/idna_test.go
+++ b/internal/export/idna/idna_test.go
@@ -33,6 +33,125 @@
 	}
 }
 
+// doTest performs a single test f(input) and verifies that the output matches
+// out and that the returned error is expected. The errors string contains
+// all allowed error codes as categorized in
+// http://www.unicode.org/Public/idna/9.0.0/IdnaTest.txt:
+// P: Processing
+// V: Validity
+// A: to ASCII
+// B: Bidi
+// C: Context J
+func doTest(t *testing.T, f func(string) (string, error), name, input, want, errors string) {
+	errors = strings.Trim(errors, "[]")
+	test := "ok"
+	if errors != "" {
+		test = "err:" + errors
+	}
+	// Replace some of the escape sequences to make it easier to single out
+	// tests on the command name.
+	in := strings.Trim(strconv.QuoteToASCII(input), `"`)
+	in = strings.Replace(in, `\u`, "#", -1)
+	in = strings.Replace(in, `\U`, "#", -1)
+	name = fmt.Sprintf("%s/%s/%s", name, in, test)
+
+	testtext.Run(t, name, func(t *testing.T) {
+		got, err := f(input)
+
+		if err != nil {
+			code := err.(interface {
+				code() string
+			}).code()
+			if strings.Index(errors, code) == -1 {
+				t.Errorf("error %q not in set of expected errors {%v}", code, errors)
+			}
+		} else if errors != "" {
+			t.Errorf("no errors; want error in {%v}", errors)
+		}
+
+		if want != "" && got != want {
+			t.Errorf(`string: got %+q; want %+q`, got, want)
+		}
+	})
+}
+
+// TestLabelErrors tests strings returned in case of error. All results should
+// be identical to the reference implementation and can be verified at
+// http://unicode.org/cldr/utility/idna.jsp. The reference implementation,
+// however, seems to not display Bidi and ContextJ errors.
+//
+// In some cases the behavior of browsers is added as a comment. In all cases,
+// whenever a resolve search returns an error here, Chrome will treat the input
+// string as a search string (including those for Bidi and Context J errors),
+// unless noted otherwise.
+func TestLabelErrors(t *testing.T) {
+	encode := func(s string) string { s, _ = encode(acePrefix, s); return s }
+	type kind struct {
+		name string
+		f    func(string) (string, error)
+	}
+	resolve := kind{"ToASCII", Resolve.ToASCII}
+	display := kind{"ToUnicode", Display.ToUnicode}
+	testCases := []struct {
+		kind
+		input   string
+		want    string
+		wantErr string
+	}{
+		// Don't map U+2490 (DIGIT NINE FULL STOP). This is the behavior of
+		// Chrome, Safari, and IE. Firefox will first map ⒐ to 9. and return
+		// lab9.be.
+		{resolve, "lab⒐be", "xn--labbe-zh9b", "P1"}, // encode("lab⒐be")
+		{display, "lab⒐be", "lab⒐be", "P1"},
+
+		{resolve, "plan⒐faß.de", "xn--planfass-c31e.de", "P1"}, // encode("plan⒐fass") + ".de"
+		{display, "plan⒐faß.de", "plan⒐faß.de", "P1"},
+
+		// Chrome 54.0 recognizes the error and treats this input verbatim as a
+		// search string.
+		// Safari 10.0 (non-conform spec) decomposes "⒈" and computes the
+		// punycode on the result using transitional mapping.
+		// Firefox 49.0.1 goes haywire on this string and prints a bunch of what
+		// seems to be nested punycode encodings.
+		{resolve, "日本⒈co.ßßß.de", "xn--co-wuw5954azlb.ssssss.de", "P1"},
+		{display, "日本⒈co.ßßß.de", "日本⒈co.ßßß.de", "P1"},
+
+		{resolve, "a\u200Cb", "ab", ""},
+		{display, "a\u200Cb", "a\u200Cb", "C"},
+
+		{resolve, encode("a\u200Cb"), encode("a\u200Cb"), "C"},
+		{display, "a\u200Cb", "a\u200Cb", "C"},
+
+		{resolve, "grﻋﺮﺑﻲ.de", "xn--gr-gtd9a1b0g.de", "B"},
+		{
+			// Notice how the string gets transformed, even with an error.
+			// Chrome will use the original string if it finds an error, so not
+			// the transformed one.
+			display,
+			"gr\ufecb\ufeae\ufe91\ufef2.de",
+			"gr\u0639\u0631\u0628\u064a.de",
+			"B",
+		},
+
+		{resolve, "\u0671.\u03c3\u07dc", "xn--qib.xn--4xa21s", "B"}, // ٱ.σߜ
+		{display, "\u0671.\u03c3\u07dc", "\u0671.\u03c3\u07dc", "B"},
+
+		// normalize input
+		{resolve, "a\u0323\u0322", "xn--jta191l", ""}, // ạ̢
+		{display, "a\u0323\u0322", "\u1ea1\u0322", ""},
+
+		// Non-normalized strings are not normalized when they originate from
+		// punycode. Despite the error, Chrome, Safari and Firefox will attempt
+		// to look up the input punycode.
+		{resolve, encode("a\u0323\u0322") + ".com", "xn--a-tdbc.com", "V1"},
+		{display, encode("a\u0323\u0322") + ".com", "a\u0323\u0322.com", "V1"},
+	}
+
+	for _, tc := range testCases {
+		doTest(t, tc.f, tc.name, tc.input, tc.want, tc.wantErr)
+	}
+}
+
 func TestConformance(t *testing.T) {
 	testtext.SkipIfNotLong(t)
 
@@ -46,6 +165,13 @@
 			section = strings.ToLower(strings.Split(s, " ")[0])
 		}
 	}))
+	transitional := &Profile{
+		Transitional:    true,
+		VerifyDNSLength: true,
+	}
+	nonTransitional := &Profile{
+		VerifyDNSLength: true,
+	}
 	for p.Next() {
 		started = true
 
@@ -53,12 +179,12 @@
 		profiles := []*Profile{}
 		switch p.String(0) {
 		case "T":
-			profiles = append(profiles, Transitional)
+			profiles = append(profiles, transitional)
 		case "N":
-			profiles = append(profiles, NonTransitional)
+			profiles = append(profiles, nonTransitional)
 		case "B":
-			profiles = append(profiles, Transitional)
-			profiles = append(profiles, NonTransitional)
+			profiles = append(profiles, transitional)
+			profiles = append(profiles, nonTransitional)
 		}
 
 		src := unescape(p.String(1))
@@ -71,48 +197,24 @@
 		if wantToASCII == "" {
 			wantToASCII = wantToUnicode
 		}
-		test := "err:"
+		wantErrToUnicode := ""
 		if strings.HasPrefix(wantToUnicode, "[") {
-			test += strings.Replace(strings.Trim(wantToUnicode, "[]"), " ", "", -1)
+			wantErrToUnicode = wantToUnicode
+			wantToUnicode = ""
 		}
+		wantErrToASCII := ""
 		if strings.HasPrefix(wantToASCII, "[") {
-			test += strings.Replace(strings.Trim(wantToASCII, "[]"), " ", "", -1)
-		}
-		if test == "err:" {
-			test = "ok"
+			wantErrToASCII = wantToASCII
+			wantToASCII = ""
 		}
 
 		// TODO: also do IDNA tests.
 		// invalidInIDNA2008 := p.String(4) == "NV8"
 
 		for _, p := range profiles {
-			testtext.Run(t, fmt.Sprintf("%s:%s/%s/%+q", section, test, p, src), func(t *testing.T) {
-				got, err := p.ToUnicode(src)
-				wantErr := strings.HasPrefix(wantToUnicode, "[")
-				gotErr := err != nil
-				if wantErr {
-					if gotErr != wantErr {
-						t.Errorf(`ToUnicode:err got %v; want %v (%s)`,
-							gotErr, wantErr, wantToUnicode)
-					}
-				} else if got != wantToUnicode || gotErr != wantErr {
-					t.Errorf(`ToUnicode: got %+q, %v (%v); want %+q, %v`,
-						got, gotErr, err, wantToUnicode, wantErr)
-				}
-
-				got, err = p.ToASCII(src)
-				wantErr = strings.HasPrefix(wantToASCII, "[")
-				gotErr = err != nil
-				if wantErr {
-					if gotErr != wantErr {
-						t.Errorf(`ToASCII:err got %v; want %v (%s)`,
-							gotErr, wantErr, wantToASCII)
-					}
-				} else if got != wantToASCII || gotErr != wantErr {
-					t.Errorf(`ToASCII: got %+q, %v (%v); want %+q, %v`,
-						got, gotErr, err, wantToASCII, wantErr)
-				}
-			})
+			name := fmt.Sprintf("%s:%s", section, p)
+			doTest(t, p.ToUnicode, name+":ToUnicode", src, wantToUnicode, wantErrToUnicode)
+			doTest(t, p.ToASCII, name+":ToASCII", src, wantToASCII, wantErrToASCII)
 		}
 	}
 }
diff --git a/internal/export/idna/punycode.go b/internal/export/idna/punycode.go
index 026dc79..f0cbd48 100644
--- a/internal/export/idna/punycode.go
+++ b/internal/export/idna/punycode.go
@@ -7,7 +7,6 @@
 // This file implements the Punycode algorithm from RFC 3492.
 
 import (
-	"fmt"
 	"math"
 	"strings"
 	"unicode/utf8"
@@ -27,6 +26,8 @@
 	tmin        int32 = 1
 )
 
+func punyError(s string) error { return &labelError{s, "A3"} }
+
 // decode decodes a string as specified in section 6.2.
 func decode(encoded string) (string, error) {
 	if encoded == "" {
@@ -34,7 +35,7 @@
 	}
 	pos := 1 + strings.LastIndex(encoded, "-")
 	if pos == 1 {
-		return "", fmt.Errorf("idna: invalid label %q", encoded)
+		return "", punyError(encoded)
 	}
 	if pos == len(encoded) {
 		return encoded[:len(encoded)-1], nil
@@ -50,16 +51,16 @@
 		oldI, w := i, int32(1)
 		for k := base; ; k += base {
 			if pos == len(encoded) {
-				return "", fmt.Errorf("idna: invalid label %q", encoded)
+				return "", punyError(encoded)
 			}
 			digit, ok := decodeDigit(encoded[pos])
 			if !ok {
-				return "", fmt.Errorf("idna: invalid label %q", encoded)
+				return "", punyError(encoded)
 			}
 			pos++
 			i += digit * w
 			if i < 0 {
-				return "", fmt.Errorf("idna: invalid label %q", encoded)
+				return "", punyError(encoded)
 			}
 			t := k - bias
 			if t < tmin {
@@ -72,7 +73,7 @@
 			}
 			w *= base - t
 			if w >= math.MaxInt32/base {
-				return "", fmt.Errorf("idna: invalid label %q", encoded)
+				return "", punyError(encoded)
 			}
 		}
 		x := int32(len(output) + 1)
@@ -80,7 +81,7 @@
 		n += i / x
 		i %= x
 		if n > utf8.MaxRune || len(output) >= 1024 {
-			return "", fmt.Errorf("idna: invalid label %q", encoded)
+			return "", punyError(encoded)
 		}
 		output = append(output, 0)
 		copy(output[i+1:], output[i:])
@@ -121,14 +122,14 @@
 		}
 		delta += (m - n) * (h + 1)
 		if delta < 0 {
-			return "", fmt.Errorf("idna: invalid label %q", s)
+			return "", punyError(s)
 		}
 		n = m
 		for _, r := range s {
 			if r < n {
 				delta++
 				if delta < 0 {
-					return "", fmt.Errorf("idna: invalid label %q", s)
+					return "", punyError(s)
 				}
 				continue
 			}