acme: automatically retry on badNonce errors

After receiving a badNonce error, the call can be safely retried. Nonce
errors can happen unexpectedly based on an unknown expiration date or
server-side changes. Rather than force the caller handle these errors,
retryPostJWS will keep retrying until success or a different error.

According to the spec, the error returned should be
"urn:ietf:params:acme:error:badNonce", but the error that Let's Encrypt
returns is "urn:acme:error:badNonce" so we just check the suffix.

Fixes golang/go#19703

Change-Id: Id15012dff91e51d28ed8bc54f13a6212186cb7df
Reviewed-on: https://go-review.googlesource.com/40130
Run-TryBot: Alex Vaghin <ddos@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Alex Vaghin <ddos@google.com>
diff --git a/acme/acme.go b/acme/acme.go
index 140d422..d650604 100644
--- a/acme/acme.go
+++ b/acme/acme.go
@@ -198,7 +198,7 @@
 		req.NotAfter = now.Add(exp).Format(time.RFC3339)
 	}
 
-	res, err := c.postJWS(ctx, c.Key, c.dir.CertURL, req)
+	res, err := c.retryPostJWS(ctx, c.Key, c.dir.CertURL, req)
 	if err != nil {
 		return nil, "", err
 	}
@@ -273,7 +273,7 @@
 	if key == nil {
 		key = c.Key
 	}
-	res, err := c.postJWS(ctx, key, c.dir.RevokeURL, body)
+	res, err := c.retryPostJWS(ctx, key, c.dir.RevokeURL, body)
 	if err != nil {
 		return err
 	}
@@ -361,7 +361,7 @@
 		Resource:   "new-authz",
 		Identifier: authzID{Type: "dns", Value: domain},
 	}
-	res, err := c.postJWS(ctx, c.Key, c.dir.AuthzURL, req)
+	res, err := c.retryPostJWS(ctx, c.Key, c.dir.AuthzURL, req)
 	if err != nil {
 		return nil, err
 	}
@@ -419,7 +419,7 @@
 		Status:   "deactivated",
 		Delete:   true,
 	}
-	res, err := c.postJWS(ctx, c.Key, url, req)
+	res, err := c.retryPostJWS(ctx, c.Key, url, req)
 	if err != nil {
 		return err
 	}
@@ -438,21 +438,7 @@
 // In all other cases WaitAuthorization returns an error.
 // If the Status is StatusInvalid, the returned error is ErrAuthorizationFailed.
 func (c *Client) WaitAuthorization(ctx context.Context, url string) (*Authorization, error) {
-	var count int
-	sleep := func(v string, inc int) error {
-		count += inc
-		d := backoff(count, 10*time.Second)
-		d = retryAfter(v, d)
-		wakeup := time.NewTimer(d)
-		defer wakeup.Stop()
-		select {
-		case <-ctx.Done():
-			return ctx.Err()
-		case <-wakeup.C:
-			return nil
-		}
-	}
-
+	sleep := sleeper(ctx)
 	for {
 		res, err := c.get(ctx, url)
 		if err != nil {
@@ -525,7 +511,7 @@
 		Type:     chal.Type,
 		Auth:     auth,
 	}
-	res, err := c.postJWS(ctx, c.Key, chal.URI, req)
+	res, err := c.retryPostJWS(ctx, c.Key, chal.URI, req)
 	if err != nil {
 		return nil, err
 	}
@@ -658,7 +644,7 @@
 		req.Contact = acct.Contact
 		req.Agreement = acct.AgreedTerms
 	}
-	res, err := c.postJWS(ctx, c.Key, url, req)
+	res, err := c.retryPostJWS(ctx, c.Key, url, req)
 	if err != nil {
 		return nil, err
 	}
@@ -695,6 +681,40 @@
 	}, nil
 }
 
+// retryPostJWS will retry calls to postJWS if there is a badNonce error,
+// clearing the stored nonces after each error.
+// If the response was 4XX-5XX, then responseError is called on the body,
+// the body is closed, and the error returned.
+func (c *Client) retryPostJWS(ctx context.Context, key crypto.Signer, url string, body interface{}) (*http.Response, error) {
+	sleep := sleeper(ctx)
+	for {
+		res, err := c.postJWS(ctx, key, url, body)
+		if err != nil {
+			return nil, err
+		}
+		// handle errors 4XX-5XX with responseError
+		if res.StatusCode >= 400 && res.StatusCode <= 599 {
+			err := responseError(res)
+			res.Body.Close()
+			// according to spec badNonce is urn:ietf:params:acme:error:badNonce
+			// however, acme servers in the wild return their version of the error
+			// https://tools.ietf.org/html/draft-ietf-acme-acme-02#section-5.4
+			if ae, ok := err.(*Error); ok && strings.HasSuffix(strings.ToLower(ae.ProblemType), ":badnonce") {
+				// clear any nonces that we might've stored that might now be
+				// considered bad
+				c.clearNonces()
+				retry := res.Header.Get("retry-after")
+				if err := sleep(retry, 1); err != nil {
+					return nil, err
+				}
+				continue
+			}
+			return nil, err
+		}
+		return res, nil
+	}
+}
+
 // postJWS signs the body with the given key and POSTs it to the provided url.
 // The body argument must be JSON-serializable.
 func (c *Client) postJWS(ctx context.Context, key crypto.Signer, url string, body interface{}) (*http.Response, error) {
@@ -730,6 +750,13 @@
 	return nonce, nil
 }
 
+// clearNonces clears any stored nonces
+func (c *Client) clearNonces() {
+	c.noncesMu.Lock()
+	defer c.noncesMu.Unlock()
+	c.nonces = make(map[string]struct{})
+}
+
 // addNonce stores a nonce value found in h (if any) for future use.
 func (c *Client) addNonce(h http.Header) {
 	v := nonceFromHeader(h)
@@ -941,6 +968,28 @@
 	return links
 }
 
+// sleeper returns a function that accepts the Retry-After HTTP header value
+// and an increment that's used with backoff to increasingly sleep on
+// consecutive calls until the context is done. If the Retry-After header
+// cannot be parsed, then backoff is used with a maximum sleep time of 10
+// seconds.
+func sleeper(ctx context.Context) func(ra string, inc int) error {
+	var count int
+	return func(ra string, inc int) error {
+		count += inc
+		d := backoff(count, 10*time.Second)
+		d = retryAfter(ra, d)
+		wakeup := time.NewTimer(d)
+		defer wakeup.Stop()
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-wakeup.C:
+			return nil
+		}
+	}
+}
+
 // retryAfter parses a Retry-After HTTP header value,
 // trying to convert v into an int (seconds) or use http.ParseTime otherwise.
 // It returns d if v cannot be parsed.
diff --git a/acme/acme_test.go b/acme/acme_test.go
index b91533d..0210ce3 100644
--- a/acme/acme_test.go
+++ b/acme/acme_test.go
@@ -1065,6 +1065,44 @@
 	}
 }
 
+func TestRetryPostJWS(t *testing.T) {
+	var count int
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		count++
+		w.Header().Set("replay-nonce", fmt.Sprintf("nonce%d", count))
+		if r.Method == "HEAD" {
+			// We expect the client to do 2 head requests to fetch
+			// nonces, one to start and another after getting badNonce
+			return
+		}
+
+		head, err := decodeJWSHead(r)
+		if err != nil {
+			t.Errorf("decodeJWSHead: %v", err)
+		} else if head.Nonce == "" {
+			t.Error("head.Nonce is empty")
+		} else if head.Nonce == "nonce1" {
+			// return a badNonce error to force the call to retry
+			w.WriteHeader(http.StatusBadRequest)
+			w.Write([]byte(`{"type":"urn:ietf:params:acme:error:badNonce"}`))
+			return
+		}
+		// Make client.Authorize happy; we're not testing its result.
+		w.WriteHeader(http.StatusCreated)
+		w.Write([]byte(`{"status":"valid"}`))
+	}))
+	defer ts.Close()
+
+	client := Client{Key: testKey, dir: &Directory{AuthzURL: ts.URL}}
+	// This call will fail with badNonce, causing a retry
+	if _, err := client.Authorize(context.Background(), "example.com"); err != nil {
+		t.Errorf("client.Authorize 1: %v", err)
+	}
+	if count != 4 {
+		t.Errorf("total requests count: %d; want 4", count)
+	}
+}
+
 func TestLinkHeader(t *testing.T) {
 	h := http.Header{"Link": {
 		`<https://example.com/acme/new-authz>;rel="next"`,