misc/ios: retry loop to handle builder flakiness

After moving the darwin/arm builder to new hardware several new flaky
error messages appeared. This provided enough information to Google
to make it clear that iOS build systems have been flaky for many
years, and that is unlikely to change any time soon.

However, all of the pain of lldb and using a breakpoint early in
program initialization gives us an advantage: all install and
initialization flakiness appears to happen before the Go program ever
gets going. So if we see an error or we timeout before we reach our
breakpoint (before any test code has executed), we can assume it is
the fault of the builder and restart without risking hiding a flaky
Go test.

This code has successfully processed the last 8 builds. I am hopeful.

Change-Id: Ide24aaae4fa7bdab9d8f4432bb85d8f2256c7606
Reviewed-on: https://go-review.googlesource.com/8241
Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
diff --git a/misc/ios/go_darwin_arm_exec.go b/misc/ios/go_darwin_arm_exec.go
index 431ddcc..4495f52 100644
--- a/misc/ios/go_darwin_arm_exec.go
+++ b/misc/ios/go_darwin_arm_exec.go
@@ -26,6 +26,10 @@
 
 const debug = false
 
+var errRetry = errors.New("failed to start test harness (retry attempted)")
+
+var tmpdir string
+
 func main() {
 	log.SetFlags(0)
 	log.SetPrefix("go_darwin_arm_exec: ")
@@ -36,39 +40,39 @@
 		log.Fatal("usage: go_darwin_arm_exec a.out")
 	}
 
-	if err := run(os.Args[1], os.Args[2:]); err != nil {
+	var err error
+	tmpdir, err = ioutil.TempDir("", "go_darwin_arm_exec_")
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Approximately 1 in a 100 binaries fail to start. If it happens,
+	// try again. These failures happen for several reasons beyond
+	// our control, but all of them are safe to retry as they happen
+	// before lldb encounters the initial getwd breakpoint. As we
+	// know the tests haven't started, we are not hiding flaky tests
+	// with this retry.
+	for i := 0; i < 5; i++ {
+		if i > 0 {
+			fmt.Fprintln(os.Stderr, "start timeout, trying again")
+		}
+		err = run(os.Args[1], os.Args[2:])
+		if err == nil || err != errRetry {
+			break
+		}
+	}
+	if !debug {
+		os.RemoveAll(tmpdir)
+	}
+	if err != nil {
 		fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err)
 		os.Exit(1)
 	}
 }
 
 func run(bin string, args []string) (err error) {
-	type waitPanic struct {
-		err error
-	}
-	defer func() {
-		if r := recover(); r != nil {
-			if w, ok := r.(waitPanic); ok {
-				err = w.err
-				return
-			}
-			panic(r)
-		}
-	}()
-
-	defer exec.Command("killall", "ios-deploy").Run() // cleanup
-
-	exec.Command("killall", "ios-deploy").Run()
-
-	tmpdir, err := ioutil.TempDir("", "go_darwin_arm_exec_")
-	if err != nil {
-		log.Fatal(err)
-	}
-	if !debug {
-		defer os.RemoveAll(tmpdir)
-	}
-
 	appdir := filepath.Join(tmpdir, "gotest.app")
+	os.RemoveAll(appdir)
 	if err := os.MkdirAll(appdir, 0755); err != nil {
 		return err
 	}
@@ -109,9 +113,31 @@
 		return fmt.Errorf("codesign: %v", err)
 	}
 
-	if err := os.Chdir(tmpdir); err != nil {
+	oldwd, err := os.Getwd()
+	if err != nil {
 		return err
 	}
+	if err := os.Chdir(filepath.Join(appdir, "..")); err != nil {
+		return err
+	}
+	defer os.Chdir(oldwd)
+
+	type waitPanic struct {
+		err error
+	}
+	defer func() {
+		if r := recover(); r != nil {
+			if w, ok := r.(waitPanic); ok {
+				err = w.err
+				return
+			}
+			panic(r)
+		}
+	}()
+
+	defer exec.Command("killall", "ios-deploy").Run() // cleanup
+
+	exec.Command("killall", "ios-deploy").Run()
 
 	// ios-deploy invokes lldb to give us a shell session with the app.
 	cmd = exec.Command(
@@ -175,11 +201,11 @@
 			w.printBuf()
 			return fmt.Errorf("failed (stage %s): %v", stage, err)
 		case i := <-w.find(str, timeout):
-			if i >= 0 {
-				w.clearTo(i + len(str))
-			} else {
-				log.Printf("timed out on stage %s, continuing", stage)
+			if i < 0 {
+				log.Printf("timed out on stage %q, retrying", stage)
+				return errRetry
 			}
+			w.clearTo(i + len(str))
 			return nil
 		}
 	}
@@ -192,7 +218,11 @@
 
 	// Wait for installation and connection.
 	if err := waitFor("ios-deploy before run", "(lldb)     connect\r\nProcess 0 connected\r\n", 0); err != nil {
-		return err
+		// Retry if we see a rare and longstanding ios-deploy bug.
+		// https://github.com/phonegap/ios-deploy/issues/11
+		//	Assertion failed: (AMDeviceStartService(device, CFSTR("com.apple.debugserver"), &gdbfd, NULL) == 0)
+		log.Printf("%v, retrying", err)
+		return errRetry
 	}
 
 	// Script LLDB. Oh dear.
@@ -205,9 +235,21 @@
 	do(`breakpoint set -n getwd`) // in runtime/cgo/gcc_darwin_arm.go
 
 	fmt.Fprintln(lldb, `run`)
-	// Sometimes we don't see "reason = breakpoint", so we time out
-	// and try to continue.
-	if err := waitFor("br getwd", "stop reason = breakpoint", 10*time.Second); err != nil {
+	if err := waitFor("br getwd", "stop reason = breakpoint", 20*time.Second); err != nil {
+		// At this point we see several flaky errors from the iOS
+		// build infrastructure. The most common is never reaching
+		// the breakpoint, which we catch with a timeout. Very
+		// occasionally lldb can produce errors like:
+		//
+		//	Breakpoint 1: no locations (pending).
+		//	WARNING:  Unable to resolve breakpoint to any actual locations.
+		//
+		// As no actual test code has been executed by this point,
+		// we treat all errors as recoverable.
+		if err != errRetry {
+			log.Printf("%v, retrying", err)
+			err = errRetry
+		}
 		return err
 	}
 	if err := waitFor("br getwd prompt", "(lldb)", 0); err != nil {