blob: 8acc08c952dc2e78ac1e1620272fa1879d5ab623 [file] [log] [blame]
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Locktrigger “locks” a given build trigger, making sure that
// the currently running build is the only trigger running.
//
// Usage:
//
// locktrigger -project=$PROJECT_ID -build=$BUILD_ID
//
// The $PROJECT_ID and $BUILD_ID are typically written literally in cloudbuild.yaml
// and then substituted by Cloud Build.
//
// When a project uses “continuous deployment powered by Cloud Build”,
// the deployment is a little bit too continuous: when multiple commits
// land in a short time window, Cloud Build will run all the triggered
// build jobs in parallel. If each job does “gcloud app deploy”, there
// is no guarantee which will win: perhaps an older commit will complete
// last, resulting in the newest commit not actually being the final
// deployed version of the site. This should probably be fixed in
// “continuous deployment powered by Cloud Build”, but until then,
// locktrigger works around the problem.
//
// All triggered builds must run locktrigger to guarantee mutual exclusion.
// When there is contention—that is, when multiple builds are running and
// they all run locktrigger—the build corresponding to the newest commit
// is permitted to continue running, and older builds are canceled.
//
// When locktrigger exits successfully, then, at that moment, the current
// build is (or recently was) the only running build for its trigger.
// Of course, another build may start immediately after locktrigger exits.
// As long as that build also runs locktrigger, then either it will cancel
// itself (if it is older than we are), or it will cancel us before proceeding
// (if we are older than it is).
package main
import (
"bytes"
"context"
"flag"
"fmt"
"log"
"os"
"os/exec"
"strings"
"time"
cloudbuild "cloud.google.com/go/cloudbuild/apiv1/v2"
"google.golang.org/api/iterator"
cloudbuildpb "google.golang.org/genproto/googleapis/devtools/cloudbuild/v1"
)
var (
project = flag.String("project", "", "GCP project `name` (required)")
build = flag.String("build", "", "GCP build `id` (required)")
repo = flag.String("repo", "", "`URL` of repository (required)")
)
func usage() {
fmt.Fprintf(os.Stderr, "usage: locktrigger -project=name -build=id -repo=URL\n")
os.Exit(2)
}
func main() {
flag.Usage = usage
flag.Parse()
log.SetPrefix("locktrigger: ")
log.SetFlags(0)
if *project == "" || *build == "" || *repo == "" {
usage()
}
ctx := context.Background()
c, err := cloudbuild.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
defer c.Close()
// Find commit hash of local Git
myHash := run("git", "rev-parse", "HEAD")
log.Printf("my hash: %v", myHash)
// Find build object for current build, check that it matches.
self := getBuild(c, ctx, *build)
if hash := self.Substitutions["COMMIT_SHA"]; hash != myHash {
log.Fatalf("build COMMIT_SHA does not match local hash: %v != %v", hash, myHash)
}
log.Printf("my build: %v", self.Id)
if self.BuildTriggerId == "" {
log.Fatalf("build has no trigger ID")
}
log.Printf("my trigger: %v", self.BuildTriggerId)
// List all builds for our trigger that are still running.
req := &cloudbuildpb.ListBuildsRequest{
ProjectId: *project,
// Note: Really want "status=WORKING buildTriggerId="+self.BuildTriggerId,
// but that fails with an InvalidArgument error for unknown reasons.
// status=WORKING will narrow the list down to something reasonable,
// and we filter the unrelated triggers below.
Filter: "status=WORKING",
}
it := c.ListBuilds(ctx, req)
foundSelf := false
shallow := false
if _, err := os.Stat(run("git", "rev-parse", "--git-dir") + "/shallow"); err == nil {
shallow = true
}
for {
b, err := it.Next()
if err == iterator.Done {
break
}
if err != nil {
log.Fatalf("reading builds: %v (%q)", err, req.Filter)
}
if b.BuildTriggerId != self.BuildTriggerId {
continue
}
// Check whether this build is an older or newer commit.
// If this build is older, cancel it.
// If this build is newer, cancel ourselves.
if b.Id == self.Id {
foundSelf = true
continue
}
hash := b.Substitutions["COMMIT_SHA"]
if hash == "" {
log.Fatalf("cannot find COMMIT_SHA for build %v", b.Id)
}
if hash == myHash {
log.Fatalf("found another build %v at same commit %v", b.Id, hash)
}
// Fetch the full Git repo so we can answer the history questions.
// This is delayed until now to avoid the expense of fetching the full repo
// if we are the only build that is running.
if shallow {
log.Printf("git fetch --unshallow")
run("git", "fetch", "--unshallow", *repo)
shallow = false
}
// Contention.
// Find the common ancestor between us and that build,
// to tell whether we're older, it's older, or we're unrelated.
log.Printf("checking %v", hash)
switch run("git", "merge-base", myHash, hash) {
default:
log.Fatalf("unexpected build for unrelated commit %v", hash)
case myHash:
// myHash is older than b's hash. Cancel self.
log.Printf("canceling self, for build %v commit %v", b.Id, hash)
cancel(c, ctx, self.Id)
case hash:
// b's hash is older than myHash. Cancel b.
log.Printf("canceling build %v commit %v", b.Id, hash)
cancel(c, ctx, b.Id)
}
}
// If we listed all the in-progress builds, we should have seen ourselves.
if !foundSelf {
log.Fatalf("reading builds: didn't find self")
}
}
// getBuild returns the build info for the build with the given id.
func getBuild(c *cloudbuild.Client, ctx context.Context, id string) *cloudbuildpb.Build {
req := &cloudbuildpb.GetBuildRequest{
ProjectId: *project,
Id: id,
}
b, err := c.GetBuild(ctx, req)
if err != nil {
log.Fatalf("getbuild %v: %v", id, err)
}
return b
}
// cancel cancels the build with the given id.
func cancel(c *cloudbuild.Client, ctx context.Context, id string) {
req := &cloudbuildpb.CancelBuildRequest{
ProjectId: *project,
Id: id,
}
_, err := c.CancelBuild(ctx, req)
if err != nil {
// Not Fatal: maybe cancel failed because the build exited.
// Waiting for it to stop running below will take care of that case.
log.Printf("cancel %v: %v", id, err)
}
// Wait for build to report being stopped,
// in case cancel only queues the cancellation and doesn't actually wait,
// or in case cancel failed.
// Willing to wait a few minutes.
now := time.Now()
for time.Since(now) < 3*time.Minute {
b := getBuild(c, ctx, id)
if b.Status != cloudbuildpb.Build_WORKING {
log.Printf("canceled %v: now %v", id, b.Status)
return
}
time.Sleep(10 * time.Second)
}
log.Fatalf("cancel %v: did not stop", id)
}
// run runs the given command line and returns the standard output, with spaces trimmed.
func run(args ...string) string {
var stdout, stderr bytes.Buffer
cmd := exec.Command(args[0], args[1:]...)
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
log.Fatalf("exec %v: %v\n%s%s", args, err, stdout.String(), stderr.String())
}
return strings.TrimSpace(stdout.String())
}