| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build linux || darwin |
| // +build linux darwin |
| |
| // Code interacting with Google Compute Engine (GCE) and |
| // a GCE implementation of the BuildletPool interface. |
| |
| package pool |
| |
| import ( |
| "context" |
| "encoding/json" |
| "errors" |
| "fmt" |
| "io" |
| "io/ioutil" |
| "log" |
| "net/http" |
| "os" |
| "path" |
| "sort" |
| "strconv" |
| "strings" |
| "sync" |
| "sync/atomic" |
| "time" |
| |
| "cloud.google.com/go/compute/metadata" |
| "cloud.google.com/go/datastore" |
| "cloud.google.com/go/errorreporting" |
| "cloud.google.com/go/storage" |
| "golang.org/x/build/buildenv" |
| "golang.org/x/build/buildlet" |
| "golang.org/x/build/dashboard" |
| "golang.org/x/build/gerrit" |
| "golang.org/x/build/internal/buildgo" |
| "golang.org/x/build/internal/buildstats" |
| "golang.org/x/build/internal/lru" |
| "golang.org/x/build/internal/secret" |
| "golang.org/x/build/internal/spanlog" |
| "golang.org/x/oauth2" |
| "golang.org/x/oauth2/google" |
| "google.golang.org/api/compute/v1" |
| "google.golang.org/api/googleapi" |
| ) |
| |
| func init() { |
| buildlet.GCEGate = gceAPIGate |
| } |
| |
| // apiCallTicker ticks regularly, preventing us from accidentally making |
| // GCE API calls too quickly. Our quota is 20 QPS, but we temporarily |
| // limit ourselves to less than that. |
| var apiCallTicker = time.NewTicker(time.Second / 10) |
| |
| func gceAPIGate() { |
| <-apiCallTicker.C |
| } |
| |
| // Initialized by InitGCE: |
| // |
| // TODO(golang.org/issue/38337): These should be moved into a struct as |
| // part of the effort to reduce package level variables. |
| var ( |
| buildEnv *buildenv.Environment |
| |
| // dsClient is a datastore client for the build project (symbolic-datum-552), where build progress is stored. |
| dsClient *datastore.Client |
| // goDSClient is a datastore client for golang-org, where build status is stored. |
| goDSClient *datastore.Client |
| // oAuthHTTPClient is the OAuth2 HTTP client used to make API calls to Google Cloud APIs. |
| oAuthHTTPClient *http.Client |
| computeService *compute.Service |
| gcpCreds *google.Credentials |
| errTryDeps error // non-nil if try bots are disabled |
| gerritClient *gerrit.Client |
| storageClient *storage.Client |
| inStaging bool // are we running in the staging project? (named -dev) |
| errorsClient *errorreporting.Client // Stackdriver errors client |
| gkeNodeHostname string |
| |
| // values created due to separating the buildlet pools into a separate package |
| gceMode string |
| deleteTimeout time.Duration |
| testFiles map[string]string |
| basePinErr *atomic.Value |
| isGCERemoteBuildlet IsRemoteBuildletFunc |
| ) |
| |
| // InitGCE initializes the GCE buildlet pool. |
| func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error { |
| gceMode = mode |
| deleteTimeout = vmDeleteTimeout |
| testFiles = tFiles |
| basePinErr = basePin |
| isGCERemoteBuildlet = fn |
| |
| ctx := context.Background() |
| var err error |
| |
| // If the coordinator is running on a GCE instance and a |
| // buildEnv was not specified with the env flag, set the |
| // buildEnvName to the project ID |
| if buildEnvName == "" { |
| if mode == "dev" { |
| buildEnvName = "dev" |
| } else if metadata.OnGCE() { |
| buildEnvName, err = metadata.ProjectID() |
| if err != nil { |
| log.Fatalf("metadata.ProjectID: %v", err) |
| } |
| } |
| } |
| |
| buildEnv = buildenv.ByProjectID(buildEnvName) |
| inStaging = buildEnv == buildenv.Staging |
| |
| // If running on GCE, override the zone and static IP, and check service account permissions. |
| if metadata.OnGCE() { |
| gkeNodeHostname, err = metadata.Get("instance/hostname") |
| if err != nil { |
| return fmt.Errorf("failed to get current instance hostname: %v", err) |
| } |
| |
| if buildEnv.KubeBuild.Zone == "" { |
| projectZone, err := metadata.Get("instance/zone") |
| if err != nil || projectZone == "" { |
| return fmt.Errorf("failed to get current GCE zone: %v", err) |
| } |
| // Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a". |
| projectZone = path.Base(projectZone) |
| buildEnv.KubeBuild.Zone = projectZone |
| } |
| |
| if buildEnv.StaticIP == "" { |
| buildEnv.StaticIP, err = metadata.ExternalIP() |
| if err != nil { |
| return fmt.Errorf("ExternalIP: %v", err) |
| } |
| } |
| |
| if !hasComputeScope() { |
| return errors.New("coordinator is not running with access to read and write Compute resources. VM support disabled") |
| } |
| } |
| |
| cfgDump, _ := json.MarshalIndent(buildEnv, "", " ") |
| log.Printf("Loaded configuration %q for project %q:\n%s", buildEnvName, buildEnv.ProjectName, cfgDump) |
| |
| if mode != "dev" { |
| storageClient, err = storage.NewClient(ctx) |
| if err != nil { |
| log.Fatalf("storage.NewClient: %v", err) |
| } |
| } |
| |
| dsClient, err = datastore.NewClient(ctx, buildEnv.ProjectName) |
| if err != nil { |
| if mode == "dev" { |
| log.Printf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err) |
| } else { |
| log.Fatalf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err) |
| } |
| } |
| goDSClient, err = datastore.NewClient(ctx, buildEnv.GoProjectName) |
| if err != nil { |
| if mode == "dev" { |
| log.Printf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err) |
| } else { |
| log.Fatalf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err) |
| } |
| } |
| |
| // don't send dev errors to Stackdriver. |
| if mode != "dev" { |
| errorsClient, err = errorreporting.NewClient(ctx, buildEnv.ProjectName, errorreporting.Config{ |
| ServiceName: "coordinator", |
| }) |
| if err != nil { |
| // don't exit, we still want to run coordinator |
| log.Printf("Error creating errors client: %v", err) |
| } |
| } |
| |
| gcpCreds, err = buildEnv.Credentials(ctx) |
| if err != nil { |
| if mode == "dev" { |
| // don't try to do anything else with GCE, as it will likely fail |
| return nil |
| } |
| log.Fatalf("failed to get a token source: %v", err) |
| } |
| oAuthHTTPClient = oauth2.NewClient(ctx, gcpCreds.TokenSource) |
| computeService, _ = compute.New(oAuthHTTPClient) |
| errTryDeps = checkTryBuildDeps(ctx, sc) |
| if errTryDeps != nil { |
| log.Printf("TryBot builders disabled due to error: %v", errTryDeps) |
| } else { |
| log.Printf("TryBot builders enabled.") |
| } |
| |
| if mode != "dev" && metadata.OnGCE() && (buildEnv == buildenv.Production || buildEnv == buildenv.Staging) { |
| go syncBuildStatsLoop(buildEnv) |
| go gcePool.pollQuotaLoop() |
| go createBasepinDisks(ctx) |
| } |
| |
| return nil |
| } |
| |
| // TODO(golang.org/issue/38337): These should be moved into a struct as |
| // part of the effort to reduce package level variables. |
| |
| // GCEConfiguration manages and contains all of the GCE configuration. |
| type GCEConfiguration struct{} |
| |
| // NewGCEConfiguration creates a new GCEConfiguration. |
| func NewGCEConfiguration() *GCEConfiguration { return &GCEConfiguration{} } |
| |
| // StorageClient retrieves the GCE storage client. |
| func (c *GCEConfiguration) StorageClient() *storage.Client { |
| return storageClient |
| } |
| |
| // BuildEnv retrieves the GCE build env. |
| func (c *GCEConfiguration) BuildEnv() *buildenv.Environment { |
| return buildEnv |
| } |
| |
| // SetBuildEnv sets the GCE build env. This is primarily reserved for |
| // testing purposes. |
| func (c *GCEConfiguration) SetBuildEnv(b *buildenv.Environment) { |
| buildEnv = b |
| } |
| |
| // BuildletPool retrieves the GCE buildlet pool. |
| func (c *GCEConfiguration) BuildletPool() *GCEBuildlet { |
| return gcePool |
| } |
| |
| // InStaging returns a boolean denoting if the environment is staging. |
| func (c *GCEConfiguration) InStaging() bool { |
| return inStaging |
| } |
| |
| // GerritClient retrieves a gerrit client. |
| func (c *GCEConfiguration) GerritClient() *gerrit.Client { |
| return gerritClient |
| } |
| |
| // GKENodeHostname retrieves the GKE node hostname. |
| func (c *GCEConfiguration) GKENodeHostname() string { |
| return gkeNodeHostname |
| } |
| |
| // DSClient retrieves the datastore client. |
| func (c *GCEConfiguration) DSClient() *datastore.Client { |
| return dsClient |
| } |
| |
| // GoDSClient retrieves the datastore client for golang.org project. |
| func (c *GCEConfiguration) GoDSClient() *datastore.Client { |
| return goDSClient |
| } |
| |
| // TryDepsErr retrives any Trybot dependency error. |
| func (c *GCEConfiguration) TryDepsErr() error { |
| return errTryDeps |
| } |
| |
| // ErrorsClient retrieves the stackdriver errors client. |
| func (c *GCEConfiguration) ErrorsClient() *errorreporting.Client { |
| return errorsClient |
| } |
| |
| // OAuthHTTPClient retrieves an OAuth2 HTTP client used to make API calls to GCP. |
| func (c *GCEConfiguration) OAuthHTTPClient() *http.Client { |
| return oAuthHTTPClient |
| } |
| |
| // GCPCredentials retrieves the GCP credentials. |
| func (c *GCEConfiguration) GCPCredentials() *google.Credentials { |
| return gcpCreds |
| } |
| |
| func checkTryBuildDeps(ctx context.Context, sc *secret.Client) error { |
| if !hasStorageScope() { |
| return errors.New("coordinator's GCE instance lacks the storage service scope") |
| } |
| if gceMode == "dev" { |
| return errors.New("running in dev mode") |
| } |
| wr := storageClient.Bucket(buildEnv.LogBucket).Object("hello.txt").NewWriter(context.Background()) |
| fmt.Fprintf(wr, "Hello, world! Coordinator start-up at %v", time.Now()) |
| if err := wr.Close(); err != nil { |
| return fmt.Errorf("test write of a GCS object to bucket %q failed: %v", buildEnv.LogBucket, err) |
| } |
| if inStaging { |
| // Don't expect to write to Gerrit in staging mode. |
| gerritClient = gerrit.NewClient("https://go-review.googlesource.com", gerrit.NoAuth) |
| } else { |
| ctxSec, cancel := context.WithTimeout(ctx, 10*time.Second) |
| defer cancel() |
| |
| gobotPass, err := sc.Retrieve(ctxSec, secret.NameGobotPassword) |
| if err != nil { |
| return fmt.Errorf("failed to get project metadata 'gobot-password': %v", err) |
| } |
| gerritClient = gerrit.NewClient("https://go-review.googlesource.com", |
| gerrit.BasicAuth("git-gobot.golang.org", strings.TrimSpace(string(gobotPass)))) |
| } |
| |
| return nil |
| } |
| |
| var gcePool = &GCEBuildlet{} |
| |
| var _ Buildlet = (*GCEBuildlet)(nil) |
| |
| // GCEBuildlet manages a pool of GCE buildlets. |
| type GCEBuildlet struct { |
| mu sync.Mutex // guards all following |
| |
| disabled bool |
| |
| // CPU quota usage & limits. pollQuota updates quotas periodically. |
| // The values recorded here reflect the updates as well as our own |
| // bookkeeping of instances as they are created and destroyed. |
| cpuLeft int |
| instLeft int |
| c2cpuLeft int |
| n2cpuLeft int |
| n2dcpuLeft int |
| instUsage int |
| cpuUsage int |
| c2cpuUsage int |
| n2cpuUsage int |
| n2dcpuUsage int |
| inst map[string]time.Time // GCE VM instance name -> creationTime |
| } |
| |
| func (p *GCEBuildlet) pollQuotaLoop() { |
| for { |
| p.pollQuota() |
| time.Sleep(5 * time.Second) |
| } |
| } |
| |
| func (p *GCEBuildlet) pollQuota() { |
| gceAPIGate() |
| reg, err := computeService.Regions.Get(buildEnv.ProjectName, buildEnv.KubeBuild.Region).Do() |
| if err != nil { |
| log.Printf("Failed to get quota for %s/%s: %v", buildEnv.ProjectName, buildEnv.KubeBuild.Region, err) |
| return |
| } |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| for _, quota := range reg.Quotas { |
| switch quota.Metric { |
| case "CPUS": |
| p.cpuLeft = int(quota.Limit) - int(quota.Usage) |
| p.cpuUsage = int(quota.Usage) |
| case "C2_CPUS": |
| p.c2cpuLeft = int(quota.Limit) - int(quota.Usage) |
| p.c2cpuUsage = int(quota.Usage) |
| case "N2_CPUS": |
| p.n2cpuLeft = int(quota.Limit) - int(quota.Usage) |
| p.n2cpuUsage = int(quota.Usage) |
| case "N2D_CPUS": |
| p.n2dcpuLeft = int(quota.Limit) - int(quota.Usage) |
| p.n2dcpuUsage = int(quota.Usage) |
| case "INSTANCES": |
| p.instLeft = int(quota.Limit) - int(quota.Usage) |
| p.instUsage = int(quota.Usage) |
| } |
| } |
| } |
| |
| // SetEnabled marks the buildlet pool as enabled. |
| func (p *GCEBuildlet) SetEnabled(enabled bool) { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| p.disabled = !enabled |
| } |
| |
| // GetBuildlet retrieves a buildlet client for an available buildlet. |
| func (p *GCEBuildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger) (bc buildlet.Client, err error) { |
| hconf, ok := dashboard.Hosts[hostType] |
| if !ok { |
| return nil, fmt.Errorf("gcepool: unknown host type %q", hostType) |
| } |
| qsp := lg.CreateSpan("awaiting_gce_quota") |
| err = p.awaitVMCountQuota(ctx, hconf) |
| qsp.Done(err) |
| if err != nil { |
| return nil, err |
| } |
| |
| instName := instanceName(hostType, 7) |
| instName = strings.Replace(instName, "_", "-", -1) // Issue 22905; can't use underscores in GCE VMs |
| p.setInstanceUsed(instName, true) |
| |
| gceBuildletSpan := lg.CreateSpan("create_gce_buildlet", instName) |
| defer func() { gceBuildletSpan.Done(err) }() |
| |
| var ( |
| needDelete bool |
| createSpan = lg.CreateSpan("create_gce_instance", instName) |
| waitBuildlet spanlog.Span // made after create is done |
| curSpan = createSpan // either instSpan or waitBuildlet |
| ) |
| |
| zone := buildEnv.RandomVMZone() |
| |
| log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone) |
| bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{ |
| DeleteIn: determineDeleteTimeout(hconf, deleteTimeout), |
| OnInstanceRequested: func() { |
| log.Printf("GCE VM %q now booting", instName) |
| }, |
| OnInstanceCreated: func() { |
| needDelete = true |
| |
| createSpan.Done(nil) |
| waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName) |
| curSpan = waitBuildlet |
| }, |
| OnGotInstanceInfo: func(*compute.Instance) { |
| lg.LogEventTime("got_instance_info", "waiting_for_buildlet...") |
| }, |
| Zone: zone, |
| }) |
| if err != nil { |
| curSpan.Done(err) |
| log.Printf("Failed to create VM for %s at %s: %v", hostType, zone, err) |
| if needDelete { |
| deleteVM(zone, instName) |
| p.putVMCountQuota(hconf) |
| } |
| p.setInstanceUsed(instName, false) |
| return nil, err |
| } |
| waitBuildlet.Done(nil) |
| bc.SetDescription("GCE VM: " + instName) |
| bc.SetGCEInstanceName(instName) |
| bc.SetOnHeartbeatFailure(func() { |
| p.putBuildlet(bc, hostType, zone, instName) |
| }) |
| return bc, nil |
| } |
| |
| func (p *GCEBuildlet) putBuildlet(bc buildlet.Client, hostType, zone, instName string) error { |
| // TODO(bradfitz): add the buildlet to a freelist (of max N |
| // items) for up to 10 minutes since when it got started if |
| // it's never seen a command execution failure, and we can |
| // wipe all its disk content? (perhaps wipe its disk content |
| // when it's retrieved, like the reverse buildlet pool) But |
| // this will require re-introducing a distinction in the |
| // buildlet client library between Close, Destroy/Halt, and |
| // tracking execution errors. That was all half-baked before |
| // and thus removed. Now Close always destroys everything. |
| deleteVM(zone, instName) |
| p.setInstanceUsed(instName, false) |
| |
| hconf, ok := dashboard.Hosts[hostType] |
| if !ok { |
| panic("failed to lookup conf") // should've worked if we did it before |
| } |
| p.putVMCountQuota(hconf) |
| return nil |
| } |
| |
| // WriteHTMLStatus writes the status of the buildlet pool to an io.Writer. |
| func (p *GCEBuildlet) WriteHTMLStatus(w io.Writer) { |
| fmt.Fprintf(w, "<b>GCE pool</b> capacity: %s", p.capacityString()) |
| const show = 6 // must be even |
| active := p.instancesActive() |
| if len(active) > 0 { |
| fmt.Fprintf(w, "<ul>") |
| for i, inst := range active { |
| if i < show/2 || i >= len(active)-(show/2) { |
| fmt.Fprintf(w, "<li>%v, %s</li>\n", inst.Name, friendlyDuration(time.Since(inst.Creation))) |
| } else if i == show/2 { |
| fmt.Fprintf(w, "<li>... %d of %d total omitted ...</li>\n", len(active)-show, len(active)) |
| } |
| } |
| fmt.Fprintf(w, "</ul>") |
| } |
| } |
| |
| func (p *GCEBuildlet) String() string { |
| return fmt.Sprintf("GCE pool capacity: %s", p.capacityString()) |
| } |
| |
| func (p *GCEBuildlet) capacityString() string { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| return fmt.Sprintf("%d/%d instances; %d/%d CPUs, %d/%d C2_CPUS, %d/%d N2_CPUS, %d/%d N2D_CPUS", |
| len(p.inst), p.instUsage+p.instLeft, |
| p.cpuUsage, p.cpuUsage+p.cpuLeft, |
| p.c2cpuUsage, p.c2cpuUsage+p.c2cpuLeft, |
| p.n2cpuUsage, p.n2cpuUsage+p.n2cpuLeft, |
| p.n2dcpuUsage, p.n2dcpuUsage+p.n2dcpuLeft) |
| } |
| |
| // awaitVMCountQuota waits for numCPU CPUs of quota to become available, |
| // or returns ctx.Err. |
| func (p *GCEBuildlet) awaitVMCountQuota(ctx context.Context, hconf *dashboard.HostConfig) error { |
| // Poll every 2 seconds, which could be better, but works and |
| // is simple. |
| for { |
| if p.tryAllocateQuota(hconf) { |
| return nil |
| } |
| select { |
| case <-time.After(2 * time.Second): |
| case <-ctx.Done(): |
| return ctx.Err() |
| } |
| } |
| } |
| |
| func (p *GCEBuildlet) tryAllocateQuota(hconf *dashboard.HostConfig) bool { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| if p.disabled { |
| return false |
| } |
| if p.instLeft < 1 { |
| return false |
| } |
| numCPU := hconf.GCENumCPU() |
| mt := hconf.MachineType() |
| if strings.HasPrefix(mt, "n2-") { |
| if p.n2cpuLeft < numCPU { |
| return false |
| } |
| p.n2cpuUsage += numCPU |
| p.n2cpuLeft -= numCPU |
| p.instLeft-- |
| } else if strings.HasPrefix(mt, "n2d-") { |
| if p.n2dcpuLeft < numCPU { |
| return false |
| } |
| p.n2dcpuUsage += numCPU |
| p.n2dcpuLeft -= numCPU |
| p.instLeft-- |
| } else if strings.HasPrefix(mt, "c2-") { |
| if p.c2cpuLeft < numCPU { |
| return false |
| } |
| p.c2cpuUsage += numCPU |
| p.c2cpuLeft -= numCPU |
| p.instLeft-- |
| } else { |
| // E2 and N1 instances are counted here. We do not use M1, M2, |
| // or A2 quotas. See |
| // https://cloud.google.com/compute/quotas#cpu_quota. |
| if p.cpuLeft < numCPU { |
| return false |
| } |
| p.cpuUsage += numCPU |
| p.cpuLeft -= numCPU |
| p.instLeft-- |
| } |
| return true |
| } |
| |
| // putVMCountQuota adjusts the dead-reckoning of our quota usage by |
| // one instance and cpu CPUs. |
| func (p *GCEBuildlet) putVMCountQuota(hconf *dashboard.HostConfig) { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| mt := hconf.MachineType() |
| numCPU := hconf.GCENumCPU() |
| if strings.HasPrefix(mt, "n2-") { |
| p.n2cpuUsage -= numCPU |
| p.n2cpuLeft += numCPU |
| p.instLeft++ |
| } else if strings.HasPrefix(mt, "n2d-") { |
| p.n2dcpuUsage -= numCPU |
| p.n2dcpuLeft += numCPU |
| p.instLeft++ |
| } else if strings.HasPrefix(mt, "c2-") { |
| p.c2cpuUsage -= numCPU |
| p.c2cpuLeft += numCPU |
| p.instLeft++ |
| } else { |
| // E2 and N1 instances are counted here. We do not use M1, M2, |
| // or A2 quotas. See |
| // https://cloud.google.com/compute/quotas#cpu_quota. |
| p.cpuUsage -= numCPU |
| p.cpuLeft += numCPU |
| p.instLeft++ |
| } |
| } |
| |
| func (p *GCEBuildlet) setInstanceUsed(instName string, used bool) { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| if p.inst == nil { |
| p.inst = make(map[string]time.Time) |
| } |
| if used { |
| p.inst[instName] = time.Now() |
| } else { |
| delete(p.inst, instName) |
| } |
| } |
| |
| func (p *GCEBuildlet) instanceUsed(instName string) bool { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| _, ok := p.inst[instName] |
| return ok |
| } |
| |
| func (p *GCEBuildlet) instancesActive() (ret []ResourceTime) { |
| p.mu.Lock() |
| defer p.mu.Unlock() |
| for name, create := range p.inst { |
| ret = append(ret, ResourceTime{ |
| Name: name, |
| Creation: create, |
| }) |
| } |
| sort.Sort(ByCreationTime(ret)) |
| return ret |
| } |
| |
| // ResourceTime is a GCE instance or Kube pod name and its creation time. |
| type ResourceTime struct { |
| Name string |
| Creation time.Time |
| } |
| |
| // ByCreationTime provides the functionality to sort resource times by |
| // the time of creation. |
| type ByCreationTime []ResourceTime |
| |
| func (s ByCreationTime) Len() int { return len(s) } |
| func (s ByCreationTime) Less(i, j int) bool { return s[i].Creation.Before(s[j].Creation) } |
| func (s ByCreationTime) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
| |
| // CleanUpOldVMs loops forever and periodically enumerates virtual |
| // machines and deletes those which have expired. |
| // |
| // A VM is considered expired if it has a "delete-at" metadata |
| // attribute having a unix timestamp before the current time. |
| // |
| // This is the safety mechanism to delete VMs which stray from the |
| // normal deleting process. VMs are created to run a single build and |
| // should be shut down by a controlling process. Due to various types |
| // of failures, they might get stranded. To prevent them from getting |
| // stranded and wasting resources forever, we instead set the |
| // "delete-at" metadata attribute on them when created to some time |
| // that's well beyond their expected lifetime. |
| func (p *GCEBuildlet) CleanUpOldVMs() { |
| if gceMode == "dev" { |
| return |
| } |
| if computeService == nil { |
| return |
| } |
| |
| // TODO(bradfitz): remove this list and just query it from the compute API? |
| // https://godoc.org/google.golang.org/api/compute/v1#RegionsService.Get |
| // and Region.Zones: https://godoc.org/google.golang.org/api/compute/v1#Region |
| |
| for { |
| for _, zone := range buildEnv.VMZones { |
| if err := p.cleanZoneVMs(zone); err != nil { |
| log.Printf("Error cleaning VMs in zone %q: %v", zone, err) |
| } |
| } |
| time.Sleep(time.Minute) |
| } |
| } |
| |
| // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone. |
| func (p *GCEBuildlet) cleanZoneVMs(zone string) error { |
| // Fetch the first 500 (default) running instances and clean |
| // those. We expect that we'll be running many fewer than |
| // that. Even if we have more, eventually the first 500 will |
| // either end or be cleaned, and then the next call will get a |
| // partially-different 500. |
| // TODO(bradfitz): revisit this code if we ever start running |
| // thousands of VMs. |
| gceAPIGate() |
| list, err := computeService.Instances.List(buildEnv.ProjectName, zone).Do() |
| if err != nil { |
| return fmt.Errorf("listing instances: %v", err) |
| } |
| for _, inst := range list.Items { |
| if inst.Metadata == nil { |
| // Defensive. Not seen in practice. |
| continue |
| } |
| if isGCERemoteBuildlet(inst.Name) { |
| // Remote buildlets have their own expiration mechanism that respects active SSH sessions. |
| log.Printf("cleanZoneVMs: skipping remote buildlet %q", inst.Name) |
| continue |
| } |
| var sawDeleteAt bool |
| var deleteReason string |
| for _, it := range inst.Metadata.Items { |
| if it.Key == "delete-at" { |
| if it.Value == nil { |
| log.Printf("missing delete-at value; ignoring") |
| continue |
| } |
| unixDeadline, err := strconv.ParseInt(*it.Value, 10, 64) |
| if err != nil { |
| log.Printf("invalid delete-at value %q seen; ignoring", *it.Value) |
| continue |
| } |
| sawDeleteAt = true |
| if time.Now().Unix() > unixDeadline { |
| deleteReason = "delete-at expiration" |
| } |
| } |
| } |
| isBuildlet := isBuildlet(inst.Name) |
| |
| if isBuildlet && !sawDeleteAt && !p.instanceUsed(inst.Name) { |
| createdAt, _ := time.Parse(time.RFC3339Nano, inst.CreationTimestamp) |
| if createdAt.Before(time.Now().Add(-3 * time.Hour)) { |
| deleteReason = fmt.Sprintf("no delete-at, created at %s", inst.CreationTimestamp) |
| } |
| } |
| |
| // Delete buildlets (things we made) from previous |
| // generations. Only deleting things starting with "buildlet-" |
| // is a historical restriction, but still fine for paranoia. |
| if deleteReason == "" && sawDeleteAt && isBuildlet && !p.instanceUsed(inst.Name) { |
| if _, ok := deletedVMCache.Get(inst.Name); !ok { |
| deleteReason = "from earlier coordinator generation" |
| } |
| } |
| |
| if deleteReason != "" { |
| log.Printf("deleting VM %q in zone %q; %s ...", inst.Name, zone, deleteReason) |
| deleteVM(zone, inst.Name) |
| } |
| } |
| return nil |
| } |
| |
| var deletedVMCache = lru.New(100) // keyed by instName |
| |
| type token struct{} |
| |
| // deleteVM starts a delete of an instance in a given zone. |
| // |
| // It either returns an operation name (if delete is pending) or the |
| // empty string if the instance didn't exist. |
| func deleteVM(zone, instName string) (operation string, err error) { |
| deletedVMCache.Add(instName, token{}) |
| gceAPIGate() |
| op, err := computeService.Instances.Delete(buildEnv.ProjectName, zone, instName).Do() |
| apiErr, ok := err.(*googleapi.Error) |
| if ok { |
| if apiErr.Code == 404 { |
| return "", nil |
| } |
| } |
| if err != nil { |
| log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err) |
| return "", err |
| } |
| log.Printf("Sent request to delete instance %q in zone %q. Operation ID, Name: %v, %v", instName, zone, op.Id, op.Name) |
| return op.Name, nil |
| } |
| |
| // HasScope returns true if the GCE metadata contains the default scopes. |
| func HasScope(want string) bool { |
| // If not on GCE, assume full access |
| if !metadata.OnGCE() { |
| return true |
| } |
| scopes, err := metadata.Scopes("default") |
| if err != nil { |
| log.Printf("failed to query metadata default scopes: %v", err) |
| return false |
| } |
| for _, v := range scopes { |
| if v == want { |
| return true |
| } |
| } |
| return false |
| } |
| |
| func hasComputeScope() bool { |
| return HasScope(compute.ComputeScope) || HasScope(compute.CloudPlatformScope) |
| } |
| |
| func hasStorageScope() bool { |
| return HasScope(storage.ScopeReadWrite) || HasScope(storage.ScopeFullControl) || HasScope(compute.CloudPlatformScope) |
| } |
| |
| // ReadGCSFile reads the named file from the GCS bucket. |
| func ReadGCSFile(name string) ([]byte, error) { |
| if gceMode == "dev" { |
| b, ok := testFiles[name] |
| if !ok { |
| return nil, &os.PathError{ |
| Op: "open", |
| Path: name, |
| Err: os.ErrNotExist, |
| } |
| } |
| return []byte(b), nil |
| } |
| |
| r, err := storageClient.Bucket(buildEnv.BuildletBucket).Object(name).NewReader(context.Background()) |
| if err != nil { |
| return nil, err |
| } |
| defer r.Close() |
| return ioutil.ReadAll(r) |
| } |
| |
| // syncBuildStatsLoop runs forever in its own goroutine and syncs the |
| // coordinator's datastore Build & Span entities to BigQuery |
| // periodically. |
| func syncBuildStatsLoop(env *buildenv.Environment) { |
| ticker := time.NewTicker(5 * time.Minute) |
| for { |
| ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) |
| if err := buildstats.SyncBuilds(ctx, env); err != nil { |
| log.Printf("buildstats: SyncBuilds: %v", err) |
| } |
| if err := buildstats.SyncSpans(ctx, env); err != nil { |
| log.Printf("buildstats: SyncSpans: %v", err) |
| } |
| cancel() |
| <-ticker.C |
| } |
| } |
| |
| // createBasepinDisks creates zone-local copies of VM disk images, to |
| // speed up VM creations in the future. |
| // |
| // Other than a list call, this a no-op unless new VM images were |
| // added or updated recently. |
| func createBasepinDisks(ctx context.Context) { |
| for { |
| t0 := time.Now() |
| bgc, err := buildgo.NewClient(ctx, buildEnv) |
| if err != nil { |
| log.Printf("basepin: NewClient: %v", err) |
| return |
| } |
| log.Printf("basepin: creating basepin disks...") |
| err = bgc.MakeBasepinDisks(ctx) |
| d := time.Since(t0).Round(time.Second / 10) |
| if err != nil { |
| basePinErr.Store(err.Error()) |
| log.Printf("basepin: error creating basepin disks, after %v: %v", d, err) |
| time.Sleep(5 * time.Minute) |
| continue |
| } |
| basePinErr.Store("") |
| log.Printf("basepin: created basepin disks after %v", d) |
| return |
| } |
| } |