| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package buildlet |
| |
| import ( |
| "context" |
| "crypto/tls" |
| "errors" |
| "fmt" |
| "log" |
| "net/http" |
| "strings" |
| "time" |
| |
| "golang.org/x/build/buildenv" |
| "golang.org/x/build/dashboard" |
| "golang.org/x/oauth2" |
| "google.golang.org/api/compute/v1" |
| ) |
| |
| // GCEGate optionally specifies a function to run before any GCE API call. |
| // It's intended to be used to bound QPS rate to GCE. |
| var GCEGate func() |
| |
| func apiGate() { |
| if GCEGate != nil { |
| GCEGate() |
| } |
| } |
| |
| // VMOpts control how new VMs are started. |
| type VMOpts struct { |
| // Zone is the GCE zone to create the VM in. Required. |
| Zone string |
| |
| // ProjectID is the GCE project ID. Required. |
| ProjectID string |
| |
| // TLS optionally specifies the TLS keypair to use. |
| // If zero, http without auth is used. |
| TLS KeyPair |
| |
| // Optional description of the VM. |
| Description string |
| |
| // Optional metadata to put on the instance. |
| Meta map[string]string |
| |
| // DeleteIn optionally specifies a duration at which |
| // to delete the VM. |
| DeleteIn time.Duration |
| |
| // OnInstanceRequested optionally specifies a hook to run synchronously |
| // after the computeService.Instances.Insert call, but before |
| // waiting for its operation to proceed. |
| OnInstanceRequested func() |
| |
| // OnInstanceCreated optionally specifies a hook to run synchronously |
| // after the instance operation succeeds. |
| OnInstanceCreated func() |
| |
| // OnInstanceCreated optionally specifies a hook to run synchronously |
| // after the computeService.Instances.Get call. |
| OnGotInstanceInfo func() |
| |
| // OnBeginBuildletProbe optionally specifies a hook to run synchronously |
| // before StartNewVM tries to hit buildletURL to see if it's up yet. |
| OnBeginBuildletProbe func(buildletURL string) |
| |
| // OnEndBuildletProbe optionally specifies a hook to run synchronously |
| // after StartNewVM tries to hit the buildlet's URL to see if it's up. |
| // The hook parameters are the return values from http.Get. |
| OnEndBuildletProbe func(*http.Response, error) |
| |
| // FallbackToFullPrice optionally specifies a hook to return a new |
| // GCE instance name if the first one failed to launch |
| // as preemptible. (If you use the same name, GCE complains about |
| // resources already existing, even if it failed to be created) |
| FallbackToFullPrice func() (newInstname string) |
| } |
| |
| // StartNewVM boots a new VM on GCE and returns a buildlet client |
| // configured to speak to it. |
| func StartNewVM(ts oauth2.TokenSource, instName, hostType string, opts VMOpts) (*Client, error) { |
| computeService, _ := compute.New(oauth2.NewClient(context.TODO(), ts)) |
| |
| hconf, ok := dashboard.Hosts[hostType] |
| if !ok { |
| return nil, fmt.Errorf("invalid host type %q", hostType) |
| } |
| if !hconf.IsGCE() { |
| return nil, fmt.Errorf("host type %q is not a GCE host type", hostType) |
| } |
| |
| zone := opts.Zone |
| if zone == "" { |
| // TODO: automatic? maybe that's not useful. |
| // For now just return an error. |
| return nil, errors.New("buildlet: missing required Zone option") |
| } |
| projectID := opts.ProjectID |
| if projectID == "" { |
| return nil, errors.New("buildlet: missing required ProjectID option") |
| } |
| |
| usePreempt := false |
| Try: |
| prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID |
| machType := prefix + "/zones/" + zone + "/machineTypes/" + hconf.MachineType() |
| diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd" |
| if hconf.RegularDisk { |
| diskType = "" // a spinning disk |
| } |
| |
| // Request an IP address if this is a world-facing buildlet. |
| var accessConfigs []*compute.AccessConfig |
| // TODO(bradfitz): remove the "true ||" part once we figure out why the buildlet |
| // never boots without an IP address. Userspace seems to hang before we get to the buildlet? |
| if true || !opts.TLS.IsZero() { |
| accessConfigs = []*compute.AccessConfig{ |
| &compute.AccessConfig{ |
| Type: "ONE_TO_ONE_NAT", |
| Name: "External NAT", |
| }, |
| } |
| } |
| |
| instance := &compute.Instance{ |
| Name: instName, |
| Description: opts.Description, |
| MachineType: machType, |
| Disks: []*compute.AttachedDisk{ |
| { |
| AutoDelete: true, |
| Boot: true, |
| Type: "PERSISTENT", |
| InitializeParams: &compute.AttachedDiskInitializeParams{ |
| DiskName: instName, |
| SourceImage: "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + hconf.VMImage, |
| DiskType: diskType, |
| }, |
| }, |
| }, |
| Tags: &compute.Tags{ |
| // Warning: do NOT list "http-server" or "allow-ssh" (our |
| // project's custom tag to allow ssh access) here; the |
| // buildlet provides full remote code execution. |
| // The https-server is authenticated, though. |
| Items: []string{"https-server"}, |
| }, |
| Metadata: &compute.Metadata{}, |
| NetworkInterfaces: []*compute.NetworkInterface{ |
| &compute.NetworkInterface{ |
| AccessConfigs: accessConfigs, |
| Network: prefix + "/global/networks/default", |
| }, |
| }, |
| Scheduling: &compute.Scheduling{ |
| Preemptible: usePreempt, |
| }, |
| } |
| addMeta := func(key, value string) { |
| instance.Metadata.Items = append(instance.Metadata.Items, &compute.MetadataItems{ |
| Key: key, |
| Value: &value, |
| }) |
| } |
| // The buildlet-binary-url is the URL of the buildlet binary |
| // which the VMs are configured to download at boot and run. |
| // This lets us/ update the buildlet more easily than |
| // rebuilding the whole VM image. |
| addMeta("buildlet-binary-url", hconf.BuildletBinaryURL(buildenv.ByProjectID(opts.ProjectID))) |
| addMeta("buildlet-host-type", hostType) |
| if !opts.TLS.IsZero() { |
| addMeta("tls-cert", opts.TLS.CertPEM) |
| addMeta("tls-key", opts.TLS.KeyPEM) |
| addMeta("password", opts.TLS.Password()) |
| } |
| |
| if opts.DeleteIn != 0 { |
| // In case the VM gets away from us (generally: if the |
| // coordinator dies while a build is running), then we |
| // set this attribute of when it should be killed so |
| // we can kill it later when the coordinator is |
| // restarted. The cleanUpOldVMs goroutine loop handles |
| // that killing. |
| addMeta("delete-at", fmt.Sprint(time.Now().Add(opts.DeleteIn).Unix())) |
| } |
| |
| for k, v := range opts.Meta { |
| addMeta(k, v) |
| } |
| |
| apiGate() |
| op, err := computeService.Instances.Insert(projectID, zone, instance).Do() |
| if err != nil { |
| return nil, fmt.Errorf("Failed to create instance: %v", err) |
| } |
| condRun(opts.OnInstanceRequested) |
| createOp := op.Name |
| |
| // Wait for instance create operation to succeed. |
| OpLoop: |
| for { |
| time.Sleep(2 * time.Second) |
| apiGate() |
| op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do() |
| if err != nil { |
| return nil, fmt.Errorf("Failed to get op %s: %v", createOp, err) |
| } |
| switch op.Status { |
| case "PENDING", "RUNNING": |
| continue |
| case "DONE": |
| if op.Error != nil { |
| for _, operr := range op.Error.Errors { |
| log.Printf("failed to create instance %s in zone %s: %v", instName, zone, operr.Code) |
| if operr.Code == "ZONE_RESOURCE_POOL_EXHAUSTED" && usePreempt && opts.FallbackToFullPrice != nil { |
| oldName := instName |
| usePreempt = false |
| instName = opts.FallbackToFullPrice() |
| log.Printf("buildlet/gce: retrying without preempt with name %q (previously: %q)", instName, oldName) |
| goto Try |
| } |
| // TODO: catch Code=="QUOTA_EXCEEDED" and "Message" and return |
| // a known error value/type. |
| return nil, fmt.Errorf("Error creating instance: %+v", operr) |
| } |
| return nil, errors.New("Failed to start.") |
| } |
| break OpLoop |
| default: |
| return nil, fmt.Errorf("Unknown create status %q: %+v", op.Status, op) |
| } |
| } |
| condRun(opts.OnInstanceCreated) |
| |
| apiGate() |
| inst, err := computeService.Instances.Get(projectID, zone, instName).Do() |
| if err != nil { |
| return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err) |
| } |
| |
| // Finds its internal and/or external IP addresses. |
| intIP, extIP := instanceIPs(inst) |
| |
| // Wait for it to boot and its buildlet to come up. |
| var buildletURL string |
| var ipPort string |
| if !opts.TLS.IsZero() { |
| if extIP == "" { |
| return nil, errors.New("didn't find its external IP address") |
| } |
| buildletURL = "https://" + extIP |
| ipPort = extIP + ":443" |
| } else { |
| if intIP == "" { |
| return nil, errors.New("didn't find its internal IP address") |
| } |
| buildletURL = "http://" + intIP |
| ipPort = intIP + ":80" |
| } |
| condRun(opts.OnGotInstanceInfo) |
| |
| const timeout = 5 * time.Minute |
| var alive bool |
| impatientClient := &http.Client{ |
| Timeout: 5 * time.Second, |
| Transport: &http.Transport{ |
| Dial: defaultDialer(), |
| DisableKeepAlives: true, |
| TLSClientConfig: &tls.Config{ |
| InsecureSkipVerify: true, |
| }, |
| }, |
| } |
| deadline := time.Now().Add(timeout) |
| try := 0 |
| for time.Now().Before(deadline) { |
| try++ |
| if fn := opts.OnBeginBuildletProbe; fn != nil { |
| fn(buildletURL) |
| } |
| res, err := impatientClient.Get(buildletURL) |
| if fn := opts.OnEndBuildletProbe; fn != nil { |
| fn(res, err) |
| } |
| if err != nil { |
| time.Sleep(1 * time.Second) |
| continue |
| } |
| res.Body.Close() |
| if res.StatusCode != 200 { |
| return nil, fmt.Errorf("buildlet returned HTTP status code %d on try number %d", res.StatusCode, try) |
| } |
| alive = true |
| break |
| } |
| if !alive { |
| return nil, fmt.Errorf("buildlet didn't come up at %s in %v", buildletURL, timeout) |
| } |
| |
| return NewClient(ipPort, opts.TLS), nil |
| } |
| |
| // DestroyVM sends a request to delete a VM. Actual VM description is |
| // currently (2015-01-19) very slow for no good reason. This function |
| // returns once it's been requested, not when it's done. |
| func DestroyVM(ts oauth2.TokenSource, proj, zone, instance string) error { |
| computeService, _ := compute.New(oauth2.NewClient(context.TODO(), ts)) |
| apiGate() |
| _, err := computeService.Instances.Delete(proj, zone, instance).Do() |
| return err |
| } |
| |
| type VM struct { |
| // Name is the name of the GCE VM instance. |
| // For example, it's of the form "mote-bradfitz-plan9-386-foo", |
| // and not "plan9-386-foo". |
| Name string |
| IPPort string |
| TLS KeyPair |
| Type string // buildlet type |
| } |
| |
| // ListVMs lists all VMs. |
| func ListVMs(ts oauth2.TokenSource, proj, zone string) ([]VM, error) { |
| var vms []VM |
| computeService, _ := compute.New(oauth2.NewClient(context.TODO(), ts)) |
| |
| // TODO(bradfitz): paging over results if more than 500 |
| apiGate() |
| list, err := computeService.Instances.List(proj, zone).Do() |
| if err != nil { |
| return nil, err |
| } |
| for _, inst := range list.Items { |
| if inst.Metadata == nil { |
| // Defensive. Not seen in practice. |
| continue |
| } |
| meta := map[string]string{} |
| for _, it := range inst.Metadata.Items { |
| if it.Value != nil { |
| meta[it.Key] = *it.Value |
| } |
| } |
| hostType := meta["buildlet-host-type"] |
| if hostType == "" { |
| continue |
| } |
| vm := VM{ |
| Name: inst.Name, |
| Type: hostType, |
| TLS: KeyPair{ |
| CertPEM: meta["tls-cert"], |
| KeyPEM: meta["tls-key"], |
| }, |
| } |
| _, extIP := instanceIPs(inst) |
| if extIP == "" || vm.TLS.IsZero() { |
| continue |
| } |
| vm.IPPort = extIP + ":443" |
| vms = append(vms, vm) |
| } |
| return vms, nil |
| } |
| |
| func instanceIPs(inst *compute.Instance) (intIP, extIP string) { |
| for _, iface := range inst.NetworkInterfaces { |
| if strings.HasPrefix(iface.NetworkIP, "10.") { |
| intIP = iface.NetworkIP |
| } |
| for _, accessConfig := range iface.AccessConfigs { |
| if accessConfig.Type == "ONE_TO_ONE_NAT" { |
| extIP = accessConfig.NatIP |
| } |
| } |
| } |
| return |
| } |