blob: de58941db2e5d9ecbf9d7c58eb4bc23ef576f70e [file] [log] [blame]
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main // import "golang.org/x/build/cmd/coordinator/buildongce"
import (
"bytes"
"context"
"crypto/rand"
"errors"
"flag"
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
"text/template"
"time"
"golang.org/x/build/buildenv"
"golang.org/x/oauth2/google"
compute "google.golang.org/api/compute/v1"
dm "google.golang.org/api/deploymentmanager/v2"
)
var (
proj = flag.String("project", "", "Optional name of the Google Cloud Platform project to create the infrastructure in. If empty, the project defined in golang.org/x/build/buildenv is used, for either production or staging (if the -staging flag is used)")
staging = flag.Bool("staging", false, "If true, buildenv.Staging will be used to provide default configuration values. Otherwise, buildenv.Production is used.")
makeClusters = flag.String("make-clusters", "go,buildlets", "comma-separated list of clusters to create. Empty means none.")
makeDisks = flag.Bool("make-basepin", false, "Create the basepin disk images for all builders, then stop. Does not create the VM.")
computeService *compute.Service
deploymentService *dm.Service
oauthClient *http.Client
err error
buildEnv *buildenv.Environment
)
// Deployment Manager V2 manifest for creating a Google Container Engine
// cluster to run buildlets, as well as an autoscaler attached to the
// cluster's instance group to add capacity based on CPU utilization
const kubeConfig = `
resources:
- name: "{{ .Kube.Name }}"
type: container.v1.cluster
properties:
zone: "{{ .Env.Zone }}"
cluster:
initial_node_count: {{ .Kube.MinNodes }}
network: "default"
logging_service: "logging.googleapis.com"
monitoring_service: "none"
node_config:
machine_type: "{{ .Kube.MachineType }}"
oauth_scopes:
- "https://www.googleapis.com/auth/cloud-platform"
- "https://www.googleapis.com/auth/userinfo.email"
master_auth:
username: "admin"
password: "{{ .Password }}"`
// Old autoscaler part:
/*
`
- name: autoscaler
type: compute.v1.autoscaler
properties:
zone: "{{ .Zone }}"
name: "{{ .KubeName }}"
target: "$(ref.{{ .KubeName }}.instanceGroupUrls[0])"
autoscalingPolicy:
minNumReplicas: {{ .KubeMinNodes }}
maxNumReplicas: {{ .KubeMaxNodes }}
coolDownPeriodSec: 1200
cpuUtilization:
utilizationTarget: .6`
*/
func main() {
buildEnv = buildenv.Production
flag.Parse()
if *staging {
buildEnv = buildenv.Staging
}
if *proj != "" {
buildEnv.ProjectName = *proj
}
// Brad is sick of google.DefaultClient giving him the
// permissions from the instance via the metadata service. Use
// the service account from disk if it exists instead:
keyFile := filepath.Join(os.Getenv("HOME"), "keys", buildEnv.ProjectName+".key.json")
if _, err := os.Stat(keyFile); err == nil {
log.Printf("Using service account from %s", keyFile)
os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", keyFile)
}
oauthClient, err = google.DefaultClient(context.Background(), compute.CloudPlatformScope, compute.ComputeScope, compute.DevstorageFullControlScope)
if err != nil {
log.Fatalf("could not create oAuth client: %v", err)
}
computeService, err = compute.New(oauthClient)
if err != nil {
log.Fatalf("could not create client for Google Compute Engine: %v", err)
}
if *makeDisks {
if err := makeBasepinDisks(computeService); err != nil {
log.Fatalf("could not create basepin disks: %v", err)
}
return
}
for _, c := range []*buildenv.KubeConfig{&buildEnv.KubeBuild, &buildEnv.KubeTools} {
err := createCluster(c)
if err != nil {
log.Fatalf("Error creating Kubernetes cluster %q: %v", c.Name, err)
}
}
}
func awaitOp(svc *compute.Service, op *compute.Operation) error {
opName := op.Name
log.Printf("Waiting on operation %v", opName)
for {
time.Sleep(2 * time.Second)
op, err := svc.ZoneOperations.Get(buildEnv.ProjectName, buildEnv.Zone, opName).Do()
if err != nil {
return fmt.Errorf("Failed to get op %s: %v", opName, err)
}
switch op.Status {
case "PENDING", "RUNNING":
log.Printf("Waiting on operation %v", opName)
continue
case "DONE":
if op.Error != nil {
var last error
for _, operr := range op.Error.Errors {
log.Printf("Error: %+v", operr)
last = fmt.Errorf("%v", operr)
}
return last
}
log.Printf("Success. %+v", op)
return nil
default:
return fmt.Errorf("Unknown status %q: %+v", op.Status, op)
}
}
}
type deploymentTemplateData struct {
Env *buildenv.Environment
Kube *buildenv.KubeConfig
Password string
}
func wantClusterCreate(name string) bool {
for _, want := range strings.Split(*makeClusters, ",") {
if want == name {
return true
}
}
return false
}
func createCluster(kube *buildenv.KubeConfig) error {
if !wantClusterCreate(kube.Name) {
log.Printf("skipping kubernetes cluster %q per flag", kube.Name)
return nil
}
log.Printf("Creating Kubernetes cluster: %v", kube.Name)
deploymentService, err = dm.New(oauthClient)
if err != nil {
return fmt.Errorf("could not create client for Google Cloud Deployment Manager: %v", err)
}
if kube.MaxNodes == 0 || kube.MinNodes == 0 {
return fmt.Errorf("MaxNodes/MinNodes values cannot be 0")
}
tpl, err := template.New("kube").Parse(kubeConfig)
if err != nil {
return fmt.Errorf("could not parse Deployment Manager template: %v", err)
}
var result bytes.Buffer
err = tpl.Execute(&result, deploymentTemplateData{
Env: buildEnv,
Kube: kube,
Password: randomPassword(),
})
if err != nil {
return fmt.Errorf("could not execute Deployment Manager template: %v", err)
}
deployment := &dm.Deployment{
Name: kube.Name,
Target: &dm.TargetConfiguration{
Config: &dm.ConfigFile{
Content: result.String(),
},
},
}
op, err := deploymentService.Deployments.Insert(buildEnv.ProjectName, deployment).Do()
if err != nil {
return fmt.Errorf("Failed to create cluster with Deployment Manager: %v", err)
}
opName := op.Name
log.Printf("Created. Waiting on operation %v", opName)
OpLoop:
for {
time.Sleep(2 * time.Second)
op, err := deploymentService.Operations.Get(buildEnv.ProjectName, opName).Do()
if err != nil {
return fmt.Errorf("Failed to get op %s: %v", opName, err)
}
switch op.Status {
case "PENDING", "RUNNING":
log.Printf("Waiting on operation %v", opName)
continue
case "DONE":
// If no errors occurred, op.StatusMessage is empty.
if op.StatusMessage != "" {
log.Printf("Error: %+v", op.StatusMessage)
return fmt.Errorf("Failed to create.")
}
log.Printf("Success.")
break OpLoop
default:
return fmt.Errorf("Unknown status %q: %+v", op.Status, op)
}
}
return nil
}
func randomPassword() string {
buf := make([]byte, 10)
if _, err := rand.Read(buf); err != nil {
log.Fatalf("randomPassword: %v", err)
}
return fmt.Sprintf("%x", buf)
}
func makeBasepinDisks(svc *compute.Service) error {
// Try to find it by name.
imList, err := svc.Images.List(buildEnv.ProjectName).Do()
if err != nil {
return fmt.Errorf("Error listing images for %s: %v", buildEnv.ProjectName, err)
}
if imList.NextPageToken != "" {
return errors.New("too many images; pagination not supported")
}
diskList, err := svc.Disks.List(buildEnv.ProjectName, buildEnv.Zone).Do()
if err != nil {
return err
}
if diskList.NextPageToken != "" {
return errors.New("too many disks; pagination not supported (yet?)")
}
need := make(map[string]*compute.Image) // keys like "https://www.googleapis.com/compute/v1/projects/symbolic-datum-552/global/images/linux-buildlet-arm"
for _, im := range imList.Items {
need[im.SelfLink] = im
}
for _, d := range diskList.Items {
if !strings.HasPrefix(d.Name, "basepin-") {
continue
}
if si, ok := need[d.SourceImage]; ok && d.SourceImageId == fmt.Sprint(si.Id) {
log.Printf("Have %s: %s (%v)\n", d.Name, d.SourceImage, d.SourceImageId)
delete(need, d.SourceImage)
}
}
var needed []string
for imageName := range need {
needed = append(needed, imageName)
}
sort.Strings(needed)
for _, n := range needed {
log.Printf("Need %v", n)
}
for i, imName := range needed {
im := need[imName]
log.Printf("(%d/%d) Creating %s ...", i+1, len(needed), im.Name)
op, err := svc.Disks.Insert(buildEnv.ProjectName, buildEnv.Zone, &compute.Disk{
Description: "zone-cached basepin image of " + im.Name,
Name: "basepin-" + im.Name + "-" + fmt.Sprint(im.Id),
SizeGb: im.DiskSizeGb,
SourceImage: im.SelfLink,
SourceImageId: fmt.Sprint(im.Id),
Type: "https://www.googleapis.com/compute/v1/projects/" + buildEnv.ProjectName + "/zones/" + buildEnv.Zone + "/diskTypes/pd-ssd",
}).Do()
if err != nil {
return err
}
if err := awaitOp(svc, op); err != nil {
log.Fatalf("failed to create: %v", err)
}
}
return nil
}