internal/coordinator/pool: add ec2 pool

The EC2 buildlet pool added by this commit will manage the lifecycle
of buildlets running on EC2. EC2 VMs will only be created for the
ARM64 architecture. As VMs are requested, the pool will ensure that
there are enough resources for the VM to be created and keep track of
the VMs created. Once a VM is destroyed, the resources will be made
available for other VMs. This pool will only keep instances as they
are needed.

Updates golang/go#36841
Updates golang/go#38337

Change-Id: Ic777485c0b0a69ec13726c58b49e9fdc1df4777e
Reviewed-on: https://go-review.googlesource.com/c/build/+/247907
Run-TryBot: Carlos Amedee <carlos@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Alexander Rakoczy <alex@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
diff --git a/internal/coordinator/pool/ec2.go b/internal/coordinator/pool/ec2.go
new file mode 100644
index 0000000..b97052c
--- /dev/null
+++ b/internal/coordinator/pool/ec2.go
@@ -0,0 +1,361 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build linux darwin
+
+package pool
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"html"
+	"io"
+	"log"
+	"sync"
+	"time"
+
+	"golang.org/x/build/buildenv"
+	"golang.org/x/build/buildlet"
+	"golang.org/x/build/dashboard"
+	"golang.org/x/build/internal/cloud"
+	"golang.org/x/build/internal/spanlog"
+)
+
+var _ Buildlet = (*EC2Buildlet)(nil)
+
+// ec2Buildlet is the package level buildlet pool.
+//
+// TODO(golang.org/issues/38337) remove once a package level variable is no longer
+// required by the main package.
+var ec2Buildlet *EC2Buildlet
+
+// EC2BuildetPool retrieves the package level EC2Buildlet pool set by the constructor.
+//
+// TODO(golang.org/issues/38337) remove once a package level variable is no longer
+// required by the main package.
+func EC2BuildetPool() *EC2Buildlet {
+	return ec2Buildlet
+}
+
+func init() {
+	// initializes a basic package level ec2Buildlet pool to enable basic testing in other
+	// packages.
+	//
+	// TODO(golang.org/issues/38337) remove once a package level variable is no longer
+	// required by the main package.
+	ec2Buildlet = &EC2Buildlet{
+		ledger: newLedger(),
+	}
+}
+
+// awsClient represents the aws client used to interact with AWS. This is a partial
+// implementation of pool.AWSClient.
+type awsClient interface {
+	DestroyInstances(ctx context.Context, instIDs ...string) error
+	Quota(ctx context.Context, service, code string) (int64, error)
+	InstanceTypesARM(ctx context.Context) ([]*cloud.InstanceType, error)
+	RunningInstances(ctx context.Context) ([]*cloud.Instance, error)
+}
+
+// EC2Opt is optional configuration for the the buildlet.
+type EC2Opt func(*EC2Buildlet)
+
+// WithVMDeleteTimeout sets the VM deletion timeout for all EC2 VMs.
+func WithVMDeleteTimeout(timeout time.Duration) EC2Opt {
+	return func(eb *EC2Buildlet) {
+		eb.vmDeleteTimeout = timeout
+	}
+}
+
+// EC2Buildlet manages a pool of AWS EC2 buildlets.
+type EC2Buildlet struct {
+	once sync.Once
+	// done channel closing will signal the pollers to discontinue polling
+	done chan struct{}
+
+	// awsClient is the client used to interact with AWS services.
+	awsClient awsClient
+	// buildEnv contains the build enviornment settings.
+	buildEnv *buildenv.Environment
+	// buildletClient is the client used to create a buildlet.
+	buildletClient ec2BuildletClient
+	// hosts provides the host configuration for all hosts. It is passed in to facilitate
+	// testing.
+	hosts map[string]*dashboard.HostConfig
+	// isRemoteBuildletFunc informs the caller is a VM instance is being used as a remote
+	// buildlet.
+	//
+	// TODO(golang.org/issues/38337) remove once we find a way to pass in remote buildlet
+	// information at the get buidlet request.
+	isRemoteBuildlet IsRemoteBuildletFunc
+	// ledger tracks instances and their resource allocations.
+	ledger *ledger
+	// vmDeleteTimeout contains the timeout used to determine if a VM should be deleted.
+	vmDeleteTimeout time.Duration
+}
+
+// ec2BuildletClient represents an EC2 buildlet client in the buildlet package.
+type ec2BuildletClient interface {
+	StartNewVM(ctx context.Context, buildEnv *buildenv.Environment, hconf *dashboard.HostConfig, vmName, hostType string, opts *buildlet.VMOpts) (*buildlet.Client, error)
+}
+
+// NewEC2Buildlet creates a new EC2 buildlet pool used to create and manage the lifecycle of
+// EC2 buildlets. Information about ARM64 instance types is retrieved before starting the pool.
+// EC2 quota types are also retrieved before starting the pool. The pool will continuously poll
+// for quotas which limit the resources that can be consumed by the pool. It will also periodically
+// search for VMs which are no longer in use or are untracked by the pool in order to delete them.
+func NewEC2Buildlet(client *cloud.AWSClient, buildEnv *buildenv.Environment, hosts map[string]*dashboard.HostConfig, fn IsRemoteBuildletFunc, opts ...EC2Opt) (*EC2Buildlet, error) {
+	if fn == nil {
+		return nil, errors.New("remote buildlet check function is not set")
+	}
+	b := &EC2Buildlet{
+		awsClient:        client,
+		buildEnv:         buildEnv,
+		buildletClient:   buildlet.NewEC2Client(client),
+		done:             make(chan struct{}),
+		hosts:            hosts,
+		isRemoteBuildlet: fn,
+		ledger:           newLedger(),
+		vmDeleteTimeout:  45 * time.Minute, // default VM delete timeout
+	}
+	for _, opt := range opts {
+		opt(b)
+	}
+	if err := b.retrieveAndSetQuota(); err != nil {
+		return nil, fmt.Errorf("unable to create EC2 pool: %w", err)
+	}
+	if err := b.retrieveAndSetInstanceTypes(); err != nil {
+		return nil, fmt.Errorf("unable to create EC2 pool: %w", err)
+	}
+	go b.cleanupUnusedVMs()
+	go b.pollQuota()
+
+	// TODO(golang.org/issues/38337) remove once a package level variable is no longer
+	// required by the main package.
+	ec2Buildlet = b
+	return b, nil
+}
+
+// GetBuildlet retrieves a buildlet client for a newly created buildlet.
+func (eb *EC2Buildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger) (*buildlet.Client, error) {
+	hconf, ok := eb.hosts[hostType]
+	if !ok {
+		return nil, fmt.Errorf("ec2 pool: unknown host type %q", hostType)
+	}
+	instName := instanceName(hostType, 7)
+	log.Printf("Creating EC2 VM %q for %s", instName, hostType)
+	kp, err := buildlet.NewKeyPair()
+	if err != nil {
+		log.Printf("failed to create TLS key pair for %s: %s", hostType, err)
+		return nil, fmt.Errorf("failed to create TLS key pair: %w", err)
+	}
+
+	qsp := lg.CreateSpan("awaiting_ec2_quota")
+	err = eb.ledger.ReserveResources(ctx, instName, hconf.MachineType())
+	qsp.Done(err)
+	if err != nil {
+		return nil, err
+	}
+
+	ec2BuildletSpan := lg.CreateSpan("create_ec2_buildlet", instName)
+	defer func() { ec2BuildletSpan.Done(err) }()
+
+	var (
+		createSpan      = lg.CreateSpan("create_ec2_instance", instName)
+		waitBuildlet    spanlog.Span
+		curSpan         = createSpan
+		instanceCreated bool
+	)
+	bc, err := eb.buildletClient.StartNewVM(ctx, eb.buildEnv, hconf, instName, hostType, &buildlet.VMOpts{
+		Zone:     "", // allow the EC2 api pick an availability zone with capacity
+		TLS:      kp,
+		Meta:     make(map[string]string),
+		DeleteIn: deleteTimeoutFromContextOrValue(ctx, eb.vmDeleteTimeout),
+		OnInstanceRequested: func() {
+			log.Printf("EC2 VM %q now booting", instName)
+		},
+		OnInstanceCreated: func() {
+			log.Printf("EC2 VM %q now running", instName)
+			createSpan.Done(nil)
+			instanceCreated = true
+			waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName)
+			curSpan = waitBuildlet
+		},
+		OnGotEC2InstanceInfo: func(inst *cloud.Instance) {
+			lg.LogEventTime("got_instance_info", "waiting_for_buildlet...")
+			eb.ledger.UpdateReservation(instName, inst.ID)
+		},
+	})
+	if err != nil {
+		curSpan.Done(err)
+		log.Printf("EC2 VM creation failed for %s: %v", hostType, err)
+		if instanceCreated {
+			log.Printf("EC2 VM %q failed initialize buildlet client. deleting...", instName)
+			eb.buildletDone(instName)
+		} else {
+			eb.ledger.Remove(instName)
+		}
+		return nil, err
+	}
+	waitBuildlet.Done(nil)
+	bc.SetDescription(fmt.Sprintf("EC2 VM: %s", instName))
+	bc.SetOnHeartbeatFailure(func() {
+		log.Printf("EC2 VM %q failed heartbeat", instName)
+		eb.buildletDone(instName)
+	})
+	return bc, nil
+}
+
+// String gives a report of capacity usage for the EC2 buildlet pool.
+func (eb *EC2Buildlet) String() string {
+	return fmt.Sprintf("EC2 pool capacity: %s", eb.capacityString())
+}
+
+// capacityString() gives a report of capacity usage.
+func (eb *EC2Buildlet) capacityString() string {
+	r := eb.ledger.Resources()
+	return fmt.Sprintf("%d instances; %d/%d CPUs", r.InstCount, r.CPUUsed, r.CPULimit)
+}
+
+// WriteHTMLStatus writes the status of the EC2 buildlet pool to an io.Writer.
+func (eb *EC2Buildlet) WriteHTMLStatus(w io.Writer) {
+	fmt.Fprintf(w, "<b>EC2 pool</b> capacity: %s", eb.capacityString())
+
+	active := eb.ledger.ResourceTime()
+	if len(active) > 0 {
+		fmt.Fprintf(w, "<ul>")
+		for _, inst := range active {
+			fmt.Fprintf(w, "<li>%v, %s</li>\n", html.EscapeString(inst.Name), friendlyDuration(time.Since(inst.Creation)))
+		}
+		fmt.Fprintf(w, "</ul>")
+	}
+}
+
+// buildletDone issues a call to destroy the EC2 instance and removes
+// the instance from the ledger. Removing the instance from the ledger
+// also releases any resources allocated to that instance. If an instance
+// is not found in the ledger or on EC2 then an error is logged. All
+// untracked instances will be cleaned up by the polling cleanupUnusedVMs
+// method.
+func (eb *EC2Buildlet) buildletDone(instName string) {
+	vmID := eb.ledger.InstanceID(instName)
+	if vmID == "" {
+		log.Printf("EC2 vm %s not found", instName)
+		return
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	if err := eb.awsClient.DestroyInstances(ctx, vmID); err != nil {
+		log.Printf("EC2 VM %s deletion failed: %s", instName, err)
+	}
+	eb.ledger.Remove(instName)
+}
+
+// Close stops the pollers used by the EC2Buildlet pool from running.
+func (eb *EC2Buildlet) Close() {
+	eb.once.Do(func() {
+		close(eb.done)
+	})
+}
+
+// retrieveAndSetQuota queries EC2 for account relevant quotas and sets the quota in the ledger.
+func (eb *EC2Buildlet) retrieveAndSetQuota() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	cpuQuota, err := eb.awsClient.Quota(ctx, cloud.QuotaServiceEC2, cloud.QuotaCodeCPUOnDemand)
+	if err != nil {
+		log.Printf("unable to query for cpu quota: %s", err)
+		return err
+	}
+	eb.ledger.SetCPULimit(cpuQuota)
+	return nil
+}
+
+// pollQuota repeatedly polls for the EC2 quota data and sets the quota data in
+// the ledger. It stops polling once the done channel has been closed.
+func (eb *EC2Buildlet) pollQuota() {
+	t := time.NewTicker(time.Hour)
+	defer t.Stop()
+	for {
+		select {
+		case <-t.C:
+			err := eb.retrieveAndSetQuota()
+			if err != nil {
+				log.Printf("polling for EC2 quota failed: %s", err)
+			}
+		case <-eb.done:
+			// closing the done channel signals the end of the polling loop.
+			log.Printf("stopped polling for EC2 quota")
+			return
+		}
+	}
+}
+
+// retrieveAndSetInstanceTypes retrieves the ARM64 instance types from the EC2
+// service and sets them in the ledger.
+func (eb *EC2Buildlet) retrieveAndSetInstanceTypes() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	its, err := eb.awsClient.InstanceTypesARM(ctx)
+	if err != nil {
+		return fmt.Errorf("unable to retrieve instance types: %w", err)
+	}
+	eb.ledger.UpdateInstanceTypes(its)
+	log.Printf("ec2 buildlet pool instance types updated")
+	return nil
+}
+
+// cleanupUnusedVMs periodically queries for VMs which are not tracked in the ledger and
+// deletes them. If the done channel has been closed then the polling will exit.
+func (eb *EC2Buildlet) cleanupUnusedVMs() {
+	t := time.NewTicker(2 * time.Minute)
+	defer t.Stop()
+	for {
+		select {
+		case <-t.C:
+			log.Printf("cleaning up unused EC2 instances")
+			eb.destroyUntrackedInstances()
+		case <-eb.done:
+			// closing the done channel signals the end of the polling loop.
+			log.Printf("stopped cleaning up unused EC2 instances")
+			return
+		}
+	}
+}
+
+// destroyUntrackedInstances searches for VMs which exist but are not being tracked in the
+// ledger and deletes them.
+func (eb *EC2Buildlet) destroyUntrackedInstances() {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	insts, err := eb.awsClient.RunningInstances(ctx)
+	if err != nil {
+		log.Printf("failed to query for instances: %s", err)
+		return
+	}
+	deleteInsts := make([]string, 0, len(insts))
+	for _, inst := range insts {
+		if eb.isRemoteBuildlet(inst.Name) {
+			// Remote buildlets have their own expiration mechanism that respects active SSH sessions.
+			log.Printf("destroyUntrackedInstances: skipping remote buildlet %q", inst.Name)
+			continue
+		}
+		if id := eb.ledger.InstanceID(inst.Name); id != "" {
+			continue
+		}
+		deleteInsts = append(deleteInsts, inst.ID)
+		log.Printf("queued for deleting untracked EC2 VM %q with id %q", inst.Name, inst.ID)
+	}
+	if len(deleteInsts) == 0 {
+		return
+	}
+	if err := eb.awsClient.DestroyInstances(ctx, deleteInsts...); err != nil {
+		log.Printf("failed cleaning EC2 VMs: %s", err)
+	}
+}
diff --git a/internal/coordinator/pool/ec2_test.go b/internal/coordinator/pool/ec2_test.go
new file mode 100644
index 0000000..502b412
--- /dev/null
+++ b/internal/coordinator/pool/ec2_test.go
@@ -0,0 +1,699 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build linux darwin
+
+package pool
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sort"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"golang.org/x/build/buildenv"
+	"golang.org/x/build/buildlet"
+	"golang.org/x/build/dashboard"
+	"golang.org/x/build/internal/cloud"
+	"golang.org/x/build/internal/spanlog"
+)
+
+func TestEC2BuildletGetBuildlet(t *testing.T) {
+	host := "host-type-x"
+
+	l := newLedger()
+	l.UpdateInstanceTypes([]*cloud.InstanceType{
+		// set to default gce type because there is no way to set the machine
+		// type from outside of the buildenv package.
+		{
+			Type: "n1-standard-4",
+			CPU:  4,
+		},
+	})
+	l.SetCPULimit(20)
+
+	bp := &EC2Buildlet{
+		buildletClient: &fakeEC2BuildletClient{
+			createVMRequestSuccess: true,
+			VMCreated:              true,
+			buildletCreated:        true,
+		},
+		buildEnv: &buildenv.Environment{},
+		ledger:   l,
+		hosts: map[string]*dashboard.HostConfig{
+			host: {
+				VMImage:        "ami-15",
+				ContainerImage: "bar-arm64:latest",
+				SSHUsername:    "foo",
+			},
+		},
+	}
+	_, err := bp.GetBuildlet(context.Background(), host, noopEventTimeLogger{})
+	if err != nil {
+		t.Errorf("EC2Buildlet.GetBuildlet(ctx, %q, %+v) = _, %s; want no error", host, noopEventTimeLogger{}, err)
+	}
+}
+
+func TestEC2BuildletGetBuildletError(t *testing.T) {
+	host := "host-type-x"
+	testCases := []struct {
+		desc           string
+		hostType       string
+		logger         Logger
+		ledger         *ledger
+		buildletClient ec2BuildletClient
+		hosts          map[string]*dashboard.HostConfig
+	}{
+		{
+			desc:     "invalid-host-type",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				"wrong-host-type": {},
+			},
+			logger: noopEventTimeLogger{},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: true,
+				VMCreated:              true,
+			},
+		},
+		{
+			desc:     "buildlet-client-failed-instance-created",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			logger: noopEventTimeLogger{},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: false,
+				VMCreated:              false,
+			},
+		},
+		{
+			desc:     "buildlet-client-failed-instance-not-created",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			logger: noopEventTimeLogger{},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: true,
+				VMCreated:              false,
+			},
+		},
+	}
+	for _, tt := range testCases {
+		t.Run(tt.desc, func(t *testing.T) {
+			bp := &EC2Buildlet{
+				buildletClient: tt.buildletClient,
+				buildEnv:       &buildenv.Environment{},
+				ledger:         tt.ledger,
+				hosts:          tt.hosts,
+			}
+			_, gotErr := bp.GetBuildlet(context.Background(), tt.hostType, tt.logger)
+			if gotErr == nil {
+				t.Errorf("EC2Buildlet.GetBuildlet(ctx, %q, %+v) = _, %s", tt.hostType, tt.logger, gotErr)
+			}
+		})
+	}
+}
+
+func TestEC2BuildletGetBuildletLogger(t *testing.T) {
+	host := "host-type-x"
+	testCases := []struct {
+		desc           string
+		buildletClient ec2BuildletClient
+		hostType       string
+		hosts          map[string]*dashboard.HostConfig
+		ledger         *ledger
+		wantLogs       []string
+		wantSpans      []string
+		wantSpansErr   []string
+	}{
+		{
+			desc:     "buildlet-client-failed-instance-create-request-failed",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: false,
+				VMCreated:              false,
+				buildletCreated:        false,
+			},
+			wantSpans:    []string{"create_ec2_instance", "awaiting_ec2_quota", "create_ec2_buildlet"},
+			wantSpansErr: []string{"create_ec2_buildlet", "create_ec2_instance"},
+		},
+		{
+			desc:     "buildlet-client-failed-instance-not-created",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: true,
+				VMCreated:              false,
+				buildletCreated:        false,
+			},
+			wantSpans:    []string{"create_ec2_instance", "awaiting_ec2_quota", "create_ec2_buildlet"},
+			wantSpansErr: []string{"create_ec2_buildlet", "create_ec2_instance"},
+		},
+		{
+			desc:     "buildlet-client-failed-instance-created",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: true,
+				VMCreated:              true,
+				buildletCreated:        false,
+			},
+			wantSpans:    []string{"create_ec2_instance", "awaiting_ec2_quota", "create_ec2_buildlet", "wait_buildlet_start"},
+			wantSpansErr: []string{"create_ec2_buildlet", "wait_buildlet_start"},
+		},
+		{
+			desc:     "success",
+			hostType: host,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  0,
+				entries:  make(map[string]*entry),
+				types: map[string]*cloud.InstanceType{
+					"n1-highcpu-2": {
+						Type: "n1-highcpu-2",
+						CPU:  4,
+					},
+				},
+			},
+			hosts: map[string]*dashboard.HostConfig{
+				host: {},
+			},
+			buildletClient: &fakeEC2BuildletClient{
+				createVMRequestSuccess: true,
+				VMCreated:              true,
+				buildletCreated:        true,
+			},
+			wantSpans:    []string{"create_ec2_instance", "create_ec2_buildlet", "awaiting_ec2_quota", "wait_buildlet_start"},
+			wantSpansErr: []string{},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			bp := &EC2Buildlet{
+				buildletClient: tc.buildletClient,
+				buildEnv:       &buildenv.Environment{},
+				ledger:         tc.ledger,
+				hosts:          tc.hosts,
+			}
+			l := newTestLogger()
+			_, _ = bp.GetBuildlet(context.Background(), tc.hostType, l)
+			if !cmp.Equal(l.spanEvents(), tc.wantSpans, cmp.Transformer("sort", func(in []string) []string {
+				out := append([]string(nil), in...)
+				sort.Strings(out)
+				return out
+			})) {
+				t.Errorf("span events = %+v; want %+v", l.spanEvents(), tc.wantSpans)
+			}
+			for _, spanErr := range tc.wantSpansErr {
+				s, ok := l.spans[spanErr]
+				if !ok {
+					t.Fatalf("log span %q does not exist", spanErr)
+				}
+				if s.err == nil {
+					t.Fatalf("testLogger.span[%q].err is nil", spanErr)
+				}
+			}
+		})
+	}
+}
+
+func TestEC2BuildletString(t *testing.T) {
+	testCases := []struct {
+		desc      string
+		instCount int64
+		cpuCount  int64
+		cpuLimit  int64
+	}{
+		{"default", 0, 0, 0},
+		{"non-default", 2, 2, 3},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			es := make([]*entry, tc.instCount)
+			entries := make(map[string]*entry)
+			for i, e := range es {
+				entries[fmt.Sprintf("%d", i)] = e
+			}
+			eb := &EC2Buildlet{
+				ledger: &ledger{
+					cpuLimit: tc.cpuLimit,
+					cpuUsed:  tc.cpuCount,
+					entries:  entries,
+				},
+			}
+			want := fmt.Sprintf("EC2 pool capacity: %d instances; %d/%d CPUs", tc.instCount, tc.cpuCount, tc.cpuLimit)
+			got := eb.String()
+			if got != want {
+				t.Errorf("EC2Buildlet.String() = %s; want %s", got, want)
+			}
+		})
+	}
+}
+
+func TestEC2BuildletCapacityString(t *testing.T) {
+	testCases := []struct {
+		desc      string
+		instCount int64
+		cpuCount  int64
+		cpuLimit  int64
+	}{
+		{"defaults", 0, 0, 0},
+		{"non-default", 2, 2, 3},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			es := make([]*entry, tc.instCount)
+			entries := make(map[string]*entry)
+			for i, e := range es {
+				entries[fmt.Sprintf("%d", i)] = e
+			}
+			eb := &EC2Buildlet{
+				ledger: &ledger{
+					cpuLimit: tc.cpuLimit,
+					cpuUsed:  tc.cpuCount,
+					entries:  entries,
+				},
+			}
+			want := fmt.Sprintf("%d instances; %d/%d CPUs", tc.instCount, tc.cpuCount, tc.cpuLimit)
+			got := eb.capacityString()
+			if got != want {
+				t.Errorf("EC2Buildlet.capacityString() = %s; want %s", got, want)
+			}
+		})
+	}
+}
+
+func TestEC2BuildletbuildletDone(t *testing.T) {
+	t.Run("done-successful", func(t *testing.T) {
+		instName := "instance-name-x"
+
+		awsC := cloud.NewFakeAWSClient()
+		inst, err := awsC.CreateInstance(context.Background(), &cloud.EC2VMConfiguration{
+			Description: "test instance",
+			ImageID:     "image-x",
+			Name:        instName,
+			SSHKeyID:    "key-14",
+			Tags:        map[string]string{},
+			Type:        "type-x",
+			Zone:        "zone-1",
+		})
+		if err != nil {
+			t.Errorf("unable to create instance: %s", err)
+		}
+
+		pool := &EC2Buildlet{
+			awsClient: awsC,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  5,
+				entries: map[string]*entry{
+					instName: {
+						createdAt:    time.Now(),
+						instanceID:   inst.ID,
+						instanceName: instName,
+						vCPUCount:    5,
+					},
+				},
+			},
+		}
+		pool.buildletDone(instName)
+		if gotID := pool.ledger.InstanceID(instName); gotID != "" {
+			t.Errorf("ledger.instanceID = %q; want %q", gotID, "")
+		}
+		gotInsts, err := awsC.RunningInstances(context.Background())
+		if err != nil || len(gotInsts) != 0 {
+			t.Errorf("awsClient.RunningInstances(ctx) = %+v, %s; want [], nil", gotInsts, err)
+		}
+	})
+	t.Run("instance-not-in-ledger", func(t *testing.T) {
+		instName := "instance-name-x"
+
+		awsC := cloud.NewFakeAWSClient()
+		inst, err := awsC.CreateInstance(context.Background(), &cloud.EC2VMConfiguration{
+			Description: "test instance",
+			ImageID:     "image-x",
+			Name:        instName,
+			SSHKeyID:    "key-14",
+			Tags:        map[string]string{},
+			Type:        "type-x",
+			Zone:        "zone-1",
+		})
+		if err != nil {
+			t.Errorf("unable to create instance: %s", err)
+		}
+
+		pool := &EC2Buildlet{
+			awsClient: awsC,
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  5,
+				entries:  map[string]*entry{},
+			},
+		}
+		pool.buildletDone(inst.Name)
+		gotInsts, err := awsC.RunningInstances(context.Background())
+		if err != nil || len(gotInsts) != 1 {
+			t.Errorf("awsClient.RunningInstances(ctx) = %+v, %s; want 1 instance, nil", gotInsts, err)
+		}
+	})
+	t.Run("instance-not-in-ec2", func(t *testing.T) {
+		instName := "instance-name-x"
+		pool := &EC2Buildlet{
+			awsClient: cloud.NewFakeAWSClient(),
+			ledger: &ledger{
+				cpuLimit: 20,
+				cpuUsed:  5,
+				entries: map[string]*entry{
+					instName: {
+						createdAt:    time.Now(),
+						instanceID:   "instance-id-14",
+						instanceName: instName,
+						vCPUCount:    5,
+					},
+				},
+			},
+		}
+		pool.buildletDone(instName)
+		if gotID := pool.ledger.InstanceID(instName); gotID != "" {
+			t.Errorf("ledger.instanceID = %q; want %q", gotID, "")
+		}
+	})
+}
+
+func TestEC2BuildletClose(t *testing.T) {
+	pool := &EC2Buildlet{
+		done: make(chan struct{}),
+	}
+	defer func() {
+		if r := recover(); r != nil {
+			t.Errorf("EC2Buildlet.Close() paniced=%s", r)
+		}
+	}()
+	pool.Close()
+	pool.Close()
+	select {
+	case _, ok := <-pool.done:
+		if ok {
+			t.Error("EC2Buildlet.done not closed; read from channel")
+		}
+	default:
+		t.Error("EC2Buildlet.done not closed: waiting for read")
+	}
+}
+
+func TestEC2BuildletRetrieveAndSetQuota(t *testing.T) {
+	pool := &EC2Buildlet{
+		awsClient: cloud.NewFakeAWSClient(),
+		ledger:    newLedger(),
+	}
+	err := pool.retrieveAndSetQuota()
+	if err != nil {
+		t.Errorf("EC2Buildlet.retrieveAndSetQuota(ctx) = %s; want nil", err)
+	}
+	if pool.ledger.cpuLimit == 0 {
+		t.Errorf("ledger.cpuLimit = %d; want non-zero", pool.ledger.cpuLimit)
+	}
+}
+
+func TestEC2BuildletRetrieveAndSetInstanceTypes(t *testing.T) {
+	pool := &EC2Buildlet{
+		awsClient: cloud.NewFakeAWSClient(),
+		ledger:    newLedger(),
+	}
+	err := pool.retrieveAndSetInstanceTypes()
+	if err != nil {
+		t.Errorf("EC2Buildlet.retrieveAndSetInstanceTypes() = %s; want nil", err)
+	}
+	if len(pool.ledger.types) == 0 {
+		t.Errorf("len(pool.ledger.types) = %d; want non-zero", len(pool.ledger.types))
+	}
+}
+
+func TestEC2BuildeletDestroyUntrackedInstances(t *testing.T) {
+	awsC := cloud.NewFakeAWSClient()
+	create := func() *cloud.Instance {
+		inst, err := awsC.CreateInstance(context.Background(), &cloud.EC2VMConfiguration{
+			Description: "test instance",
+			ImageID:     "image-x",
+			Name:        instanceName("host-test-type", 10),
+			SSHKeyID:    "key-14",
+			Tags:        map[string]string{},
+			Type:        "type-x",
+			Zone:        "zone-1",
+		})
+		if err != nil {
+			t.Errorf("unable to create instance: %s", err)
+		}
+		return inst
+	}
+	// create untracked instances
+	for it := 0; it < 10; it++ {
+		_ = create()
+	}
+	wantTrackedInst := create()
+	wantRemoteInst := create()
+
+	pool := &EC2Buildlet{
+		awsClient: awsC,
+		isRemoteBuildlet: func(name string) bool {
+			if name == wantRemoteInst.Name {
+				return true
+			}
+			return false
+		},
+		ledger: &ledger{
+			cpuLimit: 200,
+			cpuUsed:  4,
+			entries: map[string]*entry{
+				wantTrackedInst.Name: {
+					createdAt:    time.Now(),
+					instanceID:   wantTrackedInst.ID,
+					instanceName: wantTrackedInst.Name,
+					vCPUCount:    4,
+				},
+			},
+		},
+	}
+	pool.destroyUntrackedInstances()
+	gotInsts, err := awsC.RunningInstances(context.Background())
+	if err != nil || len(gotInsts) != 2 {
+		t.Errorf("awsClient.RunningInstances(ctx) = %+v, %s; want single inst and no error", gotInsts, err)
+	}
+}
+
+// fakeEC2BuildletClient is the client used to create buildlets on EC2.
+type fakeEC2BuildletClient struct {
+	createVMRequestSuccess bool
+	VMCreated              bool
+	buildletCreated        bool
+}
+
+// StartNewVM boots a new VM on EC2, waits until the client is accepting connections
+// on the configured port and returns a buildlet client configured communicate with it.
+func (f *fakeEC2BuildletClient) StartNewVM(ctx context.Context, buildEnv *buildenv.Environment, hconf *dashboard.HostConfig, vmName, hostType string, opts *buildlet.VMOpts) (*buildlet.Client, error) {
+	// check required params
+	if opts == nil || opts.TLS.IsZero() {
+		return nil, errors.New("TLS keypair is not set")
+	}
+	if buildEnv == nil {
+		return nil, errors.New("invalid build enviornment")
+	}
+	if hconf == nil {
+		return nil, errors.New("invalid host configuration")
+	}
+	if vmName == "" || hostType == "" {
+		return nil, fmt.Errorf("invalid vmName: %q and hostType: %q", vmName, hostType)
+	}
+	if opts.DeleteIn == 0 {
+		opts.DeleteIn = 30 * time.Minute
+	}
+	if !f.createVMRequestSuccess {
+		return nil, fmt.Errorf("unable to create instance %s: creation disabled", vmName)
+	}
+	condRun := func(fn func()) {
+		if fn != nil {
+			fn()
+		}
+	}
+	condRun(opts.OnInstanceRequested)
+	if !f.VMCreated {
+		return nil, errors.New("error waiting for instance to exist: vm existance disabled")
+	}
+
+	condRun(opts.OnInstanceCreated)
+
+	if !f.buildletCreated {
+		return nil, errors.New("error waiting for buildlet: buildlet creation disabled")
+	}
+
+	if opts.OnGotEC2InstanceInfo != nil {
+		opts.OnGotEC2InstanceInfo(&cloud.Instance{
+			CPUCount:          4,
+			CreatedAt:         time.Time{},
+			Description:       "sample vm",
+			ID:                "id-" + instanceName("random", 4),
+			IPAddressExternal: "127.0.0.1",
+			IPAddressInternal: "127.0.0.1",
+			ImageID:           "image-x",
+			Name:              vmName,
+			SSHKeyID:          "key-15",
+			SecurityGroups:    nil,
+			State:             "running",
+			Tags: map[string]string{
+				"foo": "bar",
+			},
+			Type: "yy.large",
+			Zone: "zone-a",
+		})
+	}
+	return &buildlet.Client{}, nil
+}
+
+type testLogger struct {
+	eventTimes []eventTime
+	spans      map[string]*span
+}
+
+type eventTime struct {
+	event string
+	opt   []string
+}
+
+type span struct {
+	event      string
+	opt        []string
+	err        error
+	calledDone bool
+}
+
+func (s *span) Done(err error) error {
+	s.err = err
+	s.calledDone = true
+	return nil
+}
+
+func newTestLogger() *testLogger {
+	return &testLogger{
+		eventTimes: make([]eventTime, 0, 5),
+		spans:      make(map[string]*span),
+	}
+}
+
+func (l *testLogger) LogEventTime(event string, optText ...string) {
+	l.eventTimes = append(l.eventTimes, eventTime{
+		event: event,
+		opt:   optText,
+	})
+}
+
+func (l *testLogger) CreateSpan(event string, optText ...string) spanlog.Span {
+	s := &span{
+		event: event,
+		opt:   optText,
+	}
+	l.spans[event] = s
+	return s
+}
+
+func (l *testLogger) spanEvents() []string {
+	se := make([]string, 0, len(l.spans))
+	for k, s := range l.spans {
+		if !s.calledDone {
+			continue
+		}
+		se = append(se, k)
+	}
+	return se
+}
+
+type noopEventTimeLogger struct{}
+
+func (l noopEventTimeLogger) LogEventTime(event string, optText ...string) {}
+func (l noopEventTimeLogger) CreateSpan(event string, optText ...string) spanlog.Span {
+	return noopSpan{}
+}
+
+type noopSpan struct{}
+
+func (s noopSpan) Done(err error) error { return nil }
diff --git a/internal/coordinator/pool/gce.go b/internal/coordinator/pool/gce.go
index 76a1b03..2a1a21a 100644
--- a/internal/coordinator/pool/gce.go
+++ b/internal/coordinator/pool/gce.go
@@ -61,12 +61,9 @@
 	<-apiCallTicker.C
 }
 
-// IsGCERemoteBuildletFunc should return true if the buildlet instance name is
-// is a GCE remote buildlet.
-type IsGCERemoteBuildletFunc func(instanceName string) bool
-
 // Initialized by InitGCE:
-// TODO(http://golang.org/issue/38337): These should be moved into a struct as
+//
+// TODO(golang.org/issue/38337): These should be moved into a struct as
 // part of the effort to reduce package level variables.
 var (
 	buildEnv *buildenv.Environment
@@ -92,11 +89,11 @@
 	deleteTimeout       time.Duration
 	testFiles           map[string]string
 	basePinErr          *atomic.Value
-	isGCERemoteBuildlet IsGCERemoteBuildletFunc
+	isGCERemoteBuildlet IsRemoteBuildletFunc
 )
 
 // InitGCE initializes the GCE buildlet pool.
-func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsGCERemoteBuildletFunc, buildEnvName, mode string) error {
+func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
 	gceMode = mode
 	deleteTimeout = vmDeleteTimeout
 	testFiles = tFiles
@@ -220,7 +217,7 @@
 	return nil
 }
 
-// TODO(http://golang.org/issue/38337): These should be moved into a struct as
+// TODO(golang.org/issue/38337): These should be moved into a struct as
 // part of the effort to reduce package level variables.
 
 // GCEConfiguration manages and contains all of the GCE configuration.
@@ -404,10 +401,7 @@
 		return nil, err
 	}
 
-	deleteIn, ok := ctx.Value(BuildletTimeoutOpt{}).(time.Duration)
-	if !ok {
-		deleteIn = deleteTimeout
-	}
+	deleteIn := deleteTimeoutFromContextOrValue(ctx, deleteTimeout)
 
 	instName := instanceName(hostType, 7)
 	instName = strings.Replace(instName, "_", "-", -1) // Issue 22905; can't use underscores in GCE VMs
@@ -633,8 +627,8 @@
 	}
 
 	// TODO(bradfitz): remove this list and just query it from the compute API?
-	// http://godoc.org/google.golang.org/api/compute/v1#RegionsService.Get
-	// and Region.Zones: http://godoc.org/google.golang.org/api/compute/v1#Region
+	// https://godoc.org/google.golang.org/api/compute/v1#RegionsService.Get
+	// and Region.Zones: https://godoc.org/google.golang.org/api/compute/v1#Region
 
 	for {
 		for _, zone := range buildEnv.VMZones {
diff --git a/internal/coordinator/pool/pool.go b/internal/coordinator/pool/pool.go
index 1330a6c..d430bc2 100644
--- a/internal/coordinator/pool/pool.go
+++ b/internal/coordinator/pool/pool.go
@@ -36,6 +36,13 @@
 	String() string // TODO(bradfitz): more status stuff
 }
 
+// IsRemoteBuildletFunc should report whether the buildlet instance name is
+// is a remote buildlet. This is applicable to GCE and EC2 instances.
+//
+// TODO(golang.org/issue/38337): should be removed once remote buildlet management
+// functions are moved into a package.
+type IsRemoteBuildletFunc func(instanceName string) bool
+
 // randHex generates a random hex string.
 func randHex(n int) string {
 	buf := make([]byte, n/2+1)
@@ -58,6 +65,23 @@
 	return d2.String()
 }
 
+// instanceName generates a random instance name according to the host type.
 func instanceName(hostType string, length int) string {
 	return fmt.Sprintf("buildlet-%s-rn%s", strings.TrimPrefix(hostType, "host-"), randHex(length))
 }
+
+// deleteTimeoutFromContextOrValue retrieves the buildlet timeout duration from the
+// context. If it it is not found in the context, it will fallback to using the timeout passed
+// into the function.
+func deleteTimeoutFromContextOrValue(ctx context.Context, timeout time.Duration) time.Duration {
+	deleteIn, ok := ctx.Value(BuildletTimeoutOpt{}).(time.Duration)
+	if !ok {
+		deleteIn = timeout
+	}
+	return deleteIn
+}
+
+// isBuildlet checks the name string in order to determine if the name is for a buildlet.
+func isBuildlet(name string) bool {
+	return strings.HasPrefix(name, "buildlet-")
+}
diff --git a/internal/coordinator/pool/pool_test.go b/internal/coordinator/pool/pool_test.go
new file mode 100644
index 0000000..25d665b
--- /dev/null
+++ b/internal/coordinator/pool/pool_test.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pool
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+func TestPoolDeleteTimeoutFromContextOrValue(t *testing.T) {
+	testCases := []struct {
+		desc        string
+		ctxValue    interface{}
+		timeout     time.Duration
+		wantTimeout time.Duration
+	}{
+		{
+			desc:        "from-context",
+			ctxValue:    time.Hour,
+			timeout:     time.Minute,
+			wantTimeout: time.Hour,
+		},
+		{
+			desc:        "from-argument",
+			ctxValue:    nil,
+			timeout:     time.Minute,
+			wantTimeout: time.Minute,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			ctx := context.Background()
+			if tc.ctxValue != nil {
+				ctx = context.WithValue(ctx, BuildletTimeoutOpt{}, tc.ctxValue)
+			}
+			if got := deleteTimeoutFromContextOrValue(ctx, tc.timeout); got != tc.wantTimeout {
+				t.Errorf("deleteTimeoutFromContextOrValue(%+v, %s) = %s; want %s", ctx, tc.timeout, got, tc.wantTimeout)
+			}
+		})
+	}
+}
+
+func TestPoolIsBuildlet(t *testing.T) {
+	testCases := []struct {
+		desc string
+		name string
+		want bool
+	}{
+		{"valid", "buildlet-gce-tinker", true},
+		{"invalid", "gce-tinker", false},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			if got := isBuildlet(tc.name); got != tc.want {
+				t.Errorf("isBuildlet(%q) = %t; want %t", tc.name, got, tc.want)
+			}
+		})
+	}
+}