syscall: add AmbientCaps to linux SysProcAttr

Fixes #19713

Change-Id: Id1ca61b35bca2a4bea23dd64c7fb001a3a14fd88
Reviewed-on: https://go-review.googlesource.com/43512
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index 5cbabf7..fd87b86 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -40,6 +40,7 @@
 	// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
 	// users this should be set to false for mappings work.
 	GidMappingsEnableSetgroups bool
+	AmbientCaps                []uintptr // Ambient capabilities (Linux only)
 }
 
 var (
@@ -101,6 +102,12 @@
 //go:noinline
 //go:norace
 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
+	// Defined in linux/prctl.h starting with Linux 4.3.
+	const (
+		PR_CAP_AMBIENT       = 0x2f
+		PR_CAP_AMBIENT_RAISE = 0x2
+	)
+
 	// vfork requires that the child not touch any of the parent's
 	// active stack frames. Hence, the child does all post-fork
 	// processing in this stack frame and never returns, while the
@@ -165,6 +172,14 @@
 
 	runtime_AfterForkInChild()
 
+	// Enable the "keep capabilities" flag to set ambient capabilities later.
+	if len(sys.AmbientCaps) > 0 {
+		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
+		if err1 != 0 {
+			goto childerror
+		}
+	}
+
 	// Wait for User ID/Group ID mappings to be written.
 	if sys.UidMappings != nil || sys.GidMappings != nil {
 		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
@@ -279,6 +294,13 @@
 		}
 	}
 
+	for _, c := range sys.AmbientCaps {
+		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
+		if err1 != 0 {
+			goto childerror
+		}
+	}
+
 	// Chdir
 	if dir != nil {
 		_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
diff --git a/src/syscall/exec_linux_test.go b/src/syscall/exec_linux_test.go
index 7a88968..854060b 100644
--- a/src/syscall/exec_linux_test.go
+++ b/src/syscall/exec_linux_test.go
@@ -9,13 +9,17 @@
 import (
 	"flag"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
+	"os/user"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"syscall"
 	"testing"
+	"unsafe"
 )
 
 // Check if we are in a chroot by checking if the inode of / is
@@ -382,3 +386,158 @@
 		t.Errorf("rmdir failed on %v: %v", d, err)
 	}
 }
+
+type capHeader struct {
+	version uint32
+	pid     int
+}
+
+type capData struct {
+	effective   uint32
+	permitted   uint32
+	inheritable uint32
+}
+
+const CAP_SYS_TIME = 25
+
+type caps struct {
+	hdr  capHeader
+	data [2]capData
+}
+
+func getCaps() (caps, error) {
+	var c caps
+
+	// Get capability version
+	if _, _, errno := syscall.Syscall(syscall.SYS_CAPGET, uintptr(unsafe.Pointer(&c.hdr)), uintptr(unsafe.Pointer(nil)), 0); errno != 0 {
+		return c, fmt.Errorf("SYS_CAPGET: %v", errno)
+	}
+
+	// Get current capabilities
+	if _, _, errno := syscall.Syscall(syscall.SYS_CAPGET, uintptr(unsafe.Pointer(&c.hdr)), uintptr(unsafe.Pointer(&c.data[0])), 0); errno != 0 {
+		return c, fmt.Errorf("SYS_CAPGET: %v", errno)
+	}
+
+	return c, nil
+}
+
+func mustSupportAmbientCaps(t *testing.T) {
+	var uname syscall.Utsname
+	if err := syscall.Uname(&uname); err != nil {
+		t.Fatalf("Uname: %v", err)
+	}
+	var buf [65]byte
+	for i, b := range uname.Release {
+		buf[i] = byte(b)
+	}
+	ver := string(buf[:])
+	if i := strings.Index(ver, "\x00"); i != -1 {
+		ver = ver[:i]
+	}
+	if strings.HasPrefix(ver, "2.") ||
+		strings.HasPrefix(ver, "3.") ||
+		strings.HasPrefix(ver, "4.1.") ||
+		strings.HasPrefix(ver, "4.2.") {
+		t.Skipf("kernel version %q predates required 4.3; skipping test", ver)
+	}
+}
+
+// TestAmbientCapsHelper isn't a real test. It's used as a helper process for
+// TestAmbientCaps.
+func TestAmbientCapsHelper(*testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+		return
+	}
+	defer os.Exit(0)
+
+	caps, err := getCaps()
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+	if caps.data[0].effective&(1<<uint(CAP_SYS_TIME)) == 0 {
+		fmt.Fprintln(os.Stderr, "CAP_SYS_TIME unexpectedly not in the effective capability mask")
+		os.Exit(2)
+	}
+}
+
+func TestAmbientCaps(t *testing.T) {
+	// Make sure we are running as root so we have permissions to use unshare
+	// and create a network namespace.
+	if os.Getuid() != 0 {
+		t.Skip("kernel prohibits unshare in unprivileged process, unless using user namespace")
+	}
+	mustSupportAmbientCaps(t)
+
+	// When running under the Go continuous build, skip tests for
+	// now when under Kubernetes. (where things are root but not quite)
+	// Both of these are our own environment variables.
+	// See Issue 12815.
+	if os.Getenv("GO_BUILDER_NAME") != "" && os.Getenv("IN_KUBERNETES") == "1" {
+		t.Skip("skipping test on Kubernetes-based builders; see Issue 12815")
+	}
+
+	caps, err := getCaps()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Add CAP_SYS_TIME to the permitted and inheritable capability mask,
+	// otherwise we will not be able to add it to the ambient capability mask.
+	caps.data[0].permitted |= 1 << uint(CAP_SYS_TIME)
+	caps.data[0].inheritable |= 1 << uint(CAP_SYS_TIME)
+
+	if _, _, errno := syscall.Syscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); errno != 0 {
+		t.Fatalf("SYS_CAPSET: %v", errno)
+	}
+
+	u, err := user.Lookup("nobody")
+	if err != nil {
+		t.Fatal(err)
+	}
+	uid, err := strconv.ParseInt(u.Uid, 0, 32)
+	if err != nil {
+		t.Fatal(err)
+	}
+	gid, err := strconv.ParseInt(u.Gid, 0, 32)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Copy the test binary to a temporary location which is readable by nobody.
+	f, err := ioutil.TempFile("", "gotest")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(f.Name())
+	defer f.Close()
+	e, err := os.Open(os.Args[0])
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer e.Close()
+	if _, err := io.Copy(f, e); err != nil {
+		t.Fatal(err)
+	}
+	if err := f.Chmod(0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := f.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	cmd := exec.Command(f.Name(), "-test.run=TestAmbientCapsHelper")
+	cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Credential: &syscall.Credential{
+			Uid: uint32(uid),
+			Gid: uint32(gid),
+		},
+		AmbientCaps: []uintptr{CAP_SYS_TIME},
+	}
+	if err := cmd.Run(); err != nil {
+		t.Fatal(err.Error())
+	}
+}