unix: add CPUSetDynamic for systems with more than 1024 CPUs

The existing CPUSet type is a fixed-size array limited to 1024 CPUs,
which makes it problematic to use for large systems (such as Google's
X4 instances with 1440 and 1920 vCPUs), see e.g.
https://github.com/opencontainers/runc/issues/5023.

Introduce CPUSetDynamic type and NewCPUSet constructor to support large
systems. The bit-managing routines (set/clear/isset/fill/count) are
separated and reused.

Add variants of SchedGetaffinity, SchedSetaffinity and SetMemPolicy
that accept the new type.

Amend the documentation for CPUSet.

Amend the existing TestSchedSetaffinity to:
 - test set.Fill;
 - use t.Cleanup to restore the affinity.

Add tests for new functionality (mostly a copy of existing tests).

This is an alternative to CL 727540 / CL 727541.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Change-Id: I51bba0305b8dfa7a88a4e7fb8758d73f798574f1
Reviewed-on: https://go-review.googlesource.com/c/sys/+/735380
Reviewed-by: Tobias Klauser <tobias.klauser@gmail.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/unix/affinity_linux.go b/unix/affinity_linux.go
index 3ea4703..acd6257 100644
--- a/unix/affinity_linux.go
+++ b/unix/affinity_linux.go
@@ -13,11 +13,19 @@
 
 const cpuSetSize = _CPU_SETSIZE / _NCPUBITS
 
-// CPUSet represents a CPU affinity mask.
+// CPUSet represents a bit mask of CPUs, to be used with [SchedGetaffinity], [SchedSetaffinity],
+// and [SetMemPolicy].
+//
+// Note this type can only represent CPU IDs 0 through 1023.
+// Use [CPUSetDynamic]/[NewCPUSet] instead to avoid this limit.
 type CPUSet [cpuSetSize]cpuMask
 
-func schedAffinity(trap uintptr, pid int, set *CPUSet) error {
-	_, _, e := RawSyscall(trap, uintptr(pid), uintptr(unsafe.Sizeof(*set)), uintptr(unsafe.Pointer(set)))
+// CPUSetDynamic represents a bit mask of CPUs, to be used with [SchedGetaffinityDynamic],
+// [SchedSetaffinityDynamic], and [SetMemPolicyDynamic]. Use [NewCPUSet] to allocate.
+type CPUSetDynamic []cpuMask
+
+func schedAffinity(trap uintptr, pid int, size uintptr, ptr unsafe.Pointer) error {
+	_, _, e := RawSyscall(trap, uintptr(pid), uintptr(size), uintptr(ptr))
 	if e != 0 {
 		return errnoErr(e)
 	}
@@ -27,13 +35,13 @@
 // SchedGetaffinity gets the CPU affinity mask of the thread specified by pid.
 // If pid is 0 the calling thread is used.
 func SchedGetaffinity(pid int, set *CPUSet) error {
-	return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set)
+	return schedAffinity(SYS_SCHED_GETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set))
 }
 
 // SchedSetaffinity sets the CPU affinity mask of the thread specified by pid.
 // If pid is 0 the calling thread is used.
 func SchedSetaffinity(pid int, set *CPUSet) error {
-	return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set)
+	return schedAffinity(SYS_SCHED_SETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set))
 }
 
 // Zero clears the set s, so that it contains no CPUs.
@@ -45,9 +53,7 @@
 // will silently ignore any invalid CPU bits in [CPUSet] so this is an
 // efficient way of resetting the CPU affinity of a process.
 func (s *CPUSet) Fill() {
-	for i := range s {
-		s[i] = ^cpuMask(0)
-	}
+	cpuMaskFill(s[:])
 }
 
 func cpuBitsIndex(cpu int) int {
@@ -58,24 +64,27 @@
 	return cpuMask(1 << (uint(cpu) % _NCPUBITS))
 }
 
-// Set adds cpu to the set s.
-func (s *CPUSet) Set(cpu int) {
+func cpuMaskFill(s []cpuMask) {
+	for i := range s {
+		s[i] = ^cpuMask(0)
+	}
+}
+
+func cpuMaskSet(s []cpuMask, cpu int) {
 	i := cpuBitsIndex(cpu)
 	if i < len(s) {
 		s[i] |= cpuBitsMask(cpu)
 	}
 }
 
-// Clear removes cpu from the set s.
-func (s *CPUSet) Clear(cpu int) {
+func cpuMaskClear(s []cpuMask, cpu int) {
 	i := cpuBitsIndex(cpu)
 	if i < len(s) {
 		s[i] &^= cpuBitsMask(cpu)
 	}
 }
 
-// IsSet reports whether cpu is in the set s.
-func (s *CPUSet) IsSet(cpu int) bool {
+func cpuMaskIsSet(s []cpuMask, cpu int) bool {
 	i := cpuBitsIndex(cpu)
 	if i < len(s) {
 		return s[i]&cpuBitsMask(cpu) != 0
@@ -83,11 +92,98 @@
 	return false
 }
 
-// Count returns the number of CPUs in the set s.
-func (s *CPUSet) Count() int {
+func cpuMaskCount(s []cpuMask) int {
 	c := 0
 	for _, b := range s {
 		c += bits.OnesCount64(uint64(b))
 	}
 	return c
 }
+
+// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken.
+func (s *CPUSet) Set(cpu int) {
+	cpuMaskSet(s[:], cpu)
+}
+
+// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken.
+func (s *CPUSet) Clear(cpu int) {
+	cpuMaskClear(s[:], cpu)
+}
+
+// IsSet reports whether cpu is in the set s.
+func (s *CPUSet) IsSet(cpu int) bool {
+	return cpuMaskIsSet(s[:], cpu)
+}
+
+// Count returns the number of CPUs in the set s.
+func (s *CPUSet) Count() int {
+	return cpuMaskCount(s[:])
+}
+
+// NewCPUSet creates a CPU affinity mask capable of representing CPU IDs
+// up to maxCPU (exclusive).
+func NewCPUSet(maxCPU int) CPUSetDynamic {
+	numMasks := (maxCPU + _NCPUBITS - 1) / _NCPUBITS
+	if numMasks == 0 {
+		numMasks = 1
+	}
+	return make(CPUSetDynamic, numMasks)
+}
+
+// Zero clears the set s, so that it contains no CPUs.
+func (s CPUSetDynamic) Zero() {
+	clear(s)
+}
+
+// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinityDynamic]
+// will silently ignore any invalid CPU bits in [CPUSetDynamic] so this is an
+// efficient way of resetting the CPU affinity of a process.
+func (s CPUSetDynamic) Fill() {
+	cpuMaskFill(s)
+}
+
+// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken.
+func (s CPUSetDynamic) Set(cpu int) {
+	cpuMaskSet(s, cpu)
+}
+
+// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken.
+func (s CPUSetDynamic) Clear(cpu int) {
+	cpuMaskClear(s, cpu)
+}
+
+// IsSet reports whether cpu is in the set s.
+func (s CPUSetDynamic) IsSet(cpu int) bool {
+	return cpuMaskIsSet(s, cpu)
+}
+
+// Count returns the number of CPUs in the set s.
+func (s CPUSetDynamic) Count() int {
+	return cpuMaskCount(s)
+}
+
+func (s CPUSetDynamic) size() uintptr {
+	return uintptr(len(s)) * unsafe.Sizeof(cpuMask(0))
+}
+
+func (s CPUSetDynamic) pointer() unsafe.Pointer {
+	if len(s) == 0 {
+		return nil
+	}
+	return unsafe.Pointer(&s[0])
+}
+
+// SchedGetaffinityDynamic gets the CPU affinity mask of the thread specified by pid.
+// If pid is 0 the calling thread is used.
+//
+// If the set is smaller than the size of the affinity mask used by the kernel,
+// [EINVAL] is returned.
+func SchedGetaffinityDynamic(pid int, set CPUSetDynamic) error {
+	return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set.size(), set.pointer())
+}
+
+// SchedSetaffinityDynamic sets the CPU affinity mask of the thread specified by pid.
+// If pid is 0 the calling thread is used.
+func SchedSetaffinityDynamic(pid int, set CPUSetDynamic) error {
+	return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set.size(), set.pointer())
+}
diff --git a/unix/syscall_linux.go b/unix/syscall_linux.go
index 06c0eea..f7b82bc 100644
--- a/unix/syscall_linux.go
+++ b/unix/syscall_linux.go
@@ -2644,8 +2644,12 @@
 //sys	Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error)
 //sys	Mseal(b []byte, flags uint) (err error)
 
-//sys	setMemPolicy(mode int, mask *CPUSet, size int) (err error) = SYS_SET_MEMPOLICY
+//sys	setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) = SYS_SET_MEMPOLICY
 
 func SetMemPolicy(mode int, mask *CPUSet) error {
-	return setMemPolicy(mode, mask, _CPU_SETSIZE)
+	return setMemPolicy(mode, unsafe.Pointer(mask), _CPU_SETSIZE)
+}
+
+func SetMemPolicyDynamic(mode int, mask CPUSetDynamic) error {
+	return setMemPolicy(mode, mask.pointer(), mask.size())
 }
diff --git a/unix/syscall_linux_test.go b/unix/syscall_linux_test.go
index d3075ca..99332ea 100644
--- a/unix/syscall_linux_test.go
+++ b/unix/syscall_linux_test.go
@@ -19,6 +19,7 @@
 	"path/filepath"
 	"runtime"
 	"runtime/debug"
+	"slices"
 	"strconv"
 	"strings"
 	"syscall"
@@ -512,7 +513,12 @@
 }
 
 func TestSchedSetaffinity(t *testing.T) {
+	const maxcpus = 1024 // _CPU_SETSIZE
 	var newMask unix.CPUSet
+	newMask.Fill()
+	if count := newMask.Count(); count != maxcpus {
+		t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus)
+	}
 	newMask.Zero()
 	if newMask.Count() != 0 {
 		t.Errorf("CpuZero: didn't zero CPU set: %v", newMask)
@@ -566,6 +572,14 @@
 		}
 	}
 
+	t.Cleanup(func() {
+		// Restore old mask so it doesn't affect successive tests.
+		err = unix.SchedSetaffinity(0, &oldMask)
+		if err != nil {
+			t.Fatalf("SchedSetaffinity: %v", err)
+		}
+	})
+
 	err = unix.SchedSetaffinity(0, &newMask)
 	if err != nil {
 		t.Fatalf("SchedSetaffinity: %v", err)
@@ -580,11 +594,90 @@
 	if gotMask != newMask {
 		t.Errorf("SchedSetaffinity: returned affinity mask does not match set affinity mask")
 	}
+}
 
-	// Restore old mask so it doesn't affect successive tests
-	err = unix.SchedSetaffinity(0, &oldMask)
+func TestSchedSetaffinityDynamic(t *testing.T) {
+	const maxcpus = 4096
+
+	newMask := unix.NewCPUSet(maxcpus)
+	newMask.Fill()
+	if count := newMask.Count(); count != maxcpus {
+		t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus)
+	}
+	newMask.Zero()
+	if newMask.Count() != 0 {
+		t.Errorf("Zero: didn't zero CPU set: %v", newMask)
+	}
+	cpu := 1
+	newMask.Set(cpu)
+	if newMask.Count() != 1 || !newMask.IsSet(cpu) {
+		t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask)
+	}
+	cpu = 5
+	newMask.Set(cpu)
+	if newMask.Count() != 2 || !newMask.IsSet(cpu) {
+		t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask)
+	}
+	newMask.Clear(cpu)
+	if newMask.Count() != 1 || newMask.IsSet(cpu) {
+		t.Errorf("Clear: didn't clear CPU %d in set: %v", cpu, newMask)
+	}
+
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	oldMask := unix.NewCPUSet(maxcpus)
+	err := unix.SchedGetaffinityDynamic(0, oldMask)
 	if err != nil {
-		t.Fatalf("SchedSetaffinity: %v", err)
+		t.Fatalf("SchedGetaffinityDynamic: %v", err)
+	}
+
+	if runtime.NumCPU() < 2 {
+		t.Skip("skipping setaffinity tests on single CPU system")
+	}
+	if runtime.GOOS == "android" {
+		t.Skip("skipping setaffinity tests on android")
+	}
+
+	// On a system like ppc64x where some cores can be disabled using ppc64_cpu,
+	// setaffinity should only be called with enabled cores. The valid cores
+	// are found from the oldMask, but if none are found then the setaffinity
+	// tests are skipped. Issue #27875.
+	cpu = 1
+	if !oldMask.IsSet(cpu) {
+		newMask.Zero()
+		for i := range len(oldMask) {
+			if oldMask.IsSet(i) {
+				newMask.Set(i)
+				break
+			}
+		}
+		if newMask.Count() == 0 {
+			t.Skip("skipping setaffinity tests if CPU not available")
+		}
+	}
+
+	t.Cleanup(func() {
+		// Restore old mask so it doesn't affect successive tests.
+		err = unix.SchedSetaffinityDynamic(0, oldMask)
+		if err != nil {
+			t.Fatalf("SchedSetaffinityDynamic: %v", err)
+		}
+	})
+
+	err = unix.SchedSetaffinityDynamic(0, newMask)
+	if err != nil {
+		t.Fatalf("SchedSetaffinityDynamic: %v", err)
+	}
+
+	gotMask := unix.NewCPUSet(maxcpus)
+	err = unix.SchedGetaffinityDynamic(0, gotMask)
+	if err != nil {
+		t.Fatalf("SchedGetaffinityDynamic: %v", err)
+	}
+
+	if !slices.Equal(gotMask, newMask) {
+		t.Errorf("SchedSetaffinityDynamic: returned affinity mask does not match set affinity mask (%+v != %+v", gotMask, newMask)
 	}
 }
 
diff --git a/unix/zsyscall_linux.go b/unix/zsyscall_linux.go
index 8935d10..886f5de 100644
--- a/unix/zsyscall_linux.go
+++ b/unix/zsyscall_linux.go
@@ -2241,8 +2241,8 @@
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
-func setMemPolicy(mode int, mask *CPUSet, size int) (err error) {
-	_, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(unsafe.Pointer(mask)), uintptr(size))
+func setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) {
+	_, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(mask), uintptr(size))
 	if e1 != 0 {
 		err = errnoErr(e1)
 	}