unix: add CPUSetDynamic for systems with more than 1024 CPUs The existing CPUSet type is a fixed-size array limited to 1024 CPUs, which makes it problematic to use for large systems (such as Google's X4 instances with 1440 and 1920 vCPUs), see e.g. https://github.com/opencontainers/runc/issues/5023. Introduce CPUSetDynamic type and NewCPUSet constructor to support large systems. The bit-managing routines (set/clear/isset/fill/count) are separated and reused. Add variants of SchedGetaffinity, SchedSetaffinity and SetMemPolicy that accept the new type. Amend the documentation for CPUSet. Amend the existing TestSchedSetaffinity to: - test set.Fill; - use t.Cleanup to restore the affinity. Add tests for new functionality (mostly a copy of existing tests). This is an alternative to CL 727540 / CL 727541. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> Change-Id: I51bba0305b8dfa7a88a4e7fb8758d73f798574f1 Reviewed-on: https://go-review.googlesource.com/c/sys/+/735380 Reviewed-by: Tobias Klauser <tobias.klauser@gmail.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/unix/affinity_linux.go b/unix/affinity_linux.go index 3ea4703..acd6257 100644 --- a/unix/affinity_linux.go +++ b/unix/affinity_linux.go
@@ -13,11 +13,19 @@ const cpuSetSize = _CPU_SETSIZE / _NCPUBITS -// CPUSet represents a CPU affinity mask. +// CPUSet represents a bit mask of CPUs, to be used with [SchedGetaffinity], [SchedSetaffinity], +// and [SetMemPolicy]. +// +// Note this type can only represent CPU IDs 0 through 1023. +// Use [CPUSetDynamic]/[NewCPUSet] instead to avoid this limit. type CPUSet [cpuSetSize]cpuMask -func schedAffinity(trap uintptr, pid int, set *CPUSet) error { - _, _, e := RawSyscall(trap, uintptr(pid), uintptr(unsafe.Sizeof(*set)), uintptr(unsafe.Pointer(set))) +// CPUSetDynamic represents a bit mask of CPUs, to be used with [SchedGetaffinityDynamic], +// [SchedSetaffinityDynamic], and [SetMemPolicyDynamic]. Use [NewCPUSet] to allocate. +type CPUSetDynamic []cpuMask + +func schedAffinity(trap uintptr, pid int, size uintptr, ptr unsafe.Pointer) error { + _, _, e := RawSyscall(trap, uintptr(pid), uintptr(size), uintptr(ptr)) if e != 0 { return errnoErr(e) } @@ -27,13 +35,13 @@ // SchedGetaffinity gets the CPU affinity mask of the thread specified by pid. // If pid is 0 the calling thread is used. func SchedGetaffinity(pid int, set *CPUSet) error { - return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set) + return schedAffinity(SYS_SCHED_GETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set)) } // SchedSetaffinity sets the CPU affinity mask of the thread specified by pid. // If pid is 0 the calling thread is used. func SchedSetaffinity(pid int, set *CPUSet) error { - return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set) + return schedAffinity(SYS_SCHED_SETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set)) } // Zero clears the set s, so that it contains no CPUs. @@ -45,9 +53,7 @@ // will silently ignore any invalid CPU bits in [CPUSet] so this is an // efficient way of resetting the CPU affinity of a process. func (s *CPUSet) Fill() { - for i := range s { - s[i] = ^cpuMask(0) - } + cpuMaskFill(s[:]) } func cpuBitsIndex(cpu int) int { @@ -58,24 +64,27 @@ return cpuMask(1 << (uint(cpu) % _NCPUBITS)) } -// Set adds cpu to the set s. -func (s *CPUSet) Set(cpu int) { +func cpuMaskFill(s []cpuMask) { + for i := range s { + s[i] = ^cpuMask(0) + } +} + +func cpuMaskSet(s []cpuMask, cpu int) { i := cpuBitsIndex(cpu) if i < len(s) { s[i] |= cpuBitsMask(cpu) } } -// Clear removes cpu from the set s. -func (s *CPUSet) Clear(cpu int) { +func cpuMaskClear(s []cpuMask, cpu int) { i := cpuBitsIndex(cpu) if i < len(s) { s[i] &^= cpuBitsMask(cpu) } } -// IsSet reports whether cpu is in the set s. -func (s *CPUSet) IsSet(cpu int) bool { +func cpuMaskIsSet(s []cpuMask, cpu int) bool { i := cpuBitsIndex(cpu) if i < len(s) { return s[i]&cpuBitsMask(cpu) != 0 @@ -83,11 +92,98 @@ return false } -// Count returns the number of CPUs in the set s. -func (s *CPUSet) Count() int { +func cpuMaskCount(s []cpuMask) int { c := 0 for _, b := range s { c += bits.OnesCount64(uint64(b)) } return c } + +// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken. +func (s *CPUSet) Set(cpu int) { + cpuMaskSet(s[:], cpu) +} + +// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken. +func (s *CPUSet) Clear(cpu int) { + cpuMaskClear(s[:], cpu) +} + +// IsSet reports whether cpu is in the set s. +func (s *CPUSet) IsSet(cpu int) bool { + return cpuMaskIsSet(s[:], cpu) +} + +// Count returns the number of CPUs in the set s. +func (s *CPUSet) Count() int { + return cpuMaskCount(s[:]) +} + +// NewCPUSet creates a CPU affinity mask capable of representing CPU IDs +// up to maxCPU (exclusive). +func NewCPUSet(maxCPU int) CPUSetDynamic { + numMasks := (maxCPU + _NCPUBITS - 1) / _NCPUBITS + if numMasks == 0 { + numMasks = 1 + } + return make(CPUSetDynamic, numMasks) +} + +// Zero clears the set s, so that it contains no CPUs. +func (s CPUSetDynamic) Zero() { + clear(s) +} + +// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinityDynamic] +// will silently ignore any invalid CPU bits in [CPUSetDynamic] so this is an +// efficient way of resetting the CPU affinity of a process. +func (s CPUSetDynamic) Fill() { + cpuMaskFill(s) +} + +// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken. +func (s CPUSetDynamic) Set(cpu int) { + cpuMaskSet(s, cpu) +} + +// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken. +func (s CPUSetDynamic) Clear(cpu int) { + cpuMaskClear(s, cpu) +} + +// IsSet reports whether cpu is in the set s. +func (s CPUSetDynamic) IsSet(cpu int) bool { + return cpuMaskIsSet(s, cpu) +} + +// Count returns the number of CPUs in the set s. +func (s CPUSetDynamic) Count() int { + return cpuMaskCount(s) +} + +func (s CPUSetDynamic) size() uintptr { + return uintptr(len(s)) * unsafe.Sizeof(cpuMask(0)) +} + +func (s CPUSetDynamic) pointer() unsafe.Pointer { + if len(s) == 0 { + return nil + } + return unsafe.Pointer(&s[0]) +} + +// SchedGetaffinityDynamic gets the CPU affinity mask of the thread specified by pid. +// If pid is 0 the calling thread is used. +// +// If the set is smaller than the size of the affinity mask used by the kernel, +// [EINVAL] is returned. +func SchedGetaffinityDynamic(pid int, set CPUSetDynamic) error { + return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set.size(), set.pointer()) +} + +// SchedSetaffinityDynamic sets the CPU affinity mask of the thread specified by pid. +// If pid is 0 the calling thread is used. +func SchedSetaffinityDynamic(pid int, set CPUSetDynamic) error { + return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set.size(), set.pointer()) +}
diff --git a/unix/syscall_linux.go b/unix/syscall_linux.go index 06c0eea..f7b82bc 100644 --- a/unix/syscall_linux.go +++ b/unix/syscall_linux.go
@@ -2644,8 +2644,12 @@ //sys Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error) //sys Mseal(b []byte, flags uint) (err error) -//sys setMemPolicy(mode int, mask *CPUSet, size int) (err error) = SYS_SET_MEMPOLICY +//sys setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) = SYS_SET_MEMPOLICY func SetMemPolicy(mode int, mask *CPUSet) error { - return setMemPolicy(mode, mask, _CPU_SETSIZE) + return setMemPolicy(mode, unsafe.Pointer(mask), _CPU_SETSIZE) +} + +func SetMemPolicyDynamic(mode int, mask CPUSetDynamic) error { + return setMemPolicy(mode, mask.pointer(), mask.size()) }
diff --git a/unix/syscall_linux_test.go b/unix/syscall_linux_test.go index d3075ca..99332ea 100644 --- a/unix/syscall_linux_test.go +++ b/unix/syscall_linux_test.go
@@ -19,6 +19,7 @@ "path/filepath" "runtime" "runtime/debug" + "slices" "strconv" "strings" "syscall" @@ -512,7 +513,12 @@ } func TestSchedSetaffinity(t *testing.T) { + const maxcpus = 1024 // _CPU_SETSIZE var newMask unix.CPUSet + newMask.Fill() + if count := newMask.Count(); count != maxcpus { + t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus) + } newMask.Zero() if newMask.Count() != 0 { t.Errorf("CpuZero: didn't zero CPU set: %v", newMask) @@ -566,6 +572,14 @@ } } + t.Cleanup(func() { + // Restore old mask so it doesn't affect successive tests. + err = unix.SchedSetaffinity(0, &oldMask) + if err != nil { + t.Fatalf("SchedSetaffinity: %v", err) + } + }) + err = unix.SchedSetaffinity(0, &newMask) if err != nil { t.Fatalf("SchedSetaffinity: %v", err) @@ -580,11 +594,90 @@ if gotMask != newMask { t.Errorf("SchedSetaffinity: returned affinity mask does not match set affinity mask") } +} - // Restore old mask so it doesn't affect successive tests - err = unix.SchedSetaffinity(0, &oldMask) +func TestSchedSetaffinityDynamic(t *testing.T) { + const maxcpus = 4096 + + newMask := unix.NewCPUSet(maxcpus) + newMask.Fill() + if count := newMask.Count(); count != maxcpus { + t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus) + } + newMask.Zero() + if newMask.Count() != 0 { + t.Errorf("Zero: didn't zero CPU set: %v", newMask) + } + cpu := 1 + newMask.Set(cpu) + if newMask.Count() != 1 || !newMask.IsSet(cpu) { + t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask) + } + cpu = 5 + newMask.Set(cpu) + if newMask.Count() != 2 || !newMask.IsSet(cpu) { + t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask) + } + newMask.Clear(cpu) + if newMask.Count() != 1 || newMask.IsSet(cpu) { + t.Errorf("Clear: didn't clear CPU %d in set: %v", cpu, newMask) + } + + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + oldMask := unix.NewCPUSet(maxcpus) + err := unix.SchedGetaffinityDynamic(0, oldMask) if err != nil { - t.Fatalf("SchedSetaffinity: %v", err) + t.Fatalf("SchedGetaffinityDynamic: %v", err) + } + + if runtime.NumCPU() < 2 { + t.Skip("skipping setaffinity tests on single CPU system") + } + if runtime.GOOS == "android" { + t.Skip("skipping setaffinity tests on android") + } + + // On a system like ppc64x where some cores can be disabled using ppc64_cpu, + // setaffinity should only be called with enabled cores. The valid cores + // are found from the oldMask, but if none are found then the setaffinity + // tests are skipped. Issue #27875. + cpu = 1 + if !oldMask.IsSet(cpu) { + newMask.Zero() + for i := range len(oldMask) { + if oldMask.IsSet(i) { + newMask.Set(i) + break + } + } + if newMask.Count() == 0 { + t.Skip("skipping setaffinity tests if CPU not available") + } + } + + t.Cleanup(func() { + // Restore old mask so it doesn't affect successive tests. + err = unix.SchedSetaffinityDynamic(0, oldMask) + if err != nil { + t.Fatalf("SchedSetaffinityDynamic: %v", err) + } + }) + + err = unix.SchedSetaffinityDynamic(0, newMask) + if err != nil { + t.Fatalf("SchedSetaffinityDynamic: %v", err) + } + + gotMask := unix.NewCPUSet(maxcpus) + err = unix.SchedGetaffinityDynamic(0, gotMask) + if err != nil { + t.Fatalf("SchedGetaffinityDynamic: %v", err) + } + + if !slices.Equal(gotMask, newMask) { + t.Errorf("SchedSetaffinityDynamic: returned affinity mask does not match set affinity mask (%+v != %+v", gotMask, newMask) } }
diff --git a/unix/zsyscall_linux.go b/unix/zsyscall_linux.go index 8935d10..886f5de 100644 --- a/unix/zsyscall_linux.go +++ b/unix/zsyscall_linux.go
@@ -2241,8 +2241,8 @@ // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func setMemPolicy(mode int, mask *CPUSet, size int) (err error) { - _, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(unsafe.Pointer(mask)), uintptr(size)) +func setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) { + _, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(mask), uintptr(size)) if e1 != 0 { err = errnoErr(e1) }