cpu: conditionally re-enable AVX512 support on darwin/amd64

Darwin opmask clobbering bug was fixed in kernel version 21.3.0
as released in MacOS 12.2.0. This commit resolves issue by
checking for Darwin AVX512 support via a sysctl call with the
addition of a kernel minimum version check.

The kernel version check is completed without adding new
dependencies to x/sys/cpu. A sysctl call is accomplished by
copying a minimal amount of code from x/sys/unix, to retrieve
only the needed KERN_OSRELEASE value. This code is structured
in the same manner as an existing analogous AIX/PPC64 syscall.

The resulting dotted version string value is then parsed for
numeric comparison with a dependency free function.

All code in this contribution is structured to ease removal of
the special darwin/amd64 codepaths when that OS/arch combination
is eventually no longer supported by golang.

Resolves issue: golang/go#49233, reinstates fix for issue: golang/go#43089

Change-Id: I4755fc8b3865eb6562b0959ecc910e2c46ac6cb4
Reviewed-on: https://go-review.googlesource.com/c/sys/+/620256
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: vsivsi@yahoo.com <vsivsi@yahoo.com>
diff --git a/cpu/asm_darwin_x86_gc.s b/cpu/asm_darwin_x86_gc.s
new file mode 100644
index 0000000..ec2acfe
--- /dev/null
+++ b/cpu/asm_darwin_x86_gc.s
@@ -0,0 +1,17 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+#include "textflag.h"
+
+TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctl(SB)
+GLOBL	·libc_sysctl_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+
+TEXT libc_sysctlbyname_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctlbyname(SB)
+GLOBL	·libc_sysctlbyname_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctlbyname_trampoline_addr(SB)/8, $libc_sysctlbyname_trampoline<>(SB)
diff --git a/cpu/cpu_darwin_x86.go b/cpu/cpu_darwin_x86.go
new file mode 100644
index 0000000..b838cb9
--- /dev/null
+++ b/cpu/cpu_darwin_x86.go
@@ -0,0 +1,61 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+// darwinSupportsAVX512 checks Darwin kernel for AVX512 support via sysctl
+// call (see issue 43089). It also restricts AVX512 support for Darwin to
+// kernel version 21.3.0 (MacOS 12.2.0) or later (see issue 49233).
+//
+// Background:
+// Darwin implements a special mechanism to economize on thread state when
+// AVX512 specific registers are not in use. This scheme minimizes state when
+// preempting threads that haven't yet used any AVX512 instructions, but adds
+// special requirements to check for AVX512 hardware support at runtime (e.g.
+// via sysctl call or commpage inspection). See issue 43089 and link below for
+// full background:
+// https://github.com/apple-oss-distributions/xnu/blob/xnu-11215.1.10/osfmk/i386/fpu.c#L214-L240
+//
+// Additionally, all versions of the Darwin kernel from 19.6.0 through 21.2.0
+// (corresponding to MacOS 10.15.6 - 12.1) have a bug that can cause corruption
+// of the AVX512 mask registers (K0-K7) upon signal return. For this reason
+// AVX512 is considered unsafe to use on Darwin for kernel versions prior to
+// 21.3.0, where a fix has been confirmed. See issue 49233 for full background.
+func darwinSupportsAVX512() bool {
+	return darwinSysctlEnabled([]byte("hw.optional.avx512f\x00")) && darwinKernelVersionCheck(21, 3, 0)
+}
+
+// Ensure Darwin kernel version is at least major.minor.patch, avoiding dependencies
+func darwinKernelVersionCheck(major, minor, patch int) bool {
+	var release [256]byte
+	err := darwinOSRelease(&release)
+	if err != nil {
+		return false
+	}
+
+	var mmp [3]int
+	c := 0
+Loop:
+	for _, b := range release[:] {
+		switch {
+		case b >= '0' && b <= '9':
+			mmp[c] = 10*mmp[c] + int(b-'0')
+		case b == '.':
+			c++
+			if c > 2 {
+				return false
+			}
+		case b == 0:
+			break Loop
+		default:
+			return false
+		}
+	}
+	if c != 2 {
+		return false
+	}
+	return mmp[0] > major || mmp[0] == major && (mmp[1] > minor || mmp[1] == minor && mmp[2] >= patch)
+}
diff --git a/cpu/cpu_gc_x86.go b/cpu/cpu_gc_x86.go
index 910728f..32a4451 100644
--- a/cpu/cpu_gc_x86.go
+++ b/cpu/cpu_gc_x86.go
@@ -6,10 +6,10 @@
 
 package cpu
 
-// cpuid is implemented in cpu_x86.s for gc compiler
+// cpuid is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
 
-// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler
+// xgetbv with ecx = 0 is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func xgetbv() (eax, edx uint32)
diff --git a/cpu/cpu_x86.s b/cpu/cpu_gc_x86.s
similarity index 94%
rename from cpu/cpu_x86.s
rename to cpu/cpu_gc_x86.s
index 7d7ba33..ce208ce 100644
--- a/cpu/cpu_x86.s
+++ b/cpu/cpu_gc_x86.s
@@ -18,7 +18,7 @@
 	RET
 
 // func xgetbv() (eax, edx uint32)
-TEXT ·xgetbv(SB),NOSPLIT,$0-8
+TEXT ·xgetbv(SB), NOSPLIT, $0-8
 	MOVL $0, CX
 	XGETBV
 	MOVL AX, eax+0(FP)
diff --git a/cpu/cpu_gccgo_x86.go b/cpu/cpu_gccgo_x86.go
index 99c60fe..170d21d 100644
--- a/cpu/cpu_gccgo_x86.go
+++ b/cpu/cpu_gccgo_x86.go
@@ -23,9 +23,3 @@
 	gccgoXgetbv(&a, &d)
 	return a, d
 }
-
-// gccgo doesn't build on Darwin, per:
-// https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/gcc.rb#L76
-func darwinSupportsAVX512() bool {
-	return false
-}
diff --git a/cpu/cpu_other_x86.go b/cpu/cpu_other_x86.go
new file mode 100644
index 0000000..a0fd7e2
--- /dev/null
+++ b/cpu/cpu_other_x86.go
@@ -0,0 +1,11 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64p32 || (amd64 && (!darwin || !gc))
+
+package cpu
+
+func darwinSupportsAVX512() bool {
+	panic("only implemented for gc && amd64 && darwin")
+}
diff --git a/cpu/cpu_x86.go b/cpu/cpu_x86.go
index c29f5e4..600a680 100644
--- a/cpu/cpu_x86.go
+++ b/cpu/cpu_x86.go
@@ -92,10 +92,8 @@
 		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
 
 		if runtime.GOOS == "darwin" {
-			// Darwin doesn't save/restore AVX-512 mask registers correctly across signal handlers.
-			// Since users can't rely on mask register contents, let's not advertise AVX-512 support.
-			// See issue 49233.
-			osSupportsAVX512 = false
+			// Darwin requires special AVX512 checks, see cpu_darwin_x86.go
+			osSupportsAVX512 = osSupportsAVX && darwinSupportsAVX512()
 		} else {
 			// Check if OPMASK and ZMM registers have OS support.
 			osSupportsAVX512 = osSupportsAVX && isSet(5, eax) && isSet(6, eax) && isSet(7, eax)
diff --git a/cpu/syscall_darwin_x86_gc.go b/cpu/syscall_darwin_x86_gc.go
new file mode 100644
index 0000000..4d0888b
--- /dev/null
+++ b/cpu/syscall_darwin_x86_gc.go
@@ -0,0 +1,98 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Minimal copy of x/sys/unix so the cpu package can make a
+// system call on Darwin without depending on x/sys/unix.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+type _C_int int32
+
+// adapted from unix.Uname() at x/sys/unix/syscall_darwin.go L419
+func darwinOSRelease(release *[256]byte) error {
+	// from x/sys/unix/zerrors_openbsd_amd64.go
+	const (
+		CTL_KERN       = 0x1
+		KERN_OSRELEASE = 0x2
+	)
+
+	mib := []_C_int{CTL_KERN, KERN_OSRELEASE}
+	n := unsafe.Sizeof(*release)
+
+	return sysctl(mib, &release[0], &n, nil, 0)
+}
+
+type Errno = syscall.Errno
+
+var _zero uintptr // Single-word zero for use when we need a valid pointer to 0 bytes.
+
+// from x/sys/unix/zsyscall_darwin_amd64.go L791-807
+func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	var _p0 unsafe.Pointer
+	if len(mib) > 0 {
+		_p0 = unsafe.Pointer(&mib[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	if _, _, err := syscall_syscall6(
+		libc_sysctl_trampoline_addr,
+		uintptr(_p0),
+		uintptr(len(mib)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+var libc_sysctl_trampoline_addr uintptr
+
+// adapted from internal/cpu/cpu_arm64_darwin.go
+func darwinSysctlEnabled(name []byte) bool {
+	out := int32(0)
+	nout := unsafe.Sizeof(out)
+	if ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0); ret != nil {
+		return false
+	}
+	return out > 0
+}
+
+//go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib"
+
+var libc_sysctlbyname_trampoline_addr uintptr
+
+// adapted from runtime/sys_darwin.go in the pattern of sysctl() above, as defined in x/sys/unix
+func sysctlbyname(name *byte, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	if _, _, err := syscall_syscall6(
+		libc_sysctlbyname_trampoline_addr,
+		uintptr(unsafe.Pointer(name)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+		0,
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib"
+
+// Implemented in the runtime package (runtime/sys_darwin.go)
+func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
+//go:linkname syscall_syscall6 syscall.syscall6