internal/cpu: repair VNNI feature check

This is a pain to test.
Also the original test was never executed, because it was wrong.

It looks like processors that might lack this features
include Intel 11th generation and AMD Zen 4.  These might
or might not have bit 2 set in the 7th cpuid "leaf" (SM4)
which is what the incorrect test was checking; the bug
is triggered by ^VNNI & SM4.  Apparently the SM4 bit is
not usually set, else we would have seen a test failure.

The "Lion Cove" microarchitecture (Arrow Lake, Lunar Lake)
appears to trigger this problem, it's not clear if there are
others.  It was hard to verify this from online information.

Fixes #76881.

Change-Id: I21be6b4f47134d81e89799b0f06f89fcb6563264
Reviewed-on: https://go-review.googlesource.com/c/go/+/731240
TryBot-Bypass: David Chase <drchase@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index 4610ce8..711fb04 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -219,7 +219,7 @@
 	if eax7 >= 1 {
 		eax71, _, _, _ := cpuid(7, 1)
 		if X86.HasAVX {
-			X86.HasAVXVNNI = isSet(4, eax71)
+			X86.HasAVXVNNI = isSet(eax71, cpuid_AVXVNNI)
 		}
 	}
 
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 83925ae..5fd7407 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -1135,18 +1135,22 @@
 	wanted2 := make([]int32, 4)
 	res1 := make([]int32, 4)
 	res2 := make([]int32, 4)
-	for i := range 4 {
-		xd[i] = 5
-		yd[i] = 6
-		zd[i] = 3
-		wanted1[i] = 30
-		wanted2[i] = 30
+	for i := range 16 {
+		xd[i] = int8(i + 112)  // 112+15 = 127
+		yd[i] = uint8(i + 240) // 240+15 = 255
 	}
+	for i := range 4 {
+		i4 := 4 * i
+		wanted1[i] = int32(xd[i4])*int32(yd[i4]) + int32(xd[i4+1])*int32(yd[i4+1]) + int32(xd[i4+2])*int32(yd[i4+2]) + int32(xd[i4+3])*int32(yd[i4+3])
+		zd[i] = int32(i + 1)
+		wanted2[i] = wanted1[i] + zd[i]
+	}
+
 	x := archsimd.LoadInt8x16Slice(xd)
 	y := archsimd.LoadUint8x16Slice(yd)
 	z := archsimd.LoadInt32x4Slice(zd)
 	x.DotProductQuadruple(y).StoreSlice(res1)
-	x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
+	x.DotProductQuadruple(y).Add(z).StoreSlice(res2)
 	for i := range 4 {
 		if res1[i] != wanted1[i] {
 			t.Errorf("got %d wanted %d", res1[i], wanted1[i])