all: REVERSE MERGE dev.simd (0e5948d) into master

This commit is a REVERSE MERGE.
It merges dev.simd back into its parent branch, master, for Go 1.27
one last time.

Merge List:

+ 2026-06-01 0e5948dc59 [dev.simd] all: merge master (1cae25d) into dev.simd
+ 2026-06-01 a92cf0ee94 [dev.simd] simd, cmd/compile: add Midway GODEBUG=simd=0 emulation switch
+ 2026-06-01 e3afca43e2 [dev.simd] simd: attempting to pin down the simdgen output-order glitch
+ 2026-06-01 cea3788e05 [dev.simd] simd: add carryless multiply for wasm and for midway
+ 2026-06-01 627bc968ea [dev.simd] simd: add ARM64 PMULL (carrylessMultiplyWidenLo) intrinsic
+ 2026-05-30 80ab7bc1fa [dev.simd] simd: rename LoLong intrinsics to WidenLo
+ 2026-05-30 9764721859 [dev.simd] simdgen: filter arrangement symbols for shaped ARM64 instructions

Change-Id: Id0010ed9bd8e5d7da033d9506f48d367e1232194
diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
index 891a381..642d985 100644
--- a/src/cmd/compile/internal/arm64/simdssa.go
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -373,6 +373,9 @@
 		ssa.OpARM64VUMULL16B:
 		p = simdV21Long(s, v, arm64.ARNG_16B)
 
+	case ssa.OpARM64VPMULL2D:
+		p = simdV21Long(s, v, arm64.ARNG_2D)
+
 	case ssa.OpARM64VSMULL4S,
 		ssa.OpARM64VUMULL4S:
 		p = simdV21Long(s, v, arm64.ARNG_4S)
@@ -438,6 +441,9 @@
 		ssa.OpARM64VUMULL2_16B:
 		p = simdV21Long2(s, v, arm64.ARNG_16B)
 
+	case ssa.OpARM64VPMULL2_2D:
+		p = simdV21Long2(s, v, arm64.ARNG_2D)
+
 	case ssa.OpARM64VSMULL2_4S,
 		ssa.OpARM64VUMULL2_4S:
 		p = simdV21Long2(s, v, arm64.ARNG_4S)
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 5c526cb..44bf01d 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -240,6 +240,8 @@
 		return arm64.ARNG_4S
 	case arm64.ARNG_2S:
 		return arm64.ARNG_2D
+	case arm64.ARNG_1D:
+		return arm64.ARNG_1Q
 	default:
 		base.Fatalf("unsupported long input arrangement: %d", arng)
 		return 0
@@ -256,6 +258,8 @@
 		return arm64.ARNG_4H
 	case arm64.ARNG_4S:
 		return arm64.ARNG_2S
+	case arm64.ARNG_2D:
+		return arm64.ARNG_1D
 	default:
 		base.Fatalf("unsupported halfLanes input arrangement: %d", arng)
 		return 0
diff --git a/src/cmd/compile/internal/midway/deepcopy.go b/src/cmd/compile/internal/midway/deepcopy.go
index 95fc1f2..088ff23 100644
--- a/src/cmd/compile/internal/midway/deepcopy.go
+++ b/src/cmd/compile/internal/midway/deepcopy.go
@@ -98,10 +98,20 @@
 		name := id.Value
 		width := nameToElemBitWidth(name)
 		if width > 0 {
+			archsimdId := syntax.NewName(id.Pos(), archPkg)
+			if c.VecLen == 0 {
+				// special case for emulation
+				newSel := &syntax.SelectorExpr{
+					X:   archsimdId,
+					Sel: id, // name is unchanged for emulation
+				}
+				newSel.SetPos(id.Pos())
+				return newSel
+			}
+
 			count := c.VecLen / width
 			base := name[:len(name)-1]
 			newName := fmt.Sprintf("%sx%d", base, count)
-			archsimdId := syntax.NewName(id.Pos(), archPkg)
 			newSelId := syntax.NewName(id.Pos(), newName)
 			newSel := &syntax.SelectorExpr{
 				X:   archsimdId,
@@ -144,6 +154,17 @@
 			}
 			width := nameToElemBitWidth(name)
 			if width > 0 {
+				archsimdId := syntax.NewName(se.Pos(), archPkg)
+				if c.VecLen == 0 {
+					// emulated instead, name is unchanged
+					newSel := &syntax.SelectorExpr{
+						X:   archsimdId,
+						Sel: se.Sel,
+					}
+					newSel.SetPos(se.Pos())
+					return newSel
+				}
+
 				count := c.VecLen / width
 				base := name[:len(name)-1]
 				newName := fmt.Sprintf("%sx%d", base, count)
@@ -151,7 +172,6 @@
 					newName = "Load" + newName + nameSuffix
 				}
 
-				archsimdId := syntax.NewName(se.Pos(), archPkg)
 				newSelId := syntax.NewName(se.Sel.Pos(), newName)
 
 				newSel := &syntax.SelectorExpr{
diff --git a/src/cmd/compile/internal/midway/midway.go b/src/cmd/compile/internal/midway/midway.go
index d737a30..9adeec8 100644
--- a/src/cmd/compile/internal/midway/midway.go
+++ b/src/cmd/compile/internal/midway/midway.go
@@ -11,11 +11,11 @@
 func rewriteSizes() []int {
 	switch buildcfg.GOARCH {
 	case "wasm":
-		return []int{128}
+		return []int{0, 128}
 	case "amd64":
-		return []int{128, 256, 512}
+		return []int{0, 128, 256, 512}
 	case "arm64":
-		return []int{128} // this will change for SVE and cannot just be a size-based choice.
+		return []int{0, 128} // this will change for SVE and cannot just be a size-based choice.
 	}
 	return nil
 }
@@ -24,6 +24,7 @@
 const archFullPkg = "simd/internal/bridge"
 const archPkg = "bridge"
 const vectorSizeFn = "VectorBitSize"
+const emulatedFn = "Emulated"
 
 func isSimdTypeName(s string) bool {
 	switch s {
diff --git a/src/cmd/compile/internal/midway/rewrite.go b/src/cmd/compile/internal/midway/rewrite.go
index 27685e2..083761c 100644
--- a/src/cmd/compile/internal/midway/rewrite.go
+++ b/src/cmd/compile/internal/midway/rewrite.go
@@ -201,10 +201,13 @@
 	// switch ast node.
 	// the goal is something like (for now, till there are finer-grained choices)
 	// switch simd.VectorSize() {
-	//   case 128: call the specialize-for-128-code(args)
+	//   case 128: if simd.Emulated() { call the specialize-for-emulation-code(args) }
+	//             else { call the specialize-for-128-code(args) }
 	//   case 256: call the specialize-for-256-code(args)
 	//   etc
 	// }
+	//
+	// the cases above deal with the usual `return call(...)` vs `call(...); return`
 	switchStmt := &syntax.SwitchStmt{
 		Tag: pe(&syntax.CallExpr{
 			Fun: pe(&syntax.SelectorExpr{
@@ -215,6 +218,8 @@
 		Body: []*syntax.CaseClause{},
 	}
 
+	var emulation syntax.Stmt
+
 	for _, k := range r.sizes {
 		fnName := fmt.Sprintf("%s@simd%d", d.Name.Value, k)
 		fnIdent := syntax.NewName(d.Pos(), fnName)
@@ -224,22 +229,57 @@
 			ArgList: args(),
 		})
 
-		var branchStmt syntax.Stmt
+		// callReturnStmt is either `return call(...)` or `call(...); return`
+		var callReturnStmt syntax.Stmt
 		if d.Type.ResultList != nil && len(d.Type.ResultList) > 0 {
-			branchStmt = &syntax.ReturnStmt{Results: callExpr}
+			callReturnStmt = &syntax.ReturnStmt{Results: callExpr}
 		} else {
-			branchStmt = &syntax.BlockStmt{
+			callReturnStmt = &syntax.BlockStmt{
 				List: []syntax.Stmt{
 					ps(&syntax.ExprStmt{X: callExpr}),
 					ps(&syntax.ReturnStmt{}),
 				},
+				Rbrace: d.Pos(),
 			}
 		}
-		branchStmt.SetPos(d.Pos())
+		callReturnStmt.SetPos(d.Pos())
+
+		if k == 0 {
+			// emulation == `if simd.Emulated() { callReturnStmt }`
+			// save it for the first part of the 128 case.
+			cond := pe(&syntax.CallExpr{
+				Fun: pe(&syntax.SelectorExpr{
+					X:   syntax.NewName(d.Pos(), simdPkg), // Assume this is resolvable
+					Sel: syntax.NewName(d.Pos(), emulatedFn),
+				})})
+
+			blockStmt, ok := callReturnStmt.(*syntax.BlockStmt)
+			if !ok {
+				blockStmt = &syntax.BlockStmt{
+					List:   []syntax.Stmt{callReturnStmt},
+					Rbrace: d.Pos(),
+				}
+				blockStmt.SetPos(d.Pos())
+			}
+
+			emulation = ps(&syntax.IfStmt{
+				Cond: cond,
+				Then: blockStmt,
+			})
+			continue
+		}
+
+		var caseBody []syntax.Stmt
+		// assume that 128 is a case; when we do scalable simd, this may change.
+		// For now, if there is emulation, it is 128-bit (only).
+		if emulation != nil && k == 128 {
+			caseBody = append(caseBody, emulation)
+			emulation = nil
+		}
 
 		caseClause := &syntax.CaseClause{
 			Cases: pe(&syntax.BasicLit{Kind: syntax.IntLit, Value: fmt.Sprintf("%d", k)}),
-			Body:  []syntax.Stmt{branchStmt},
+			Body:  append(caseBody, callReturnStmt),
 		}
 		caseClause.SetPos(d.Pos())
 		switchStmt.Body = append(switchStmt.Body, caseClause)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
index 5de91a3..b7dc48e 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -17,6 +17,7 @@
 (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN2D y))) => (VXTN2_2D dst y)
 (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN4S y))) => (VXTN2_4S dst y)
 (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN8H y))) => (VXTN2_8H dst y)
+(VPMULL2D (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VPMULL2_2D x y)
 (VSMULL16B (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_16B x y)
 (VSMULL4S (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_4S x y)
 (VSMULL8H (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_8H x y)
@@ -232,12 +233,12 @@
 (MulAddUint8x16 x y z)  => (VMLA16B z x y) // earlyMatchRule
 (MulAddUint16x8 x y z)  => (VMLA8H z x y) // earlyMatchRule
 (MulAddUint32x4 x y z)  => (VMLA4S z x y) // earlyMatchRule
-(MulLoLongInt8x16 ...) => (VSMULL16B ...) // pureVreg
-(MulLoLongInt16x8 ...) => (VSMULL8H ...) // pureVreg
-(MulLoLongInt32x4 ...) => (VSMULL4S ...) // pureVreg
-(MulLoLongUint8x16 ...) => (VUMULL16B ...) // pureVreg
-(MulLoLongUint16x8 ...) => (VUMULL8H ...) // pureVreg
-(MulLoLongUint32x4 ...) => (VUMULL4S ...) // pureVreg
+(MulWidenLoInt8x16 ...) => (VSMULL16B ...) // pureVreg
+(MulWidenLoInt16x8 ...) => (VSMULL8H ...) // pureVreg
+(MulWidenLoInt32x4 ...) => (VSMULL4S ...) // pureVreg
+(MulWidenLoUint8x16 ...) => (VUMULL16B ...) // pureVreg
+(MulWidenLoUint16x8 ...) => (VUMULL8H ...) // pureVreg
+(MulWidenLoUint32x4 ...) => (VUMULL4S ...) // pureVreg
 (NegFloat32x4 ...) => (VFNEG4S ...) // pureVreg
 (NegFloat64x2 ...) => (VFNEG2D ...) // pureVreg
 (NegInt8x16 ...) => (VNEG16B ...) // pureVreg
@@ -349,12 +350,6 @@
 (ShiftLeftConstUint16x8 ...) => (VSHL8H ...) // pureVreg
 (ShiftLeftConstUint32x4 ...) => (VSHL4S ...) // pureVreg
 (ShiftLeftConstUint64x2 ...) => (VSHL2D ...) // pureVreg
-(ShiftLeftLoLongConstInt8x16 ...) => (VSSHLL16B ...) // pureVreg
-(ShiftLeftLoLongConstInt16x8 ...) => (VSSHLL8H ...) // pureVreg
-(ShiftLeftLoLongConstInt32x4 ...) => (VSSHLL4S ...) // pureVreg
-(ShiftLeftLoLongConstUint8x16 ...) => (VUSHLL16B ...) // pureVreg
-(ShiftLeftLoLongConstUint16x8 ...) => (VUSHLL8H ...) // pureVreg
-(ShiftLeftLoLongConstUint32x4 ...) => (VUSHLL4S ...) // pureVreg
 (ShiftLeftSaturatedConstInt8x16 ...) => (VSQSHL16Bconst ...) // pureVreg
 (VSQSHL16Bconst [a] x) && a==0 => x // asmRule
 (ShiftLeftSaturatedConstInt16x8 ...) => (VSQSHL8Hconst ...) // pureVreg
@@ -371,6 +366,12 @@
 (VUQSHL4Sconst [a] x) && a==0 => x // asmRule
 (ShiftLeftSaturatedConstUint64x2 ...) => (VUQSHL2Dconst ...) // pureVreg
 (VUQSHL2Dconst [a] x) && a==0 => x // asmRule
+(ShiftLeftWidenLoConstInt8x16 ...) => (VSSHLL16B ...) // pureVreg
+(ShiftLeftWidenLoConstInt16x8 ...) => (VSSHLL8H ...) // pureVreg
+(ShiftLeftWidenLoConstInt32x4 ...) => (VSSHLL4S ...) // pureVreg
+(ShiftLeftWidenLoConstUint8x16 ...) => (VUSHLL16B ...) // pureVreg
+(ShiftLeftWidenLoConstUint16x8 ...) => (VUSHLL8H ...) // pureVreg
+(ShiftLeftWidenLoConstUint32x4 ...) => (VUSHLL4S ...) // pureVreg
 (ShiftRightConstInt8x16 ...) => (VSSHR16B ...) // pureVreg
 (VSSHR16B [a] x) && a==0 => x // asmRule
 (ShiftRightConstInt16x8 ...) => (VSSHR8H ...) // pureVreg
@@ -468,3 +469,4 @@
 (broadcast1To16Int8x16 x) => (VDUPBbcast [0] x) // pureVreg
 (VDUPBbcast [i] (VMOVBins [j] _ (MOVDconst [c]))) && i == j && c>=-128 && c<=255 => (VMOVI16B [uint8(c)]) // argsMatchRule
 (broadcast1To16Uint8x16 x) => (VDUPBbcast [0] x) // pureVreg
+(carrylessMultiplyWidenLoUint64x2 ...) => (VPMULL2D ...) // pureVreg
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
index d3c03ec..3d4907a 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -109,6 +109,8 @@
 		{name: "VNOT16B", argLength: 1, reg: v11, asm: "VNOT", typ: "Vec128"},
 		{name: "VORN16B", argLength: 2, reg: v21, asm: "VORN", typ: "Vec128"},
 		{name: "VORR16B", argLength: 2, reg: v21, asm: "VORR", commutative: true, typ: "Vec128"},
+		{name: "VPMULL2D", argLength: 2, reg: v21, asm: "VPMULL", commutative: true, typ: "Vec128"},
+		{name: "VPMULL2_2D", argLength: 2, reg: v21, asm: "VPMULL2", commutative: true, typ: "Vec128"},
 		{name: "VSCVTF2D", argLength: 1, reg: v11, asm: "VSCVTF", typ: "Vec128"},
 		{name: "VSCVTF4S", argLength: 1, reg: v11, asm: "VSCVTF", typ: "Vec128"},
 		{name: "VSMAX4S", argLength: 2, reg: v21, asm: "VSMAX", commutative: true, typ: "Vec128"},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index d42b19e..e06c3bea 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -736,12 +736,6 @@
 		{name: "MulInt64x2", argLength: 2, commutative: true},                           // ARCH:amd64,wasm
 		{name: "MulInt64x4", argLength: 2, commutative: true},                           // ARCH:amd64
 		{name: "MulInt64x8", argLength: 2, commutative: true},                           // ARCH:amd64
-		{name: "MulLoLongInt8x16", argLength: 2, commutative: true},                     // ARCH:arm64
-		{name: "MulLoLongInt16x8", argLength: 2, commutative: true},                     // ARCH:arm64
-		{name: "MulLoLongInt32x4", argLength: 2, commutative: true},                     // ARCH:arm64
-		{name: "MulLoLongUint8x16", argLength: 2, commutative: true},                    // ARCH:arm64
-		{name: "MulLoLongUint16x8", argLength: 2, commutative: true},                    // ARCH:arm64
-		{name: "MulLoLongUint32x4", argLength: 2, commutative: true},                    // ARCH:arm64
 		{name: "MulSignInt8x16", argLength: 2},                                          // ARCH:amd64
 		{name: "MulSignInt8x32", argLength: 2},                                          // ARCH:amd64
 		{name: "MulSignInt16x8", argLength: 2},                                          // ARCH:amd64
@@ -764,12 +758,12 @@
 		{name: "MulWidenHiUint8x16", argLength: 2, commutative: true},                   // ARCH:wasm
 		{name: "MulWidenHiUint16x8", argLength: 2, commutative: true},                   // ARCH:wasm
 		{name: "MulWidenHiUint32x4", argLength: 2, commutative: true},                   // ARCH:wasm
-		{name: "MulWidenLoInt8x16", argLength: 2, commutative: true},                    // ARCH:wasm
-		{name: "MulWidenLoInt16x8", argLength: 2, commutative: true},                    // ARCH:wasm
-		{name: "MulWidenLoInt32x4", argLength: 2, commutative: true},                    // ARCH:wasm
-		{name: "MulWidenLoUint8x16", argLength: 2, commutative: true},                   // ARCH:wasm
-		{name: "MulWidenLoUint16x8", argLength: 2, commutative: true},                   // ARCH:wasm
-		{name: "MulWidenLoUint32x4", argLength: 2, commutative: true},                   // ARCH:wasm
+		{name: "MulWidenLoInt8x16", argLength: 2, commutative: true},                    // ARCH:arm64,wasm
+		{name: "MulWidenLoInt16x8", argLength: 2, commutative: true},                    // ARCH:arm64,wasm
+		{name: "MulWidenLoInt32x4", argLength: 2, commutative: true},                    // ARCH:arm64,wasm
+		{name: "MulWidenLoUint8x16", argLength: 2, commutative: true},                   // ARCH:arm64,wasm
+		{name: "MulWidenLoUint16x8", argLength: 2, commutative: true},                   // ARCH:arm64,wasm
+		{name: "MulWidenLoUint32x4", argLength: 2, commutative: true},                   // ARCH:arm64,wasm
 		{name: "NegFloat32x4", argLength: 1},                                            // ARCH:arm64,wasm
 		{name: "NegFloat64x2", argLength: 1},                                            // ARCH:arm64,wasm
 		{name: "NegInt8x16", argLength: 1},                                              // ARCH:arm64,wasm
@@ -1396,6 +1390,7 @@
 		{name: "broadcast1To64MaskedInt8x16", argLength: 2},                             // ARCH:amd64
 		{name: "broadcast1To64MaskedUint8x16", argLength: 2},                            // ARCH:amd64
 		{name: "broadcast1To64Uint8x16", argLength: 1},                                  // ARCH:amd64
+		{name: "carrylessMultiplyWidenLoUint64x2", argLength: 2, commutative: true},     // ARCH:arm64
 		{name: "AESRoundKeyGenAssistUint32x4", argLength: 1, aux: "UInt8"},              // ARCH:amd64
 		{name: "CeilScaledFloat32x4", argLength: 1, aux: "UInt8"},                       // ARCH:amd64
 		{name: "CeilScaledFloat32x8", argLength: 1, aux: "UInt8"},                       // ARCH:amd64
@@ -1517,12 +1512,6 @@
 		{name: "ShiftLeftConstUint16x8", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
 		{name: "ShiftLeftConstUint32x4", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
 		{name: "ShiftLeftConstUint64x2", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
-		{name: "ShiftLeftLoLongConstInt8x16", argLength: 1, aux: "UInt8"},               // ARCH:arm64
-		{name: "ShiftLeftLoLongConstInt16x8", argLength: 1, aux: "UInt8"},               // ARCH:arm64
-		{name: "ShiftLeftLoLongConstInt32x4", argLength: 1, aux: "UInt8"},               // ARCH:arm64
-		{name: "ShiftLeftLoLongConstUint8x16", argLength: 1, aux: "UInt8"},              // ARCH:arm64
-		{name: "ShiftLeftLoLongConstUint16x8", argLength: 1, aux: "UInt8"},              // ARCH:arm64
-		{name: "ShiftLeftLoLongConstUint32x4", argLength: 1, aux: "UInt8"},              // ARCH:arm64
 		{name: "ShiftLeftSaturatedConstInt8x16", argLength: 1, aux: "UInt8"},            // ARCH:arm64
 		{name: "ShiftLeftSaturatedConstInt16x8", argLength: 1, aux: "UInt8"},            // ARCH:arm64
 		{name: "ShiftLeftSaturatedConstInt32x4", argLength: 1, aux: "UInt8"},            // ARCH:arm64
@@ -1531,6 +1520,12 @@
 		{name: "ShiftLeftSaturatedConstUint16x8", argLength: 1, aux: "UInt8"},           // ARCH:arm64
 		{name: "ShiftLeftSaturatedConstUint32x4", argLength: 1, aux: "UInt8"},           // ARCH:arm64
 		{name: "ShiftLeftSaturatedConstUint64x2", argLength: 1, aux: "UInt8"},           // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstInt8x16", argLength: 1, aux: "UInt8"},              // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstInt16x8", argLength: 1, aux: "UInt8"},              // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstInt32x4", argLength: 1, aux: "UInt8"},              // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstUint8x16", argLength: 1, aux: "UInt8"},             // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstUint16x8", argLength: 1, aux: "UInt8"},             // ARCH:arm64
+		{name: "ShiftLeftWidenLoConstUint32x4", argLength: 1, aux: "UInt8"},             // ARCH:arm64
 		{name: "ShiftRightConstInt8x16", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
 		{name: "ShiftRightConstInt16x8", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
 		{name: "ShiftRightConstInt32x4", argLength: 1, aux: "UInt8"},                    // ARCH:arm64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index b2705c9..c113437 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -5106,6 +5106,8 @@
 	OpARM64VNOT16B
 	OpARM64VORN16B
 	OpARM64VORR16B
+	OpARM64VPMULL2D
+	OpARM64VPMULL2_2D
 	OpARM64VSCVTF2D
 	OpARM64VSCVTF4S
 	OpARM64VSMAX4S
@@ -7933,12 +7935,6 @@
 	OpMulInt64x2
 	OpMulInt64x4
 	OpMulInt64x8
-	OpMulLoLongInt8x16
-	OpMulLoLongInt16x8
-	OpMulLoLongInt32x4
-	OpMulLoLongUint8x16
-	OpMulLoLongUint16x8
-	OpMulLoLongUint32x4
 	OpMulSignInt8x16
 	OpMulSignInt8x32
 	OpMulSignInt16x8
@@ -8593,6 +8589,7 @@
 	Opbroadcast1To64MaskedInt8x16
 	Opbroadcast1To64MaskedUint8x16
 	Opbroadcast1To64Uint8x16
+	OpcarrylessMultiplyWidenLoUint64x2
 	OpAESRoundKeyGenAssistUint32x4
 	OpCeilScaledFloat32x4
 	OpCeilScaledFloat32x8
@@ -8714,12 +8711,6 @@
 	OpShiftLeftConstUint16x8
 	OpShiftLeftConstUint32x4
 	OpShiftLeftConstUint64x2
-	OpShiftLeftLoLongConstInt8x16
-	OpShiftLeftLoLongConstInt16x8
-	OpShiftLeftLoLongConstInt32x4
-	OpShiftLeftLoLongConstUint8x16
-	OpShiftLeftLoLongConstUint16x8
-	OpShiftLeftLoLongConstUint32x4
 	OpShiftLeftSaturatedConstInt8x16
 	OpShiftLeftSaturatedConstInt16x8
 	OpShiftLeftSaturatedConstInt32x4
@@ -8728,6 +8719,12 @@
 	OpShiftLeftSaturatedConstUint16x8
 	OpShiftLeftSaturatedConstUint32x4
 	OpShiftLeftSaturatedConstUint64x2
+	OpShiftLeftWidenLoConstInt8x16
+	OpShiftLeftWidenLoConstInt16x8
+	OpShiftLeftWidenLoConstInt32x4
+	OpShiftLeftWidenLoConstUint8x16
+	OpShiftLeftWidenLoConstUint16x8
+	OpShiftLeftWidenLoConstUint32x4
 	OpShiftRightConstInt8x16
 	OpShiftRightConstInt16x8
 	OpShiftRightConstInt32x4
@@ -81330,6 +81327,36 @@
 		},
 	},
 	{
+		name:        "VPMULL2D",
+		argLen:      2,
+		commutative: true,
+		asm:         arm64.AVPMULL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+				{1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
+		name:        "VPMULL2_2D",
+		argLen:      2,
+		commutative: true,
+		asm:         arm64.AVPMULL2,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+				{1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
 		name:   "VSCVTF2D",
 		argLen: 1,
 		asm:    arm64.AVSCVTF,
@@ -111007,42 +111034,6 @@
 		generic:     true,
 	},
 	{
-		name:        "MulLoLongInt8x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLoLongInt16x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLoLongInt32x4",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLoLongUint8x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLoLongUint16x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLoLongUint32x4",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
 		name:    "MulSignInt8x16",
 		argLen:  2,
 		generic: true,
@@ -114405,6 +114396,12 @@
 		generic: true,
 	},
 	{
+		name:        "carrylessMultiplyWidenLoUint64x2",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
 		name:    "AESRoundKeyGenAssistUint32x4",
 		auxType: auxUInt8,
 		argLen:  1,
@@ -115131,42 +115128,6 @@
 		generic: true,
 	},
 	{
-		name:    "ShiftLeftLoLongConstInt8x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "ShiftLeftLoLongConstInt16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "ShiftLeftLoLongConstInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "ShiftLeftLoLongConstUint8x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "ShiftLeftLoLongConstUint16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "ShiftLeftLoLongConstUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
 		name:    "ShiftLeftSaturatedConstInt8x16",
 		auxType: auxUInt8,
 		argLen:  1,
@@ -115215,6 +115176,42 @@
 		generic: true,
 	},
 	{
+		name:    "ShiftLeftWidenLoConstInt8x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "ShiftLeftWidenLoConstInt16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "ShiftLeftWidenLoConstInt32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "ShiftLeftWidenLoConstUint8x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "ShiftLeftWidenLoConstUint16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "ShiftLeftWidenLoConstUint32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
 		name:    "ShiftRightConstInt8x16",
 		auxType: auxUInt8,
 		argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 5564c5b..428d313 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -412,6 +412,8 @@
 		return rewriteValueARM64_OpARM64VMOVDins0(v)
 	case OpARM64VMOVSins0:
 		return rewriteValueARM64_OpARM64VMOVSins0(v)
+	case OpARM64VPMULL2D:
+		return rewriteValueARM64_OpARM64VPMULL2D(v)
 	case OpARM64VSHL16B:
 		return rewriteValueARM64_OpARM64VSHL16B(v)
 	case OpARM64VSHL2D:
@@ -1579,24 +1581,6 @@
 	case OpMulInt8x16:
 		v.Op = OpARM64VMUL16B
 		return true
-	case OpMulLoLongInt16x8:
-		v.Op = OpARM64VSMULL8H
-		return true
-	case OpMulLoLongInt32x4:
-		v.Op = OpARM64VSMULL4S
-		return true
-	case OpMulLoLongInt8x16:
-		v.Op = OpARM64VSMULL16B
-		return true
-	case OpMulLoLongUint16x8:
-		v.Op = OpARM64VUMULL8H
-		return true
-	case OpMulLoLongUint32x4:
-		v.Op = OpARM64VUMULL4S
-		return true
-	case OpMulLoLongUint8x16:
-		v.Op = OpARM64VUMULL16B
-		return true
 	case OpMulUint16x8:
 		v.Op = OpARM64VMUL8H
 		return true
@@ -1606,6 +1590,24 @@
 	case OpMulUint8x16:
 		v.Op = OpARM64VMUL16B
 		return true
+	case OpMulWidenLoInt16x8:
+		v.Op = OpARM64VSMULL8H
+		return true
+	case OpMulWidenLoInt32x4:
+		v.Op = OpARM64VSMULL4S
+		return true
+	case OpMulWidenLoInt8x16:
+		v.Op = OpARM64VSMULL16B
+		return true
+	case OpMulWidenLoUint16x8:
+		v.Op = OpARM64VUMULL8H
+		return true
+	case OpMulWidenLoUint32x4:
+		v.Op = OpARM64VUMULL4S
+		return true
+	case OpMulWidenLoUint8x16:
+		v.Op = OpARM64VUMULL16B
+		return true
 	case OpNeg16:
 		v.Op = OpARM64NEG
 		return true
@@ -2055,24 +2057,6 @@
 	case OpShiftLeftConstUint8x16:
 		v.Op = OpARM64VSHL16B
 		return true
-	case OpShiftLeftLoLongConstInt16x8:
-		v.Op = OpARM64VSSHLL8H
-		return true
-	case OpShiftLeftLoLongConstInt32x4:
-		v.Op = OpARM64VSSHLL4S
-		return true
-	case OpShiftLeftLoLongConstInt8x16:
-		v.Op = OpARM64VSSHLL16B
-		return true
-	case OpShiftLeftLoLongConstUint16x8:
-		v.Op = OpARM64VUSHLL8H
-		return true
-	case OpShiftLeftLoLongConstUint32x4:
-		v.Op = OpARM64VUSHLL4S
-		return true
-	case OpShiftLeftLoLongConstUint8x16:
-		v.Op = OpARM64VUSHLL16B
-		return true
 	case OpShiftLeftSaturatedConstInt16x8:
 		v.Op = OpARM64VSQSHL8Hconst
 		return true
@@ -2097,6 +2081,24 @@
 	case OpShiftLeftSaturatedConstUint8x16:
 		v.Op = OpARM64VUQSHL16Bconst
 		return true
+	case OpShiftLeftWidenLoConstInt16x8:
+		v.Op = OpARM64VSSHLL8H
+		return true
+	case OpShiftLeftWidenLoConstInt32x4:
+		v.Op = OpARM64VSSHLL4S
+		return true
+	case OpShiftLeftWidenLoConstInt8x16:
+		v.Op = OpARM64VSSHLL16B
+		return true
+	case OpShiftLeftWidenLoConstUint16x8:
+		v.Op = OpARM64VUSHLL8H
+		return true
+	case OpShiftLeftWidenLoConstUint32x4:
+		v.Op = OpARM64VUSHLL4S
+		return true
+	case OpShiftLeftWidenLoConstUint8x16:
+		v.Op = OpARM64VUSHLL16B
+		return true
 	case OpShiftRightConstInt16x8:
 		v.Op = OpARM64VSSHR8H
 		return true
@@ -2464,6 +2466,9 @@
 		return rewriteValueARM64_Opbroadcast1To8Int16x8(v)
 	case Opbroadcast1To8Uint16x8:
 		return rewriteValueARM64_Opbroadcast1To8Uint16x8(v)
+	case OpcarrylessMultiplyWidenLoUint64x2:
+		v.Op = OpARM64VPMULL2D
+		return true
 	}
 	return false
 }
@@ -18509,6 +18514,29 @@
 	}
 	return false
 }
+func rewriteValueARM64_OpARM64VPMULL2D(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (VPMULL2D (VDUPDextr [1] x) (VDUPDextr [1] y))
+	// result: (VPMULL2_2D x y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpARM64VDUPDextr || auxIntToUint8(v_0.AuxInt) != 1 {
+				continue
+			}
+			x := v_0.Args[0]
+			if v_1.Op != OpARM64VDUPDextr || auxIntToUint8(v_1.AuxInt) != 1 {
+				continue
+			}
+			y := v_1.Args[0]
+			v.reset(OpARM64VPMULL2_2D)
+			v.AddArg2(x, y)
+			return true
+		}
+		break
+	}
+	return false
+}
 func rewriteValueARM64_OpARM64VSHL16B(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VSHL16B [a] x)
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index 81857d7..c05c5d7 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -103,9 +103,9 @@
 	addF(simdPackage, "Uint32x4.And", opLen2(ssa.OpAndUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.And", opLen2(ssa.OpAndUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x16.And", opLen2(ssa.OpAndUint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x2.And", opLen2(ssa.OpAndUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.And", opLen2(ssa.OpAndUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.And", opLen2(ssa.OpAndUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x2.And", opLen2(ssa.OpAndUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x16.AndNot", opLen2_21(ssa.OpAndNotInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.AndNot", opLen2_21(ssa.OpAndNotInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.AndNot", opLen2_21(ssa.OpAndNotInt8x64, types.TypeVec512), sys.AMD64)
@@ -127,9 +127,9 @@
 	addF(simdPackage, "Uint32x4.AndNot", opLen2_21(ssa.OpAndNotUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.AndNot", opLen2_21(ssa.OpAndNotUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x16.AndNot", opLen2_21(ssa.OpAndNotUint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x2.AndNot", opLen2_21(ssa.OpAndNotUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.AndNot", opLen2_21(ssa.OpAndNotUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.AndNot", opLen2_21(ssa.OpAndNotUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x2.AndNot", opLen2_21(ssa.OpAndNotUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Average", opLen2(ssa.OpAverageUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Average", opLen2(ssa.OpAverageUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Average", opLen2(ssa.OpAverageUint8x64, types.TypeVec512), sys.AMD64)
@@ -774,9 +774,9 @@
 	addF(simdPackage, "Uint32x4.Or", opLen2(ssa.OpOrUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Or", opLen2(ssa.OpOrUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Or", opLen2(ssa.OpOrUint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
@@ -1232,9 +1232,9 @@
 	addF(simdPackage, "Uint32x4.Xor", opLen2(ssa.OpXorUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Xor", opLen2(ssa.OpXorUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Xor", opLen2(ssa.OpXorUint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Xor", opLen2(ssa.OpXorUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Xor", opLen2(ssa.OpXorUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Xor", opLen2(ssa.OpXorUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Xor", opLen2(ssa.OpXorUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x16.blend", opLen3(ssa.OpblendInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.blend", opLen3(ssa.OpblendInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.blendMasked", opLen3(ssa.OpblendMaskedInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
index 31dddb2..9a29e74 100644
--- a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
@@ -207,12 +207,12 @@
 	addF(simdPackage, "Uint8x16.MulAdd", opLen3(ssa.OpMulAddUint8x16, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint16x8.MulAdd", opLen3(ssa.OpMulAddUint16x8, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint32x4.MulAdd", opLen3(ssa.OpMulAddUint32x4, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Int8x16.MulLoLong", opLen2(ssa.OpMulLoLongInt8x16, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Int16x8.MulLoLong", opLen2(ssa.OpMulLoLongInt16x8, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Int32x4.MulLoLong", opLen2(ssa.OpMulLoLongInt32x4, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Uint8x16.MulLoLong", opLen2(ssa.OpMulLoLongUint8x16, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Uint16x8.MulLoLong", opLen2(ssa.OpMulLoLongUint16x8, types.TypeVec128), sys.ARM64)
-	addF(simdPackage, "Uint32x4.MulLoLong", opLen2(ssa.OpMulLoLongUint32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int8x16.MulWidenLo", opLen2(ssa.OpMulWidenLoInt8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int16x8.MulWidenLo", opLen2(ssa.OpMulWidenLoInt16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int32x4.MulWidenLo", opLen2(ssa.OpMulWidenLoInt32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint8x16.MulWidenLo", opLen2(ssa.OpMulWidenLoUint8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint16x8.MulWidenLo", opLen2(ssa.OpMulWidenLoUint16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint32x4.MulWidenLo", opLen2(ssa.OpMulWidenLoUint32x4, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Float32x4.Neg", opLen1(ssa.OpNegFloat32x4, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Float64x2.Neg", opLen1(ssa.OpNegFloat64x2, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Int8x16.Neg", opLen1(ssa.OpNegInt8x16, types.TypeVec128), sys.ARM64)
@@ -318,12 +318,6 @@
 	addF(simdPackage, "Uint16x8.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64)
 	addF(simdPackage, "Uint32x4.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64)
 	addF(simdPackage, "Uint64x2.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint64x2, types.TypeVec128, 0, 63), sys.ARM64)
-	addF(simdPackage, "Int8x16.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64)
-	addF(simdPackage, "Int16x8.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64)
-	addF(simdPackage, "Int32x4.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64)
-	addF(simdPackage, "Uint8x16.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint8x16, types.TypeVec128, 0, 7), sys.ARM64)
-	addF(simdPackage, "Uint16x8.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64)
-	addF(simdPackage, "Uint32x4.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64)
 	addF(simdPackage, "Int8x16.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64)
 	addF(simdPackage, "Int16x8.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64)
 	addF(simdPackage, "Int32x4.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64)
@@ -332,6 +326,12 @@
 	addF(simdPackage, "Uint16x8.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64)
 	addF(simdPackage, "Uint32x4.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64)
 	addF(simdPackage, "Uint64x2.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint64x2, types.TypeVec128, 0, 63), sys.ARM64)
+	addF(simdPackage, "Int8x16.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64)
+	addF(simdPackage, "Int16x8.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64)
+	addF(simdPackage, "Int32x4.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64)
+	addF(simdPackage, "Uint8x16.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint8x16, types.TypeVec128, 0, 7), sys.ARM64)
+	addF(simdPackage, "Uint16x8.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64)
+	addF(simdPackage, "Uint32x4.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64)
 	addF(simdPackage, "Int8x16.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64)
 	addF(simdPackage, "Int16x8.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64)
 	addF(simdPackage, "Int32x4.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64)
@@ -415,6 +415,7 @@
 	addF(simdPackage, "Uint16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint16x8, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Int8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int8x16, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint64x2.carrylessMultiplyWidenLo", opLen2(ssa.OpcarrylessMultiplyWidenLoUint64x2, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Float32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
 	addF(simdPackage, "Float32x4.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
 	addF(simdPackage, "Float32x4.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 7e0d2d3..5942544 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -57,6 +57,7 @@
 	HasSSE41            bool
 	HasSSE42            bool
 	HasVAES             bool
+	HasVPCLMULQDQ       bool
 	_                   CacheLinePad
 }
 
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index 3c0a0ad..515b2c7 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -57,6 +57,7 @@
 	cpuid_AVX512_VBMI      = 1 << 1
 	cpuid_AVX512_VBMI2     = 1 << 6
 	cpuid_GFNI             = 1 << 8
+	cpuid_VPCLMULQDQ       = 1 << 10 // applies to not just AVX512
 	cpuid_AVX512VPCLMULQDQ = 1 << 10
 	cpuid_AVX512_BITALG    = 1 << 12
 
@@ -174,6 +175,7 @@
 	X86.HasADX = isSet(ebx7, cpuid_ADX)
 	X86.HasSHA = isSet(ebx7, cpuid_SHA)
 	X86.HasVAES = isSet(ecx7, cpuid_VAES) && X86.HasAVX
+	X86.HasVPCLMULQDQ = isSet(ecx7, cpuid_VPCLMULQDQ)
 
 	X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
 	if X86.HasAVX512F {
diff --git a/src/simd/archsimd/_gen/midway/comments.yaml b/src/simd/archsimd/_gen/midway/comments.yaml
index c40440d..a300664 100644
--- a/src/simd/archsimd/_gen/midway/comments.yaml
+++ b/src/simd/archsimd/_gen/midway/comments.yaml
@@ -64,6 +64,38 @@
   ToInt32s: "ToInt32s converts the mask to an Int32s vector."
   ToInt64s: "ToInt64s converts the mask to an Int64s vector."
 
+  CarrylessMultiplyEven: |-
+    CarrylessMultiplyOdd computes the carryless
+    // multiplications of selected even indexed elements of x and y.
+    // Each product is 128 bits wide and fills the corresponding
+    // even-odd pairs in the result.
+    //
+    // A carryless multiplication uses bitwise XOR instead of
+    // add-with-carry, for example (in base two):
+    //
+    //  11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+    //
+    // This also models multiplication of polynomials with coefficients
+    // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+    // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+    // polynomial terms, but coefficients "add" with XOR.)"
+
+  CarrylessMultiplyOdd: |-
+    CarrylessMultiplyOdd computes the carryless
+    // multiplications of selected odd indexed elements of x and y.
+    // Each product is 128 bits wide and fills the corresponding
+    // even-odd pairs in the result.
+    //
+    // A carryless multiplication uses bitwise XOR instead of
+    // add-with-carry, for example (in base two):
+    //
+    //  11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+    //
+    // This also models multiplication of polynomials with coefficients
+    // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+    // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+    // polynomial terms, but coefficients "add" with XOR.)"
+
 types:
   _simd: "internal SIMD marker."
   Int8s: "Int8s represents a vector of 8-bit signed integers."
diff --git a/src/simd/archsimd/_gen/midway/intersect_simd_ops.go b/src/simd/archsimd/_gen/midway/intersect_simd_ops.go
index 90434db..67aa790 100644
--- a/src/simd/archsimd/_gen/midway/intersect_simd_ops.go
+++ b/src/simd/archsimd/_gen/midway/intersect_simd_ops.go
@@ -103,9 +103,16 @@
 	archSimdPath := *goRoot + "/src/simd/archsimd"
 
 	// Hardcoded list of files
-	amd64Files := []string{"ops_amd64.go", "compare_gen_amd64.go", "types_amd64.go", "other_gen_amd64.go", "extra_amd64.go", "maskmerge_gen_amd64.go", "shuffles_amd64.go", "slice_gen_amd64.go", "slicepart_amd64.go", "slicepart_128.go", "string.go"}
-	wasmFiles := []string{"ops_wasm.go", "types_wasm.go", "slicepart_wasm.go", "string.go", "slicepart_128.go", "ops_emulated_wasm.go"}
-	neonFiles := []string{"compare_gen_arm64.go", "maskmerge_gen_arm64.go", "ops_arm64.go", "slicepart_128.go", "ops_internal_arm64.go", "other_gen_arm64.go", "slice_gen_arm64.go", "slicepart_arm64.go", "types_arm64.go"}
+	amd64Files := []string{"ops_amd64.go", "compare_gen_amd64.go", "types_amd64.go",
+		"other_gen_amd64.go", "extra_amd64.go", "maskmerge_gen_amd64.go",
+		"shuffles_amd64.go", "slice_gen_amd64.go", "slicepart_amd64.go",
+		"slicepart_128.go", "string.go", "ops_emulated_amd64.go"}
+	wasmFiles := []string{"ops_wasm.go", "types_wasm.go", "slicepart_wasm.go",
+		"string.go", "slicepart_128.go", "ops_emulated_wasm.go"}
+	neonFiles := []string{"clmul_arm64.go", "compare_gen_arm64.go",
+		"maskmerge_gen_arm64.go", "ops_arm64.go", "slicepart_128.go",
+		"ops_internal_arm64.go", "other_gen_arm64.go", "slice_gen_arm64.go",
+		"slicepart_arm64.go", "types_arm64.go"}
 
 	emulatedFile := *goRoot + "/src/simd/simd_emulated.go"
 
diff --git a/src/simd/archsimd/_gen/sgutil/compare_natural.go b/src/simd/archsimd/_gen/sgutil/compare_natural.go
index f8ca9fa..8d25e58 100644
--- a/src/simd/archsimd/_gen/sgutil/compare_natural.go
+++ b/src/simd/archsimd/_gen/sgutil/compare_natural.go
@@ -50,6 +50,10 @@
 			if num1 > num2 {
 				return 1
 			}
+			// "1" < "01".  Don't expect it in simdgen, but just in case.
+			if ln1, ln2 := i-numStart1, j-numStart2; ln1 != ln2 {
+				return ln1 - ln2
+			}
 			// If numbers are equal, continue to the next segment.
 		} else {
 			// Non-digit comparison.
diff --git a/src/simd/archsimd/_gen/sgutil/sort_test.go b/src/simd/archsimd/_gen/sgutil/sort_test.go
index 9f74296..c86baf2 100644
--- a/src/simd/archsimd/_gen/sgutil/sort_test.go
+++ b/src/simd/archsimd/_gen/sgutil/sort_test.go
@@ -13,7 +13,7 @@
 	}{
 		{"a1", "a2", -1},
 		{"a11a", "a11b", -1},
-		{"a01a1", "a1a01", -1},
+		{"a01a1", "a1a01", 1},
 		{"a2", "a1", 1},
 		{"a10", "a2", 1},
 		{"a1", "a10", -1},
@@ -24,7 +24,7 @@
 		{"file1", "file1", 0},
 		{"file", "file1", -1},
 		{"file1", "file", 1},
-		{"a01", "a1", -1},
+		{"a01", "a1", 1},
 		{"a1a", "a1b", -1},
 	}
 
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/instruction.go b/src/simd/archsimd/_gen/simdgen/arm64/instruction.go
index 0e62e51..c968b2c 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/instruction.go
+++ b/src/simd/archsimd/_gen/simdgen/arm64/instruction.go
@@ -7,6 +7,7 @@
 import (
 	"fmt"
 	"regexp"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -348,26 +349,98 @@
 		return arrangements, DefaultArngs
 	}
 
+	// Determine the arrangement shape and which symbol to extract from.
+	// For LongArngs and NarrowArngs, we need only the source-side symbol.
+	// For WideArngs, we need only the wide-side symbol.
+	ashape = instruction.ArngShape()
+	var targetSymbol string
+	if ashape == LongArngs || ashape == NarrowArngs {
+		symbols := instruction.arrangementSymbols()
+		if len(symbols) >= 2 {
+			targetSymbol = "<" + symbols[len(symbols)-1] + ">"
+		}
+	} else if ashape == WideArngs {
+		symbols := instruction.arrangementSymbols()
+		if len(symbols) >= 2 {
+			targetSymbol = "<" + symbols[0] + ">"
+		}
+	}
+
+	nonTarget := map[string]bool{}
 	for _, Explanation := range instruction.Explanations.Explanations {
 		Definition := Explanation.Definition
 		if Definition.Table.TGroup.TBody.Row != nil {
+			isTarget := targetSymbol == "" || targetSymbol == strings.TrimSpace(Explanation.Symbol.Value)
 			for _, Row := range Definition.Table.TGroup.TBody.Row {
 				for _, Entry := range Row.Entries {
 					if Entry.Class == "symbol" {
-						arrangements = append(arrangements, strings.TrimSpace(Entry.Value))
+						v := strings.TrimSpace(Entry.Value)
+						if isTarget {
+							arrangements = append(arrangements, v)
+						} else if eb, _, _ := parseArrangement(v); eb > 0 {
+							nonTarget[v] = false
+						}
 					}
 				}
 			}
 		}
 	}
 
+	verifyNonTargetArrangements(instruction.Mnemonic(), ashape, arrangements, nonTarget)
+
 	fixedArrangements := instruction.extractFixedArrangements()
 	arrangements = append(arrangements, fixedArrangements...)
 	arrangements = removeDuplicates(arrangements)
-	ashape = instruction.ArngShape()
 	return arrangements, ashape
 }
 
+// verifyNonTargetArrangements checks that non-target symbol arrangements are the
+// expected transformed versions of the target arrangements (half/double elemBits).
+func verifyNonTargetArrangements(mnemonic string, ashape ArngShape, target []string, nonTarget map[string]bool) {
+	if ashape == DefaultArngs || len(nonTarget) == 0 {
+		return
+	}
+	// FCVTN has a FEAT_FP8 variant not covered by NarrowArngs.
+	// The other variants are covered.
+	switch mnemonic {
+	case "FCVTN", "FCVTXN":
+		return
+	}
+	for _, t := range target {
+		eb, _, _ := parseArrangement(t)
+		if eb == 0 {
+			continue
+		}
+		var expectedElemBits int
+		switch ashape {
+		case LongArngs:
+			expectedElemBits = eb * 2
+		case NarrowArngs, WideArngs:
+			expectedElemBits = eb / 2
+		}
+		if expectedElemBits == 0 {
+			continue
+		}
+		for nt := range nonTarget {
+			ntEb, _, _ := parseArrangement(nt)
+			if ntEb == expectedElemBits {
+				nonTarget[nt] = true
+			}
+		}
+	}
+	var unexplained []string
+	for nt, explained := range nonTarget {
+		if !explained {
+			unexplained = append(unexplained, nt)
+		}
+	}
+	if len(unexplained) > 0 {
+		sort.Strings(unexplained)
+		panic(fmt.Sprintf("%s: non-target arrangements not explained by target: %v\ntarget: %v",
+			mnemonic, unexplained, target))
+	}
+}
+
 // regDiagramArngShape returns the expected arrangement shape based on RegDiagram for NEON.
 // Used for cross-check verification by ArngShape() only.
 func (instruction *Instruction) regDiagramArngShape() ArngShape {
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go b/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go
index 1b3b92a..0524e54 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go
+++ b/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go
@@ -121,6 +121,8 @@
 	integerUpTo8Bits  = []string{"int8:16B", "int8:8B", "uint8:16B", "uint8:8B"}
 	integerUpTo16Bits = append([]string{"int16:4H", "int16:8H", "uint16:4H", "uint16:8H"}, integerUpTo8Bits...)
 	integerUpTo32Bits = append([]string{"int32:2S", "int32:4S", "uint32:2S", "uint32:4S"}, integerUpTo16Bits...)
+	integerWideOnly   = []string{"int16:8H", "int32:4S", "int64:2D", "uint16:8H", "uint32:4S", "uint64:2D"}
+	polynomialArrngs  = []string{"int8:8B", "int8:16B", "int64:1D", "int64:2D", "uint8:8B", "uint8:16B", "uint64:1D", "uint64:2D"}
 	integer32And8Bits = append([]string{"int32:2S", "int32:4S", "uint32:2S", "uint32:4S"}, integerUpTo8Bits...)
 	addvArngs         = append([]string{"int32:4S", "uint32:4S"}, integerUpTo16Bits...)
 	integer           = append([]string{"int64:2D", "uint64:2D"}, integerUpTo32Bits...)
@@ -153,7 +155,7 @@
 	{"^FADDP$", matchOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)},
 	{"^FADDP$", matchOps(unary), requireArngs([]string{"float32:2S", "float64:2D"}, DefaultArngs), emitsDefs(2)},
 	{"^SABA$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)},
-	{"^SABAL$", matchOps(threeArgsResultInArg0), requireArngs(integer, LongArngs), emitsDefs(14)},
+	{"^SABAL$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
 	{"^F(ADD|SUB|DIV)$", requireOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)},
 	{"^(AND|ORR|EOR|BIC|ORN)$", matchOps(binary), requireArngs(bitwise, DefaultArngs), emitsDefs(14)},
 	{"^NOT$", requireOps(unary), requireArngs(bitwise, DefaultArngs), emitsDefs(14)},
@@ -193,11 +195,11 @@
 	{"^SHL$", requireOps(unaryWithImm), requireArngs(integer, DefaultArngs), emitsDefs(14)},
 	{"^(S|U)SHR$", requireOps(unaryWithImm), requireArngs(integer, DefaultArngs), emitsDefs(14)},
 	{"^(S|U)SRA$", requireOps(unaryWithImmResultInArg0), requireArngs(integer, DefaultArngs), emitsDefs(14)},
-	{"^(S|U)SHLL$", requireOps(unaryWithImm), requireArngs(integer, LongArngs), emitsDefs(14)},
-	{"^SADALP$", matchOps(twoArgsResultInArg0), requireArngs(integerWith1D, LongArngs), emitsDefs(16)},
-	{"^((S|U)ADDLP)$", requireOps(unary), requireArngs(integerWith1D, LongArngs), emitsDefs(16)},
-	{"^(R?(ADD|SUB)HN)$", requireOps(binary), requireArngs(integer, NarrowArngs), emitsDefs(14)},
-	{"^SHRN$", requireOps(unaryWithImm), requireArngs(integer, NarrowArngs), emitsDefs(14)},
+	{"^(S|U)SHLL$", requireOps(unaryWithImm), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
+	{"^SADALP$", matchOps(twoArgsResultInArg0), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
+	{"^((S|U)ADDLP)$", requireOps(unary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
+	{"^(R?(ADD|SUB)HN)$", requireOps(binary), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)},
+	{"^SHRN$", requireOps(unaryWithImm), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)},
 	{"^(CLZ|CLS)$", requireOps(unary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)},
 	{"^(CNT|RBIT)$", requireOps(unary), requireArngs(integerUpTo8Bits, DefaultArngs), emitsDefs(4)},
 	{"^(S|U)R?HADD$", matchOps(binary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)},
@@ -205,16 +207,17 @@
 	{"^FMUL$", matchOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)},
 	{"^F(MLA|MLS)$", matchOps(threeArgsResultInArg0), requireArngs(floating, DefaultArngs), emitsDefs(3)},
 	{"^MUL$", matchOps(binary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)},
-	{"^((S|U)MULL)$", matchOps(binary), requireArngs(integer, LongArngs), emitsDefs(14)},
+	{"^((S|U)MULL)$", matchOps(binary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
 	{"^(MLA|MLS)$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)},
-	{"^((S|U)Q)?XTN$", requireOps(unary), requireArngs(integer, NarrowArngs), emitsDefs(14)},
-	{"^(S|U)XTL$", requireOps(unary), requireArngs(integer, LongArngs), emitsDefs(14)},
+	{"^((S|U)Q)?XTN$", requireOps(unary), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)},
+	{"^(S|U)XTL$", requireOps(unary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)},
 	{"^FCVT[NMPZ](S|U)$", matchOps(unary), requireArngs(floating, DefaultArngs), emitsDefs(3)},
 	{"^(S|U)CVTF$", matchOps(unary), requireArngs(floating, DefaultArngs), emitsDefs(3)},
-	{"^(S|U)ADDW$", requireOps(binary), requireArngs(integer, WideArngs), emitsDefs(14)},
-	{"^(S|U)SUBW$", requireOps(binary), requireArngs(integer, WideArngs), emitsDefs(14)},
-	{"^FCVTL$", requireOps(unary), requireArngs(floating, LongArngs), emitsDefs(3)},
+	{"^(S|U)ADDW$", requireOps(binary), requireArngs(integerWideOnly, WideArngs), emitsDefs(6)},
+	{"^(S|U)SUBW$", requireOps(binary), requireArngs(integerWideOnly, WideArngs), emitsDefs(6)},
+	{"^FCVTL$", requireOps(unary), requireArngs([]string{"float32:2S", "float32:4S"}, LongArngs), emitsDefs(2)},
 	{"^USDOT$", matchOps(threeArgsResultInArg0), requireArngs(integer32And8Bits, UnsupportedArngs), emitsDefs(0)},
+	{"^PMULL$", matchOps(binary), requireArngs(polynomialArrngs, LongArngs), emitsDefs(8)},
 }
 
 func TestArm64Instructions(t *testing.T) {
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/operands.go b/src/simd/archsimd/_gen/simdgen/arm64/operands.go
index 30cba28..aa9e84a 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/operands.go
+++ b/src/simd/archsimd/_gen/simdgen/arm64/operands.go
@@ -93,8 +93,8 @@
 			op.Lanes = arrangement.bits / op.ElemBits
 		case ashape == WideArngs && vregPos == 2:
 			op.ElemBits = arrangement.elemBits / 2
-			op.Bits = arrangement.bits / 2
-			op.Lanes = arrangement.lanes
+			op.Bits = arrangement.bits
+			op.Lanes = arrangement.bits / op.ElemBits
 		default:
 			op.ElemBits = arrangement.elemBits
 			op.Bits = arrangement.bits
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
index 4aba7f3..4f0751c 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -514,6 +514,10 @@
 			processArg(arg)
 		}
 	}
+	for _, v := range ret {
+		slices.SortFunc(v, compareSimdTypes)
+	}
+
 	return ret
 }
 
diff --git a/src/simd/archsimd/_gen/simdgen/godefs.go b/src/simd/archsimd/_gen/simdgen/godefs.go
index 732d685..0cbe42e 100644
--- a/src/simd/archsimd/_gen/simdgen/godefs.go
+++ b/src/simd/archsimd/_gen/simdgen/godefs.go
@@ -7,6 +7,7 @@
 import (
 	"fmt"
 	"log"
+	"math/rand/v2"
 	"regexp"
 	"slices"
 	"strconv"
@@ -430,6 +431,10 @@
 			if num1 > num2 {
 				return 1
 			}
+			// "1" < "01".  Don't expect it in simdgen, but just in case.
+			if ln1, ln2 := i-numStart1, j-numStart2; ln1 != ln2 {
+				return ln1 - ln2
+			}
 			// If numbers are equal, continue to the next segment.
 		} else {
 			// Non-digit comparison.
@@ -472,6 +477,11 @@
 		op.adjustAsm()
 		ops = append(ops, op)
 	}
+
+	rand.Shuffle(len(ops), func(i, j int) {
+		ops[i], ops[j] = ops[j], ops[i]
+	})
+
 	slices.SortFunc(ops, compareOperations)
 	// The parsed XED data might contain duplicates, like
 	// 512 bits VPADDP.
@@ -479,7 +489,7 @@
 	slices.SortFunc(deduped, compareOperations)
 
 	if *Verbose {
-		log.Printf("dedup len: %d\n", len(ops))
+		log.Printf("dedup len: %d, ops len: %d\n", len(deduped), len(ops))
 	}
 	var err error
 	if err = overwrite(deduped); err != nil {
@@ -507,6 +517,10 @@
 		log.Printf("dedup len: %d\n", len(deduped))
 	}
 	reportXEDInconsistency(deduped)
+
+	// Sorting again, just in case.
+	slices.SortFunc(deduped, compareOperations)
+
 	typeMap := parseSIMDTypes(deduped)
 
 	archInfo := CurrentArch()
diff --git a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml
index 9fbe3f0..19e13be 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml
@@ -23,3 +23,19 @@
     // where the characteristic polynomial P is x^8 + x^4 + x^3 + x + 1.
 - go: carrylessMultiply
   commutative: false
+
+- go: carrylessMultiplyWidenLo
+  commutative: true
+  documentation: !string |-
+    // NAME returns the carryless (polynomial) product of the low halves
+    // of x and y.
+    //
+    // A carryless multiplication uses bitwise XOR instead of
+    // add-with-carry, for example (in base two):
+    //
+    //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+    //
+    // This also models multiplication of polynomials with coefficients
+    // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+    // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+    // polynomial terms, but coefficients "add" with XOR.)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml
new file mode 100644
index 0000000..7659884
--- /dev/null
+++ b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml
@@ -0,0 +1,10 @@
+!sum
+# Polynomial (carryless) multiply long, P64 variant (2D→1Q).
+- go: carrylessMultiplyWidenLo
+  asm: VPMULL
+  hiHalfAsm: VPMULL2
+  in:
+  - go: Uint64x2
+  - go: Uint64x2
+  out:
+  - go: Uint64x2
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
index d4a5886..646c0e5 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
@@ -12,7 +12,7 @@
   commutative: true
   documentation: !string |-
     // NAME multiplies elements and stores the high part of the result.
-- go: MulLoLong
+- go: MulWidenLo
   commutative: true
   documentation: !string |-
     // NAME multiplies corresponding low-indexed elements and produces a result with double the element width.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml
index 42f00db..7dc7669 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml
@@ -10,7 +10,7 @@
   - *any
 
 # Multiply long signed (SMULL)
-- go: MulLoLong
+- go: MulWidenLo
   signed: true
   asm: "VSMULL"
   hiHalfAsm: "VSMULL2"
@@ -24,7 +24,7 @@
     base: int
 
 # Multiply long unsigned (UMULL)
-- go: MulLoLong
+- go: MulWidenLo
   signed: false
   asm: "VUMULL"
   hiHalfAsm: "VUMULL2"
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
index 364b347..673ebc7 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
@@ -51,13 +51,13 @@
   documentation: !string |-
     // NAME performs a right shift on each element in x by the constant number of bits
     // and narrows the result to half the element width.
-- go: ShiftLeftLoLongConst
+- go: ShiftLeftWidenLoConst
   signed: false
   commutative: false
   documentation: !string |-
     // NAME performs a left shift on each unsigned low-indexed element in x by the constant number of bits
     // and widens the result to double the element width.
-- go: ShiftLeftLoLongConst
+- go: ShiftLeftWidenLoConst
   signed: true
   commutative: false
   documentation: !string |-
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml
index 3c4c637..ed51011 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml
@@ -175,7 +175,7 @@
   - go: $u
     base: uint
 
-- go: ShiftLeftLoLongConst
+- go: ShiftLeftWidenLoConst
   signed: false
   asm: "VUSHLL"
   hiHalfAsm: "VUSHLL2"
@@ -187,7 +187,7 @@
   - go: $u
     base: uint
 
-- go: ShiftLeftLoLongConst
+- go: ShiftLeftWidenLoConst
   signed: true
   asm: "VSSHLL"
   hiHalfAsm: "VSSHLL2"
diff --git a/src/simd/archsimd/_gen/simdgen/sort_test.go b/src/simd/archsimd/_gen/simdgen/sort_test.go
index 399acf0..a743477 100644
--- a/src/simd/archsimd/_gen/simdgen/sort_test.go
+++ b/src/simd/archsimd/_gen/simdgen/sort_test.go
@@ -13,7 +13,7 @@
 	}{
 		{"a1", "a2", -1},
 		{"a11a", "a11b", -1},
-		{"a01a1", "a1a01", -1},
+		{"a01a1", "a1a01", 1},
 		{"a2", "a1", 1},
 		{"a10", "a2", 1},
 		{"a1", "a10", -1},
@@ -24,7 +24,7 @@
 		{"file1", "file1", 0},
 		{"file", "file1", -1},
 		{"file1", "file", 1},
-		{"a01", "a1", -1},
+		{"a01", "a1", 1},
 		{"a1a", "a1b", -1},
 	}
 
diff --git a/src/simd/archsimd/_gen/simdgen/types.yaml b/src/simd/archsimd/_gen/simdgen/types.yaml
index 54b08c8..0e876d3 100644
--- a/src/simd/archsimd/_gen/simdgen/types.yaml
+++ b/src/simd/archsimd/_gen/simdgen/types.yaml
@@ -85,6 +85,8 @@
 
 # Special for carryless multiply
   - {class: vreg, go: Uint64x8,   base: "uint",  elemBits: 128, bits: 512, lanes: 8}
+# Result type of ARM64 carryless multiply, e.g. VPMULL V2.D1, V1.D1, V3.Q1
+  - {class: vreg, go: Uint64x2,   base: "uint",  elemBits: 128, bits: 128, lanes: 1}
 
 # Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
 # The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
diff --git a/src/simd/archsimd/clmul_arm64.go b/src/simd/archsimd/clmul_arm64.go
new file mode 100644
index 0000000..5c5fa81
--- /dev/null
+++ b/src/simd/archsimd/clmul_arm64.go
@@ -0,0 +1,79 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// CarrylessMultiplyEven computes the carryless
+// multiplications of selected even halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Asm: PMULL, CPU Feature: PMULL
+func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 {
+	return x.carrylessMultiplyWidenLo(y)
+}
+
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected odd halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Asm: PMULL, CPU Feature: PMULL
+func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 {
+	return x.GetHi().carrylessMultiplyWidenLo(y.GetHi())
+}
+
+// CarrylessMultiplyOddEven computes the carryless
+// multiplications of selected odd half of x's elements and even half of y's elements.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Asm: PMULL, CPU Feature: PMULL
+func (x Uint64x2) CarrylessMultiplyOddEven(y Uint64x2) Uint64x2 {
+	return x.GetHi().carrylessMultiplyWidenLo(y)
+}
+
+// CarrylessMultiplyEvenOdd computes the carryless
+// multiplications of selected even half of x's elements and odd half of y's elements.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Asm: PMULL, CPU Feature: PMULL
+func (x Uint64x2) CarrylessMultiplyEvenOdd(y Uint64x2) Uint64x2 {
+	return x.carrylessMultiplyWidenLo(y.GetHi())
+}
diff --git a/src/simd/archsimd/clmul_emulated.go b/src/simd/archsimd/clmul_emulated.go
new file mode 100644
index 0000000..b78af61
--- /dev/null
+++ b/src/simd/archsimd/clmul_emulated.go
@@ -0,0 +1,103 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (arm64 || wasm)
+
+package archsimd
+
+func new64x2(lo, hi uint64) Uint64x2 {
+	return Uint64x2{}.SetElem(0, lo).SetElem(1, hi)
+}
+
+// These masks all have 4 zeroes between 1s.
+var m0 = new64x2(0x1084210842108421, 0x2108421084210842)
+var m1 = new64x2(0x2108421084210842, 0x4210842108421084)
+var m2 = new64x2(0x4210842108421084, 0x8421084210842108)
+var m3 = new64x2(0x8421084210842108, 0x0842108421084210)
+var m4 = new64x2(0x0842108421084210, 0x1084210842108421)
+
+// Selects the middle 64 bits of a 128-bit simd value
+var middle = new64x2(0xffffffff00000000, 0x00000000ffffffff)
+
+// mwl is a 64x64 into 128 multiply that is missing
+// some carries that we don't need for CLMUL emulation.
+// The high 64 bits of each input are ignored.
+// Also just for fun, accumulate sums with Xor.
+func (x Uint64x2) mwl(y Uint64x2) Uint64x2 {
+	// reshape input into Uint32x4
+	// input is  {a b _ _}.mwl{c d _ _}
+	// need the sum of
+	// ac0_ac1
+	//   0 ad0_ad1
+	//   0 bc0_bc1
+	//   0   0 bd0_bd1
+	// This "sum" is where the carries (not propagated
+	// across lanes) are lost.
+	ab__ := x.ReshapeToUint32s()
+	cd__ := y.ReshapeToUint32s()
+	ac0_ac1_bd0_bd1 := ab__.MulWidenLo(cd__)
+
+	dc__ := y.RotateAllLeft(32).ReshapeToUint32s()
+	ad0_ad1_bc0_bc1 := ab__.MulWidenLo(dc__)
+	//
+	// have        ad0, ad1, bc0, bc1
+	// want        0, ad0+bc0, ad1+bc1, 0
+	// to add to    ac0_ac1_bd0_bd1
+	//
+	// swap 64-bit halves of ad0_ad1_bc0_bc1
+	// to get   bc0_bc1_ad0_ad1
+	bc0_bc1_ad0_ad1 := Uint64x2{}.SetElem(0, ad0_ad1_bc0_bc1.GetElem(1)).SetElem(1, ad0_ad1_bc0_bc1.GetElem(0))
+
+	// added to ad0_ad1_bc0_bc1 yields
+	//   bc0+ad0, bc1+ad1, bc0+ad0, bc1+ad1
+	// rotate 32 (within the two 64-bit elements) yields
+	//   bc1+ad1, bc0+ad0, bc1+ad1, bc0+ad0
+	// and then intersect with mask:
+	//   0      , bc0+ad0, bc1+ad1, 0
+	//
+	// use xor to make it a worse multiply
+	zzz_adPbc0_adPbc1_zzz := bc0_bc1_ad0_ad1.Xor(ad0_ad1_bc0_bc1).RotateAllLeft(32).And(middle)
+	return ac0_ac1_bd0_bd1.Xor(zzz_adPbc0_adPbc1_zzz)
+}
+
+// carrylessMultiply is constant time carrless multiply implemented with an
+// absurd number of multiplication given that the emulation platforms only have
+// 32x32 into 64, it might make sense to rework this into that primitive, but,
+// for now this works and is easily tested in scalar Go.
+func (x Uint64x2) carrylessMultiply(y Uint64x2) Uint64x2 {
+
+	// This by masking the two inputs into 5 thinned inputs, with
+	// 4 zeroes separating any 2 set bits.  Multiply will potentially
+	// set more bits with addition of overlapping terms, however this
+	// technique allows as many as 31 additions (filling all 4 separation
+	// positions with 1) without perturbing the bits we care about.  Since
+	// there's at most 13 set bits in a thinned input, 31 is not a problem.
+	// If there were only 3 set bits, there are 16 1s per thinned input and
+	// only 15 additions can be tolerated -- so that's not possible.
+
+	// This is also discussed at
+	// https://timtaubert.de/blog/2017/06/verified-binary-multiplication-for-ghash/
+
+	x0 := x.And(m0)
+	x1 := x.And(m1)
+	x2 := x.And(m2)
+	x3 := x.And(m3)
+	x4 := x.And(m4)
+
+	y0 := y.And(m0)
+	y1 := y.And(m1)
+	y2 := y.And(m2)
+	y3 := y.And(m3)
+	y4 := y.And(m4)
+
+	var z Uint64x2
+	// for a given line, combining (xI).mwl(yJ) terms, I+J == K mod 5; mask index = K
+	z = (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
+	z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
+	z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
+	z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
+	z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
+
+	return z
+}
diff --git a/src/simd/archsimd/cpu_other.go b/src/simd/archsimd/cpu_other.go
new file mode 100644
index 0000000..326040f
--- /dev/null
+++ b/src/simd/archsimd/cpu_other.go
@@ -0,0 +1,21 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "internal/cpu"
+
+type ARM64Features struct{}
+
+var ARM64 ARM64Features
+
+// PMULL returns whether the CPU supports the PMULL feature.
+//
+// PMULL is defined on all GOARCHes, but will only return true on
+// GOARCH arm64.
+func (ARM64Features) PMULL() bool {
+	return cpu.ARM64.HasPMULL
+}
diff --git a/src/simd/archsimd/extra_amd64.go b/src/simd/archsimd/extra_amd64.go
index 9f23c22..b0dba6d 100644
--- a/src/simd/archsimd/extra_amd64.go
+++ b/src/simd/archsimd/extra_amd64.go
@@ -179,201 +179,3 @@
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) IsNaN() Mask64x8
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX
-func (x Float32x4) Abs() Float32x4 {
-	mask := BroadcastUint32x4(0x80000000)
-	return x.ToBits().AndNot(mask).BitsToFloat32()
-}
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX2
-func (x Float32x8) Abs() Float32x8 {
-	// mask will have a 1 in the sign bit UNLESS x is NaN
-	mask := BroadcastUint32x8(0x80000000)
-	return x.ToBits().AndNot(mask).BitsToFloat32()
-}
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX512
-func (x Float32x16) Abs() Float32x16 {
-	mask := BroadcastUint32x16(0x80000000)
-	return x.ToBits().AndNot(mask).BitsToFloat32()
-}
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX
-func (x Float64x2) Abs() Float64x2 {
-	// mask will have a 1 in the sign bit UNLESS x is NaN
-	mask := BroadcastUint64x2(0x8000000000000000)
-	return x.ToBits().AndNot(mask).BitsToFloat64()
-}
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX2
-func (x Float64x4) Abs() Float64x4 {
-	mask := BroadcastUint64x4(0x8000000000000000)
-	return x.ToBits().AndNot(mask).BitsToFloat64()
-}
-
-// Abs returns the absolute values of the elements of x
-//
-// Emulated, CPU Feature AVX512
-func (x Float64x8) Abs() Float64x8 {
-	mask := BroadcastUint64x8(0x8000000000000000)
-	return x.ToBits().AndNot(mask).BitsToFloat64()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX
-func (x Float32x4) Neg() Float32x4 {
-	mask := BroadcastUint32x4(0x80000000)
-	return x.ToBits().Xor(mask).BitsToFloat32()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX2
-func (x Float32x8) Neg() Float32x8 {
-	// mask will have a 1 in the sign bit UNLESS x is NaN
-	mask := BroadcastUint32x8(0x80000000)
-	return x.ToBits().Xor(mask).BitsToFloat32()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX512
-func (x Float32x16) Neg() Float32x16 {
-	mask := BroadcastUint32x16(0x80000000)
-	return x.ToBits().Xor(mask).BitsToFloat32()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX
-func (x Float64x2) Neg() Float64x2 {
-	// mask will have a 1 in the sign bit UNLESS x is NaN
-	mask := BroadcastUint64x2(0x8000000000000000)
-	return x.ToBits().Xor(mask).BitsToFloat64()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX2
-func (x Float64x4) Neg() Float64x4 {
-	mask := BroadcastUint64x4(0x8000000000000000)
-	return x.ToBits().Xor(mask).BitsToFloat64()
-}
-
-// Neg returns the negation of the elements of x
-//
-// Emulated, CPU Feature AVX512
-func (x Float64x8) Neg() Float64x8 {
-	mask := BroadcastUint64x8(0x8000000000000000)
-	return x.ToBits().Xor(mask).BitsToFloat64()
-}
-
-var f0x16 = [16]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
-var f0x32 = [32]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
-	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
-var f0x64 = [64]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
-	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
-	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
-	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX
-func (x Int8x16) Mul(y Int8x16) Int8x16 {
-	mask := LoadInt8x16Array(&f0x16)
-	mask16 := mask.ToBits().ReshapeToUint16s()
-	xe := x.And(mask).ToBits().ReshapeToUint16s()
-	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ToBits().ReshapeToUint16s()
-	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
-}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX
-func (x Uint8x16) Mul(y Uint8x16) Uint8x16 {
-	mask := LoadInt8x16Array(&f0x16).ToBits()
-	mask16 := mask.ReshapeToUint16s()
-	xe := x.And(mask).ReshapeToUint16s()
-	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ReshapeToUint16s()
-	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s()
-}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX2
-func (x Int8x32) Mul(y Int8x32) Int8x32 {
-	mask := LoadInt8x32Array(&f0x32)
-	mask16 := mask.ToBits().ReshapeToUint16s()
-	xe := x.And(mask).ToBits().ReshapeToUint16s()
-	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ToBits().ReshapeToUint16s()
-	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
-}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX512
-func (x Int8x64) Mul(y Int8x64) Int8x64 {
-	mask := LoadInt8x64Array(&f0x64)
-	mask16 := mask.ToBits().ReshapeToUint16s()
-	xe := x.And(mask).ToBits().ReshapeToUint16s()
-	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ToBits().ReshapeToUint16s()
-	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
-}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX2
-func (x Uint8x32) Mul(y Uint8x32) Uint8x32 {
-	mask := LoadInt8x32Array(&f0x32).ToBits()
-	mask16 := mask.ReshapeToUint16s()
-	xe := x.And(mask).ReshapeToUint16s()
-	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ReshapeToUint16s()
-	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s()
-}
-
-// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
-//
-// Emulated, CPU Feature: AVX512
-func (x Uint8x64) Mul(y Uint8x64) Uint8x64 {
-	mask := LoadInt8x64Array(&f0x64).ToBits()
-	mask16 := mask.ReshapeToUint16s()
-	xe := x.And(mask).ReshapeToUint16s()
-	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	ye := y.And(mask).ReshapeToUint16s()
-	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
-	pe := xe.Mul(ye).And(mask16)
-	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
-	return pe.Or(po).ReshapeToUint8s()
-}
diff --git a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
index f9082c9..9c48a36 100644
--- a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
@@ -125,3 +125,20 @@
 	).Store(got)
 	checkSlices(t, got, want)
 }
+
+func TestClMul(t *testing.T) {
+	var x = archsimd.LoadUint64x2([]uint64{1, 5})
+	var y = archsimd.LoadUint64x2([]uint64{3, 9})
+
+	foo := func(v archsimd.Uint64x2, s []uint64) {
+		r := make([]uint64, 2, 2)
+		v.Store(r)
+		checkSlices[uint64](t, r, s)
+	}
+
+	foo(x.CarrylessMultiplyEven(y), []uint64{3, 0})
+	foo(x.CarrylessMultiplyEvenOdd(y), []uint64{9, 0})
+	foo(x.CarrylessMultiplyOddEven(y), []uint64{15, 0})
+	foo(x.CarrylessMultiplyOdd(y), []uint64{45, 0})
+	foo(y.CarrylessMultiplyEven(y), []uint64{5, 0})
+}
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index 15cab5a..eadf945 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -539,11 +539,6 @@
 
 // And performs a bitwise x & y.
 //
-// Asm: VPAND, CPU Feature: AVX
-func (x Uint64x2) And(y Uint64x2) Uint64x2
-
-// And performs a bitwise x & y.
-//
 // Asm: VPAND, CPU Feature: AVX2
 func (x Uint64x4) And(y Uint64x4) Uint64x4
 
@@ -552,6 +547,11 @@
 // Asm: VPANDQ, CPU Feature: AVX512
 func (x Uint64x8) And(y Uint64x8) Uint64x8
 
+// And performs a bitwise x & y.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Uint64x2) And(y Uint64x2) Uint64x2
+
 /* AndNot */
 
 // AndNot performs a bitwise x &^ y.
@@ -661,11 +661,6 @@
 
 // AndNot performs a bitwise x &^ y.
 //
-// Asm: VPANDN, CPU Feature: AVX
-func (x Uint64x2) AndNot(y Uint64x2) Uint64x2
-
-// AndNot performs a bitwise x &^ y.
-//
 // Asm: VPANDN, CPU Feature: AVX2
 func (x Uint64x4) AndNot(y Uint64x4) Uint64x4
 
@@ -674,6 +669,11 @@
 // Asm: VPANDNQ, CPU Feature: AVX512
 func (x Uint64x8) AndNot(y Uint64x8) Uint64x8
 
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Uint64x2) AndNot(y Uint64x2) Uint64x2
+
 /* Average */
 
 // Average computes the rounded average of corresponding elements.
@@ -4584,11 +4584,6 @@
 
 // Or performs a bitwise x | y.
 //
-// Asm: VPOR, CPU Feature: AVX
-func (x Uint64x2) Or(y Uint64x2) Uint64x2
-
-// Or performs a bitwise x | y.
-//
 // Asm: VPOR, CPU Feature: AVX2
 func (x Uint64x4) Or(y Uint64x4) Uint64x4
 
@@ -4597,6 +4592,11 @@
 // Asm: VPORQ, CPU Feature: AVX512
 func (x Uint64x8) Or(y Uint64x8) Uint64x8
 
+// Or performs a bitwise x | y.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Uint64x2) Or(y Uint64x2) Uint64x2
+
 /* Permute */
 
 // Permute permutes x.
@@ -7610,11 +7610,6 @@
 
 // Xor performs a bitwise x ^ y.
 //
-// Asm: VPXOR, CPU Feature: AVX
-func (x Uint64x2) Xor(y Uint64x2) Uint64x2
-
-// Xor performs a bitwise x ^ y.
-//
 // Asm: VPXOR, CPU Feature: AVX2
 func (x Uint64x4) Xor(y Uint64x4) Uint64x4
 
@@ -7623,6 +7618,11 @@
 // Asm: VPXORQ, CPU Feature: AVX512
 func (x Uint64x8) Xor(y Uint64x8) Uint64x8
 
+// Xor performs a bitwise x ^ y.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Uint64x2) Xor(y Uint64x2) Uint64x2
+
 // AsFloat64x2 reinterprets the bits of a Float32x4 vector as a Float64x2 vector
 //
 // Deprecated: use combinations of ToBits, BitsTo{Int<N>,Float<N>}, ReshapeToUint<N>
diff --git a/src/simd/archsimd/ops_arm64.go b/src/simd/archsimd/ops_arm64.go
index 2d60c69..fe6f010 100644
--- a/src/simd/archsimd/ops_arm64.go
+++ b/src/simd/archsimd/ops_arm64.go
@@ -1173,55 +1173,55 @@
 // Asm: VMLA, CPU Feature: NEON
 func (x Uint32x4) MulAdd(y Uint32x4, z Uint32x4) Uint32x4
 
-/* MulLoLong */
+/* MulWidenLo */
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VSMULL, CPU Feature: NEON
-func (x Int8x16) MulLoLong(y Int8x16) Int16x8
+func (x Int8x16) MulWidenLo(y Int8x16) Int16x8
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VSMULL, CPU Feature: NEON
-func (x Int16x8) MulLoLong(y Int16x8) Int32x4
+func (x Int16x8) MulWidenLo(y Int16x8) Int32x4
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VSMULL, CPU Feature: NEON
-func (x Int32x4) MulLoLong(y Int32x4) Int64x2
+func (x Int32x4) MulWidenLo(y Int32x4) Int64x2
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VUMULL, CPU Feature: NEON
-func (x Uint8x16) MulLoLong(y Uint8x16) Uint16x8
+func (x Uint8x16) MulWidenLo(y Uint8x16) Uint16x8
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VUMULL, CPU Feature: NEON
-func (x Uint16x8) MulLoLong(y Uint16x8) Uint32x4
+func (x Uint16x8) MulWidenLo(y Uint16x8) Uint32x4
 
-// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width.
+// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width.
 // For the high-indexed elements, use GetHi:
 //
-//	x.GetHi().MulLoLong(y.GetHi())
+//	x.GetHi().MulWidenLo(y.GetHi())
 //
 // Asm: VUMULL, CPU Feature: NEON
-func (x Uint32x4) MulLoLong(y Uint32x4) Uint64x2
+func (x Uint32x4) MulWidenLo(y Uint32x4) Uint64x2
 
 /* Neg */
 
@@ -1875,74 +1875,6 @@
 // Asm: VSHL, CPU Feature: NEON
 func (x Uint64x2) ShiftLeftConst(constant uint64) Uint64x2
 
-/* ShiftLeftLoLongConst */
-
-// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VSSHLL, CPU Feature: NEON
-func (x Int8x16) ShiftLeftLoLongConst(constant uint64) Int16x8
-
-// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VSSHLL, CPU Feature: NEON
-func (x Int16x8) ShiftLeftLoLongConst(constant uint64) Int32x4
-
-// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VSSHLL, CPU Feature: NEON
-func (x Int32x4) ShiftLeftLoLongConst(constant uint64) Int64x2
-
-// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VUSHLL, CPU Feature: NEON
-func (x Uint8x16) ShiftLeftLoLongConst(constant uint64) Uint16x8
-
-// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VUSHLL, CPU Feature: NEON
-func (x Uint16x8) ShiftLeftLoLongConst(constant uint64) Uint32x4
-
-// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
-// and widens the result to double the element width.
-// For the high-indexed elements, use GetHi:
-//
-//	x.GetHi().ShiftLeftLoLongConst(...)
-//
-// A non-constant value of constant may result in significantly worse performance for this operation.
-//
-// Asm: VUSHLL, CPU Feature: NEON
-func (x Uint32x4) ShiftLeftLoLongConst(constant uint64) Uint64x2
-
 /* ShiftLeftSaturatedConst */
 
 // ShiftLeftSaturatedConst performs a saturating left shift on each element in x by the constant number of bits specified by y.
@@ -2009,6 +1941,74 @@
 // Asm: VUQSHL, CPU Feature: NEON
 func (x Uint64x2) ShiftLeftSaturatedConst(constant uint64) Uint64x2
 
+/* ShiftLeftWidenLoConst */
+
+// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VSSHLL, CPU Feature: NEON
+func (x Int8x16) ShiftLeftWidenLoConst(constant uint64) Int16x8
+
+// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VSSHLL, CPU Feature: NEON
+func (x Int16x8) ShiftLeftWidenLoConst(constant uint64) Int32x4
+
+// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VSSHLL, CPU Feature: NEON
+func (x Int32x4) ShiftLeftWidenLoConst(constant uint64) Int64x2
+
+// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VUSHLL, CPU Feature: NEON
+func (x Uint8x16) ShiftLeftWidenLoConst(constant uint64) Uint16x8
+
+// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VUSHLL, CPU Feature: NEON
+func (x Uint16x8) ShiftLeftWidenLoConst(constant uint64) Uint32x4
+
+// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits
+// and widens the result to double the element width.
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().ShiftLeftWidenLoConst(...)
+//
+// A non-constant value of constant may result in significantly worse performance for this operation.
+//
+// Asm: VUSHLL, CPU Feature: NEON
+func (x Uint32x4) ShiftLeftWidenLoConst(constant uint64) Uint64x2
+
 /* ShiftRightConst */
 
 // ShiftRightConst performs an arithmetic right shift on each element in x by the constant number of bits specified by y.
diff --git a/src/simd/archsimd/ops_emulated_amd64.go b/src/simd/archsimd/ops_emulated_amd64.go
new file mode 100644
index 0000000..cc45326
--- /dev/null
+++ b/src/simd/archsimd/ops_emulated_amd64.go
@@ -0,0 +1,205 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package archsimd
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX
+func (x Float32x4) Abs() Float32x4 {
+	mask := BroadcastUint32x4(0x80000000)
+	return x.ToBits().AndNot(mask).BitsToFloat32()
+}
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX2
+func (x Float32x8) Abs() Float32x8 {
+	// mask will have a 1 in the sign bit UNLESS x is NaN
+	mask := BroadcastUint32x8(0x80000000)
+	return x.ToBits().AndNot(mask).BitsToFloat32()
+}
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX512
+func (x Float32x16) Abs() Float32x16 {
+	mask := BroadcastUint32x16(0x80000000)
+	return x.ToBits().AndNot(mask).BitsToFloat32()
+}
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX
+func (x Float64x2) Abs() Float64x2 {
+	// mask will have a 1 in the sign bit UNLESS x is NaN
+	mask := BroadcastUint64x2(0x8000000000000000)
+	return x.ToBits().AndNot(mask).BitsToFloat64()
+}
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX2
+func (x Float64x4) Abs() Float64x4 {
+	mask := BroadcastUint64x4(0x8000000000000000)
+	return x.ToBits().AndNot(mask).BitsToFloat64()
+}
+
+// Abs returns the absolute values of the elements of x
+//
+// Emulated, CPU Feature AVX512
+func (x Float64x8) Abs() Float64x8 {
+	mask := BroadcastUint64x8(0x8000000000000000)
+	return x.ToBits().AndNot(mask).BitsToFloat64()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX
+func (x Float32x4) Neg() Float32x4 {
+	mask := BroadcastUint32x4(0x80000000)
+	return x.ToBits().Xor(mask).BitsToFloat32()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX2
+func (x Float32x8) Neg() Float32x8 {
+	// mask will have a 1 in the sign bit UNLESS x is NaN
+	mask := BroadcastUint32x8(0x80000000)
+	return x.ToBits().Xor(mask).BitsToFloat32()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX512
+func (x Float32x16) Neg() Float32x16 {
+	mask := BroadcastUint32x16(0x80000000)
+	return x.ToBits().Xor(mask).BitsToFloat32()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX
+func (x Float64x2) Neg() Float64x2 {
+	// mask will have a 1 in the sign bit UNLESS x is NaN
+	mask := BroadcastUint64x2(0x8000000000000000)
+	return x.ToBits().Xor(mask).BitsToFloat64()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX2
+func (x Float64x4) Neg() Float64x4 {
+	mask := BroadcastUint64x4(0x8000000000000000)
+	return x.ToBits().Xor(mask).BitsToFloat64()
+}
+
+// Neg returns the negation of the elements of x
+//
+// Emulated, CPU Feature AVX512
+func (x Float64x8) Neg() Float64x8 {
+	mask := BroadcastUint64x8(0x8000000000000000)
+	return x.ToBits().Xor(mask).BitsToFloat64()
+}
+
+var f0x16 = [16]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
+var f0x32 = [32]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
+var f0x64 = [64]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX
+func (x Int8x16) Mul(y Int8x16) Int8x16 {
+	mask := LoadInt8x16Array(&f0x16)
+	mask16 := mask.ToBits().ReshapeToUint16s()
+	xe := x.And(mask).ToBits().ReshapeToUint16s()
+	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ToBits().ReshapeToUint16s()
+	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
+}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX
+func (x Uint8x16) Mul(y Uint8x16) Uint8x16 {
+	mask := LoadInt8x16Array(&f0x16).ToBits()
+	mask16 := mask.ReshapeToUint16s()
+	xe := x.And(mask).ReshapeToUint16s()
+	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ReshapeToUint16s()
+	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s()
+}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX2
+func (x Int8x32) Mul(y Int8x32) Int8x32 {
+	mask := LoadInt8x32Array(&f0x32)
+	mask16 := mask.ToBits().ReshapeToUint16s()
+	xe := x.And(mask).ToBits().ReshapeToUint16s()
+	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ToBits().ReshapeToUint16s()
+	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
+}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX512
+func (x Int8x64) Mul(y Int8x64) Int8x64 {
+	mask := LoadInt8x64Array(&f0x64)
+	mask16 := mask.ToBits().ReshapeToUint16s()
+	xe := x.And(mask).ToBits().ReshapeToUint16s()
+	xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ToBits().ReshapeToUint16s()
+	yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s().BitsToInt8()
+}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX2
+func (x Uint8x32) Mul(y Uint8x32) Uint8x32 {
+	mask := LoadInt8x32Array(&f0x32).ToBits()
+	mask16 := mask.ReshapeToUint16s()
+	xe := x.And(mask).ReshapeToUint16s()
+	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ReshapeToUint16s()
+	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s()
+}
+
+// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ.
+//
+// Emulated, CPU Feature: AVX512
+func (x Uint8x64) Mul(y Uint8x64) Uint8x64 {
+	mask := LoadInt8x64Array(&f0x64).ToBits()
+	mask16 := mask.ReshapeToUint16s()
+	xe := x.And(mask).ReshapeToUint16s()
+	xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	ye := y.And(mask).ReshapeToUint16s()
+	yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8)
+	pe := xe.Mul(ye).And(mask16)
+	po := xo.Mul(yo).And(mask16).ShiftAllLeft(8)
+	return pe.Or(po).ReshapeToUint8s()
+}
diff --git a/src/simd/archsimd/ops_emulated_wasm.go b/src/simd/archsimd/ops_emulated_wasm.go
index b8dbe50..46180f1 100644
--- a/src/simd/archsimd/ops_emulated_wasm.go
+++ b/src/simd/archsimd/ops_emulated_wasm.go
@@ -162,3 +162,41 @@
 func (x Uint64x2) OnesCount() Uint64x2 {
 	return x.BitsToInt64().OnesCount().ToBits()
 }
+
+// CarrylessMultiplyEven computes the carryless
+// multiplications of selected even halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Emulated
+func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 {
+	return x.carrylessMultiply(y)
+}
+
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected odd halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// Emulated
+func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 {
+	x = x.SetElem(0, x.GetElem(1))
+	y = y.SetElem(0, x.GetElem(1))
+	return x.carrylessMultiply(y)
+}
diff --git a/src/simd/archsimd/ops_internal_arm64.go b/src/simd/archsimd/ops_internal_arm64.go
index 277e581..69da701 100644
--- a/src/simd/archsimd/ops_internal_arm64.go
+++ b/src/simd/archsimd/ops_internal_arm64.go
@@ -85,3 +85,24 @@
 //
 // Asm: VDUP, CPU Feature: NEON
 func (x Uint8x16) broadcast1To16() Uint8x16
+
+/* carrylessMultiplyWidenLo */
+
+// carrylessMultiplyWidenLo returns the carryless (polynomial) product of the low halves
+// of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+// For the high-indexed elements, use GetHi:
+//
+//	x.GetHi().carrylessMultiplyWidenLo(y.GetHi())
+//
+// Asm: VPMULL, CPU Feature: NEON
+func (x Uint64x2) carrylessMultiplyWidenLo(y Uint64x2) Uint64x2
diff --git a/src/simd/internal/bridge/decls_amd64.go b/src/simd/internal/bridge/decls_amd64.go
index f8d3921..c8d41ea 100644
--- a/src/simd/internal/bridge/decls_amd64.go
+++ b/src/simd/internal/bridge/decls_amd64.go
@@ -2926,6 +2926,30 @@
 	return Int64x8((archsimd.Uint64x8(x)).BitsToInt64())
 }
 
+func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y)))
+}
+
+func (x Uint64x4) CarrylessMultiplyEven(y Uint64x4) Uint64x4 {
+	return Uint64x4((archsimd.Uint64x4(x)).CarrylessMultiplyEven(archsimd.Uint64x4(y)))
+}
+
+func (x Uint64x8) CarrylessMultiplyEven(y Uint64x8) Uint64x8 {
+	return Uint64x8((archsimd.Uint64x8(x)).CarrylessMultiplyEven(archsimd.Uint64x8(y)))
+}
+
+func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y)))
+}
+
+func (x Uint64x4) CarrylessMultiplyOdd(y Uint64x4) Uint64x4 {
+	return Uint64x4((archsimd.Uint64x4(x)).CarrylessMultiplyOdd(archsimd.Uint64x4(y)))
+}
+
+func (x Uint64x8) CarrylessMultiplyOdd(y Uint64x8) Uint64x8 {
+	return Uint64x8((archsimd.Uint64x8(x)).CarrylessMultiplyOdd(archsimd.Uint64x8(y)))
+}
+
 func (x Uint64x2) ConvertToInt64() Int64x2 {
 	return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64())
 }
diff --git a/src/simd/internal/bridge/decls_arm64.go b/src/simd/internal/bridge/decls_arm64.go
index bdf2a87..b2f7c07 100644
--- a/src/simd/internal/bridge/decls_arm64.go
+++ b/src/simd/internal/bridge/decls_arm64.go
@@ -982,6 +982,14 @@
 	return Int64x2((archsimd.Uint64x2(x)).BitsToInt64())
 }
 
+func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y)))
+}
+
+func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y)))
+}
+
 func (x Uint64x2) ConvertToInt64() Int64x2 {
 	return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64())
 }
diff --git a/src/simd/internal/bridge/decls_wasm.go b/src/simd/internal/bridge/decls_wasm.go
index 0818cac..daba992 100644
--- a/src/simd/internal/bridge/decls_wasm.go
+++ b/src/simd/internal/bridge/decls_wasm.go
@@ -982,6 +982,14 @@
 	return Int64x2((archsimd.Uint64x2(x)).BitsToInt64())
 }
 
+func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y)))
+}
+
+func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 {
+	return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y)))
+}
+
 func (x Uint64x2) ConvertToInt64() Int64x2 {
 	return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64())
 }
diff --git a/src/simd/internal/bridge/simd_emulated.go b/src/simd/internal/bridge/simd_emulated.go
new file mode 100644
index 0000000..64a728b
--- /dev/null
+++ b/src/simd/internal/bridge/simd_emulated.go
@@ -0,0 +1,3222 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (amd64 || wasm || arm64)
+
+package bridge
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+)
+
+// VectorSize returns the bit length of the emulated vector (fixed to 128).
+func VectorBitSize() int {
+	return 128
+}
+
+// Emulated returns whether simd is emulated.
+func Emulated() bool {
+	return true
+}
+
+type _simd struct {
+	_ [0]func(*_simd) *_simd
+}
+
+// Int8s represents a 128-bit vector of 16 int8 elements.
+type Int8s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadInt8s loads a slice of int8 into an Int8s vector.
+func LoadInt8s(s []int8) Int8s {
+	var a, b uint64
+	for i := 0; i < 16; i++ {
+		val := uint64(uint8(s[i]))
+		if i < 8 {
+			a |= val << (8 * i)
+		} else {
+			b |= val << (8 * (i - 8))
+		}
+	}
+	return Int8s{a: a, b: b}
+}
+
+// LoadInt8sPart loads a partial slice of int8 into an Int8s vector.
+func LoadInt8sPart(s []int8) (Int8s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 16 {
+		n = 16
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(uint8(s[i]))
+		if i < 8 {
+			a |= val << (8 * i)
+		} else {
+			b |= val << (8 * (i - 8))
+		}
+	}
+	return Int8s{a: a, b: b}, n
+}
+
+func (x Int8s) get(i int) int8 {
+	if i < 8 {
+		return int8(x.a >> (8 * i))
+	}
+	return int8(x.b >> (8 * (i - 8)))
+}
+
+func (x *Int8s) set(i int, v int8) {
+	val := uint64(uint8(v))
+	if i < 8 {
+		mask := uint64(0xff) << (8 * i)
+		x.a = (x.a &^ mask) | (val << (8 * i))
+	} else {
+		mask := uint64(0xff) << (8 * (i - 8))
+		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
+	}
+}
+
+// Abs returns the element-wise absolute value of x.
+func (x Int8s) Abs() Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		v := x.get(i)
+		if v < 0 {
+			res.set(i, -v)
+		} else {
+			res.set(i, v)
+		}
+	}
+	return res
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Int8s) Add(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// AddSaturated returns the element-wise saturated sum of x and y.
+func (x Int8s) AddSaturated(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		sum := int(x.get(i)) + int(y.get(i))
+		if sum > math.MaxInt8 {
+			res.set(i, math.MaxInt8)
+		} else if sum < math.MinInt8 {
+			res.set(i, math.MinInt8)
+		} else {
+			res.set(i, int8(sum))
+		}
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Int8s) And(y Int8s) Int8s {
+	return Int8s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Int8s) AndNot(y Int8s) Int8s {
+	return Int8s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Int8s) Equal(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Int8s) Greater(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Int8s) GreaterEqual(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Int8s) Less(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Int8s) LessEqual(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Int8s) NotEqual(y Int8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Int8s) Len() int {
+	return 16
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Int8s) Masked(mask Mask8s) Int8s {
+	return Int8s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Int8s) Max(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Int8s) Mul(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Int8s) IfElse(mask Mask8s, y Int8s) Int8s {
+	return Int8s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Int8s) Min(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Neg returns the element-wise negation of x.
+func (x Int8s) Neg() Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		res.set(i, -x.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Int8s) Not() Int8s {
+	return Int8s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Int8s) Or(y Int8s) Int8s {
+	return Int8s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// Store stores the vector elements into the slice s.
+func (x Int8s) Store(s []int8) {
+	for i := 0; i < 16 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Int8s) StorePart(s []int8) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Int8s) String() string {
+	var parts [16]int8
+	for i := 0; i < 16; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Int8s) Sub(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// SubSaturated returns the element-wise saturated difference of x and y.
+func (x Int8s) SubSaturated(y Int8s) Int8s {
+	var res Int8s
+	for i := 0; i < 16; i++ {
+		diff := int(x.get(i)) - int(y.get(i))
+		if diff > math.MaxInt8 {
+			res.set(i, math.MaxInt8)
+		} else if diff < math.MinInt8 {
+			res.set(i, math.MinInt8)
+		} else {
+			res.set(i, int8(diff))
+		}
+	}
+	return res
+}
+
+// ToMask returns a mask representation of the vector.
+func (x Int8s) ToMask() Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) != 0 {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Int8s) Xor(y Int8s) Int8s {
+	return Int8s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// ConvertToUint8 converts the vector elements to uint8.
+func (x Int8s) ConvertToUint8() Uint8s {
+	return Uint8s{a: x.a, b: x.b}
+}
+
+// ToBits reinterprets the vector bits as a Uint8s vector.
+func (x Int8s) ToBits() Uint8s {
+	return Uint8s{a: x.a, b: x.b}
+}
+
+// Int16s represents a 128-bit vector of 8 int16 elements.
+type Int16s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadInt16s loads a slice of int16 into an Int16s vector.
+func LoadInt16s(s []int16) Int16s {
+	var a, b uint64
+	for i := 0; i < 8; i++ {
+		val := uint64(uint16(s[i]))
+		if i < 4 {
+			a |= val << (16 * i)
+		} else {
+			b |= val << (16 * (i - 4))
+		}
+	}
+	return Int16s{a: a, b: b}
+}
+
+// LoadInt16sPart loads a partial slice of int16 into an Int16s vector.
+func LoadInt16sPart(s []int16) (Int16s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 8 {
+		n = 8
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(uint16(s[i]))
+		if i < 4 {
+			a |= val << (16 * i)
+		} else {
+			b |= val << (16 * (i - 4))
+		}
+	}
+	return Int16s{a: a, b: b}, n
+}
+
+func (x Int16s) get(i int) int16 {
+	if i < 4 {
+		return int16(x.a >> (16 * i))
+	}
+	return int16(x.b >> (16 * (i - 4)))
+}
+
+func (x *Int16s) set(i int, v int16) {
+	val := uint64(uint16(v))
+	if i < 4 {
+		mask := uint64(0xffff) << (16 * i)
+		x.a = (x.a &^ mask) | (val << (16 * i))
+	} else {
+		mask := uint64(0xffff) << (16 * (i - 4))
+		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
+	}
+}
+
+// Abs returns the element-wise absolute value of x.
+func (x Int16s) Abs() Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		v := x.get(i)
+		if v < 0 {
+			res.set(i, -v)
+		} else {
+			res.set(i, v)
+		}
+	}
+	return res
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Int16s) Add(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// AddSaturated returns the element-wise saturated sum of x and y.
+func (x Int16s) AddSaturated(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		sum := int(x.get(i)) + int(y.get(i))
+		if sum > math.MaxInt16 {
+			res.set(i, math.MaxInt16)
+		} else if sum < math.MinInt16 {
+			res.set(i, math.MinInt16)
+		} else {
+			res.set(i, int16(sum))
+		}
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Int16s) And(y Int16s) Int16s {
+	return Int16s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Int16s) AndNot(y Int16s) Int16s {
+	return Int16s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Int16s) Equal(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Int16s) Greater(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Int16s) GreaterEqual(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Int16s) Less(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Int16s) LessEqual(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Int16s) NotEqual(y Int16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Int16s) Len() int {
+	return 8
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Int16s) Masked(mask Mask16s) Int16s {
+	return Int16s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Int16s) Max(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Int16s) IfElse(mask Mask16s, y Int16s) Int16s {
+	return Int16s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Int16s) Min(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Int16s) Mul(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// Neg returns the element-wise negation of x.
+func (x Int16s) Neg() Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, -x.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Int16s) Not() Int16s {
+	return Int16s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Int16s) Or(y Int16s) Int16s {
+	return Int16s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Int16s) ShiftAllLeft(y uint8) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)<<y)
+	}
+	return res
+}
+
+// ShiftAllRight shifts all elements right by y bits.
+func (x Int16s) ShiftAllRight(y uint8) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)>>y)
+	}
+	return res
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Int16s) RotateAllLeft(dist uint64) Int16s {
+	var res Int16s
+	d := dist & 15
+	for i := 0; i < 8; i++ {
+		u := uint16(x.get(i))
+		r := (u << d) | (u >> ((16 - d) & 15))
+		res.set(i, int16(r))
+	}
+	return res
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Int16s) RotateAllRight(dist uint64) Int16s {
+	var res Int16s
+	d := dist & 15
+	for i := 0; i < 8; i++ {
+		u := uint16(x.get(i))
+		r := (u >> d) | (u << ((16 - d) & 15))
+		res.set(i, int16(r))
+	}
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Int16s) Store(s []int16) {
+	for i := 0; i < 8 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Int16s) StorePart(s []int16) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Int16s) String() string {
+	var parts [8]int16
+	for i := 0; i < 8; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Int16s) Sub(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// SubSaturated returns the element-wise saturated difference of x and y.
+func (x Int16s) SubSaturated(y Int16s) Int16s {
+	var res Int16s
+	for i := 0; i < 8; i++ {
+		diff := int(x.get(i)) - int(y.get(i))
+		if diff > math.MaxInt16 {
+			res.set(i, math.MaxInt16)
+		} else if diff < math.MinInt16 {
+			res.set(i, math.MinInt16)
+		} else {
+			res.set(i, int16(diff))
+		}
+	}
+	return res
+}
+
+// ToMask returns a mask representation of the vector.
+func (x Int16s) ToMask() Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) != 0 {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Int16s) Xor(y Int16s) Int16s {
+	return Int16s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// ConvertToUint16 converts the vector elements to uint16.
+func (x Int16s) ConvertToUint16() Uint16s {
+	return Uint16s{a: x.a, b: x.b}
+}
+
+// ToBits reinterprets the vector bits as a Uint16s vector.
+func (x Int16s) ToBits() Uint16s {
+	return Uint16s{a: x.a, b: x.b}
+}
+
+// Int32s represents a 128-bit vector of 4 int32 elements.
+type Int32s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadInt32s loads a slice of int32 into an Int32s vector.
+func LoadInt32s(s []int32) Int32s {
+	var a, b uint64
+	for i := 0; i < 4; i++ {
+		val := uint64(uint32(s[i]))
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Int32s{a: a, b: b}
+}
+
+// LoadInt32sPart loads a partial slice of int32 into an Int32s vector.
+func LoadInt32sPart(s []int32) (Int32s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 4 {
+		n = 4
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(uint32(s[i]))
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Int32s{a: a, b: b}, n
+}
+
+func (x Int32s) get(i int) int32 {
+	if i < 2 {
+		return int32(x.a >> (32 * i))
+	}
+	return int32(x.b >> (32 * (i - 2)))
+}
+
+func (x *Int32s) set(i int, v int32) {
+	val := uint64(uint32(v))
+	if i < 2 {
+		mask := uint64(0xffffffff) << (32 * i)
+		x.a = (x.a &^ mask) | (val << (32 * i))
+	} else {
+		mask := uint64(0xffffffff) << (32 * (i - 2))
+		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
+	}
+}
+
+// Abs returns the element-wise absolute value of x.
+func (x Int32s) Abs() Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		v := x.get(i)
+		if v < 0 {
+			res.set(i, -v)
+		} else {
+			res.set(i, v)
+		}
+	}
+	return res
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Int32s) Add(y Int32s) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Int32s) And(y Int32s) Int32s {
+	return Int32s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Int32s) AndNot(y Int32s) Int32s {
+	return Int32s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// ConvertToFloat32 converts the vector elements to float32.
+func (x Int32s) ConvertToFloat32() Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, float32(x.get(i)))
+	}
+	return res
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Int32s) Equal(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Int32s) Greater(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Int32s) GreaterEqual(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Int32s) Less(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Int32s) LessEqual(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Int32s) NotEqual(y Int32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Int32s) Len() int {
+	return 4
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Int32s) Masked(mask Mask32s) Int32s {
+	return Int32s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Int32s) Max(y Int32s) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Int32s) IfElse(mask Mask32s, y Int32s) Int32s {
+	return Int32s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Int32s) Min(y Int32s) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Int32s) Mul(y Int32s) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// Neg returns the element-wise negation of x.
+func (x Int32s) Neg() Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, -x.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Int32s) Not() Int32s {
+	return Int32s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Int32s) Or(y Int32s) Int32s {
+	return Int32s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Int32s) ShiftAllLeft(y uint8) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)<<y)
+	}
+	return res
+}
+
+// ShiftAllRight shifts all elements right by y bits.
+func (x Int32s) ShiftAllRight(y uint8) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)>>y)
+	}
+	return res
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Int32s) RotateAllLeft(dist uint64) Int32s {
+	var res Int32s
+	d := dist & 31
+	for i := 0; i < 4; i++ {
+		u := uint32(x.get(i))
+		r := (u << d) | (u >> ((32 - d) & 31))
+		res.set(i, int32(r))
+	}
+	return res
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Int32s) RotateAllRight(dist uint64) Int32s {
+	var res Int32s
+	d := dist & 31
+	for i := 0; i < 4; i++ {
+		u := uint32(x.get(i))
+		r := (u >> d) | (u << ((32 - d) & 31))
+		res.set(i, int32(r))
+	}
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Int32s) Store(s []int32) {
+	for i := 0; i < 4 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Int32s) StorePart(s []int32) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Int32s) String() string {
+	var parts [4]int32
+	for i := 0; i < 4; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Int32s) Sub(y Int32s) Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// ToMask returns a mask representation of the vector.
+func (x Int32s) ToMask() Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) != 0 {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Int32s) Xor(y Int32s) Int32s {
+	return Int32s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// ConvertToUint32 converts the vector elements to uint32.
+func (x Int32s) ConvertToUint32() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// ToBits reinterprets the vector bits as a Uint32s vector.
+func (x Int32s) ToBits() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// Int64s represents a 128-bit vector of 2 int64 elements.
+type Int64s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadInt64s loads a slice of int64 into an Int64s vector.
+func LoadInt64s(s []int64) Int64s {
+	var a, b uint64
+	a = uint64(s[0])
+	b = uint64(s[1])
+	return Int64s{a: a, b: b}
+}
+
+// LoadInt64sPart loads a partial slice of int64 into an Int64s vector.
+func LoadInt64sPart(s []int64) (Int64s, int) {
+	var a, b uint64
+	if len(s) > 0 {
+		a = uint64(s[0])
+	}
+	if len(s) > 1 {
+		b = uint64(s[1])
+	}
+	return Int64s{a: a, b: b}, len(s)
+}
+
+func (x Int64s) get(i int) int64 {
+	if i == 0 {
+		return int64(x.a)
+	}
+	return int64(x.b)
+}
+
+func (x *Int64s) set(i int, v int64) {
+	if i == 0 {
+		x.a = uint64(v)
+	} else {
+		x.b = uint64(v)
+	}
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Int64s) Add(y Int64s) Int64s {
+	return Int64s{a: x.a + y.a, b: x.b + y.b}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Int64s) And(y Int64s) Int64s {
+	return Int64s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Int64s) AndNot(y Int64s) Int64s {
+	return Int64s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Int64s) Equal(y Int64s) Mask64s {
+	var res Mask64s
+	if x.a == y.a {
+		res.a = ^uint64(0)
+	}
+	if x.b == y.b {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Int64s) Greater(y Int64s) Mask64s {
+	var res Mask64s
+	if int64(x.a) > int64(y.a) {
+		res.a = ^uint64(0)
+	}
+	if int64(x.b) > int64(y.b) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Int64s) GreaterEqual(y Int64s) Mask64s {
+	var res Mask64s
+	if int64(x.a) >= int64(y.a) {
+		res.a = ^uint64(0)
+	}
+	if int64(x.b) >= int64(y.b) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Int64s) Less(y Int64s) Mask64s {
+	var res Mask64s
+	if int64(x.a) < int64(y.a) {
+		res.a = ^uint64(0)
+	}
+	if int64(x.b) < int64(y.b) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Int64s) LessEqual(y Int64s) Mask64s {
+	var res Mask64s
+	if int64(x.a) <= int64(y.a) {
+		res.a = ^uint64(0)
+	}
+	if int64(x.b) <= int64(y.b) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Int64s) NotEqual(y Int64s) Mask64s {
+	var res Mask64s
+	if x.a != y.a {
+		res.a = ^uint64(0)
+	}
+	if x.b != y.b {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Int64s) Len() int {
+	return 2
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Int64s) Masked(mask Mask64s) Int64s {
+	return Int64s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Int64s) IfElse(mask Mask64s, y Int64s) Int64s {
+	return Int64s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Neg returns the element-wise negation of x.
+func (x Int64s) Neg() Int64s {
+	return Int64s{a: uint64(-int64(x.a)), b: uint64(-int64(x.b))}
+}
+
+// Not returns the bitwise NOT of x.
+func (x Int64s) Not() Int64s {
+	return Int64s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Int64s) Or(y Int64s) Int64s {
+	return Int64s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Int64s) ShiftAllLeft(y uint8) Int64s {
+	return Int64s{a: x.a << y, b: x.b << y}
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Int64s) RotateAllLeft(dist uint64) Int64s {
+	d := dist & 63
+	return Int64s{
+		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
+		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
+	}
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Int64s) RotateAllRight(dist uint64) Int64s {
+	d := dist & 63
+	return Int64s{
+		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
+		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
+	}
+}
+
+// Store stores the vector elements into the slice s.
+func (x Int64s) Store(s []int64) {
+	if len(s) > 0 {
+		s[0] = int64(x.a)
+	}
+	if len(s) > 1 {
+		s[1] = int64(x.b)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Int64s) StorePart(s []int64) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Int64s) String() string {
+	return fmt.Sprint([2]int64{int64(x.a), int64(x.b)})
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Int64s) Sub(y Int64s) Int64s {
+	return Int64s{a: x.a - y.a, b: x.b - y.b}
+}
+
+// ToMask returns a mask representation of the vector.
+func (x Int64s) ToMask() Mask64s {
+	var res Mask64s
+	if x.a != 0 {
+		res.a = ^uint64(0)
+	}
+	if x.b != 0 {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Int64s) Xor(y Int64s) Int64s {
+	return Int64s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// ConvertToUint64 converts the vector elements to uint64.
+func (x Int64s) ConvertToUint64() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// ToBits reinterprets the vector bits as a Uint64s vector.
+func (x Int64s) ToBits() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// Uint8s represents a 128-bit vector of 16 uint8 elements.
+type Uint8s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadUint8s loads a slice of uint8 into an Uint8s vector.
+func LoadUint8s(s []uint8) Uint8s {
+	var a, b uint64
+	for i := 0; i < 16; i++ {
+		val := uint64(s[i])
+		if i < 8 {
+			a |= val << (8 * i)
+		} else {
+			b |= val << (8 * (i - 8))
+		}
+	}
+	return Uint8s{a: a, b: b}
+}
+
+// LoadUint8sPart loads a partial slice of uint8 into an Uint8s vector.
+func LoadUint8sPart(s []uint8) (Uint8s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 16 {
+		n = 16
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(s[i])
+		if i < 8 {
+			a |= val << (8 * i)
+		} else {
+			b |= val << (8 * (i - 8))
+		}
+	}
+	return Uint8s{a: a, b: b}, n
+}
+
+func (x Uint8s) get(i int) uint8 {
+	if i < 8 {
+		return uint8(x.a >> (8 * i))
+	}
+	return uint8(x.b >> (8 * (i - 8)))
+}
+
+func (x *Uint8s) set(i int, v uint8) {
+	val := uint64(v)
+	if i < 8 {
+		mask := uint64(0xff) << (8 * i)
+		x.a = (x.a &^ mask) | (val << (8 * i))
+	} else {
+		mask := uint64(0xff) << (8 * (i - 8))
+		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
+	}
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Uint8s) Add(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// AddSaturated returns the element-wise saturated sum of x and y.
+func (x Uint8s) AddSaturated(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		sum := int(x.get(i)) + int(y.get(i))
+		if sum > math.MaxUint8 {
+			res.set(i, math.MaxUint8)
+		} else {
+			res.set(i, uint8(sum))
+		}
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Uint8s) And(y Uint8s) Uint8s {
+	return Uint8s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Uint8s) AndNot(y Uint8s) Uint8s {
+	return Uint8s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Average returns the element-wise average of x and y.
+func (x Uint8s) Average(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		res.set(i, uint8((int(x.get(i))+int(y.get(i))+1)>>1))
+	}
+	return res
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Uint8s) Equal(y Uint8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Uint8s) NotEqual(y Uint8s) Mask8s {
+	var res Mask8s
+	for i := 0; i < 16; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Uint8s) Len() int {
+	return 16
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Uint8s) Masked(mask Mask8s) Uint8s {
+	return Uint8s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Uint8s) Max(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Uint8s) IfElse(mask Mask8s, y Uint8s) Uint8s {
+	return Uint8s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Uint8s) Min(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Uint8s) Mul(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Uint8s) Not() Uint8s {
+	return Uint8s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Uint8s) Or(y Uint8s) Uint8s {
+	return Uint8s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// Store stores the vector elements into the slice s.
+func (x Uint8s) Store(s []uint8) {
+	for i := 0; i < 16 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Uint8s) StorePart(s []uint8) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Uint8s) String() string {
+	var parts [16]uint8
+	for i := 0; i < 16; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Uint8s) Sub(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// SubSaturated returns the element-wise saturated difference of x and y.
+func (x Uint8s) SubSaturated(y Uint8s) Uint8s {
+	var res Uint8s
+	for i := 0; i < 16; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, 0)
+		} else {
+			res.set(i, vx-vy)
+		}
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Uint8s) Xor(y Uint8s) Uint8s {
+	return Uint8s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// BitsToInt8 reinterprets the vector bits as an Int8s vector.
+func (x Uint8s) BitsToInt8() Int8s {
+	return Int8s{a: x.a, b: x.b}
+}
+
+// ConvertToInt8 converts the vector elements to int8.
+func (x Uint8s) ConvertToInt8() Int8s {
+	return Int8s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
+func (x Uint8s) ReshapeToUint16s() Uint16s {
+	return Uint16s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
+func (x Uint8s) ReshapeToUint32s() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
+func (x Uint8s) ReshapeToUint64s() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// Uint16s represents a 128-bit vector of 8 uint16 elements.
+type Uint16s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadUint16s loads a slice of uint16 into an Uint16s vector.
+func LoadUint16s(s []uint16) Uint16s {
+	var a, b uint64
+	for i := 0; i < 8; i++ {
+		val := uint64(s[i])
+		if i < 4 {
+			a |= val << (16 * i)
+		} else {
+			b |= val << (16 * (i - 4))
+		}
+	}
+	return Uint16s{a: a, b: b}
+}
+
+// LoadUint16sPart loads a partial slice of uint16 into an Uint16s vector.
+func LoadUint16sPart(s []uint16) (Uint16s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 8 {
+		n = 8
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(s[i])
+		if i < 4 {
+			a |= val << (16 * i)
+		} else {
+			b |= val << (16 * (i - 4))
+		}
+	}
+	return Uint16s{a: a, b: b}, n
+}
+
+func (x Uint16s) get(i int) uint16 {
+	if i < 4 {
+		return uint16(x.a >> (16 * i))
+	}
+	return uint16(x.b >> (16 * (i - 4)))
+}
+
+func (x *Uint16s) set(i int, v uint16) {
+	val := uint64(v)
+	if i < 4 {
+		mask := uint64(0xffff) << (16 * i)
+		x.a = (x.a &^ mask) | (val << (16 * i))
+	} else {
+		mask := uint64(0xffff) << (16 * (i - 4))
+		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
+	}
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Uint16s) Add(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// AddSaturated returns the element-wise saturated sum of x and y.
+func (x Uint16s) AddSaturated(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		sum := int(x.get(i)) + int(y.get(i))
+		if sum > math.MaxUint16 {
+			res.set(i, math.MaxUint16)
+		} else {
+			res.set(i, uint16(sum))
+		}
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Uint16s) And(y Uint16s) Uint16s {
+	return Uint16s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Uint16s) AndNot(y Uint16s) Uint16s {
+	return Uint16s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Average returns the element-wise average of x and y.
+func (x Uint16s) Average(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, uint16((int(x.get(i))+int(y.get(i))+1)>>1))
+	}
+	return res
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Uint16s) Equal(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Uint16s) Greater(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Uint16s) GreaterEqual(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Uint16s) Less(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Uint16s) LessEqual(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Uint16s) NotEqual(y Uint16s) Mask16s {
+	var res Mask16s
+	for i := 0; i < 8; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Uint16s) Len() int {
+	return 8
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Uint16s) Masked(mask Mask16s) Uint16s {
+	return Uint16s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Uint16s) Max(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Uint16s) IfElse(mask Mask16s, y Uint16s) Uint16s {
+	return Uint16s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Uint16s) Min(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Uint16s) Mul(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Uint16s) Not() Uint16s {
+	return Uint16s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Uint16s) Or(y Uint16s) Uint16s {
+	return Uint16s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Uint16s) ShiftAllLeft(y uint8) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)<<y)
+	}
+	return res
+}
+
+// ShiftAllRight shifts all elements right by y bits.
+func (x Uint16s) ShiftAllRight(y uint8) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)>>y)
+	}
+	return res
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Uint16s) RotateAllLeft(dist uint64) Uint16s {
+	var res Uint16s
+	d := dist & 15
+	for i := 0; i < 8; i++ {
+		u := x.get(i)
+		r := (u << d) | (u >> ((16 - d) & 15))
+		res.set(i, r)
+	}
+	return res
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Uint16s) RotateAllRight(dist uint64) Uint16s {
+	var res Uint16s
+	d := dist & 15
+	for i := 0; i < 8; i++ {
+		u := x.get(i)
+		r := (u >> d) | (u << ((16 - d) & 15))
+		res.set(i, r)
+	}
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Uint16s) Store(s []uint16) {
+	for i := 0; i < 8 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Uint16s) StorePart(s []uint16) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Uint16s) String() string {
+	var parts [8]uint16
+	for i := 0; i < 8; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Uint16s) Sub(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// SubSaturated returns the element-wise saturated difference of x and y.
+func (x Uint16s) SubSaturated(y Uint16s) Uint16s {
+	var res Uint16s
+	for i := 0; i < 8; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, 0)
+		} else {
+			res.set(i, vx-vy)
+		}
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Uint16s) Xor(y Uint16s) Uint16s {
+	return Uint16s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// BitsToInt16 reinterprets the vector bits as an Int16s vector.
+func (x Uint16s) BitsToInt16() Int16s {
+	return Int16s{a: x.a, b: x.b}
+}
+
+// ConvertToInt16 converts the vector elements to int16.
+func (x Uint16s) ConvertToInt16() Int16s {
+	return Int16s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
+func (x Uint16s) ReshapeToUint32s() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
+func (x Uint16s) ReshapeToUint64s() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
+func (x Uint16s) ReshapeToUint8s() Uint8s {
+	return Uint8s{a: x.a, b: x.b}
+}
+
+// Uint32s represents a 128-bit vector of 4 uint32 elements.
+type Uint32s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadUint32s loads a slice of uint32 into an Uint32s vector.
+func LoadUint32s(s []uint32) Uint32s {
+	var a, b uint64
+	for i := 0; i < 4; i++ {
+		val := uint64(s[i])
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Uint32s{a: a, b: b}
+}
+
+// LoadUint32sPart loads a partial slice of uint32 into an Uint32s vector.
+func LoadUint32sPart(s []uint32) (Uint32s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 4 {
+		n = 4
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(s[i])
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Uint32s{a: a, b: b}, n
+}
+
+func (x Uint32s) get(i int) uint32 {
+	if i < 2 {
+		return uint32(x.a >> (32 * i))
+	}
+	return uint32(x.b >> (32 * (i - 2)))
+}
+
+func (x *Uint32s) set(i int, v uint32) {
+	val := uint64(v)
+	if i < 2 {
+		mask := uint64(0xffffffff) << (32 * i)
+		x.a = (x.a &^ mask) | (val << (32 * i))
+	} else {
+		mask := uint64(0xffffffff) << (32 * (i - 2))
+		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
+	}
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Uint32s) Add(y Uint32s) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// And returns the bitwise AND of x and y.
+func (x Uint32s) And(y Uint32s) Uint32s {
+	return Uint32s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Uint32s) AndNot(y Uint32s) Uint32s {
+	return Uint32s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Uint32s) Equal(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Uint32s) Greater(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Uint32s) GreaterEqual(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Uint32s) Less(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Uint32s) LessEqual(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Uint32s) NotEqual(y Uint32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Uint32s) Len() int {
+	return 4
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Uint32s) Masked(mask Mask32s) Uint32s {
+	return Uint32s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Uint32s) Max(y Uint32s) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Uint32s) IfElse(mask Mask32s, y Uint32s) Uint32s {
+	return Uint32s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Uint32s) Min(y Uint32s) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Uint32s) Mul(y Uint32s) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// Not returns the bitwise NOT of x.
+func (x Uint32s) Not() Uint32s {
+	return Uint32s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Uint32s) Or(y Uint32s) Uint32s {
+	return Uint32s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Uint32s) ShiftAllLeft(y uint8) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)<<y)
+	}
+	return res
+}
+
+// ShiftAllRight shifts all elements right by y bits.
+func (x Uint32s) ShiftAllRight(y uint8) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)>>y)
+	}
+	return res
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Uint32s) RotateAllLeft(dist uint64) Uint32s {
+	var res Uint32s
+	d := dist & 31
+	for i := 0; i < 4; i++ {
+		u := x.get(i)
+		r := (u << d) | (u >> ((32 - d) & 31))
+		res.set(i, r)
+	}
+	return res
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Uint32s) RotateAllRight(dist uint64) Uint32s {
+	var res Uint32s
+	d := dist & 31
+	for i := 0; i < 4; i++ {
+		u := x.get(i)
+		r := (u >> d) | (u << ((32 - d) & 31))
+		res.set(i, r)
+	}
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Uint32s) Store(s []uint32) {
+	for i := 0; i < 4 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Uint32s) StorePart(s []uint32) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Uint32s) String() string {
+	var parts [4]uint32
+	for i := 0; i < 4; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Uint32s) Sub(y Uint32s) Uint32s {
+	var res Uint32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Uint32s) Xor(y Uint32s) Uint32s {
+	return Uint32s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// BitsToFloat32 reinterprets the vector bits as a Float32s vector.
+func (x Uint32s) BitsToFloat32() Float32s {
+	return Float32s{a: x.a, b: x.b}
+}
+
+// BitsToInt32 reinterprets the vector bits as an Int32s vector.
+func (x Uint32s) BitsToInt32() Int32s {
+	return Int32s{a: x.a, b: x.b}
+}
+
+// ConvertToInt32 converts the vector elements to int32.
+func (x Uint32s) ConvertToInt32() Int32s {
+	return Int32s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
+func (x Uint32s) ReshapeToUint16s() Uint16s {
+	return Uint16s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
+func (x Uint32s) ReshapeToUint64s() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
+func (x Uint32s) ReshapeToUint8s() Uint8s {
+	return Uint8s{a: x.a, b: x.b}
+}
+
+// Uint64s represents a 128-bit vector of 2 uint64 elements.
+type Uint64s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadUint64s loads a slice of uint64 into an Uint64s vector.
+func LoadUint64s(s []uint64) Uint64s {
+	var a, b uint64
+	a = s[0]
+	b = s[1]
+	return Uint64s{a: a, b: b}
+}
+
+// LoadUint64sPart loads a partial slice of uint64 into an Uint64s vector.
+func LoadUint64sPart(s []uint64) (Uint64s, int) {
+	n := len(s)
+	var a, b uint64
+	if n > 0 {
+		a = s[0]
+	}
+	if n > 1 {
+		b = s[1]
+	}
+	return Uint64s{a: a, b: b}, n
+}
+
+func (x Uint64s) get(i int) uint64 {
+	if i == 0 {
+		return x.a
+	}
+	return x.b
+}
+
+func (x *Uint64s) set(i int, v uint64) {
+	if i == 0 {
+		x.a = v
+	} else {
+		x.b = v
+	}
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Uint64s) Add(y Uint64s) Uint64s {
+	return Uint64s{a: x.a + y.a, b: x.b + y.b}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Uint64s) And(y Uint64s) Uint64s {
+	return Uint64s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// AndNot returns the bitwise AND NOT of x and y.
+func (x Uint64s) AndNot(y Uint64s) Uint64s {
+	return Uint64s{a: x.a &^ y.a, b: x.b &^ y.b}
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Uint64s) Equal(y Uint64s) Mask64s {
+	var res Mask64s
+	if x.a == y.a {
+		res.a = ^uint64(0)
+	}
+	if x.b == y.b {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Uint64s) Greater(y Uint64s) Mask64s {
+	var res Mask64s
+	for i := 0; i < 2; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Uint64s) GreaterEqual(y Uint64s) Mask64s {
+	var res Mask64s
+	for i := 0; i < 2; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Uint64s) Less(y Uint64s) Mask64s {
+	var res Mask64s
+	for i := 0; i < 2; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Uint64s) LessEqual(y Uint64s) Mask64s {
+	var res Mask64s
+	for i := 0; i < 2; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Uint64s) NotEqual(y Uint64s) Mask64s {
+	var res Mask64s
+	if x.a != y.a {
+		res.a = ^uint64(0)
+	}
+	if x.b != y.b {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Uint64s) Len() int {
+	return 2
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Uint64s) Masked(mask Mask64s) Uint64s {
+	return Uint64s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Uint64s) IfElse(mask Mask64s, y Uint64s) Uint64s {
+	return Uint64s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Not returns the bitwise NOT of x.
+func (x Uint64s) Not() Uint64s {
+	return Uint64s{a: ^x.a, b: ^x.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Uint64s) Or(y Uint64s) Uint64s {
+	return Uint64s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// ShiftAllLeft shifts all elements left by y bits.
+func (x Uint64s) ShiftAllLeft(y uint8) Uint64s {
+	return Uint64s{a: x.a << y, b: x.b << y}
+}
+
+// ShiftAllRight shifts all elements right by y bits.
+func (x Uint64s) ShiftAllRight(y uint8) Uint64s {
+	return Uint64s{a: x.a >> y, b: x.b >> y}
+}
+
+// RotateAllLeft rotates all elements left by dist bits.
+func (x Uint64s) RotateAllLeft(dist uint64) Uint64s {
+	d := dist & 63
+	return Uint64s{
+		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
+		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
+	}
+}
+
+// RotateAllRight rotates all elements right by dist bits.
+func (x Uint64s) RotateAllRight(dist uint64) Uint64s {
+	d := dist & 63
+	return Uint64s{
+		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
+		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
+	}
+}
+
+// Store stores the vector elements into the slice s.
+func (x Uint64s) Store(s []uint64) {
+	if len(s) > 0 {
+		s[0] = x.a
+	}
+	if len(s) > 1 {
+		s[1] = x.b
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Uint64s) StorePart(s []uint64) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Uint64s) String() string {
+	return fmt.Sprint([2]uint64{x.a, x.b})
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Uint64s) Sub(y Uint64s) Uint64s {
+	return Uint64s{a: x.a - y.a, b: x.b - y.b}
+}
+
+// Xor returns the bitwise XOR of x and y.
+func (x Uint64s) Xor(y Uint64s) Uint64s {
+	return Uint64s{a: x.a ^ y.a, b: x.b ^ y.b}
+}
+
+// BitsToFloat64 reinterprets the vector bits as a Float64s vector.
+func (x Uint64s) BitsToFloat64() Float64s {
+	return Float64s{a: x.a, b: x.b}
+}
+
+// BitsToInt64 reinterprets the vector bits as an Int64s vector.
+func (x Uint64s) BitsToInt64() Int64s {
+	return Int64s{a: x.a, b: x.b}
+}
+
+// ConvertToInt64 converts the vector elements to int64.
+func (x Uint64s) ConvertToInt64() Int64s {
+	return Int64s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
+func (x Uint64s) ReshapeToUint16s() Uint16s {
+	return Uint16s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
+func (x Uint64s) ReshapeToUint32s() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
+func (x Uint64s) ReshapeToUint8s() Uint8s {
+	return Uint8s{a: x.a, b: x.b}
+}
+
+// Float32s represents a 128-bit vector of 4 float32 elements.
+type Float32s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadFloat32s loads a slice of float32 into an Float32s vector.
+func LoadFloat32s(s []float32) Float32s {
+	var a, b uint64
+	for i := 0; i < 4; i++ {
+		val := uint64(math.Float32bits(s[i]))
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Float32s{a: a, b: b}
+}
+
+// LoadFloat32sPart loads a partial slice of float32 into an Float32s vector.
+func LoadFloat32sPart(s []float32) (Float32s, int) {
+	var a, b uint64
+	n := len(s)
+	if n > 4 {
+		n = 4
+	}
+	for i := 0; i < n; i++ {
+		val := uint64(math.Float32bits(s[i]))
+		if i < 2 {
+			a |= val << (32 * i)
+		} else {
+			b |= val << (32 * (i - 2))
+		}
+	}
+	return Float32s{a: a, b: b}, n
+}
+
+func (x Float32s) get(i int) float32 {
+	if i < 2 {
+		return math.Float32frombits(uint32(x.a >> (32 * i)))
+	}
+	return math.Float32frombits(uint32(x.b >> (32 * (i - 2))))
+}
+
+func (x *Float32s) set(i int, v float32) {
+	val := uint64(math.Float32bits(v))
+	if i < 2 {
+		mask := uint64(0xffffffff) << (32 * i)
+		x.a = (x.a &^ mask) | (val << (32 * i))
+	} else {
+		mask := uint64(0xffffffff) << (32 * (i - 2))
+		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
+	}
+}
+
+// Abs returns the element-wise absolute value of x.
+func (x Float32s) Abs() Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		v := x.get(i)
+		if v < 0 {
+			res.set(i, -v)
+		} else {
+			res.set(i, v)
+		}
+	}
+	return res
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Float32s) Add(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)+y.get(i))
+	}
+	return res
+}
+
+// ConvertToInt32 converts the vector elements to int32.
+func (x Float32s) ConvertToInt32() Int32s {
+	var res Int32s
+	for i := 0; i < 4; i++ {
+		res.set(i, int32(x.get(i)))
+	}
+	return res
+}
+
+// Div returns the element-wise quotient of x and y.
+func (x Float32s) Div(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)/y.get(i))
+	}
+	return res
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Float32s) Equal(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) == y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Float32s) Greater(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) > y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Float32s) GreaterEqual(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) >= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Float32s) Len() int {
+	return 4
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Float32s) Less(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) < y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Float32s) LessEqual(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) <= y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Float32s) Masked(mask Mask32s) Float32s {
+	return Float32s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Float32s) Max(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx > vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Float32s) IfElse(mask Mask32s, y Float32s) Float32s {
+	return Float32s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Float32s) Min(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		vx := x.get(i)
+		vy := y.get(i)
+		if vx < vy {
+			res.set(i, vx)
+		} else {
+			res.set(i, vy)
+		}
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Float32s) Mul(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)*y.get(i))
+	}
+	return res
+}
+
+// MulAdd returns x * y + z element-wise.
+func (x Float32s) MulAdd(y, z Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)+y.get(i)*z.get(i))
+	}
+	return res
+}
+
+// Neg returns the element-wise negation of x.
+func (x Float32s) Neg() Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, -(x.get(i)))
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Float32s) NotEqual(y Float32s) Mask32s {
+	var res Mask32s
+	for i := 0; i < 4; i++ {
+		if x.get(i) != y.get(i) {
+			res.set(i, true)
+		}
+	}
+	return res
+}
+
+// Sqrt returns the element-wise square root of x.
+func (x Float32s) Sqrt() Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, float32(math.Sqrt(float64(x.get(i)))))
+	}
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Float32s) Store(s []float32) {
+	for i := 0; i < 4 && i < len(s); i++ {
+		s[i] = x.get(i)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Float32s) StorePart(s []float32) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Float32s) String() string {
+	var parts [4]float32
+	for i := 0; i < 4; i++ {
+		parts[i] = x.get(i)
+	}
+	return fmt.Sprint(parts)
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Float32s) Sub(y Float32s) Float32s {
+	var res Float32s
+	for i := 0; i < 4; i++ {
+		res.set(i, x.get(i)-y.get(i))
+	}
+	return res
+}
+
+// ToBits reinterprets the vector bits as a Uint32s vector.
+func (x Float32s) ToBits() Uint32s {
+	return Uint32s{a: x.a, b: x.b}
+}
+
+// Float64s represents a 128-bit vector of 2 float64 elements.
+type Float64s struct {
+	_    _simd
+	a, b uint64
+}
+
+// LoadFloat64s loads a slice of float64 into an Float64s vector.
+func LoadFloat64s(s []float64) Float64s {
+	var a, b uint64
+	a = math.Float64bits(s[0])
+	b = math.Float64bits(s[1])
+	return Float64s{a: a, b: b}
+}
+
+// LoadFloat64sPart loads a partial slice of float64 into an Float64s vector.
+func LoadFloat64sPart(s []float64) (Float64s, int) {
+	n := len(s)
+	var a, b uint64
+	if n > 0 {
+		a = math.Float64bits(s[0])
+	}
+	if n > 1 {
+		b = math.Float64bits(s[1])
+	}
+	return Float64s{a: a, b: b}, n
+}
+
+func (x Float64s) get(i int) float64 {
+	if i == 0 {
+		return math.Float64frombits(x.a)
+	}
+	return math.Float64frombits(x.b)
+}
+
+func (x *Float64s) set(i int, v float64) {
+	if i == 0 {
+		x.a = math.Float64bits(v)
+	} else {
+		x.b = math.Float64bits(v)
+	}
+}
+
+// Abs returns the element-wise absolute value of x.
+func (x Float64s) Abs() Float64s {
+	var res Float64s
+	for i := 0; i < 4; i++ {
+		v := x.get(i)
+		if v < 0 {
+			res.set(i, -v)
+		} else {
+			res.set(i, v)
+		}
+	}
+	return res
+}
+
+// Add returns the element-wise sum of x and y.
+func (x Float64s) Add(y Float64s) Float64s {
+	var res Float64s
+	res.set(0, x.get(0)+y.get(0))
+	res.set(1, x.get(1)+y.get(1))
+	return res
+}
+
+// Div returns the element-wise quotient of x and y.
+func (x Float64s) Div(y Float64s) Float64s {
+	var res Float64s
+	res.set(0, x.get(0)/y.get(0))
+	res.set(1, x.get(1)/y.get(1))
+	return res
+}
+
+// Equal returns a mask indicating where x and y are equal.
+func (x Float64s) Equal(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) == y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) == y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Greater returns a mask indicating where x is greater than y.
+func (x Float64s) Greater(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) > y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) > y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// GreaterEqual returns a mask indicating where x is greater than or equal to y.
+func (x Float64s) GreaterEqual(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) >= y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) >= y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Len returns the number of elements in the vector.
+func (x Float64s) Len() int {
+	return 2
+}
+
+// Less returns a mask indicating where x is less than y.
+func (x Float64s) Less(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) < y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) < y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// LessEqual returns a mask indicating where x is less than or equal to y.
+func (x Float64s) LessEqual(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) <= y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) <= y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
+func (x Float64s) Masked(mask Mask64s) Float64s {
+	return Float64s{a: x.a & mask.a, b: x.b & mask.b}
+}
+
+// Max returns the element-wise maximum of x and y.
+func (x Float64s) Max(y Float64s) Float64s {
+	var res Float64s
+	vx := x.get(0)
+	vy := y.get(0)
+	if vx > vy {
+		res.set(0, vx)
+	} else {
+		res.set(0, vy)
+	}
+	vx = x.get(1)
+	vy = y.get(1)
+	if vx > vy {
+		res.set(1, vx)
+	} else {
+		res.set(1, vy)
+	}
+	return res
+}
+
+// IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
+func (x Float64s) IfElse(mask Mask64s, y Float64s) Float64s {
+	return Float64s{
+		a: (x.a & mask.a) | (y.a &^ mask.a),
+		b: (x.b & mask.b) | (y.b &^ mask.b),
+	}
+}
+
+// Min returns the element-wise minimum of x and y.
+func (x Float64s) Min(y Float64s) Float64s {
+	var res Float64s
+	vx := x.get(0)
+	vy := y.get(0)
+	if vx < vy {
+		res.set(0, vx)
+	} else {
+		res.set(0, vy)
+	}
+	vx = x.get(1)
+	vy = y.get(1)
+	if vx < vy {
+		res.set(1, vx)
+	} else {
+		res.set(1, vy)
+	}
+	return res
+}
+
+// Mul returns the element-wise product of x and y.
+func (x Float64s) Mul(y Float64s) Float64s {
+	var res Float64s
+	res.set(0, x.get(0)*y.get(0))
+	res.set(1, x.get(1)*y.get(1))
+	return res
+}
+
+// MulAdd returns x * y + z element-wise.
+func (x Float64s) MulAdd(y, z Float64s) Float64s {
+	var res Float64s
+	res.set(0, x.get(0)+y.get(0)*z.get(0))
+	res.set(1, x.get(1)+y.get(1)*z.get(1))
+	return res
+}
+
+// Neg returns the element-wise negation of x.
+func (x Float64s) Neg() Float64s {
+	var res Float64s
+	for i := 0; i < 4; i++ {
+		res.set(i, -(x.get(i)))
+	}
+	return res
+}
+
+// NotEqual returns a mask indicating where x and y are not equal.
+func (x Float64s) NotEqual(y Float64s) Mask64s {
+	var res Mask64s
+	if x.get(0) != y.get(0) {
+		res.a = ^uint64(0)
+	}
+	if x.get(1) != y.get(1) {
+		res.b = ^uint64(0)
+	}
+	return res
+}
+
+// Sqrt returns the element-wise square root of x.
+func (x Float64s) Sqrt() Float64s {
+	var res Float64s
+	res.set(0, math.Sqrt(x.get(0)))
+	res.set(1, math.Sqrt(x.get(1)))
+	return res
+}
+
+// Store stores the vector elements into the slice s.
+func (x Float64s) Store(s []float64) {
+	if len(s) > 0 {
+		s[0] = x.get(0)
+	}
+	if len(s) > 1 {
+		s[1] = x.get(1)
+	}
+}
+
+// StorePart stores a partial vector into the slice s.
+func (x Float64s) StorePart(s []float64) {
+	x.Store(s)
+}
+
+// String returns a string representation of the vector.
+func (x Float64s) String() string {
+	return fmt.Sprint([2]float64{x.get(0), x.get(1)})
+}
+
+// Sub returns the element-wise difference of x and y.
+func (x Float64s) Sub(y Float64s) Float64s {
+	var res Float64s
+	res.set(0, x.get(0)-y.get(0))
+	res.set(1, x.get(1)-y.get(1))
+	return res
+}
+
+// ToBits reinterprets the vector bits as a Uint64s vector.
+func (x Float64s) ToBits() Uint64s {
+	return Uint64s{a: x.a, b: x.b}
+}
+
+// Mask8s represents a 128-bit mask vector for 16 int8/uint8 elements.
+type Mask8s struct {
+	_    _simd
+	a, b uint64
+}
+
+func (x *Mask8s) set(i int, v bool) {
+	if v {
+		if i < 8 {
+			mask := uint64(0xff) << (8 * i)
+			x.a |= mask
+		} else {
+			mask := uint64(0xff) << (8 * (i - 8))
+			x.b |= mask
+		}
+	}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Mask8s) And(y Mask8s) Mask8s {
+	return Mask8s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Mask8s) Or(y Mask8s) Mask8s {
+	return Mask8s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// String returns a string representation of the vector.
+func (x Mask8s) String() string {
+	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
+}
+
+// ToInt8s converts the mask to an Int8s vector.
+func (x Mask8s) ToInt8s() Int8s {
+	return Int8s{a: x.a, b: x.b}
+}
+
+// Mask16s represents a 128-bit mask vector for 8 int16/uint16 elements.
+type Mask16s struct {
+	_    _simd
+	a, b uint64
+}
+
+func (x *Mask16s) set(i int, v bool) {
+	if v {
+		if i < 4 {
+			mask := uint64(0xffff) << (16 * i)
+			x.a |= mask
+		} else {
+			mask := uint64(0xffff) << (16 * (i - 4))
+			x.b |= mask
+		}
+	}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Mask16s) And(y Mask16s) Mask16s {
+	return Mask16s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Mask16s) Or(y Mask16s) Mask16s {
+	return Mask16s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// String returns a string representation of the vector.
+func (x Mask16s) String() string {
+	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
+}
+
+// ToInt16s converts the mask to an Int16s vector.
+func (x Mask16s) ToInt16s() Int16s {
+	return Int16s{a: x.a, b: x.b}
+}
+
+// Mask32s represents a 128-bit mask vector for 4 int32/uint32/float32 elements.
+type Mask32s struct {
+	_    _simd
+	a, b uint64
+}
+
+func (x *Mask32s) set(i int, v bool) {
+	if v {
+		if i < 2 {
+			mask := uint64(0xffffffff) << (32 * i)
+			x.a |= mask
+		} else {
+			mask := uint64(0xffffffff) << (32 * (i - 2))
+			x.b |= mask
+		}
+	}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Mask32s) And(y Mask32s) Mask32s {
+	return Mask32s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Mask32s) Or(y Mask32s) Mask32s {
+	return Mask32s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// String returns a string representation of the vector.
+func (x Mask32s) String() string {
+	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
+}
+
+// ToInt32s converts the mask to an Int32s vector.
+func (x Mask32s) ToInt32s() Int32s {
+	return Int32s{a: x.a, b: x.b}
+}
+
+// Mask64s represents a 128-bit mask vector for 2 int64/uint64/float64 elements.
+type Mask64s struct {
+	_    _simd
+	a, b uint64
+}
+
+func (x *Mask64s) set(i int, v bool) {
+	if v {
+		if i == 0 {
+			x.a = ^uint64(0)
+		} else {
+			x.b = ^uint64(0)
+		}
+	}
+}
+
+// And returns the bitwise AND of x and y.
+func (x Mask64s) And(y Mask64s) Mask64s {
+	return Mask64s{a: x.a & y.a, b: x.b & y.b}
+}
+
+// Or returns the bitwise OR of x and y.
+func (x Mask64s) Or(y Mask64s) Mask64s {
+	return Mask64s{a: x.a | y.a, b: x.b | y.b}
+}
+
+// String returns a string representation of the vector.
+func (x Mask64s) String() string {
+	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
+}
+
+// ToInt64s converts the mask to an Int64s vector.
+func (x Mask64s) ToInt64s() Int64s {
+	return Int64s{a: x.a, b: x.b}
+}
+
+func newT(lo, hi uint64) Uint64s {
+	return Uint64s{a: lo, b: hi}
+}
+
+// mwl returns the 128-bit product of the lower halves of x and y
+func (x Uint64s) mwl(y Uint64s) Uint64s {
+	hi, lo := bits.Mul64(x.a, y.a)
+	return Uint64s{a: lo, b: hi}
+}
+
+var (
+	m1 = newT(0x1084210842108421, 0x2108421084210842)
+	m2 = newT(0x2108421084210842, 0x4210842108421084)
+	m3 = newT(0x4210842108421084, 0x8421084210842108)
+	m4 = newT(0x8421084210842108, 0x0842108421084210)
+	m5 = newT(0x0842108421084210, 0x1084210842108421)
+)
+
+func (x Uint64s) clmul(y Uint64s) Uint64s {
+	x1 := x.And(m1)
+	x2 := x.And(m2)
+	x3 := x.And(m3)
+	x4 := x.And(m4)
+	x5 := x.And(m5)
+
+	y1 := y.And(m1)
+	y2 := y.And(m2)
+	y3 := y.And(m3)
+	y4 := y.And(m4)
+	y5 := y.And(m5)
+
+	// sum of x, y indices == K mod 5; mask index = K-1
+	z := (x1.mwl(y1)).Xor(x2.mwl(y5)).Xor(x5.mwl(y2)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).And(m1)
+	z = (x4.mwl(y4)).Xor(x3.mwl(y5)).Xor(x5.mwl(y3)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m2).Or(z)
+	z = (x2.mwl(y2)).Xor(x4.mwl(y5)).Xor(x5.mwl(y4)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m3).Or(z)
+	z = (x5.mwl(y5)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m4).Or(z)
+	z = (x3.mwl(y3)).Xor(x1.mwl(y5)).Xor(x5.mwl(y1)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).And(m5).Or(z)
+
+	return z
+}
+
+// CarrylessMultiplyEven computes the carryless
+// multiplications of selected even halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s {
+	return x.clmul(y)
+}
+
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected odd halves of the elements of x and y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s {
+	x.a = x.b
+	y.a = y.b
+	return x.clmul(y)
+}
diff --git a/src/simd/internal/bridge/tofrom_amd64.go b/src/simd/internal/bridge/tofrom_amd64.go
index 6e814f4..fa2878f 100644
--- a/src/simd/internal/bridge/tofrom_amd64.go
+++ b/src/simd/internal/bridge/tofrom_amd64.go
@@ -8,6 +8,8 @@
 
 import "simd/archsimd"
 
+// For amd64, handle the larger types not mentioned in tofrom_128.go
+
 func (x Float32x16) ToArch() any {
 	return archsimd.Float32x16(x)
 }
diff --git a/src/simd/internal/bridge/tofrom_emulated.go b/src/simd/internal/bridge/tofrom_emulated.go
new file mode 100644
index 0000000..4dc3bea
--- /dev/null
+++ b/src/simd/internal/bridge/tofrom_emulated.go
@@ -0,0 +1,63 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (amd64 || wasm || arm64)
+
+package bridge
+
+func (x Float32s) ToArch() any {
+	return x
+}
+
+func (x Float64s) ToArch() any {
+	return x
+}
+
+func (x Int16s) ToArch() any {
+	return x
+}
+
+func (x Int32s) ToArch() any {
+	return x
+}
+
+func (x Int64s) ToArch() any {
+	return x
+}
+
+func (x Int8s) ToArch() any {
+	return x
+}
+
+func (x Mask16s) ToArch() any {
+	return x
+}
+
+func (x Mask32s) ToArch() any {
+	return x
+}
+
+func (x Mask64s) ToArch() any {
+	return x
+}
+
+func (x Mask8s) ToArch() any {
+	return x
+}
+
+func (x Uint16s) ToArch() any {
+	return x
+}
+
+func (x Uint32s) ToArch() any {
+	return x
+}
+
+func (x Uint64s) ToArch() any {
+	return x
+}
+
+func (x Uint8s) ToArch() any {
+	return x
+}
diff --git a/src/simd/midway_amd64.go b/src/simd/midway_amd64.go
index 78acde3..8f37f65 100644
--- a/src/simd/midway_amd64.go
+++ b/src/simd/midway_amd64.go
@@ -7,58 +7,28 @@
 package simd
 
 import (
-	"fmt"
-	"os"
+	"internal/cpu"
 	"simd/archsimd"
-	"strconv"
 )
 
-var maxVectorSize int
+const archHasHwClmul = true
 
-func init() {
-	actualMax := archMaxVectorSize()
-	if gosimd := os.Getenv("GOSIMD"); gosimd != "" {
-		val, err := strconv.Atoi(gosimd)
-		if err != nil {
-			panic(fmt.Errorf("Could not parse GOSIMD(='%s') as a decimal number, %v", gosimd, err))
-		}
-		if val > actualMax {
-			panic(fmt.Errorf("Requested GOSIMD(='%d') is larger than the simd length (%d) supported on this cpu ", val, actualMax))
-		}
-		if val < 0 {
-			panic(fmt.Errorf("Requested GOSIMD(='%d') is negative", val))
-		}
-		maxVectorSize = val
-		return
-	}
-	maxVectorSize = actualMax
-}
-
-// VectorBitSize returns the bit length of the longest vector available
-// on the current hardware.  For amd64, this is 128, 256, or 512, depending
-// on the hardware.  It can be artificially reduced by setting the
-// GOSIMD environment variable before running a program.
-func VectorBitSize() int {
-	return maxVectorSize
-}
-
-// Emulated returns whether simd operations are emulated or
-// running on actual vector hardware.
-func Emulated() bool {
-	return false
-}
-
-func archMaxVectorSize() int {
-	if archsimd.X86.AVX512() {
-		return 512
+func archMaxVectorSize() (size, allFeatureSize int) {
+	if archsimd.X86.AVX() {
+		size = 128
+		allFeatureSize = 128
 	}
 	if archsimd.X86.AVX2() {
-		return 256
+		size = 256
+		if cpu.X86.HasVPCLMULQDQ {
+			allFeatureSize = 256
+		}
 	}
-	// AVX has 256 bit float ops but only 128-bit integer ops
-	// therefore it is 128.
-	if archsimd.X86.AVX() {
-		return 128
+	if archsimd.X86.AVX512() {
+		size = 512
+		if cpu.X86.HasAVX512VPCLMULQDQ {
+			allFeatureSize = 512
+		}
 	}
-	return 0
+	return
 }
diff --git a/src/simd/midway_arm64.go b/src/simd/midway_arm64.go
index 80f24cd..a138131 100644
--- a/src/simd/midway_arm64.go
+++ b/src/simd/midway_arm64.go
@@ -6,14 +6,17 @@
 
 package simd
 
-// VectorBitSize returns the bit length of the longest vector available
-// on the current hardware.  For arm64-neon, this is 128.
-func VectorBitSize() int {
-	return 128
-}
+import (
+	"internal/cpu"
+)
 
-// Emulated returns whether simd operations are emulated or
-// running on actual vector hardware.
-func Emulated() bool {
-	return false
+const archHasHwClmul = true
+
+func archMaxVectorSize() (size, allFeatureSize int) {
+	// This describes Neon, SVE is still TBD.
+	size = 128
+	if cpu.ARM64.HasPMULL {
+		allFeatureSize = 128
+	}
+	return
 }
diff --git a/src/simd/midway_common.go b/src/simd/midway_common.go
new file mode 100644
index 0000000..aa6e509
--- /dev/null
+++ b/src/simd/midway_common.go
@@ -0,0 +1,138 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (amd64 || arm64 || wasm)
+
+package simd
+
+import (
+	"fmt"
+	"internal/godebug"
+	"strconv"
+)
+
+// The `simd` package provides an architecture and vector-length agnostic API
+// for single-instruction-multiple-data "SIMD" vectors and operations. The
+// functions and methods in this package are those that can be mostly supported
+// in hardware, combined with an emulation for those platforms that are not yet
+// supported.
+//
+// Users can also control emulation and vector length with the 'simd' GODEBUG
+// setting.  GODEBUG=simd=0 requests emulation, not hardware SIMD, even if
+// hardware is available.  On platforms that may support multiple vector
+// lengths, GODEBUG=simd=N (N=128, 256, or 512) requests a specific vector
+// length.  If the request cannot be satisfied, the simd package panics
+// informatively.
+//
+// Some platforms may support vectors of a particular length, but not all of the
+// expected operations (those appearing in this package) are available at that
+// length.  In that case, the default is to automatically downgrade to a length
+// where the operations are supported, perhaps even to emulated-only
+// (size=0).  If a size is requested that is not compatible with the available
+// features, the simd package will panic (and note the reason).  To override
+// the feature check, in the case that the user knows that the missing
+// operations will not be used, prefix the size request with a '+', for
+// example "GODEBUG=simd=+256".  A plain '+' will override the feature check at
+// whatever the hardware's default vector size happens to be.
+
+var simd = godebug.New("#simd")
+
+var maxVectorSize int
+var emulated = false
+var hwClmul = true
+
+func init() {
+	actualMax, allFeatureSize := archMaxVectorSize() // zero == no simd, zero == features unavailable
+	gosimd := simd.Value()
+	explicitRequest := false
+
+	// No SIMD, must emulate
+	if actualMax == 0 {
+		maxVectorSize = 128
+		emulated = true
+		hwClmul = false
+		return
+	}
+
+	maxVectorSize = actualMax
+
+	// If gosimd begins with a '+' or is a single '1' then override
+	// any hardware feature check disabling of hardware SIMD.
+	// The '+' may be followed by a size, expected to be 0, 128, 256, 512.
+	// If it is zero (e.g., "0" or +0") then hardware SIMD is still disabled.
+	if len(gosimd) > 0 && gosimd[0] == '+' {
+		// override feature reduction
+		// keep maxVectorSize
+		// emulated remains false
+		// note if features missing.
+		hwClmul = allFeatureSize < actualMax
+		gosimd = gosimd[1:]
+		explicitRequest = true
+
+	} else if allFeatureSize < actualMax {
+		if allFeatureSize > 0 {
+			maxVectorSize = allFeatureSize
+			hwClmul = true
+			emulated = false
+		} else {
+			maxVectorSize = 128
+			hwClmul = false
+			emulated = true
+		}
+	}
+
+	if gosimd == "" {
+		return
+	}
+
+	// possible adjustment to chosen size
+	val, err := strconv.Atoi(gosimd)
+	if err != nil {
+		panic(fmt.Errorf("Could not parse GODEBUG=gosimd='%s' as a decimal number, %v", gosimd, err))
+	}
+	if val > actualMax {
+		panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is larger than the simd length (%d) supported on this cpu ", val, actualMax))
+	}
+	if !explicitRequest && val > allFeatureSize {
+		panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is larger than the simd length required for expected features (%d) on this cpu. GODEBUG=gosimd='+%d' will skip this check.", val, allFeatureSize, val))
+	}
+	if val < 0 {
+		panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is negative", val))
+	}
+	// user-requested emulation
+	if val == 0 {
+		maxVectorSize = 128
+		hwClmul = false
+		emulated = true
+		return
+	}
+
+	hwClmul = allFeatureSize >= val
+	maxVectorSize = val
+	emulated = false
+	return
+}
+
+// VectorBitSize returns the bit length of the longest vector available
+// on the current hardware.  It can be artificially reduced by setting
+// GODEBUG=simd=<smaller size> environment variable before running a program.
+func VectorBitSize() int {
+	return maxVectorSize
+}
+
+// Emulated returns whether simd operations are emulated or
+// running on actual vector hardware.
+func Emulated() bool {
+	return emulated
+}
+
+// HasHardwareCarrylessMultiply returns whether this platform
+// as a hardware-implemented version of carryless multiply.
+// With default GODEBUG=simd settings, if this is false,
+// it is emulated and merely slow, but with non-default settings
+// this can indicate the possibility of a missing instruction
+// that will fail ("SIGILL") if it is executed.
+func HasHardwareCarrylessMultiply() bool {
+	return hwClmul && archHasHwClmul
+}
diff --git a/src/simd/midway_wasm.go b/src/simd/midway_wasm.go
index e3c1ce0..8b0673f 100644
--- a/src/simd/midway_wasm.go
+++ b/src/simd/midway_wasm.go
@@ -6,14 +6,8 @@
 
 package simd
 
-// VectorBitSize returns the bit length of the longest vector available
-// on the current hardware.  For wasm, this is 128.
-func VectorBitSize() int {
-	return 128
-}
+const archHasHwClmul = false
 
-// Emulated returns whether simd operations are emulated or
-// running on actual vector hardware.
-func Emulated() bool {
-	return false
+func archMaxVectorSize() (size, allFeatureSize int) {
+	return 128, 128
 }
diff --git a/src/simd/simd.go b/src/simd/simd.go
index a85181f..b4a298e 100644
--- a/src/simd/simd.go
+++ b/src/simd/simd.go
@@ -758,6 +758,38 @@
 // BitsToInt64 reinterprets the vector bits as an Int64s vector.
 func (x Uint64s) BitsToInt64() Int64s
 
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected even indexed elements of x and y.
+// Each product is 128 bits wide and fills the corresponding
+// even-odd pairs in the result.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)"
+func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s
+
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected odd indexed elements of x and y.
+// Each product is 128 bits wide and fills the corresponding
+// even-odd pairs in the result.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)"
+func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s
+
 // ConvertToInt64 converts the vector elements to int64.
 func (x Uint64s) ConvertToInt64() Int64s
 
diff --git a/src/simd/simd_emulated.go b/src/simd/simd_emulated.go
index 128a091..995f6fa 100644
--- a/src/simd/simd_emulated.go
+++ b/src/simd/simd_emulated.go
@@ -9,6 +9,7 @@
 import (
 	"fmt"
 	"math"
+	"math/bits"
 )
 
 // VectorSize returns the bit length of the emulated vector (fixed to 128).
@@ -21,6 +22,14 @@
 	return true
 }
 
+// EmulatedCarrylessMultiply returns whether CarrylessMultiply is emulated.
+// This sometimes matters to choice of algorithm (e.g., when computing CRC).
+// The emulation's execution time does not depend on its inputs, so it is
+// okay in that sense.
+func EmulatedCarrylessMultiply() bool {
+	return true
+}
+
 type _simd struct {
 	_ [0]func(*_simd) *_simd
 }
@@ -3144,3 +3153,81 @@
 func (x Mask64s) ToInt64s() Int64s {
 	return Int64s{a: x.a, b: x.b}
 }
+
+func newT(lo, hi uint64) Uint64s {
+	return Uint64s{a: lo, b: hi}
+}
+
+// mwl returns the 128-bit product of the lower halves of x and y
+func (x Uint64s) mwl(y Uint64s) Uint64s {
+	hi, lo := bits.Mul64(x.a, y.a)
+	return Uint64s{a: lo, b: hi}
+}
+
+var (
+	// For mK, bits J such that J mod 5 == K are set
+	m0 = newT(0x0084210842108421, 0x1108421084210842)
+	m1 = newT(0x1108421084210842, 0x3210842108421084)
+	m2 = newT(0x3210842108421084, 0x8421084210842108)
+	m3 = newT(0x8421084210842108, 0x0842108421084210)
+	m4 = newT(0x0842108421084210, 0x0084210842108421)
+)
+
+func (x Uint64s) clmul(y Uint64s) Uint64s {
+	x0 := x.And(m0)
+	x1 := x.And(m1)
+	x2 := x.And(m2)
+	x3 := x.And(m3)
+	x4 := x.And(m4)
+
+	y0 := y.And(m0)
+	y1 := y.And(m1)
+	y2 := y.And(m2)
+	y3 := y.And(m3)
+	y4 := y.And(m4)
+
+	// sum of x, y indices == K mod 5; mask index = K
+	z := (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
+	z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
+	z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
+	z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
+	z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
+
+	return z
+}
+
+// CarrylessMultiplyEven computes the carryless
+// multiplications of selected even halves of the elements of x and y.
+// The result fills the 128 bits of each even-odd pair.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s {
+	return x.clmul(y)
+}
+
+// CarrylessMultiplyOdd computes the carryless
+// multiplications of selected odd halves of the elements of x and y.
+// The result fills the 128 bits of each even-odd pair.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s {
+	x.a = x.b
+	y.a = y.b
+	return x.clmul(y)
+}
diff --git a/src/simd/testdata/ip/sum_amd64.go b/src/simd/testdata/ip/sum_amd64.go
index 5c936b5..64f7657 100644
--- a/src/simd/testdata/ip/sum_amd64.go
+++ b/src/simd/testdata/ip/sum_amd64.go
@@ -24,6 +24,8 @@
 		return b.GetLo().GetElem(0) + b.GetHi().GetElem(0)
 	case archsimd.Float32x4:
 		return boringSum(simd.Float32sFromArch(a))
+	default:
+		return boringSum(x)
 	}
 	panic("nope")
 }
diff --git a/test/codegen/simd_arm64.go b/test/codegen/simd_arm64.go
index 5386ad2..1acbb1d 100644
--- a/test/codegen/simd_arm64.go
+++ b/test/codegen/simd_arm64.go
@@ -95,10 +95,10 @@
 }
 
 func foldGetHiSetHiShifts(x archsimd.Uint32x4) archsimd.Uint16x8 {
-	shrN := x.ShiftRightNarrowConst(16)        // arm64: `VSHRN [$]16, V0.S4, V[0-9]+.H4`
-	trunc := x.ShiftRightNarrowConst(0)        // arm64: `VXTN V0.S4, V[0-9]+.H4` -`VSHRN`
-	shlLo := x.ShiftLeftLoLongConst(1)         // arm64: `VUSHLL [$]1, V0.S2, V[0-9]+.D2`
-	shlHi := x.GetHi().ShiftLeftLoLongConst(1) // arm64: `VUSHLL2 [$]1, V0.S4, V[0-9]+.D2` -`VDUP`
+	shrN := x.ShiftRightNarrowConst(16)         // arm64: `VSHRN [$]16, V0.S4, V[0-9]+.H4`
+	trunc := x.ShiftRightNarrowConst(0)         // arm64: `VXTN V0.S4, V[0-9]+.H4` -`VSHRN`
+	shlLo := x.ShiftLeftWidenLoConst(1)         // arm64: `VUSHLL [$]1, V0.S2, V[0-9]+.D2`
+	shlHi := x.GetHi().ShiftLeftWidenLoConst(1) // arm64: `VUSHLL2 [$]1, V0.S4, V[0-9]+.D2` -`VDUP`
 	sum := shrN.Add(trunc)
 	combined := sum.SetHi(x.ShiftRightNarrowConst(15)) // arm64: `VSHRN2 [$]15, V0.S4, V[0-9]+.H8` -`VMOV.*D\[`
 	sinkU64 = shlLo.Sub(shlHi)
@@ -106,13 +106,19 @@
 }
 
 func foldGetHiSetHiMuls(a, b archsimd.Uint16x8) archsimd.Uint16x8 {
-	wLo := a.MulLoLong(b)                     // arm64: `VUMULL V0.H4, V1.H4, V[0-9].S4`
-	wHi := a.GetHi().MulLoLong(b.GetHi())     // arm64: `VUMULL2 V1.H8, V0.H8, V[0-9].S4` -`VDUP`
+	wLo := a.MulWidenLo(b)                    // arm64: `VUMULL V0.H4, V1.H4, V[0-9].S4`
+	wHi := a.GetHi().MulWidenLo(b.GetHi())    // arm64: `VUMULL2 V1.H8, V0.H8, V[0-9].S4` -`VDUP`
 	wHiRight := wHi.ShiftRightNarrowConst(16) // arm64: -`.*`
 	wLoRight := wLo.ShiftRightNarrowConst(16) // arm64: `VSHRN [$]16, V[0-9]+.S4, V0.H4`
 	return wLoRight.SetHi(wHiRight)           // arm64: `VSHRN2 [$]16, V[0-9]+.S4, V0.H8` -`VMOV.*D\[`
 }
 
+func carrylessMultiplies(x, y archsimd.Uint64x2) archsimd.Uint64x2 {
+	lo := x.CarrylessMultiplyEven(y) // arm64:`VPMULL V` -`VPMULL2`
+	hi := x.CarrylessMultiplyOdd(y)  // arm64:`VPMULL2 V` -`VPMULL `
+	return lo.Xor(hi)
+}
+
 func mergeWithNotMask(x, y archsimd.Int8x16, mask archsimd.Mask8x16, f1, f2 archsimd.Float32x4) {
 	// arm64:`VBIF` -`VBIT` -`VNOT`
 	sinkI8 = x.IfElse(mask.Not(), y)