all: REVERSE MERGE dev.simd (0e5948d) into master This commit is a REVERSE MERGE. It merges dev.simd back into its parent branch, master, for Go 1.27 one last time. Merge List: + 2026-06-01 0e5948dc59 [dev.simd] all: merge master (1cae25d) into dev.simd + 2026-06-01 a92cf0ee94 [dev.simd] simd, cmd/compile: add Midway GODEBUG=simd=0 emulation switch + 2026-06-01 e3afca43e2 [dev.simd] simd: attempting to pin down the simdgen output-order glitch + 2026-06-01 cea3788e05 [dev.simd] simd: add carryless multiply for wasm and for midway + 2026-06-01 627bc968ea [dev.simd] simd: add ARM64 PMULL (carrylessMultiplyWidenLo) intrinsic + 2026-05-30 80ab7bc1fa [dev.simd] simd: rename LoLong intrinsics to WidenLo + 2026-05-30 9764721859 [dev.simd] simdgen: filter arrangement symbols for shaped ARM64 instructions Change-Id: Id0010ed9bd8e5d7da033d9506f48d367e1232194
diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go index 891a381..642d985 100644 --- a/src/cmd/compile/internal/arm64/simdssa.go +++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -373,6 +373,9 @@ ssa.OpARM64VUMULL16B: p = simdV21Long(s, v, arm64.ARNG_16B) + case ssa.OpARM64VPMULL2D: + p = simdV21Long(s, v, arm64.ARNG_2D) + case ssa.OpARM64VSMULL4S, ssa.OpARM64VUMULL4S: p = simdV21Long(s, v, arm64.ARNG_4S) @@ -438,6 +441,9 @@ ssa.OpARM64VUMULL2_16B: p = simdV21Long2(s, v, arm64.ARNG_16B) + case ssa.OpARM64VPMULL2_2D: + p = simdV21Long2(s, v, arm64.ARNG_2D) + case ssa.OpARM64VSMULL2_4S, ssa.OpARM64VUMULL2_4S: p = simdV21Long2(s, v, arm64.ARNG_4S)
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 5c526cb..44bf01d 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -240,6 +240,8 @@ return arm64.ARNG_4S case arm64.ARNG_2S: return arm64.ARNG_2D + case arm64.ARNG_1D: + return arm64.ARNG_1Q default: base.Fatalf("unsupported long input arrangement: %d", arng) return 0 @@ -256,6 +258,8 @@ return arm64.ARNG_4H case arm64.ARNG_4S: return arm64.ARNG_2S + case arm64.ARNG_2D: + return arm64.ARNG_1D default: base.Fatalf("unsupported halfLanes input arrangement: %d", arng) return 0
diff --git a/src/cmd/compile/internal/midway/deepcopy.go b/src/cmd/compile/internal/midway/deepcopy.go index 95fc1f2..088ff23 100644 --- a/src/cmd/compile/internal/midway/deepcopy.go +++ b/src/cmd/compile/internal/midway/deepcopy.go
@@ -98,10 +98,20 @@ name := id.Value width := nameToElemBitWidth(name) if width > 0 { + archsimdId := syntax.NewName(id.Pos(), archPkg) + if c.VecLen == 0 { + // special case for emulation + newSel := &syntax.SelectorExpr{ + X: archsimdId, + Sel: id, // name is unchanged for emulation + } + newSel.SetPos(id.Pos()) + return newSel + } + count := c.VecLen / width base := name[:len(name)-1] newName := fmt.Sprintf("%sx%d", base, count) - archsimdId := syntax.NewName(id.Pos(), archPkg) newSelId := syntax.NewName(id.Pos(), newName) newSel := &syntax.SelectorExpr{ X: archsimdId, @@ -144,6 +154,17 @@ } width := nameToElemBitWidth(name) if width > 0 { + archsimdId := syntax.NewName(se.Pos(), archPkg) + if c.VecLen == 0 { + // emulated instead, name is unchanged + newSel := &syntax.SelectorExpr{ + X: archsimdId, + Sel: se.Sel, + } + newSel.SetPos(se.Pos()) + return newSel + } + count := c.VecLen / width base := name[:len(name)-1] newName := fmt.Sprintf("%sx%d", base, count) @@ -151,7 +172,6 @@ newName = "Load" + newName + nameSuffix } - archsimdId := syntax.NewName(se.Pos(), archPkg) newSelId := syntax.NewName(se.Sel.Pos(), newName) newSel := &syntax.SelectorExpr{
diff --git a/src/cmd/compile/internal/midway/midway.go b/src/cmd/compile/internal/midway/midway.go index d737a30..9adeec8 100644 --- a/src/cmd/compile/internal/midway/midway.go +++ b/src/cmd/compile/internal/midway/midway.go
@@ -11,11 +11,11 @@ func rewriteSizes() []int { switch buildcfg.GOARCH { case "wasm": - return []int{128} + return []int{0, 128} case "amd64": - return []int{128, 256, 512} + return []int{0, 128, 256, 512} case "arm64": - return []int{128} // this will change for SVE and cannot just be a size-based choice. + return []int{0, 128} // this will change for SVE and cannot just be a size-based choice. } return nil } @@ -24,6 +24,7 @@ const archFullPkg = "simd/internal/bridge" const archPkg = "bridge" const vectorSizeFn = "VectorBitSize" +const emulatedFn = "Emulated" func isSimdTypeName(s string) bool { switch s {
diff --git a/src/cmd/compile/internal/midway/rewrite.go b/src/cmd/compile/internal/midway/rewrite.go index 27685e2..083761c 100644 --- a/src/cmd/compile/internal/midway/rewrite.go +++ b/src/cmd/compile/internal/midway/rewrite.go
@@ -201,10 +201,13 @@ // switch ast node. // the goal is something like (for now, till there are finer-grained choices) // switch simd.VectorSize() { - // case 128: call the specialize-for-128-code(args) + // case 128: if simd.Emulated() { call the specialize-for-emulation-code(args) } + // else { call the specialize-for-128-code(args) } // case 256: call the specialize-for-256-code(args) // etc // } + // + // the cases above deal with the usual `return call(...)` vs `call(...); return` switchStmt := &syntax.SwitchStmt{ Tag: pe(&syntax.CallExpr{ Fun: pe(&syntax.SelectorExpr{ @@ -215,6 +218,8 @@ Body: []*syntax.CaseClause{}, } + var emulation syntax.Stmt + for _, k := range r.sizes { fnName := fmt.Sprintf("%s@simd%d", d.Name.Value, k) fnIdent := syntax.NewName(d.Pos(), fnName) @@ -224,22 +229,57 @@ ArgList: args(), }) - var branchStmt syntax.Stmt + // callReturnStmt is either `return call(...)` or `call(...); return` + var callReturnStmt syntax.Stmt if d.Type.ResultList != nil && len(d.Type.ResultList) > 0 { - branchStmt = &syntax.ReturnStmt{Results: callExpr} + callReturnStmt = &syntax.ReturnStmt{Results: callExpr} } else { - branchStmt = &syntax.BlockStmt{ + callReturnStmt = &syntax.BlockStmt{ List: []syntax.Stmt{ ps(&syntax.ExprStmt{X: callExpr}), ps(&syntax.ReturnStmt{}), }, + Rbrace: d.Pos(), } } - branchStmt.SetPos(d.Pos()) + callReturnStmt.SetPos(d.Pos()) + + if k == 0 { + // emulation == `if simd.Emulated() { callReturnStmt }` + // save it for the first part of the 128 case. + cond := pe(&syntax.CallExpr{ + Fun: pe(&syntax.SelectorExpr{ + X: syntax.NewName(d.Pos(), simdPkg), // Assume this is resolvable + Sel: syntax.NewName(d.Pos(), emulatedFn), + })}) + + blockStmt, ok := callReturnStmt.(*syntax.BlockStmt) + if !ok { + blockStmt = &syntax.BlockStmt{ + List: []syntax.Stmt{callReturnStmt}, + Rbrace: d.Pos(), + } + blockStmt.SetPos(d.Pos()) + } + + emulation = ps(&syntax.IfStmt{ + Cond: cond, + Then: blockStmt, + }) + continue + } + + var caseBody []syntax.Stmt + // assume that 128 is a case; when we do scalable simd, this may change. + // For now, if there is emulation, it is 128-bit (only). + if emulation != nil && k == 128 { + caseBody = append(caseBody, emulation) + emulation = nil + } caseClause := &syntax.CaseClause{ Cases: pe(&syntax.BasicLit{Kind: syntax.IntLit, Value: fmt.Sprintf("%d", k)}), - Body: []syntax.Stmt{branchStmt}, + Body: append(caseBody, callReturnStmt), } caseClause.SetPos(d.Pos()) switchStmt.Body = append(switchStmt.Body, caseClause)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules index 5de91a3..b7dc48e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -17,6 +17,7 @@ (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN2D y))) => (VXTN2_2D dst y) (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN4S y))) => (VXTN2_4S dst y) (VMOVDins0 [1] dst (VDUPDextr [0] (VXTN8H y))) => (VXTN2_8H dst y) +(VPMULL2D (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VPMULL2_2D x y) (VSMULL16B (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_16B x y) (VSMULL4S (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_4S x y) (VSMULL8H (VDUPDextr [1] x) (VDUPDextr [1] y)) => (VSMULL2_8H x y) @@ -232,12 +233,12 @@ (MulAddUint8x16 x y z) => (VMLA16B z x y) // earlyMatchRule (MulAddUint16x8 x y z) => (VMLA8H z x y) // earlyMatchRule (MulAddUint32x4 x y z) => (VMLA4S z x y) // earlyMatchRule -(MulLoLongInt8x16 ...) => (VSMULL16B ...) // pureVreg -(MulLoLongInt16x8 ...) => (VSMULL8H ...) // pureVreg -(MulLoLongInt32x4 ...) => (VSMULL4S ...) // pureVreg -(MulLoLongUint8x16 ...) => (VUMULL16B ...) // pureVreg -(MulLoLongUint16x8 ...) => (VUMULL8H ...) // pureVreg -(MulLoLongUint32x4 ...) => (VUMULL4S ...) // pureVreg +(MulWidenLoInt8x16 ...) => (VSMULL16B ...) // pureVreg +(MulWidenLoInt16x8 ...) => (VSMULL8H ...) // pureVreg +(MulWidenLoInt32x4 ...) => (VSMULL4S ...) // pureVreg +(MulWidenLoUint8x16 ...) => (VUMULL16B ...) // pureVreg +(MulWidenLoUint16x8 ...) => (VUMULL8H ...) // pureVreg +(MulWidenLoUint32x4 ...) => (VUMULL4S ...) // pureVreg (NegFloat32x4 ...) => (VFNEG4S ...) // pureVreg (NegFloat64x2 ...) => (VFNEG2D ...) // pureVreg (NegInt8x16 ...) => (VNEG16B ...) // pureVreg @@ -349,12 +350,6 @@ (ShiftLeftConstUint16x8 ...) => (VSHL8H ...) // pureVreg (ShiftLeftConstUint32x4 ...) => (VSHL4S ...) // pureVreg (ShiftLeftConstUint64x2 ...) => (VSHL2D ...) // pureVreg -(ShiftLeftLoLongConstInt8x16 ...) => (VSSHLL16B ...) // pureVreg -(ShiftLeftLoLongConstInt16x8 ...) => (VSSHLL8H ...) // pureVreg -(ShiftLeftLoLongConstInt32x4 ...) => (VSSHLL4S ...) // pureVreg -(ShiftLeftLoLongConstUint8x16 ...) => (VUSHLL16B ...) // pureVreg -(ShiftLeftLoLongConstUint16x8 ...) => (VUSHLL8H ...) // pureVreg -(ShiftLeftLoLongConstUint32x4 ...) => (VUSHLL4S ...) // pureVreg (ShiftLeftSaturatedConstInt8x16 ...) => (VSQSHL16Bconst ...) // pureVreg (VSQSHL16Bconst [a] x) && a==0 => x // asmRule (ShiftLeftSaturatedConstInt16x8 ...) => (VSQSHL8Hconst ...) // pureVreg @@ -371,6 +366,12 @@ (VUQSHL4Sconst [a] x) && a==0 => x // asmRule (ShiftLeftSaturatedConstUint64x2 ...) => (VUQSHL2Dconst ...) // pureVreg (VUQSHL2Dconst [a] x) && a==0 => x // asmRule +(ShiftLeftWidenLoConstInt8x16 ...) => (VSSHLL16B ...) // pureVreg +(ShiftLeftWidenLoConstInt16x8 ...) => (VSSHLL8H ...) // pureVreg +(ShiftLeftWidenLoConstInt32x4 ...) => (VSSHLL4S ...) // pureVreg +(ShiftLeftWidenLoConstUint8x16 ...) => (VUSHLL16B ...) // pureVreg +(ShiftLeftWidenLoConstUint16x8 ...) => (VUSHLL8H ...) // pureVreg +(ShiftLeftWidenLoConstUint32x4 ...) => (VUSHLL4S ...) // pureVreg (ShiftRightConstInt8x16 ...) => (VSSHR16B ...) // pureVreg (VSSHR16B [a] x) && a==0 => x // asmRule (ShiftRightConstInt16x8 ...) => (VSSHR8H ...) // pureVreg @@ -468,3 +469,4 @@ (broadcast1To16Int8x16 x) => (VDUPBbcast [0] x) // pureVreg (VDUPBbcast [i] (VMOVBins [j] _ (MOVDconst [c]))) && i == j && c>=-128 && c<=255 => (VMOVI16B [uint8(c)]) // argsMatchRule (broadcast1To16Uint8x16 x) => (VDUPBbcast [0] x) // pureVreg +(carrylessMultiplyWidenLoUint64x2 ...) => (VPMULL2D ...) // pureVreg
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go index d3c03ec..3d4907a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -109,6 +109,8 @@ {name: "VNOT16B", argLength: 1, reg: v11, asm: "VNOT", typ: "Vec128"}, {name: "VORN16B", argLength: 2, reg: v21, asm: "VORN", typ: "Vec128"}, {name: "VORR16B", argLength: 2, reg: v21, asm: "VORR", commutative: true, typ: "Vec128"}, + {name: "VPMULL2D", argLength: 2, reg: v21, asm: "VPMULL", commutative: true, typ: "Vec128"}, + {name: "VPMULL2_2D", argLength: 2, reg: v21, asm: "VPMULL2", commutative: true, typ: "Vec128"}, {name: "VSCVTF2D", argLength: 1, reg: v11, asm: "VSCVTF", typ: "Vec128"}, {name: "VSCVTF4S", argLength: 1, reg: v11, asm: "VSCVTF", typ: "Vec128"}, {name: "VSMAX4S", argLength: 2, reg: v21, asm: "VSMAX", commutative: true, typ: "Vec128"},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index d42b19e..e06c3bea 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -736,12 +736,6 @@ {name: "MulInt64x2", argLength: 2, commutative: true}, // ARCH:amd64,wasm {name: "MulInt64x4", argLength: 2, commutative: true}, // ARCH:amd64 {name: "MulInt64x8", argLength: 2, commutative: true}, // ARCH:amd64 - {name: "MulLoLongInt8x16", argLength: 2, commutative: true}, // ARCH:arm64 - {name: "MulLoLongInt16x8", argLength: 2, commutative: true}, // ARCH:arm64 - {name: "MulLoLongInt32x4", argLength: 2, commutative: true}, // ARCH:arm64 - {name: "MulLoLongUint8x16", argLength: 2, commutative: true}, // ARCH:arm64 - {name: "MulLoLongUint16x8", argLength: 2, commutative: true}, // ARCH:arm64 - {name: "MulLoLongUint32x4", argLength: 2, commutative: true}, // ARCH:arm64 {name: "MulSignInt8x16", argLength: 2}, // ARCH:amd64 {name: "MulSignInt8x32", argLength: 2}, // ARCH:amd64 {name: "MulSignInt16x8", argLength: 2}, // ARCH:amd64 @@ -764,12 +758,12 @@ {name: "MulWidenHiUint8x16", argLength: 2, commutative: true}, // ARCH:wasm {name: "MulWidenHiUint16x8", argLength: 2, commutative: true}, // ARCH:wasm {name: "MulWidenHiUint32x4", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoInt8x16", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoInt16x8", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoInt32x4", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoUint8x16", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoUint16x8", argLength: 2, commutative: true}, // ARCH:wasm - {name: "MulWidenLoUint32x4", argLength: 2, commutative: true}, // ARCH:wasm + {name: "MulWidenLoInt8x16", argLength: 2, commutative: true}, // ARCH:arm64,wasm + {name: "MulWidenLoInt16x8", argLength: 2, commutative: true}, // ARCH:arm64,wasm + {name: "MulWidenLoInt32x4", argLength: 2, commutative: true}, // ARCH:arm64,wasm + {name: "MulWidenLoUint8x16", argLength: 2, commutative: true}, // ARCH:arm64,wasm + {name: "MulWidenLoUint16x8", argLength: 2, commutative: true}, // ARCH:arm64,wasm + {name: "MulWidenLoUint32x4", argLength: 2, commutative: true}, // ARCH:arm64,wasm {name: "NegFloat32x4", argLength: 1}, // ARCH:arm64,wasm {name: "NegFloat64x2", argLength: 1}, // ARCH:arm64,wasm {name: "NegInt8x16", argLength: 1}, // ARCH:arm64,wasm @@ -1396,6 +1390,7 @@ {name: "broadcast1To64MaskedInt8x16", argLength: 2}, // ARCH:amd64 {name: "broadcast1To64MaskedUint8x16", argLength: 2}, // ARCH:amd64 {name: "broadcast1To64Uint8x16", argLength: 1}, // ARCH:amd64 + {name: "carrylessMultiplyWidenLoUint64x2", argLength: 2, commutative: true}, // ARCH:arm64 {name: "AESRoundKeyGenAssistUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64 {name: "CeilScaledFloat32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64 {name: "CeilScaledFloat32x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64 @@ -1517,12 +1512,6 @@ {name: "ShiftLeftConstUint16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftConstUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftConstUint64x2", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstInt8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstInt16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstInt32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstUint8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstUint16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 - {name: "ShiftLeftLoLongConstUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftSaturatedConstInt8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftSaturatedConstInt16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftSaturatedConstInt32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 @@ -1531,6 +1520,12 @@ {name: "ShiftLeftSaturatedConstUint16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftSaturatedConstUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftLeftSaturatedConstUint64x2", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstInt8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstInt16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstInt32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstUint8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstUint16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 + {name: "ShiftLeftWidenLoConstUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftRightConstInt8x16", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftRightConstInt16x8", argLength: 1, aux: "UInt8"}, // ARCH:arm64 {name: "ShiftRightConstInt32x4", argLength: 1, aux: "UInt8"}, // ARCH:arm64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index b2705c9..c113437 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -5106,6 +5106,8 @@ OpARM64VNOT16B OpARM64VORN16B OpARM64VORR16B + OpARM64VPMULL2D + OpARM64VPMULL2_2D OpARM64VSCVTF2D OpARM64VSCVTF4S OpARM64VSMAX4S @@ -7933,12 +7935,6 @@ OpMulInt64x2 OpMulInt64x4 OpMulInt64x8 - OpMulLoLongInt8x16 - OpMulLoLongInt16x8 - OpMulLoLongInt32x4 - OpMulLoLongUint8x16 - OpMulLoLongUint16x8 - OpMulLoLongUint32x4 OpMulSignInt8x16 OpMulSignInt8x32 OpMulSignInt16x8 @@ -8593,6 +8589,7 @@ Opbroadcast1To64MaskedInt8x16 Opbroadcast1To64MaskedUint8x16 Opbroadcast1To64Uint8x16 + OpcarrylessMultiplyWidenLoUint64x2 OpAESRoundKeyGenAssistUint32x4 OpCeilScaledFloat32x4 OpCeilScaledFloat32x8 @@ -8714,12 +8711,6 @@ OpShiftLeftConstUint16x8 OpShiftLeftConstUint32x4 OpShiftLeftConstUint64x2 - OpShiftLeftLoLongConstInt8x16 - OpShiftLeftLoLongConstInt16x8 - OpShiftLeftLoLongConstInt32x4 - OpShiftLeftLoLongConstUint8x16 - OpShiftLeftLoLongConstUint16x8 - OpShiftLeftLoLongConstUint32x4 OpShiftLeftSaturatedConstInt8x16 OpShiftLeftSaturatedConstInt16x8 OpShiftLeftSaturatedConstInt32x4 @@ -8728,6 +8719,12 @@ OpShiftLeftSaturatedConstUint16x8 OpShiftLeftSaturatedConstUint32x4 OpShiftLeftSaturatedConstUint64x2 + OpShiftLeftWidenLoConstInt8x16 + OpShiftLeftWidenLoConstInt16x8 + OpShiftLeftWidenLoConstInt32x4 + OpShiftLeftWidenLoConstUint8x16 + OpShiftLeftWidenLoConstUint16x8 + OpShiftLeftWidenLoConstUint32x4 OpShiftRightConstInt8x16 OpShiftRightConstInt16x8 OpShiftRightConstInt32x4 @@ -81330,6 +81327,36 @@ }, }, { + name: "VPMULL2D", + argLen: 2, + commutative: true, + asm: arm64.AVPMULL, + reg: regInfo{ + inputs: []inputInfo{ + {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "VPMULL2_2D", + argLen: 2, + commutative: true, + asm: arm64.AVPMULL2, + reg: regInfo{ + inputs: []inputInfo{ + {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { name: "VSCVTF2D", argLen: 1, asm: arm64.AVSCVTF, @@ -111007,42 +111034,6 @@ generic: true, }, { - name: "MulLoLongInt8x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "MulLoLongInt16x8", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "MulLoLongInt32x4", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "MulLoLongUint8x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "MulLoLongUint16x8", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "MulLoLongUint32x4", - argLen: 2, - commutative: true, - generic: true, - }, - { name: "MulSignInt8x16", argLen: 2, generic: true, @@ -114405,6 +114396,12 @@ generic: true, }, { + name: "carrylessMultiplyWidenLoUint64x2", + argLen: 2, + commutative: true, + generic: true, + }, + { name: "AESRoundKeyGenAssistUint32x4", auxType: auxUInt8, argLen: 1, @@ -115131,42 +115128,6 @@ generic: true, }, { - name: "ShiftLeftLoLongConstInt8x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "ShiftLeftLoLongConstInt16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "ShiftLeftLoLongConstInt32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "ShiftLeftLoLongConstUint8x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "ShiftLeftLoLongConstUint16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "ShiftLeftLoLongConstUint32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { name: "ShiftLeftSaturatedConstInt8x16", auxType: auxUInt8, argLen: 1, @@ -115215,6 +115176,42 @@ generic: true, }, { + name: "ShiftLeftWidenLoConstInt8x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "ShiftLeftWidenLoConstInt16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "ShiftLeftWidenLoConstInt32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "ShiftLeftWidenLoConstUint8x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "ShiftLeftWidenLoConstUint16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "ShiftLeftWidenLoConstUint32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { name: "ShiftRightConstInt8x16", auxType: auxUInt8, argLen: 1,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 5564c5b..428d313 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -412,6 +412,8 @@ return rewriteValueARM64_OpARM64VMOVDins0(v) case OpARM64VMOVSins0: return rewriteValueARM64_OpARM64VMOVSins0(v) + case OpARM64VPMULL2D: + return rewriteValueARM64_OpARM64VPMULL2D(v) case OpARM64VSHL16B: return rewriteValueARM64_OpARM64VSHL16B(v) case OpARM64VSHL2D: @@ -1579,24 +1581,6 @@ case OpMulInt8x16: v.Op = OpARM64VMUL16B return true - case OpMulLoLongInt16x8: - v.Op = OpARM64VSMULL8H - return true - case OpMulLoLongInt32x4: - v.Op = OpARM64VSMULL4S - return true - case OpMulLoLongInt8x16: - v.Op = OpARM64VSMULL16B - return true - case OpMulLoLongUint16x8: - v.Op = OpARM64VUMULL8H - return true - case OpMulLoLongUint32x4: - v.Op = OpARM64VUMULL4S - return true - case OpMulLoLongUint8x16: - v.Op = OpARM64VUMULL16B - return true case OpMulUint16x8: v.Op = OpARM64VMUL8H return true @@ -1606,6 +1590,24 @@ case OpMulUint8x16: v.Op = OpARM64VMUL16B return true + case OpMulWidenLoInt16x8: + v.Op = OpARM64VSMULL8H + return true + case OpMulWidenLoInt32x4: + v.Op = OpARM64VSMULL4S + return true + case OpMulWidenLoInt8x16: + v.Op = OpARM64VSMULL16B + return true + case OpMulWidenLoUint16x8: + v.Op = OpARM64VUMULL8H + return true + case OpMulWidenLoUint32x4: + v.Op = OpARM64VUMULL4S + return true + case OpMulWidenLoUint8x16: + v.Op = OpARM64VUMULL16B + return true case OpNeg16: v.Op = OpARM64NEG return true @@ -2055,24 +2057,6 @@ case OpShiftLeftConstUint8x16: v.Op = OpARM64VSHL16B return true - case OpShiftLeftLoLongConstInt16x8: - v.Op = OpARM64VSSHLL8H - return true - case OpShiftLeftLoLongConstInt32x4: - v.Op = OpARM64VSSHLL4S - return true - case OpShiftLeftLoLongConstInt8x16: - v.Op = OpARM64VSSHLL16B - return true - case OpShiftLeftLoLongConstUint16x8: - v.Op = OpARM64VUSHLL8H - return true - case OpShiftLeftLoLongConstUint32x4: - v.Op = OpARM64VUSHLL4S - return true - case OpShiftLeftLoLongConstUint8x16: - v.Op = OpARM64VUSHLL16B - return true case OpShiftLeftSaturatedConstInt16x8: v.Op = OpARM64VSQSHL8Hconst return true @@ -2097,6 +2081,24 @@ case OpShiftLeftSaturatedConstUint8x16: v.Op = OpARM64VUQSHL16Bconst return true + case OpShiftLeftWidenLoConstInt16x8: + v.Op = OpARM64VSSHLL8H + return true + case OpShiftLeftWidenLoConstInt32x4: + v.Op = OpARM64VSSHLL4S + return true + case OpShiftLeftWidenLoConstInt8x16: + v.Op = OpARM64VSSHLL16B + return true + case OpShiftLeftWidenLoConstUint16x8: + v.Op = OpARM64VUSHLL8H + return true + case OpShiftLeftWidenLoConstUint32x4: + v.Op = OpARM64VUSHLL4S + return true + case OpShiftLeftWidenLoConstUint8x16: + v.Op = OpARM64VUSHLL16B + return true case OpShiftRightConstInt16x8: v.Op = OpARM64VSSHR8H return true @@ -2464,6 +2466,9 @@ return rewriteValueARM64_Opbroadcast1To8Int16x8(v) case Opbroadcast1To8Uint16x8: return rewriteValueARM64_Opbroadcast1To8Uint16x8(v) + case OpcarrylessMultiplyWidenLoUint64x2: + v.Op = OpARM64VPMULL2D + return true } return false } @@ -18509,6 +18514,29 @@ } return false } +func rewriteValueARM64_OpARM64VPMULL2D(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPMULL2D (VDUPDextr [1] x) (VDUPDextr [1] y)) + // result: (VPMULL2_2D x y) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpARM64VDUPDextr || auxIntToUint8(v_0.AuxInt) != 1 { + continue + } + x := v_0.Args[0] + if v_1.Op != OpARM64VDUPDextr || auxIntToUint8(v_1.AuxInt) != 1 { + continue + } + y := v_1.Args[0] + v.reset(OpARM64VPMULL2_2D) + v.AddArg2(x, y) + return true + } + break + } + return false +} func rewriteValueARM64_OpARM64VSHL16B(v *Value) bool { v_0 := v.Args[0] // match: (VSHL16B [a] x)
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go index 81857d7..c05c5d7 100644 --- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -103,9 +103,9 @@ addF(simdPackage, "Uint32x4.And", opLen2(ssa.OpAndUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.And", opLen2(ssa.OpAndUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.And", opLen2(ssa.OpAndUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.And", opLen2(ssa.OpAndUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.And", opLen2(ssa.OpAndUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.And", opLen2(ssa.OpAndUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.And", opLen2(ssa.OpAndUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.AndNot", opLen2_21(ssa.OpAndNotInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.AndNot", opLen2_21(ssa.OpAndNotInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.AndNot", opLen2_21(ssa.OpAndNotInt8x64, types.TypeVec512), sys.AMD64) @@ -127,9 +127,9 @@ addF(simdPackage, "Uint32x4.AndNot", opLen2_21(ssa.OpAndNotUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.AndNot", opLen2_21(ssa.OpAndNotUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.AndNot", opLen2_21(ssa.OpAndNotUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.AndNot", opLen2_21(ssa.OpAndNotUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.AndNot", opLen2_21(ssa.OpAndNotUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.AndNot", opLen2_21(ssa.OpAndNotUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.AndNot", opLen2_21(ssa.OpAndNotUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Average", opLen2(ssa.OpAverageUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Average", opLen2(ssa.OpAverageUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.Average", opLen2(ssa.OpAverageUint8x64, types.TypeVec512), sys.AMD64) @@ -774,9 +774,9 @@ addF(simdPackage, "Uint32x4.Or", opLen2(ssa.OpOrUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.Or", opLen2(ssa.OpOrUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.Or", opLen2(ssa.OpOrUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) @@ -1232,9 +1232,9 @@ addF(simdPackage, "Uint32x4.Xor", opLen2(ssa.OpXorUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.Xor", opLen2(ssa.OpXorUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.Xor", opLen2(ssa.OpXorUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.Xor", opLen2(ssa.OpXorUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Xor", opLen2(ssa.OpXorUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Xor", opLen2(ssa.OpXorUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Xor", opLen2(ssa.OpXorUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.blend", opLen3(ssa.OpblendInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.blend", opLen3(ssa.OpblendInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.blendMasked", opLen3(ssa.OpblendMaskedInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go index 31dddb2..9a29e74 100644 --- a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
@@ -207,12 +207,12 @@ addF(simdPackage, "Uint8x16.MulAdd", opLen3(ssa.OpMulAddUint8x16, types.TypeVec128), sys.ARM64) addF(simdPackage, "Uint16x8.MulAdd", opLen3(ssa.OpMulAddUint16x8, types.TypeVec128), sys.ARM64) addF(simdPackage, "Uint32x4.MulAdd", opLen3(ssa.OpMulAddUint32x4, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Int8x16.MulLoLong", opLen2(ssa.OpMulLoLongInt8x16, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Int16x8.MulLoLong", opLen2(ssa.OpMulLoLongInt16x8, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Int32x4.MulLoLong", opLen2(ssa.OpMulLoLongInt32x4, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Uint8x16.MulLoLong", opLen2(ssa.OpMulLoLongUint8x16, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Uint16x8.MulLoLong", opLen2(ssa.OpMulLoLongUint16x8, types.TypeVec128), sys.ARM64) - addF(simdPackage, "Uint32x4.MulLoLong", opLen2(ssa.OpMulLoLongUint32x4, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Int8x16.MulWidenLo", opLen2(ssa.OpMulWidenLoInt8x16, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Int16x8.MulWidenLo", opLen2(ssa.OpMulWidenLoInt16x8, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Int32x4.MulWidenLo", opLen2(ssa.OpMulWidenLoInt32x4, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Uint8x16.MulWidenLo", opLen2(ssa.OpMulWidenLoUint8x16, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Uint16x8.MulWidenLo", opLen2(ssa.OpMulWidenLoUint16x8, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Uint32x4.MulWidenLo", opLen2(ssa.OpMulWidenLoUint32x4, types.TypeVec128), sys.ARM64) addF(simdPackage, "Float32x4.Neg", opLen1(ssa.OpNegFloat32x4, types.TypeVec128), sys.ARM64) addF(simdPackage, "Float64x2.Neg", opLen1(ssa.OpNegFloat64x2, types.TypeVec128), sys.ARM64) addF(simdPackage, "Int8x16.Neg", opLen1(ssa.OpNegInt8x16, types.TypeVec128), sys.ARM64) @@ -318,12 +318,6 @@ addF(simdPackage, "Uint16x8.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64) addF(simdPackage, "Uint32x4.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64) addF(simdPackage, "Uint64x2.ShiftLeftConst", opLen1Imm(ssa.OpShiftLeftConstUint64x2, types.TypeVec128, 0, 63), sys.ARM64) - addF(simdPackage, "Int8x16.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64) - addF(simdPackage, "Int16x8.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64) - addF(simdPackage, "Int32x4.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64) - addF(simdPackage, "Uint8x16.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint8x16, types.TypeVec128, 0, 7), sys.ARM64) - addF(simdPackage, "Uint16x8.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64) - addF(simdPackage, "Uint32x4.ShiftLeftLoLongConst", opLen1Imm(ssa.OpShiftLeftLoLongConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64) addF(simdPackage, "Int8x16.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64) addF(simdPackage, "Int16x8.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64) addF(simdPackage, "Int32x4.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64) @@ -332,6 +326,12 @@ addF(simdPackage, "Uint16x8.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64) addF(simdPackage, "Uint32x4.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64) addF(simdPackage, "Uint64x2.ShiftLeftSaturatedConst", opLen1Imm(ssa.OpShiftLeftSaturatedConstUint64x2, types.TypeVec128, 0, 63), sys.ARM64) + addF(simdPackage, "Int8x16.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64) + addF(simdPackage, "Int16x8.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64) + addF(simdPackage, "Int32x4.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64) + addF(simdPackage, "Uint8x16.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint8x16, types.TypeVec128, 0, 7), sys.ARM64) + addF(simdPackage, "Uint16x8.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint16x8, types.TypeVec128, 0, 15), sys.ARM64) + addF(simdPackage, "Uint32x4.ShiftLeftWidenLoConst", opLen1Imm(ssa.OpShiftLeftWidenLoConstUint32x4, types.TypeVec128, 0, 31), sys.ARM64) addF(simdPackage, "Int8x16.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt8x16, types.TypeVec128, 0, 7), sys.ARM64) addF(simdPackage, "Int16x8.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt16x8, types.TypeVec128, 0, 15), sys.ARM64) addF(simdPackage, "Int32x4.ShiftRightConst", opLen1Imm(ssa.OpShiftRightConstInt32x4, types.TypeVec128, 0, 31), sys.ARM64) @@ -415,6 +415,7 @@ addF(simdPackage, "Uint16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint16x8, types.TypeVec128), sys.ARM64) addF(simdPackage, "Int8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int8x16, types.TypeVec128), sys.ARM64) addF(simdPackage, "Uint8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint8x16, types.TypeVec128), sys.ARM64) + addF(simdPackage, "Uint64x2.carrylessMultiplyWidenLo", opLen2(ssa.OpcarrylessMultiplyWidenLoUint64x2, types.TypeVec128), sys.ARM64) addF(simdPackage, "Float32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64) addF(simdPackage, "Float32x4.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64) addF(simdPackage, "Float32x4.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 7e0d2d3..5942544 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go
@@ -57,6 +57,7 @@ HasSSE41 bool HasSSE42 bool HasVAES bool + HasVPCLMULQDQ bool _ CacheLinePad }
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index 3c0a0ad..515b2c7 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go
@@ -57,6 +57,7 @@ cpuid_AVX512_VBMI = 1 << 1 cpuid_AVX512_VBMI2 = 1 << 6 cpuid_GFNI = 1 << 8 + cpuid_VPCLMULQDQ = 1 << 10 // applies to not just AVX512 cpuid_AVX512VPCLMULQDQ = 1 << 10 cpuid_AVX512_BITALG = 1 << 12 @@ -174,6 +175,7 @@ X86.HasADX = isSet(ebx7, cpuid_ADX) X86.HasSHA = isSet(ebx7, cpuid_SHA) X86.HasVAES = isSet(ecx7, cpuid_VAES) && X86.HasAVX + X86.HasVPCLMULQDQ = isSet(ecx7, cpuid_VPCLMULQDQ) X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512 if X86.HasAVX512F {
diff --git a/src/simd/archsimd/_gen/midway/comments.yaml b/src/simd/archsimd/_gen/midway/comments.yaml index c40440d..a300664 100644 --- a/src/simd/archsimd/_gen/midway/comments.yaml +++ b/src/simd/archsimd/_gen/midway/comments.yaml
@@ -64,6 +64,38 @@ ToInt32s: "ToInt32s converts the mask to an Int32s vector." ToInt64s: "ToInt64s converts the mask to an Int64s vector." + CarrylessMultiplyEven: |- + CarrylessMultiplyOdd computes the carryless + // multiplications of selected even indexed elements of x and y. + // Each product is 128 bits wide and fills the corresponding + // even-odd pairs in the result. + // + // A carryless multiplication uses bitwise XOR instead of + // add-with-carry, for example (in base two): + // + // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 + // + // This also models multiplication of polynomials with coefficients + // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = + // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds + // polynomial terms, but coefficients "add" with XOR.)" + + CarrylessMultiplyOdd: |- + CarrylessMultiplyOdd computes the carryless + // multiplications of selected odd indexed elements of x and y. + // Each product is 128 bits wide and fills the corresponding + // even-odd pairs in the result. + // + // A carryless multiplication uses bitwise XOR instead of + // add-with-carry, for example (in base two): + // + // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 + // + // This also models multiplication of polynomials with coefficients + // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = + // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds + // polynomial terms, but coefficients "add" with XOR.)" + types: _simd: "internal SIMD marker." Int8s: "Int8s represents a vector of 8-bit signed integers."
diff --git a/src/simd/archsimd/_gen/midway/intersect_simd_ops.go b/src/simd/archsimd/_gen/midway/intersect_simd_ops.go index 90434db..67aa790 100644 --- a/src/simd/archsimd/_gen/midway/intersect_simd_ops.go +++ b/src/simd/archsimd/_gen/midway/intersect_simd_ops.go
@@ -103,9 +103,16 @@ archSimdPath := *goRoot + "/src/simd/archsimd" // Hardcoded list of files - amd64Files := []string{"ops_amd64.go", "compare_gen_amd64.go", "types_amd64.go", "other_gen_amd64.go", "extra_amd64.go", "maskmerge_gen_amd64.go", "shuffles_amd64.go", "slice_gen_amd64.go", "slicepart_amd64.go", "slicepart_128.go", "string.go"} - wasmFiles := []string{"ops_wasm.go", "types_wasm.go", "slicepart_wasm.go", "string.go", "slicepart_128.go", "ops_emulated_wasm.go"} - neonFiles := []string{"compare_gen_arm64.go", "maskmerge_gen_arm64.go", "ops_arm64.go", "slicepart_128.go", "ops_internal_arm64.go", "other_gen_arm64.go", "slice_gen_arm64.go", "slicepart_arm64.go", "types_arm64.go"} + amd64Files := []string{"ops_amd64.go", "compare_gen_amd64.go", "types_amd64.go", + "other_gen_amd64.go", "extra_amd64.go", "maskmerge_gen_amd64.go", + "shuffles_amd64.go", "slice_gen_amd64.go", "slicepart_amd64.go", + "slicepart_128.go", "string.go", "ops_emulated_amd64.go"} + wasmFiles := []string{"ops_wasm.go", "types_wasm.go", "slicepart_wasm.go", + "string.go", "slicepart_128.go", "ops_emulated_wasm.go"} + neonFiles := []string{"clmul_arm64.go", "compare_gen_arm64.go", + "maskmerge_gen_arm64.go", "ops_arm64.go", "slicepart_128.go", + "ops_internal_arm64.go", "other_gen_arm64.go", "slice_gen_arm64.go", + "slicepart_arm64.go", "types_arm64.go"} emulatedFile := *goRoot + "/src/simd/simd_emulated.go"
diff --git a/src/simd/archsimd/_gen/sgutil/compare_natural.go b/src/simd/archsimd/_gen/sgutil/compare_natural.go index f8ca9fa..8d25e58 100644 --- a/src/simd/archsimd/_gen/sgutil/compare_natural.go +++ b/src/simd/archsimd/_gen/sgutil/compare_natural.go
@@ -50,6 +50,10 @@ if num1 > num2 { return 1 } + // "1" < "01". Don't expect it in simdgen, but just in case. + if ln1, ln2 := i-numStart1, j-numStart2; ln1 != ln2 { + return ln1 - ln2 + } // If numbers are equal, continue to the next segment. } else { // Non-digit comparison.
diff --git a/src/simd/archsimd/_gen/sgutil/sort_test.go b/src/simd/archsimd/_gen/sgutil/sort_test.go index 9f74296..c86baf2 100644 --- a/src/simd/archsimd/_gen/sgutil/sort_test.go +++ b/src/simd/archsimd/_gen/sgutil/sort_test.go
@@ -13,7 +13,7 @@ }{ {"a1", "a2", -1}, {"a11a", "a11b", -1}, - {"a01a1", "a1a01", -1}, + {"a01a1", "a1a01", 1}, {"a2", "a1", 1}, {"a10", "a2", 1}, {"a1", "a10", -1}, @@ -24,7 +24,7 @@ {"file1", "file1", 0}, {"file", "file1", -1}, {"file1", "file", 1}, - {"a01", "a1", -1}, + {"a01", "a1", 1}, {"a1a", "a1b", -1}, }
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/instruction.go b/src/simd/archsimd/_gen/simdgen/arm64/instruction.go index 0e62e51..c968b2c 100644 --- a/src/simd/archsimd/_gen/simdgen/arm64/instruction.go +++ b/src/simd/archsimd/_gen/simdgen/arm64/instruction.go
@@ -7,6 +7,7 @@ import ( "fmt" "regexp" + "sort" "strconv" "strings" @@ -348,26 +349,98 @@ return arrangements, DefaultArngs } + // Determine the arrangement shape and which symbol to extract from. + // For LongArngs and NarrowArngs, we need only the source-side symbol. + // For WideArngs, we need only the wide-side symbol. + ashape = instruction.ArngShape() + var targetSymbol string + if ashape == LongArngs || ashape == NarrowArngs { + symbols := instruction.arrangementSymbols() + if len(symbols) >= 2 { + targetSymbol = "<" + symbols[len(symbols)-1] + ">" + } + } else if ashape == WideArngs { + symbols := instruction.arrangementSymbols() + if len(symbols) >= 2 { + targetSymbol = "<" + symbols[0] + ">" + } + } + + nonTarget := map[string]bool{} for _, Explanation := range instruction.Explanations.Explanations { Definition := Explanation.Definition if Definition.Table.TGroup.TBody.Row != nil { + isTarget := targetSymbol == "" || targetSymbol == strings.TrimSpace(Explanation.Symbol.Value) for _, Row := range Definition.Table.TGroup.TBody.Row { for _, Entry := range Row.Entries { if Entry.Class == "symbol" { - arrangements = append(arrangements, strings.TrimSpace(Entry.Value)) + v := strings.TrimSpace(Entry.Value) + if isTarget { + arrangements = append(arrangements, v) + } else if eb, _, _ := parseArrangement(v); eb > 0 { + nonTarget[v] = false + } } } } } } + verifyNonTargetArrangements(instruction.Mnemonic(), ashape, arrangements, nonTarget) + fixedArrangements := instruction.extractFixedArrangements() arrangements = append(arrangements, fixedArrangements...) arrangements = removeDuplicates(arrangements) - ashape = instruction.ArngShape() return arrangements, ashape } +// verifyNonTargetArrangements checks that non-target symbol arrangements are the +// expected transformed versions of the target arrangements (half/double elemBits). +func verifyNonTargetArrangements(mnemonic string, ashape ArngShape, target []string, nonTarget map[string]bool) { + if ashape == DefaultArngs || len(nonTarget) == 0 { + return + } + // FCVTN has a FEAT_FP8 variant not covered by NarrowArngs. + // The other variants are covered. + switch mnemonic { + case "FCVTN", "FCVTXN": + return + } + for _, t := range target { + eb, _, _ := parseArrangement(t) + if eb == 0 { + continue + } + var expectedElemBits int + switch ashape { + case LongArngs: + expectedElemBits = eb * 2 + case NarrowArngs, WideArngs: + expectedElemBits = eb / 2 + } + if expectedElemBits == 0 { + continue + } + for nt := range nonTarget { + ntEb, _, _ := parseArrangement(nt) + if ntEb == expectedElemBits { + nonTarget[nt] = true + } + } + } + var unexplained []string + for nt, explained := range nonTarget { + if !explained { + unexplained = append(unexplained, nt) + } + } + if len(unexplained) > 0 { + sort.Strings(unexplained) + panic(fmt.Sprintf("%s: non-target arrangements not explained by target: %v\ntarget: %v", + mnemonic, unexplained, target)) + } +} + // regDiagramArngShape returns the expected arrangement shape based on RegDiagram for NEON. // Used for cross-check verification by ArngShape() only. func (instruction *Instruction) regDiagramArngShape() ArngShape {
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go b/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go index 1b3b92a..0524e54 100644 --- a/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go +++ b/src/simd/archsimd/_gen/simdgen/arm64/instruction_test.go
@@ -121,6 +121,8 @@ integerUpTo8Bits = []string{"int8:16B", "int8:8B", "uint8:16B", "uint8:8B"} integerUpTo16Bits = append([]string{"int16:4H", "int16:8H", "uint16:4H", "uint16:8H"}, integerUpTo8Bits...) integerUpTo32Bits = append([]string{"int32:2S", "int32:4S", "uint32:2S", "uint32:4S"}, integerUpTo16Bits...) + integerWideOnly = []string{"int16:8H", "int32:4S", "int64:2D", "uint16:8H", "uint32:4S", "uint64:2D"} + polynomialArrngs = []string{"int8:8B", "int8:16B", "int64:1D", "int64:2D", "uint8:8B", "uint8:16B", "uint64:1D", "uint64:2D"} integer32And8Bits = append([]string{"int32:2S", "int32:4S", "uint32:2S", "uint32:4S"}, integerUpTo8Bits...) addvArngs = append([]string{"int32:4S", "uint32:4S"}, integerUpTo16Bits...) integer = append([]string{"int64:2D", "uint64:2D"}, integerUpTo32Bits...) @@ -153,7 +155,7 @@ {"^FADDP$", matchOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)}, {"^FADDP$", matchOps(unary), requireArngs([]string{"float32:2S", "float64:2D"}, DefaultArngs), emitsDefs(2)}, {"^SABA$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)}, - {"^SABAL$", matchOps(threeArgsResultInArg0), requireArngs(integer, LongArngs), emitsDefs(14)}, + {"^SABAL$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, {"^F(ADD|SUB|DIV)$", requireOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)}, {"^(AND|ORR|EOR|BIC|ORN)$", matchOps(binary), requireArngs(bitwise, DefaultArngs), emitsDefs(14)}, {"^NOT$", requireOps(unary), requireArngs(bitwise, DefaultArngs), emitsDefs(14)}, @@ -193,11 +195,11 @@ {"^SHL$", requireOps(unaryWithImm), requireArngs(integer, DefaultArngs), emitsDefs(14)}, {"^(S|U)SHR$", requireOps(unaryWithImm), requireArngs(integer, DefaultArngs), emitsDefs(14)}, {"^(S|U)SRA$", requireOps(unaryWithImmResultInArg0), requireArngs(integer, DefaultArngs), emitsDefs(14)}, - {"^(S|U)SHLL$", requireOps(unaryWithImm), requireArngs(integer, LongArngs), emitsDefs(14)}, - {"^SADALP$", matchOps(twoArgsResultInArg0), requireArngs(integerWith1D, LongArngs), emitsDefs(16)}, - {"^((S|U)ADDLP)$", requireOps(unary), requireArngs(integerWith1D, LongArngs), emitsDefs(16)}, - {"^(R?(ADD|SUB)HN)$", requireOps(binary), requireArngs(integer, NarrowArngs), emitsDefs(14)}, - {"^SHRN$", requireOps(unaryWithImm), requireArngs(integer, NarrowArngs), emitsDefs(14)}, + {"^(S|U)SHLL$", requireOps(unaryWithImm), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, + {"^SADALP$", matchOps(twoArgsResultInArg0), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, + {"^((S|U)ADDLP)$", requireOps(unary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, + {"^(R?(ADD|SUB)HN)$", requireOps(binary), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)}, + {"^SHRN$", requireOps(unaryWithImm), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)}, {"^(CLZ|CLS)$", requireOps(unary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)}, {"^(CNT|RBIT)$", requireOps(unary), requireArngs(integerUpTo8Bits, DefaultArngs), emitsDefs(4)}, {"^(S|U)R?HADD$", matchOps(binary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)}, @@ -205,16 +207,17 @@ {"^FMUL$", matchOps(binary), requireArngs(floating, DefaultArngs), emitsDefs(3)}, {"^F(MLA|MLS)$", matchOps(threeArgsResultInArg0), requireArngs(floating, DefaultArngs), emitsDefs(3)}, {"^MUL$", matchOps(binary), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)}, - {"^((S|U)MULL)$", matchOps(binary), requireArngs(integer, LongArngs), emitsDefs(14)}, + {"^((S|U)MULL)$", matchOps(binary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, {"^(MLA|MLS)$", matchOps(threeArgsResultInArg0), requireArngs(integerUpTo32Bits, DefaultArngs), emitsDefs(12)}, - {"^((S|U)Q)?XTN$", requireOps(unary), requireArngs(integer, NarrowArngs), emitsDefs(14)}, - {"^(S|U)XTL$", requireOps(unary), requireArngs(integer, LongArngs), emitsDefs(14)}, + {"^((S|U)Q)?XTN$", requireOps(unary), requireArngs(integerWideOnly, NarrowArngs), emitsDefs(6)}, + {"^(S|U)XTL$", requireOps(unary), requireArngs(integerUpTo32Bits, LongArngs), emitsDefs(12)}, {"^FCVT[NMPZ](S|U)$", matchOps(unary), requireArngs(floating, DefaultArngs), emitsDefs(3)}, {"^(S|U)CVTF$", matchOps(unary), requireArngs(floating, DefaultArngs), emitsDefs(3)}, - {"^(S|U)ADDW$", requireOps(binary), requireArngs(integer, WideArngs), emitsDefs(14)}, - {"^(S|U)SUBW$", requireOps(binary), requireArngs(integer, WideArngs), emitsDefs(14)}, - {"^FCVTL$", requireOps(unary), requireArngs(floating, LongArngs), emitsDefs(3)}, + {"^(S|U)ADDW$", requireOps(binary), requireArngs(integerWideOnly, WideArngs), emitsDefs(6)}, + {"^(S|U)SUBW$", requireOps(binary), requireArngs(integerWideOnly, WideArngs), emitsDefs(6)}, + {"^FCVTL$", requireOps(unary), requireArngs([]string{"float32:2S", "float32:4S"}, LongArngs), emitsDefs(2)}, {"^USDOT$", matchOps(threeArgsResultInArg0), requireArngs(integer32And8Bits, UnsupportedArngs), emitsDefs(0)}, + {"^PMULL$", matchOps(binary), requireArngs(polynomialArrngs, LongArngs), emitsDefs(8)}, } func TestArm64Instructions(t *testing.T) {
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/operands.go b/src/simd/archsimd/_gen/simdgen/arm64/operands.go index 30cba28..aa9e84a 100644 --- a/src/simd/archsimd/_gen/simdgen/arm64/operands.go +++ b/src/simd/archsimd/_gen/simdgen/arm64/operands.go
@@ -93,8 +93,8 @@ op.Lanes = arrangement.bits / op.ElemBits case ashape == WideArngs && vregPos == 2: op.ElemBits = arrangement.elemBits / 2 - op.Bits = arrangement.bits / 2 - op.Lanes = arrangement.lanes + op.Bits = arrangement.bits + op.Lanes = arrangement.bits / op.ElemBits default: op.ElemBits = arrangement.elemBits op.Bits = arrangement.bits
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go index 4aba7f3..4f0751c 100644 --- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -514,6 +514,10 @@ processArg(arg) } } + for _, v := range ret { + slices.SortFunc(v, compareSimdTypes) + } + return ret }
diff --git a/src/simd/archsimd/_gen/simdgen/godefs.go b/src/simd/archsimd/_gen/simdgen/godefs.go index 732d685..0cbe42e 100644 --- a/src/simd/archsimd/_gen/simdgen/godefs.go +++ b/src/simd/archsimd/_gen/simdgen/godefs.go
@@ -7,6 +7,7 @@ import ( "fmt" "log" + "math/rand/v2" "regexp" "slices" "strconv" @@ -430,6 +431,10 @@ if num1 > num2 { return 1 } + // "1" < "01". Don't expect it in simdgen, but just in case. + if ln1, ln2 := i-numStart1, j-numStart2; ln1 != ln2 { + return ln1 - ln2 + } // If numbers are equal, continue to the next segment. } else { // Non-digit comparison. @@ -472,6 +477,11 @@ op.adjustAsm() ops = append(ops, op) } + + rand.Shuffle(len(ops), func(i, j int) { + ops[i], ops[j] = ops[j], ops[i] + }) + slices.SortFunc(ops, compareOperations) // The parsed XED data might contain duplicates, like // 512 bits VPADDP. @@ -479,7 +489,7 @@ slices.SortFunc(deduped, compareOperations) if *Verbose { - log.Printf("dedup len: %d\n", len(ops)) + log.Printf("dedup len: %d, ops len: %d\n", len(deduped), len(ops)) } var err error if err = overwrite(deduped); err != nil { @@ -507,6 +517,10 @@ log.Printf("dedup len: %d\n", len(deduped)) } reportXEDInconsistency(deduped) + + // Sorting again, just in case. + slices.SortFunc(deduped, compareOperations) + typeMap := parseSIMDTypes(deduped) archInfo := CurrentArch()
diff --git a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml index 9fbe3f0..19e13be 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/categories.yaml
@@ -23,3 +23,19 @@ // where the characteristic polynomial P is x^8 + x^4 + x^3 + x + 1. - go: carrylessMultiply commutative: false + +- go: carrylessMultiplyWidenLo + commutative: true + documentation: !string |- + // NAME returns the carryless (polynomial) product of the low halves + // of x and y. + // + // A carryless multiplication uses bitwise XOR instead of + // add-with-carry, for example (in base two): + // + // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 + // + // This also models multiplication of polynomials with coefficients + // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = + // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds + // polynomial terms, but coefficients "add" with XOR.)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml new file mode 100644 index 0000000..7659884 --- /dev/null +++ b/src/simd/archsimd/_gen/simdgen/ops/GaloisField/go_arm64.yaml
@@ -0,0 +1,10 @@ +!sum +# Polynomial (carryless) multiply long, P64 variant (2D→1Q). +- go: carrylessMultiplyWidenLo + asm: VPMULL + hiHalfAsm: VPMULL2 + in: + - go: Uint64x2 + - go: Uint64x2 + out: + - go: Uint64x2
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml index d4a5886..646c0e5 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
@@ -12,7 +12,7 @@ commutative: true documentation: !string |- // NAME multiplies elements and stores the high part of the result. -- go: MulLoLong +- go: MulWidenLo commutative: true documentation: !string |- // NAME multiplies corresponding low-indexed elements and produces a result with double the element width.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml index 42f00db..7dc7669 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/go_arm64.yaml
@@ -10,7 +10,7 @@ - *any # Multiply long signed (SMULL) -- go: MulLoLong +- go: MulWidenLo signed: true asm: "VSMULL" hiHalfAsm: "VSMULL2" @@ -24,7 +24,7 @@ base: int # Multiply long unsigned (UMULL) -- go: MulLoLong +- go: MulWidenLo signed: false asm: "VUMULL" hiHalfAsm: "VUMULL2"
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml index 364b347..673ebc7 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
@@ -51,13 +51,13 @@ documentation: !string |- // NAME performs a right shift on each element in x by the constant number of bits // and narrows the result to half the element width. -- go: ShiftLeftLoLongConst +- go: ShiftLeftWidenLoConst signed: false commutative: false documentation: !string |- // NAME performs a left shift on each unsigned low-indexed element in x by the constant number of bits // and widens the result to double the element width. -- go: ShiftLeftLoLongConst +- go: ShiftLeftWidenLoConst signed: true commutative: false documentation: !string |-
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml index 3c4c637..ed51011 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go_arm64.yaml
@@ -175,7 +175,7 @@ - go: $u base: uint -- go: ShiftLeftLoLongConst +- go: ShiftLeftWidenLoConst signed: false asm: "VUSHLL" hiHalfAsm: "VUSHLL2" @@ -187,7 +187,7 @@ - go: $u base: uint -- go: ShiftLeftLoLongConst +- go: ShiftLeftWidenLoConst signed: true asm: "VSSHLL" hiHalfAsm: "VSSHLL2"
diff --git a/src/simd/archsimd/_gen/simdgen/sort_test.go b/src/simd/archsimd/_gen/simdgen/sort_test.go index 399acf0..a743477 100644 --- a/src/simd/archsimd/_gen/simdgen/sort_test.go +++ b/src/simd/archsimd/_gen/simdgen/sort_test.go
@@ -13,7 +13,7 @@ }{ {"a1", "a2", -1}, {"a11a", "a11b", -1}, - {"a01a1", "a1a01", -1}, + {"a01a1", "a1a01", 1}, {"a2", "a1", 1}, {"a10", "a2", 1}, {"a1", "a10", -1}, @@ -24,7 +24,7 @@ {"file1", "file1", 0}, {"file", "file1", -1}, {"file1", "file", 1}, - {"a01", "a1", -1}, + {"a01", "a1", 1}, {"a1a", "a1b", -1}, }
diff --git a/src/simd/archsimd/_gen/simdgen/types.yaml b/src/simd/archsimd/_gen/simdgen/types.yaml index 54b08c8..0e876d3 100644 --- a/src/simd/archsimd/_gen/simdgen/types.yaml +++ b/src/simd/archsimd/_gen/simdgen/types.yaml
@@ -85,6 +85,8 @@ # Special for carryless multiply - {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8} +# Result type of ARM64 carryless multiply, e.g. VPMULL V2.D1, V1.D1, V3.Q1 + - {class: vreg, go: Uint64x2, base: "uint", elemBits: 128, bits: 128, lanes: 1} # Special shapes just to make VAES(ENC|DEC)(LAST)?512 work. # The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
diff --git a/src/simd/archsimd/clmul_arm64.go b/src/simd/archsimd/clmul_arm64.go new file mode 100644 index 0000000..5c5fa81 --- /dev/null +++ b/src/simd/archsimd/clmul_arm64.go
@@ -0,0 +1,79 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd + +package archsimd + +// CarrylessMultiplyEven computes the carryless +// multiplications of selected even halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Asm: PMULL, CPU Feature: PMULL +func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { + return x.carrylessMultiplyWidenLo(y) +} + +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected odd halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Asm: PMULL, CPU Feature: PMULL +func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { + return x.GetHi().carrylessMultiplyWidenLo(y.GetHi()) +} + +// CarrylessMultiplyOddEven computes the carryless +// multiplications of selected odd half of x's elements and even half of y's elements. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Asm: PMULL, CPU Feature: PMULL +func (x Uint64x2) CarrylessMultiplyOddEven(y Uint64x2) Uint64x2 { + return x.GetHi().carrylessMultiplyWidenLo(y) +} + +// CarrylessMultiplyEvenOdd computes the carryless +// multiplications of selected even half of x's elements and odd half of y's elements. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Asm: PMULL, CPU Feature: PMULL +func (x Uint64x2) CarrylessMultiplyEvenOdd(y Uint64x2) Uint64x2 { + return x.carrylessMultiplyWidenLo(y.GetHi()) +}
diff --git a/src/simd/archsimd/clmul_emulated.go b/src/simd/archsimd/clmul_emulated.go new file mode 100644 index 0000000..b78af61 --- /dev/null +++ b/src/simd/archsimd/clmul_emulated.go
@@ -0,0 +1,103 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && (arm64 || wasm) + +package archsimd + +func new64x2(lo, hi uint64) Uint64x2 { + return Uint64x2{}.SetElem(0, lo).SetElem(1, hi) +} + +// These masks all have 4 zeroes between 1s. +var m0 = new64x2(0x1084210842108421, 0x2108421084210842) +var m1 = new64x2(0x2108421084210842, 0x4210842108421084) +var m2 = new64x2(0x4210842108421084, 0x8421084210842108) +var m3 = new64x2(0x8421084210842108, 0x0842108421084210) +var m4 = new64x2(0x0842108421084210, 0x1084210842108421) + +// Selects the middle 64 bits of a 128-bit simd value +var middle = new64x2(0xffffffff00000000, 0x00000000ffffffff) + +// mwl is a 64x64 into 128 multiply that is missing +// some carries that we don't need for CLMUL emulation. +// The high 64 bits of each input are ignored. +// Also just for fun, accumulate sums with Xor. +func (x Uint64x2) mwl(y Uint64x2) Uint64x2 { + // reshape input into Uint32x4 + // input is {a b _ _}.mwl{c d _ _} + // need the sum of + // ac0_ac1 + // 0 ad0_ad1 + // 0 bc0_bc1 + // 0 0 bd0_bd1 + // This "sum" is where the carries (not propagated + // across lanes) are lost. + ab__ := x.ReshapeToUint32s() + cd__ := y.ReshapeToUint32s() + ac0_ac1_bd0_bd1 := ab__.MulWidenLo(cd__) + + dc__ := y.RotateAllLeft(32).ReshapeToUint32s() + ad0_ad1_bc0_bc1 := ab__.MulWidenLo(dc__) + // + // have ad0, ad1, bc0, bc1 + // want 0, ad0+bc0, ad1+bc1, 0 + // to add to ac0_ac1_bd0_bd1 + // + // swap 64-bit halves of ad0_ad1_bc0_bc1 + // to get bc0_bc1_ad0_ad1 + bc0_bc1_ad0_ad1 := Uint64x2{}.SetElem(0, ad0_ad1_bc0_bc1.GetElem(1)).SetElem(1, ad0_ad1_bc0_bc1.GetElem(0)) + + // added to ad0_ad1_bc0_bc1 yields + // bc0+ad0, bc1+ad1, bc0+ad0, bc1+ad1 + // rotate 32 (within the two 64-bit elements) yields + // bc1+ad1, bc0+ad0, bc1+ad1, bc0+ad0 + // and then intersect with mask: + // 0 , bc0+ad0, bc1+ad1, 0 + // + // use xor to make it a worse multiply + zzz_adPbc0_adPbc1_zzz := bc0_bc1_ad0_ad1.Xor(ad0_ad1_bc0_bc1).RotateAllLeft(32).And(middle) + return ac0_ac1_bd0_bd1.Xor(zzz_adPbc0_adPbc1_zzz) +} + +// carrylessMultiply is constant time carrless multiply implemented with an +// absurd number of multiplication given that the emulation platforms only have +// 32x32 into 64, it might make sense to rework this into that primitive, but, +// for now this works and is easily tested in scalar Go. +func (x Uint64x2) carrylessMultiply(y Uint64x2) Uint64x2 { + + // This by masking the two inputs into 5 thinned inputs, with + // 4 zeroes separating any 2 set bits. Multiply will potentially + // set more bits with addition of overlapping terms, however this + // technique allows as many as 31 additions (filling all 4 separation + // positions with 1) without perturbing the bits we care about. Since + // there's at most 13 set bits in a thinned input, 31 is not a problem. + // If there were only 3 set bits, there are 16 1s per thinned input and + // only 15 additions can be tolerated -- so that's not possible. + + // This is also discussed at + // https://timtaubert.de/blog/2017/06/verified-binary-multiplication-for-ghash/ + + x0 := x.And(m0) + x1 := x.And(m1) + x2 := x.And(m2) + x3 := x.And(m3) + x4 := x.And(m4) + + y0 := y.And(m0) + y1 := y.And(m1) + y2 := y.And(m2) + y3 := y.And(m3) + y4 := y.And(m4) + + var z Uint64x2 + // for a given line, combining (xI).mwl(yJ) terms, I+J == K mod 5; mask index = K + z = (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0) + z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z) + z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z) + z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z) + z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z) + + return z +}
diff --git a/src/simd/archsimd/cpu_other.go b/src/simd/archsimd/cpu_other.go new file mode 100644 index 0000000..326040f --- /dev/null +++ b/src/simd/archsimd/cpu_other.go
@@ -0,0 +1,21 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd + +package archsimd + +import "internal/cpu" + +type ARM64Features struct{} + +var ARM64 ARM64Features + +// PMULL returns whether the CPU supports the PMULL feature. +// +// PMULL is defined on all GOARCHes, but will only return true on +// GOARCH arm64. +func (ARM64Features) PMULL() bool { + return cpu.ARM64.HasPMULL +}
diff --git a/src/simd/archsimd/extra_amd64.go b/src/simd/archsimd/extra_amd64.go index 9f23c22..b0dba6d 100644 --- a/src/simd/archsimd/extra_amd64.go +++ b/src/simd/archsimd/extra_amd64.go
@@ -179,201 +179,3 @@ // // Asm: VCMPPD, CPU Feature: AVX512 func (x Float64x8) IsNaN() Mask64x8 - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX -func (x Float32x4) Abs() Float32x4 { - mask := BroadcastUint32x4(0x80000000) - return x.ToBits().AndNot(mask).BitsToFloat32() -} - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX2 -func (x Float32x8) Abs() Float32x8 { - // mask will have a 1 in the sign bit UNLESS x is NaN - mask := BroadcastUint32x8(0x80000000) - return x.ToBits().AndNot(mask).BitsToFloat32() -} - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX512 -func (x Float32x16) Abs() Float32x16 { - mask := BroadcastUint32x16(0x80000000) - return x.ToBits().AndNot(mask).BitsToFloat32() -} - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX -func (x Float64x2) Abs() Float64x2 { - // mask will have a 1 in the sign bit UNLESS x is NaN - mask := BroadcastUint64x2(0x8000000000000000) - return x.ToBits().AndNot(mask).BitsToFloat64() -} - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX2 -func (x Float64x4) Abs() Float64x4 { - mask := BroadcastUint64x4(0x8000000000000000) - return x.ToBits().AndNot(mask).BitsToFloat64() -} - -// Abs returns the absolute values of the elements of x -// -// Emulated, CPU Feature AVX512 -func (x Float64x8) Abs() Float64x8 { - mask := BroadcastUint64x8(0x8000000000000000) - return x.ToBits().AndNot(mask).BitsToFloat64() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX -func (x Float32x4) Neg() Float32x4 { - mask := BroadcastUint32x4(0x80000000) - return x.ToBits().Xor(mask).BitsToFloat32() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX2 -func (x Float32x8) Neg() Float32x8 { - // mask will have a 1 in the sign bit UNLESS x is NaN - mask := BroadcastUint32x8(0x80000000) - return x.ToBits().Xor(mask).BitsToFloat32() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX512 -func (x Float32x16) Neg() Float32x16 { - mask := BroadcastUint32x16(0x80000000) - return x.ToBits().Xor(mask).BitsToFloat32() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX -func (x Float64x2) Neg() Float64x2 { - // mask will have a 1 in the sign bit UNLESS x is NaN - mask := BroadcastUint64x2(0x8000000000000000) - return x.ToBits().Xor(mask).BitsToFloat64() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX2 -func (x Float64x4) Neg() Float64x4 { - mask := BroadcastUint64x4(0x8000000000000000) - return x.ToBits().Xor(mask).BitsToFloat64() -} - -// Neg returns the negation of the elements of x -// -// Emulated, CPU Feature AVX512 -func (x Float64x8) Neg() Float64x8 { - mask := BroadcastUint64x8(0x8000000000000000) - return x.ToBits().Xor(mask).BitsToFloat64() -} - -var f0x16 = [16]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} -var f0x32 = [32]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} -var f0x64 = [64]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX -func (x Int8x16) Mul(y Int8x16) Int8x16 { - mask := LoadInt8x16Array(&f0x16) - mask16 := mask.ToBits().ReshapeToUint16s() - xe := x.And(mask).ToBits().ReshapeToUint16s() - xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ToBits().ReshapeToUint16s() - yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s().BitsToInt8() -} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX -func (x Uint8x16) Mul(y Uint8x16) Uint8x16 { - mask := LoadInt8x16Array(&f0x16).ToBits() - mask16 := mask.ReshapeToUint16s() - xe := x.And(mask).ReshapeToUint16s() - xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ReshapeToUint16s() - yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s() -} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX2 -func (x Int8x32) Mul(y Int8x32) Int8x32 { - mask := LoadInt8x32Array(&f0x32) - mask16 := mask.ToBits().ReshapeToUint16s() - xe := x.And(mask).ToBits().ReshapeToUint16s() - xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ToBits().ReshapeToUint16s() - yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s().BitsToInt8() -} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX512 -func (x Int8x64) Mul(y Int8x64) Int8x64 { - mask := LoadInt8x64Array(&f0x64) - mask16 := mask.ToBits().ReshapeToUint16s() - xe := x.And(mask).ToBits().ReshapeToUint16s() - xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ToBits().ReshapeToUint16s() - yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s().BitsToInt8() -} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX2 -func (x Uint8x32) Mul(y Uint8x32) Uint8x32 { - mask := LoadInt8x32Array(&f0x32).ToBits() - mask16 := mask.ReshapeToUint16s() - xe := x.And(mask).ReshapeToUint16s() - xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ReshapeToUint16s() - yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s() -} - -// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. -// -// Emulated, CPU Feature: AVX512 -func (x Uint8x64) Mul(y Uint8x64) Uint8x64 { - mask := LoadInt8x64Array(&f0x64).ToBits() - mask16 := mask.ReshapeToUint16s() - xe := x.And(mask).ReshapeToUint16s() - xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - ye := y.And(mask).ReshapeToUint16s() - yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) - pe := xe.Mul(ye).And(mask16) - po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) - return pe.Or(po).ReshapeToUint8s() -}
diff --git a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go index f9082c9..9c48a36 100644 --- a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go +++ b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
@@ -125,3 +125,20 @@ ).Store(got) checkSlices(t, got, want) } + +func TestClMul(t *testing.T) { + var x = archsimd.LoadUint64x2([]uint64{1, 5}) + var y = archsimd.LoadUint64x2([]uint64{3, 9}) + + foo := func(v archsimd.Uint64x2, s []uint64) { + r := make([]uint64, 2, 2) + v.Store(r) + checkSlices[uint64](t, r, s) + } + + foo(x.CarrylessMultiplyEven(y), []uint64{3, 0}) + foo(x.CarrylessMultiplyEvenOdd(y), []uint64{9, 0}) + foo(x.CarrylessMultiplyOddEven(y), []uint64{15, 0}) + foo(x.CarrylessMultiplyOdd(y), []uint64{45, 0}) + foo(y.CarrylessMultiplyEven(y), []uint64{5, 0}) +}
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index 15cab5a..eadf945 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go
@@ -539,11 +539,6 @@ // And performs a bitwise x & y. // -// Asm: VPAND, CPU Feature: AVX -func (x Uint64x2) And(y Uint64x2) Uint64x2 - -// And performs a bitwise x & y. -// // Asm: VPAND, CPU Feature: AVX2 func (x Uint64x4) And(y Uint64x4) Uint64x4 @@ -552,6 +547,11 @@ // Asm: VPANDQ, CPU Feature: AVX512 func (x Uint64x8) And(y Uint64x8) Uint64x8 +// And performs a bitwise x & y. +// +// Asm: VPAND, CPU Feature: AVX +func (x Uint64x2) And(y Uint64x2) Uint64x2 + /* AndNot */ // AndNot performs a bitwise x &^ y. @@ -661,11 +661,6 @@ // AndNot performs a bitwise x &^ y. // -// Asm: VPANDN, CPU Feature: AVX -func (x Uint64x2) AndNot(y Uint64x2) Uint64x2 - -// AndNot performs a bitwise x &^ y. -// // Asm: VPANDN, CPU Feature: AVX2 func (x Uint64x4) AndNot(y Uint64x4) Uint64x4 @@ -674,6 +669,11 @@ // Asm: VPANDNQ, CPU Feature: AVX512 func (x Uint64x8) AndNot(y Uint64x8) Uint64x8 +// AndNot performs a bitwise x &^ y. +// +// Asm: VPANDN, CPU Feature: AVX +func (x Uint64x2) AndNot(y Uint64x2) Uint64x2 + /* Average */ // Average computes the rounded average of corresponding elements. @@ -4584,11 +4584,6 @@ // Or performs a bitwise x | y. // -// Asm: VPOR, CPU Feature: AVX -func (x Uint64x2) Or(y Uint64x2) Uint64x2 - -// Or performs a bitwise x | y. -// // Asm: VPOR, CPU Feature: AVX2 func (x Uint64x4) Or(y Uint64x4) Uint64x4 @@ -4597,6 +4592,11 @@ // Asm: VPORQ, CPU Feature: AVX512 func (x Uint64x8) Or(y Uint64x8) Uint64x8 +// Or performs a bitwise x | y. +// +// Asm: VPOR, CPU Feature: AVX +func (x Uint64x2) Or(y Uint64x2) Uint64x2 + /* Permute */ // Permute permutes x. @@ -7610,11 +7610,6 @@ // Xor performs a bitwise x ^ y. // -// Asm: VPXOR, CPU Feature: AVX -func (x Uint64x2) Xor(y Uint64x2) Uint64x2 - -// Xor performs a bitwise x ^ y. -// // Asm: VPXOR, CPU Feature: AVX2 func (x Uint64x4) Xor(y Uint64x4) Uint64x4 @@ -7623,6 +7618,11 @@ // Asm: VPXORQ, CPU Feature: AVX512 func (x Uint64x8) Xor(y Uint64x8) Uint64x8 +// Xor performs a bitwise x ^ y. +// +// Asm: VPXOR, CPU Feature: AVX +func (x Uint64x2) Xor(y Uint64x2) Uint64x2 + // AsFloat64x2 reinterprets the bits of a Float32x4 vector as a Float64x2 vector // // Deprecated: use combinations of ToBits, BitsTo{Int<N>,Float<N>}, ReshapeToUint<N>
diff --git a/src/simd/archsimd/ops_arm64.go b/src/simd/archsimd/ops_arm64.go index 2d60c69..fe6f010 100644 --- a/src/simd/archsimd/ops_arm64.go +++ b/src/simd/archsimd/ops_arm64.go
@@ -1173,55 +1173,55 @@ // Asm: VMLA, CPU Feature: NEON func (x Uint32x4) MulAdd(y Uint32x4, z Uint32x4) Uint32x4 -/* MulLoLong */ +/* MulWidenLo */ -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VSMULL, CPU Feature: NEON -func (x Int8x16) MulLoLong(y Int8x16) Int16x8 +func (x Int8x16) MulWidenLo(y Int8x16) Int16x8 -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VSMULL, CPU Feature: NEON -func (x Int16x8) MulLoLong(y Int16x8) Int32x4 +func (x Int16x8) MulWidenLo(y Int16x8) Int32x4 -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VSMULL, CPU Feature: NEON -func (x Int32x4) MulLoLong(y Int32x4) Int64x2 +func (x Int32x4) MulWidenLo(y Int32x4) Int64x2 -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VUMULL, CPU Feature: NEON -func (x Uint8x16) MulLoLong(y Uint8x16) Uint16x8 +func (x Uint8x16) MulWidenLo(y Uint8x16) Uint16x8 -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VUMULL, CPU Feature: NEON -func (x Uint16x8) MulLoLong(y Uint16x8) Uint32x4 +func (x Uint16x8) MulWidenLo(y Uint16x8) Uint32x4 -// MulLoLong multiplies corresponding low-indexed elements and produces a result with double the element width. +// MulWidenLo multiplies corresponding low-indexed elements and produces a result with double the element width. // For the high-indexed elements, use GetHi: // -// x.GetHi().MulLoLong(y.GetHi()) +// x.GetHi().MulWidenLo(y.GetHi()) // // Asm: VUMULL, CPU Feature: NEON -func (x Uint32x4) MulLoLong(y Uint32x4) Uint64x2 +func (x Uint32x4) MulWidenLo(y Uint32x4) Uint64x2 /* Neg */ @@ -1875,74 +1875,6 @@ // Asm: VSHL, CPU Feature: NEON func (x Uint64x2) ShiftLeftConst(constant uint64) Uint64x2 -/* ShiftLeftLoLongConst */ - -// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VSSHLL, CPU Feature: NEON -func (x Int8x16) ShiftLeftLoLongConst(constant uint64) Int16x8 - -// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VSSHLL, CPU Feature: NEON -func (x Int16x8) ShiftLeftLoLongConst(constant uint64) Int32x4 - -// ShiftLeftLoLongConst performs a left shift on each signed low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VSSHLL, CPU Feature: NEON -func (x Int32x4) ShiftLeftLoLongConst(constant uint64) Int64x2 - -// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VUSHLL, CPU Feature: NEON -func (x Uint8x16) ShiftLeftLoLongConst(constant uint64) Uint16x8 - -// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VUSHLL, CPU Feature: NEON -func (x Uint16x8) ShiftLeftLoLongConst(constant uint64) Uint32x4 - -// ShiftLeftLoLongConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits -// and widens the result to double the element width. -// For the high-indexed elements, use GetHi: -// -// x.GetHi().ShiftLeftLoLongConst(...) -// -// A non-constant value of constant may result in significantly worse performance for this operation. -// -// Asm: VUSHLL, CPU Feature: NEON -func (x Uint32x4) ShiftLeftLoLongConst(constant uint64) Uint64x2 - /* ShiftLeftSaturatedConst */ // ShiftLeftSaturatedConst performs a saturating left shift on each element in x by the constant number of bits specified by y. @@ -2009,6 +1941,74 @@ // Asm: VUQSHL, CPU Feature: NEON func (x Uint64x2) ShiftLeftSaturatedConst(constant uint64) Uint64x2 +/* ShiftLeftWidenLoConst */ + +// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VSSHLL, CPU Feature: NEON +func (x Int8x16) ShiftLeftWidenLoConst(constant uint64) Int16x8 + +// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VSSHLL, CPU Feature: NEON +func (x Int16x8) ShiftLeftWidenLoConst(constant uint64) Int32x4 + +// ShiftLeftWidenLoConst performs a left shift on each signed low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VSSHLL, CPU Feature: NEON +func (x Int32x4) ShiftLeftWidenLoConst(constant uint64) Int64x2 + +// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VUSHLL, CPU Feature: NEON +func (x Uint8x16) ShiftLeftWidenLoConst(constant uint64) Uint16x8 + +// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VUSHLL, CPU Feature: NEON +func (x Uint16x8) ShiftLeftWidenLoConst(constant uint64) Uint32x4 + +// ShiftLeftWidenLoConst performs a left shift on each unsigned low-indexed element in x by the constant number of bits +// and widens the result to double the element width. +// For the high-indexed elements, use GetHi: +// +// x.GetHi().ShiftLeftWidenLoConst(...) +// +// A non-constant value of constant may result in significantly worse performance for this operation. +// +// Asm: VUSHLL, CPU Feature: NEON +func (x Uint32x4) ShiftLeftWidenLoConst(constant uint64) Uint64x2 + /* ShiftRightConst */ // ShiftRightConst performs an arithmetic right shift on each element in x by the constant number of bits specified by y.
diff --git a/src/simd/archsimd/ops_emulated_amd64.go b/src/simd/archsimd/ops_emulated_amd64.go new file mode 100644 index 0000000..cc45326 --- /dev/null +++ b/src/simd/archsimd/ops_emulated_amd64.go
@@ -0,0 +1,205 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package archsimd + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX +func (x Float32x4) Abs() Float32x4 { + mask := BroadcastUint32x4(0x80000000) + return x.ToBits().AndNot(mask).BitsToFloat32() +} + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX2 +func (x Float32x8) Abs() Float32x8 { + // mask will have a 1 in the sign bit UNLESS x is NaN + mask := BroadcastUint32x8(0x80000000) + return x.ToBits().AndNot(mask).BitsToFloat32() +} + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX512 +func (x Float32x16) Abs() Float32x16 { + mask := BroadcastUint32x16(0x80000000) + return x.ToBits().AndNot(mask).BitsToFloat32() +} + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX +func (x Float64x2) Abs() Float64x2 { + // mask will have a 1 in the sign bit UNLESS x is NaN + mask := BroadcastUint64x2(0x8000000000000000) + return x.ToBits().AndNot(mask).BitsToFloat64() +} + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX2 +func (x Float64x4) Abs() Float64x4 { + mask := BroadcastUint64x4(0x8000000000000000) + return x.ToBits().AndNot(mask).BitsToFloat64() +} + +// Abs returns the absolute values of the elements of x +// +// Emulated, CPU Feature AVX512 +func (x Float64x8) Abs() Float64x8 { + mask := BroadcastUint64x8(0x8000000000000000) + return x.ToBits().AndNot(mask).BitsToFloat64() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX +func (x Float32x4) Neg() Float32x4 { + mask := BroadcastUint32x4(0x80000000) + return x.ToBits().Xor(mask).BitsToFloat32() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX2 +func (x Float32x8) Neg() Float32x8 { + // mask will have a 1 in the sign bit UNLESS x is NaN + mask := BroadcastUint32x8(0x80000000) + return x.ToBits().Xor(mask).BitsToFloat32() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX512 +func (x Float32x16) Neg() Float32x16 { + mask := BroadcastUint32x16(0x80000000) + return x.ToBits().Xor(mask).BitsToFloat32() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX +func (x Float64x2) Neg() Float64x2 { + // mask will have a 1 in the sign bit UNLESS x is NaN + mask := BroadcastUint64x2(0x8000000000000000) + return x.ToBits().Xor(mask).BitsToFloat64() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX2 +func (x Float64x4) Neg() Float64x4 { + mask := BroadcastUint64x4(0x8000000000000000) + return x.ToBits().Xor(mask).BitsToFloat64() +} + +// Neg returns the negation of the elements of x +// +// Emulated, CPU Feature AVX512 +func (x Float64x8) Neg() Float64x8 { + mask := BroadcastUint64x8(0x8000000000000000) + return x.ToBits().Xor(mask).BitsToFloat64() +} + +var f0x16 = [16]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} +var f0x32 = [32]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} +var f0x64 = [64]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX +func (x Int8x16) Mul(y Int8x16) Int8x16 { + mask := LoadInt8x16Array(&f0x16) + mask16 := mask.ToBits().ReshapeToUint16s() + xe := x.And(mask).ToBits().ReshapeToUint16s() + xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ToBits().ReshapeToUint16s() + yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s().BitsToInt8() +} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX +func (x Uint8x16) Mul(y Uint8x16) Uint8x16 { + mask := LoadInt8x16Array(&f0x16).ToBits() + mask16 := mask.ReshapeToUint16s() + xe := x.And(mask).ReshapeToUint16s() + xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ReshapeToUint16s() + yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s() +} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX2 +func (x Int8x32) Mul(y Int8x32) Int8x32 { + mask := LoadInt8x32Array(&f0x32) + mask16 := mask.ToBits().ReshapeToUint16s() + xe := x.And(mask).ToBits().ReshapeToUint16s() + xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ToBits().ReshapeToUint16s() + yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s().BitsToInt8() +} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX512 +func (x Int8x64) Mul(y Int8x64) Int8x64 { + mask := LoadInt8x64Array(&f0x64) + mask16 := mask.ToBits().ReshapeToUint16s() + xe := x.And(mask).ToBits().ReshapeToUint16s() + xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ToBits().ReshapeToUint16s() + yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s().BitsToInt8() +} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX2 +func (x Uint8x32) Mul(y Uint8x32) Uint8x32 { + mask := LoadInt8x32Array(&f0x32).ToBits() + mask16 := mask.ReshapeToUint16s() + xe := x.And(mask).ReshapeToUint16s() + xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ReshapeToUint16s() + yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s() +} + +// Mul multiplies corresponding elements of two vectors, modulo 2ⁿ. +// +// Emulated, CPU Feature: AVX512 +func (x Uint8x64) Mul(y Uint8x64) Uint8x64 { + mask := LoadInt8x64Array(&f0x64).ToBits() + mask16 := mask.ReshapeToUint16s() + xe := x.And(mask).ReshapeToUint16s() + xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + ye := y.And(mask).ReshapeToUint16s() + yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) + pe := xe.Mul(ye).And(mask16) + po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) + return pe.Or(po).ReshapeToUint8s() +}
diff --git a/src/simd/archsimd/ops_emulated_wasm.go b/src/simd/archsimd/ops_emulated_wasm.go index b8dbe50..46180f1 100644 --- a/src/simd/archsimd/ops_emulated_wasm.go +++ b/src/simd/archsimd/ops_emulated_wasm.go
@@ -162,3 +162,41 @@ func (x Uint64x2) OnesCount() Uint64x2 { return x.BitsToInt64().OnesCount().ToBits() } + +// CarrylessMultiplyEven computes the carryless +// multiplications of selected even halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Emulated +func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { + return x.carrylessMultiply(y) +} + +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected odd halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// Emulated +func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { + x = x.SetElem(0, x.GetElem(1)) + y = y.SetElem(0, x.GetElem(1)) + return x.carrylessMultiply(y) +}
diff --git a/src/simd/archsimd/ops_internal_arm64.go b/src/simd/archsimd/ops_internal_arm64.go index 277e581..69da701 100644 --- a/src/simd/archsimd/ops_internal_arm64.go +++ b/src/simd/archsimd/ops_internal_arm64.go
@@ -85,3 +85,24 @@ // // Asm: VDUP, CPU Feature: NEON func (x Uint8x16) broadcast1To16() Uint8x16 + +/* carrylessMultiplyWidenLo */ + +// carrylessMultiplyWidenLo returns the carryless (polynomial) product of the low halves +// of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// For the high-indexed elements, use GetHi: +// +// x.GetHi().carrylessMultiplyWidenLo(y.GetHi()) +// +// Asm: VPMULL, CPU Feature: NEON +func (x Uint64x2) carrylessMultiplyWidenLo(y Uint64x2) Uint64x2
diff --git a/src/simd/internal/bridge/decls_amd64.go b/src/simd/internal/bridge/decls_amd64.go index f8d3921..c8d41ea 100644 --- a/src/simd/internal/bridge/decls_amd64.go +++ b/src/simd/internal/bridge/decls_amd64.go
@@ -2926,6 +2926,30 @@ return Int64x8((archsimd.Uint64x8(x)).BitsToInt64()) } +func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y))) +} + +func (x Uint64x4) CarrylessMultiplyEven(y Uint64x4) Uint64x4 { + return Uint64x4((archsimd.Uint64x4(x)).CarrylessMultiplyEven(archsimd.Uint64x4(y))) +} + +func (x Uint64x8) CarrylessMultiplyEven(y Uint64x8) Uint64x8 { + return Uint64x8((archsimd.Uint64x8(x)).CarrylessMultiplyEven(archsimd.Uint64x8(y))) +} + +func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y))) +} + +func (x Uint64x4) CarrylessMultiplyOdd(y Uint64x4) Uint64x4 { + return Uint64x4((archsimd.Uint64x4(x)).CarrylessMultiplyOdd(archsimd.Uint64x4(y))) +} + +func (x Uint64x8) CarrylessMultiplyOdd(y Uint64x8) Uint64x8 { + return Uint64x8((archsimd.Uint64x8(x)).CarrylessMultiplyOdd(archsimd.Uint64x8(y))) +} + func (x Uint64x2) ConvertToInt64() Int64x2 { return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64()) }
diff --git a/src/simd/internal/bridge/decls_arm64.go b/src/simd/internal/bridge/decls_arm64.go index bdf2a87..b2f7c07 100644 --- a/src/simd/internal/bridge/decls_arm64.go +++ b/src/simd/internal/bridge/decls_arm64.go
@@ -982,6 +982,14 @@ return Int64x2((archsimd.Uint64x2(x)).BitsToInt64()) } +func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y))) +} + +func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y))) +} + func (x Uint64x2) ConvertToInt64() Int64x2 { return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64()) }
diff --git a/src/simd/internal/bridge/decls_wasm.go b/src/simd/internal/bridge/decls_wasm.go index 0818cac..daba992 100644 --- a/src/simd/internal/bridge/decls_wasm.go +++ b/src/simd/internal/bridge/decls_wasm.go
@@ -982,6 +982,14 @@ return Int64x2((archsimd.Uint64x2(x)).BitsToInt64()) } +func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyEven(archsimd.Uint64x2(y))) +} + +func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { + return Uint64x2((archsimd.Uint64x2(x)).CarrylessMultiplyOdd(archsimd.Uint64x2(y))) +} + func (x Uint64x2) ConvertToInt64() Int64x2 { return Int64x2((archsimd.Uint64x2(x)).ConvertToInt64()) }
diff --git a/src/simd/internal/bridge/simd_emulated.go b/src/simd/internal/bridge/simd_emulated.go new file mode 100644 index 0000000..64a728b --- /dev/null +++ b/src/simd/internal/bridge/simd_emulated.go
@@ -0,0 +1,3222 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && (amd64 || wasm || arm64) + +package bridge + +import ( + "fmt" + "math" + "math/bits" +) + +// VectorSize returns the bit length of the emulated vector (fixed to 128). +func VectorBitSize() int { + return 128 +} + +// Emulated returns whether simd is emulated. +func Emulated() bool { + return true +} + +type _simd struct { + _ [0]func(*_simd) *_simd +} + +// Int8s represents a 128-bit vector of 16 int8 elements. +type Int8s struct { + _ _simd + a, b uint64 +} + +// LoadInt8s loads a slice of int8 into an Int8s vector. +func LoadInt8s(s []int8) Int8s { + var a, b uint64 + for i := 0; i < 16; i++ { + val := uint64(uint8(s[i])) + if i < 8 { + a |= val << (8 * i) + } else { + b |= val << (8 * (i - 8)) + } + } + return Int8s{a: a, b: b} +} + +// LoadInt8sPart loads a partial slice of int8 into an Int8s vector. +func LoadInt8sPart(s []int8) (Int8s, int) { + var a, b uint64 + n := len(s) + if n > 16 { + n = 16 + } + for i := 0; i < n; i++ { + val := uint64(uint8(s[i])) + if i < 8 { + a |= val << (8 * i) + } else { + b |= val << (8 * (i - 8)) + } + } + return Int8s{a: a, b: b}, n +} + +func (x Int8s) get(i int) int8 { + if i < 8 { + return int8(x.a >> (8 * i)) + } + return int8(x.b >> (8 * (i - 8))) +} + +func (x *Int8s) set(i int, v int8) { + val := uint64(uint8(v)) + if i < 8 { + mask := uint64(0xff) << (8 * i) + x.a = (x.a &^ mask) | (val << (8 * i)) + } else { + mask := uint64(0xff) << (8 * (i - 8)) + x.b = (x.b &^ mask) | (val << (8 * (i - 8))) + } +} + +// Abs returns the element-wise absolute value of x. +func (x Int8s) Abs() Int8s { + var res Int8s + for i := 0; i < 16; i++ { + v := x.get(i) + if v < 0 { + res.set(i, -v) + } else { + res.set(i, v) + } + } + return res +} + +// Add returns the element-wise sum of x and y. +func (x Int8s) Add(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// AddSaturated returns the element-wise saturated sum of x and y. +func (x Int8s) AddSaturated(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + sum := int(x.get(i)) + int(y.get(i)) + if sum > math.MaxInt8 { + res.set(i, math.MaxInt8) + } else if sum < math.MinInt8 { + res.set(i, math.MinInt8) + } else { + res.set(i, int8(sum)) + } + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Int8s) And(y Int8s) Int8s { + return Int8s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Int8s) AndNot(y Int8s) Int8s { + return Int8s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Equal returns a mask indicating where x and y are equal. +func (x Int8s) Equal(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Int8s) Greater(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Int8s) GreaterEqual(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Int8s) Less(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Int8s) LessEqual(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Int8s) NotEqual(y Int8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Int8s) Len() int { + return 16 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Int8s) Masked(mask Mask8s) Int8s { + return Int8s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Int8s) Max(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Int8s) Mul(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Int8s) IfElse(mask Mask8s, y Int8s) Int8s { + return Int8s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Int8s) Min(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Neg returns the element-wise negation of x. +func (x Int8s) Neg() Int8s { + var res Int8s + for i := 0; i < 16; i++ { + res.set(i, -x.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Int8s) Not() Int8s { + return Int8s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Int8s) Or(y Int8s) Int8s { + return Int8s{a: x.a | y.a, b: x.b | y.b} +} + +// Store stores the vector elements into the slice s. +func (x Int8s) Store(s []int8) { + for i := 0; i < 16 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Int8s) StorePart(s []int8) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Int8s) String() string { + var parts [16]int8 + for i := 0; i < 16; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Int8s) Sub(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// SubSaturated returns the element-wise saturated difference of x and y. +func (x Int8s) SubSaturated(y Int8s) Int8s { + var res Int8s + for i := 0; i < 16; i++ { + diff := int(x.get(i)) - int(y.get(i)) + if diff > math.MaxInt8 { + res.set(i, math.MaxInt8) + } else if diff < math.MinInt8 { + res.set(i, math.MinInt8) + } else { + res.set(i, int8(diff)) + } + } + return res +} + +// ToMask returns a mask representation of the vector. +func (x Int8s) ToMask() Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) != 0 { + res.set(i, true) + } + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Int8s) Xor(y Int8s) Int8s { + return Int8s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// ConvertToUint8 converts the vector elements to uint8. +func (x Int8s) ConvertToUint8() Uint8s { + return Uint8s{a: x.a, b: x.b} +} + +// ToBits reinterprets the vector bits as a Uint8s vector. +func (x Int8s) ToBits() Uint8s { + return Uint8s{a: x.a, b: x.b} +} + +// Int16s represents a 128-bit vector of 8 int16 elements. +type Int16s struct { + _ _simd + a, b uint64 +} + +// LoadInt16s loads a slice of int16 into an Int16s vector. +func LoadInt16s(s []int16) Int16s { + var a, b uint64 + for i := 0; i < 8; i++ { + val := uint64(uint16(s[i])) + if i < 4 { + a |= val << (16 * i) + } else { + b |= val << (16 * (i - 4)) + } + } + return Int16s{a: a, b: b} +} + +// LoadInt16sPart loads a partial slice of int16 into an Int16s vector. +func LoadInt16sPart(s []int16) (Int16s, int) { + var a, b uint64 + n := len(s) + if n > 8 { + n = 8 + } + for i := 0; i < n; i++ { + val := uint64(uint16(s[i])) + if i < 4 { + a |= val << (16 * i) + } else { + b |= val << (16 * (i - 4)) + } + } + return Int16s{a: a, b: b}, n +} + +func (x Int16s) get(i int) int16 { + if i < 4 { + return int16(x.a >> (16 * i)) + } + return int16(x.b >> (16 * (i - 4))) +} + +func (x *Int16s) set(i int, v int16) { + val := uint64(uint16(v)) + if i < 4 { + mask := uint64(0xffff) << (16 * i) + x.a = (x.a &^ mask) | (val << (16 * i)) + } else { + mask := uint64(0xffff) << (16 * (i - 4)) + x.b = (x.b &^ mask) | (val << (16 * (i - 4))) + } +} + +// Abs returns the element-wise absolute value of x. +func (x Int16s) Abs() Int16s { + var res Int16s + for i := 0; i < 8; i++ { + v := x.get(i) + if v < 0 { + res.set(i, -v) + } else { + res.set(i, v) + } + } + return res +} + +// Add returns the element-wise sum of x and y. +func (x Int16s) Add(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// AddSaturated returns the element-wise saturated sum of x and y. +func (x Int16s) AddSaturated(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + sum := int(x.get(i)) + int(y.get(i)) + if sum > math.MaxInt16 { + res.set(i, math.MaxInt16) + } else if sum < math.MinInt16 { + res.set(i, math.MinInt16) + } else { + res.set(i, int16(sum)) + } + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Int16s) And(y Int16s) Int16s { + return Int16s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Int16s) AndNot(y Int16s) Int16s { + return Int16s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Equal returns a mask indicating where x and y are equal. +func (x Int16s) Equal(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Int16s) Greater(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Int16s) GreaterEqual(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Int16s) Less(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Int16s) LessEqual(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Int16s) NotEqual(y Int16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Int16s) Len() int { + return 8 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Int16s) Masked(mask Mask16s) Int16s { + return Int16s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Int16s) Max(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Int16s) IfElse(mask Mask16s, y Int16s) Int16s { + return Int16s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Int16s) Min(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Int16s) Mul(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// Neg returns the element-wise negation of x. +func (x Int16s) Neg() Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, -x.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Int16s) Not() Int16s { + return Int16s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Int16s) Or(y Int16s) Int16s { + return Int16s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Int16s) ShiftAllLeft(y uint8) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)<<y) + } + return res +} + +// ShiftAllRight shifts all elements right by y bits. +func (x Int16s) ShiftAllRight(y uint8) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)>>y) + } + return res +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Int16s) RotateAllLeft(dist uint64) Int16s { + var res Int16s + d := dist & 15 + for i := 0; i < 8; i++ { + u := uint16(x.get(i)) + r := (u << d) | (u >> ((16 - d) & 15)) + res.set(i, int16(r)) + } + return res +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Int16s) RotateAllRight(dist uint64) Int16s { + var res Int16s + d := dist & 15 + for i := 0; i < 8; i++ { + u := uint16(x.get(i)) + r := (u >> d) | (u << ((16 - d) & 15)) + res.set(i, int16(r)) + } + return res +} + +// Store stores the vector elements into the slice s. +func (x Int16s) Store(s []int16) { + for i := 0; i < 8 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Int16s) StorePart(s []int16) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Int16s) String() string { + var parts [8]int16 + for i := 0; i < 8; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Int16s) Sub(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// SubSaturated returns the element-wise saturated difference of x and y. +func (x Int16s) SubSaturated(y Int16s) Int16s { + var res Int16s + for i := 0; i < 8; i++ { + diff := int(x.get(i)) - int(y.get(i)) + if diff > math.MaxInt16 { + res.set(i, math.MaxInt16) + } else if diff < math.MinInt16 { + res.set(i, math.MinInt16) + } else { + res.set(i, int16(diff)) + } + } + return res +} + +// ToMask returns a mask representation of the vector. +func (x Int16s) ToMask() Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) != 0 { + res.set(i, true) + } + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Int16s) Xor(y Int16s) Int16s { + return Int16s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// ConvertToUint16 converts the vector elements to uint16. +func (x Int16s) ConvertToUint16() Uint16s { + return Uint16s{a: x.a, b: x.b} +} + +// ToBits reinterprets the vector bits as a Uint16s vector. +func (x Int16s) ToBits() Uint16s { + return Uint16s{a: x.a, b: x.b} +} + +// Int32s represents a 128-bit vector of 4 int32 elements. +type Int32s struct { + _ _simd + a, b uint64 +} + +// LoadInt32s loads a slice of int32 into an Int32s vector. +func LoadInt32s(s []int32) Int32s { + var a, b uint64 + for i := 0; i < 4; i++ { + val := uint64(uint32(s[i])) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Int32s{a: a, b: b} +} + +// LoadInt32sPart loads a partial slice of int32 into an Int32s vector. +func LoadInt32sPart(s []int32) (Int32s, int) { + var a, b uint64 + n := len(s) + if n > 4 { + n = 4 + } + for i := 0; i < n; i++ { + val := uint64(uint32(s[i])) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Int32s{a: a, b: b}, n +} + +func (x Int32s) get(i int) int32 { + if i < 2 { + return int32(x.a >> (32 * i)) + } + return int32(x.b >> (32 * (i - 2))) +} + +func (x *Int32s) set(i int, v int32) { + val := uint64(uint32(v)) + if i < 2 { + mask := uint64(0xffffffff) << (32 * i) + x.a = (x.a &^ mask) | (val << (32 * i)) + } else { + mask := uint64(0xffffffff) << (32 * (i - 2)) + x.b = (x.b &^ mask) | (val << (32 * (i - 2))) + } +} + +// Abs returns the element-wise absolute value of x. +func (x Int32s) Abs() Int32s { + var res Int32s + for i := 0; i < 4; i++ { + v := x.get(i) + if v < 0 { + res.set(i, -v) + } else { + res.set(i, v) + } + } + return res +} + +// Add returns the element-wise sum of x and y. +func (x Int32s) Add(y Int32s) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Int32s) And(y Int32s) Int32s { + return Int32s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Int32s) AndNot(y Int32s) Int32s { + return Int32s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// ConvertToFloat32 converts the vector elements to float32. +func (x Int32s) ConvertToFloat32() Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, float32(x.get(i))) + } + return res +} + +// Equal returns a mask indicating where x and y are equal. +func (x Int32s) Equal(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Int32s) Greater(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Int32s) GreaterEqual(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Int32s) Less(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Int32s) LessEqual(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Int32s) NotEqual(y Int32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Int32s) Len() int { + return 4 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Int32s) Masked(mask Mask32s) Int32s { + return Int32s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Int32s) Max(y Int32s) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Int32s) IfElse(mask Mask32s, y Int32s) Int32s { + return Int32s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Int32s) Min(y Int32s) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Int32s) Mul(y Int32s) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// Neg returns the element-wise negation of x. +func (x Int32s) Neg() Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, -x.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Int32s) Not() Int32s { + return Int32s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Int32s) Or(y Int32s) Int32s { + return Int32s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Int32s) ShiftAllLeft(y uint8) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)<<y) + } + return res +} + +// ShiftAllRight shifts all elements right by y bits. +func (x Int32s) ShiftAllRight(y uint8) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)>>y) + } + return res +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Int32s) RotateAllLeft(dist uint64) Int32s { + var res Int32s + d := dist & 31 + for i := 0; i < 4; i++ { + u := uint32(x.get(i)) + r := (u << d) | (u >> ((32 - d) & 31)) + res.set(i, int32(r)) + } + return res +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Int32s) RotateAllRight(dist uint64) Int32s { + var res Int32s + d := dist & 31 + for i := 0; i < 4; i++ { + u := uint32(x.get(i)) + r := (u >> d) | (u << ((32 - d) & 31)) + res.set(i, int32(r)) + } + return res +} + +// Store stores the vector elements into the slice s. +func (x Int32s) Store(s []int32) { + for i := 0; i < 4 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Int32s) StorePart(s []int32) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Int32s) String() string { + var parts [4]int32 + for i := 0; i < 4; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Int32s) Sub(y Int32s) Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// ToMask returns a mask representation of the vector. +func (x Int32s) ToMask() Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) != 0 { + res.set(i, true) + } + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Int32s) Xor(y Int32s) Int32s { + return Int32s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// ConvertToUint32 converts the vector elements to uint32. +func (x Int32s) ConvertToUint32() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// ToBits reinterprets the vector bits as a Uint32s vector. +func (x Int32s) ToBits() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// Int64s represents a 128-bit vector of 2 int64 elements. +type Int64s struct { + _ _simd + a, b uint64 +} + +// LoadInt64s loads a slice of int64 into an Int64s vector. +func LoadInt64s(s []int64) Int64s { + var a, b uint64 + a = uint64(s[0]) + b = uint64(s[1]) + return Int64s{a: a, b: b} +} + +// LoadInt64sPart loads a partial slice of int64 into an Int64s vector. +func LoadInt64sPart(s []int64) (Int64s, int) { + var a, b uint64 + if len(s) > 0 { + a = uint64(s[0]) + } + if len(s) > 1 { + b = uint64(s[1]) + } + return Int64s{a: a, b: b}, len(s) +} + +func (x Int64s) get(i int) int64 { + if i == 0 { + return int64(x.a) + } + return int64(x.b) +} + +func (x *Int64s) set(i int, v int64) { + if i == 0 { + x.a = uint64(v) + } else { + x.b = uint64(v) + } +} + +// Add returns the element-wise sum of x and y. +func (x Int64s) Add(y Int64s) Int64s { + return Int64s{a: x.a + y.a, b: x.b + y.b} +} + +// And returns the bitwise AND of x and y. +func (x Int64s) And(y Int64s) Int64s { + return Int64s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Int64s) AndNot(y Int64s) Int64s { + return Int64s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Equal returns a mask indicating where x and y are equal. +func (x Int64s) Equal(y Int64s) Mask64s { + var res Mask64s + if x.a == y.a { + res.a = ^uint64(0) + } + if x.b == y.b { + res.b = ^uint64(0) + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Int64s) Greater(y Int64s) Mask64s { + var res Mask64s + if int64(x.a) > int64(y.a) { + res.a = ^uint64(0) + } + if int64(x.b) > int64(y.b) { + res.b = ^uint64(0) + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Int64s) GreaterEqual(y Int64s) Mask64s { + var res Mask64s + if int64(x.a) >= int64(y.a) { + res.a = ^uint64(0) + } + if int64(x.b) >= int64(y.b) { + res.b = ^uint64(0) + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Int64s) Less(y Int64s) Mask64s { + var res Mask64s + if int64(x.a) < int64(y.a) { + res.a = ^uint64(0) + } + if int64(x.b) < int64(y.b) { + res.b = ^uint64(0) + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Int64s) LessEqual(y Int64s) Mask64s { + var res Mask64s + if int64(x.a) <= int64(y.a) { + res.a = ^uint64(0) + } + if int64(x.b) <= int64(y.b) { + res.b = ^uint64(0) + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Int64s) NotEqual(y Int64s) Mask64s { + var res Mask64s + if x.a != y.a { + res.a = ^uint64(0) + } + if x.b != y.b { + res.b = ^uint64(0) + } + return res +} + +// Len returns the number of elements in the vector. +func (x Int64s) Len() int { + return 2 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Int64s) Masked(mask Mask64s) Int64s { + return Int64s{a: x.a & mask.a, b: x.b & mask.b} +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Int64s) IfElse(mask Mask64s, y Int64s) Int64s { + return Int64s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Neg returns the element-wise negation of x. +func (x Int64s) Neg() Int64s { + return Int64s{a: uint64(-int64(x.a)), b: uint64(-int64(x.b))} +} + +// Not returns the bitwise NOT of x. +func (x Int64s) Not() Int64s { + return Int64s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Int64s) Or(y Int64s) Int64s { + return Int64s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Int64s) ShiftAllLeft(y uint8) Int64s { + return Int64s{a: x.a << y, b: x.b << y} +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Int64s) RotateAllLeft(dist uint64) Int64s { + d := dist & 63 + return Int64s{ + a: (x.a << d) | (x.a >> ((64 - d) & 63)), + b: (x.b << d) | (x.b >> ((64 - d) & 63)), + } +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Int64s) RotateAllRight(dist uint64) Int64s { + d := dist & 63 + return Int64s{ + a: (x.a >> d) | (x.a << ((64 - d) & 63)), + b: (x.b >> d) | (x.b << ((64 - d) & 63)), + } +} + +// Store stores the vector elements into the slice s. +func (x Int64s) Store(s []int64) { + if len(s) > 0 { + s[0] = int64(x.a) + } + if len(s) > 1 { + s[1] = int64(x.b) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Int64s) StorePart(s []int64) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Int64s) String() string { + return fmt.Sprint([2]int64{int64(x.a), int64(x.b)}) +} + +// Sub returns the element-wise difference of x and y. +func (x Int64s) Sub(y Int64s) Int64s { + return Int64s{a: x.a - y.a, b: x.b - y.b} +} + +// ToMask returns a mask representation of the vector. +func (x Int64s) ToMask() Mask64s { + var res Mask64s + if x.a != 0 { + res.a = ^uint64(0) + } + if x.b != 0 { + res.b = ^uint64(0) + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Int64s) Xor(y Int64s) Int64s { + return Int64s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// ConvertToUint64 converts the vector elements to uint64. +func (x Int64s) ConvertToUint64() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// ToBits reinterprets the vector bits as a Uint64s vector. +func (x Int64s) ToBits() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// Uint8s represents a 128-bit vector of 16 uint8 elements. +type Uint8s struct { + _ _simd + a, b uint64 +} + +// LoadUint8s loads a slice of uint8 into an Uint8s vector. +func LoadUint8s(s []uint8) Uint8s { + var a, b uint64 + for i := 0; i < 16; i++ { + val := uint64(s[i]) + if i < 8 { + a |= val << (8 * i) + } else { + b |= val << (8 * (i - 8)) + } + } + return Uint8s{a: a, b: b} +} + +// LoadUint8sPart loads a partial slice of uint8 into an Uint8s vector. +func LoadUint8sPart(s []uint8) (Uint8s, int) { + var a, b uint64 + n := len(s) + if n > 16 { + n = 16 + } + for i := 0; i < n; i++ { + val := uint64(s[i]) + if i < 8 { + a |= val << (8 * i) + } else { + b |= val << (8 * (i - 8)) + } + } + return Uint8s{a: a, b: b}, n +} + +func (x Uint8s) get(i int) uint8 { + if i < 8 { + return uint8(x.a >> (8 * i)) + } + return uint8(x.b >> (8 * (i - 8))) +} + +func (x *Uint8s) set(i int, v uint8) { + val := uint64(v) + if i < 8 { + mask := uint64(0xff) << (8 * i) + x.a = (x.a &^ mask) | (val << (8 * i)) + } else { + mask := uint64(0xff) << (8 * (i - 8)) + x.b = (x.b &^ mask) | (val << (8 * (i - 8))) + } +} + +// Add returns the element-wise sum of x and y. +func (x Uint8s) Add(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// AddSaturated returns the element-wise saturated sum of x and y. +func (x Uint8s) AddSaturated(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + sum := int(x.get(i)) + int(y.get(i)) + if sum > math.MaxUint8 { + res.set(i, math.MaxUint8) + } else { + res.set(i, uint8(sum)) + } + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Uint8s) And(y Uint8s) Uint8s { + return Uint8s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Uint8s) AndNot(y Uint8s) Uint8s { + return Uint8s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Average returns the element-wise average of x and y. +func (x Uint8s) Average(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + res.set(i, uint8((int(x.get(i))+int(y.get(i))+1)>>1)) + } + return res +} + +// Equal returns a mask indicating where x and y are equal. +func (x Uint8s) Equal(y Uint8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Uint8s) NotEqual(y Uint8s) Mask8s { + var res Mask8s + for i := 0; i < 16; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Uint8s) Len() int { + return 16 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Uint8s) Masked(mask Mask8s) Uint8s { + return Uint8s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Uint8s) Max(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Uint8s) IfElse(mask Mask8s, y Uint8s) Uint8s { + return Uint8s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Uint8s) Min(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Uint8s) Mul(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Uint8s) Not() Uint8s { + return Uint8s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Uint8s) Or(y Uint8s) Uint8s { + return Uint8s{a: x.a | y.a, b: x.b | y.b} +} + +// Store stores the vector elements into the slice s. +func (x Uint8s) Store(s []uint8) { + for i := 0; i < 16 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Uint8s) StorePart(s []uint8) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Uint8s) String() string { + var parts [16]uint8 + for i := 0; i < 16; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Uint8s) Sub(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// SubSaturated returns the element-wise saturated difference of x and y. +func (x Uint8s) SubSaturated(y Uint8s) Uint8s { + var res Uint8s + for i := 0; i < 16; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, 0) + } else { + res.set(i, vx-vy) + } + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Uint8s) Xor(y Uint8s) Uint8s { + return Uint8s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// BitsToInt8 reinterprets the vector bits as an Int8s vector. +func (x Uint8s) BitsToInt8() Int8s { + return Int8s{a: x.a, b: x.b} +} + +// ConvertToInt8 converts the vector elements to int8. +func (x Uint8s) ConvertToInt8() Int8s { + return Int8s{a: x.a, b: x.b} +} + +// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector. +func (x Uint8s) ReshapeToUint16s() Uint16s { + return Uint16s{a: x.a, b: x.b} +} + +// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector. +func (x Uint8s) ReshapeToUint32s() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector. +func (x Uint8s) ReshapeToUint64s() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// Uint16s represents a 128-bit vector of 8 uint16 elements. +type Uint16s struct { + _ _simd + a, b uint64 +} + +// LoadUint16s loads a slice of uint16 into an Uint16s vector. +func LoadUint16s(s []uint16) Uint16s { + var a, b uint64 + for i := 0; i < 8; i++ { + val := uint64(s[i]) + if i < 4 { + a |= val << (16 * i) + } else { + b |= val << (16 * (i - 4)) + } + } + return Uint16s{a: a, b: b} +} + +// LoadUint16sPart loads a partial slice of uint16 into an Uint16s vector. +func LoadUint16sPart(s []uint16) (Uint16s, int) { + var a, b uint64 + n := len(s) + if n > 8 { + n = 8 + } + for i := 0; i < n; i++ { + val := uint64(s[i]) + if i < 4 { + a |= val << (16 * i) + } else { + b |= val << (16 * (i - 4)) + } + } + return Uint16s{a: a, b: b}, n +} + +func (x Uint16s) get(i int) uint16 { + if i < 4 { + return uint16(x.a >> (16 * i)) + } + return uint16(x.b >> (16 * (i - 4))) +} + +func (x *Uint16s) set(i int, v uint16) { + val := uint64(v) + if i < 4 { + mask := uint64(0xffff) << (16 * i) + x.a = (x.a &^ mask) | (val << (16 * i)) + } else { + mask := uint64(0xffff) << (16 * (i - 4)) + x.b = (x.b &^ mask) | (val << (16 * (i - 4))) + } +} + +// Add returns the element-wise sum of x and y. +func (x Uint16s) Add(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// AddSaturated returns the element-wise saturated sum of x and y. +func (x Uint16s) AddSaturated(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + sum := int(x.get(i)) + int(y.get(i)) + if sum > math.MaxUint16 { + res.set(i, math.MaxUint16) + } else { + res.set(i, uint16(sum)) + } + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Uint16s) And(y Uint16s) Uint16s { + return Uint16s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Uint16s) AndNot(y Uint16s) Uint16s { + return Uint16s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Average returns the element-wise average of x and y. +func (x Uint16s) Average(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, uint16((int(x.get(i))+int(y.get(i))+1)>>1)) + } + return res +} + +// Equal returns a mask indicating where x and y are equal. +func (x Uint16s) Equal(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Uint16s) Greater(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Uint16s) GreaterEqual(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Uint16s) Less(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Uint16s) LessEqual(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Uint16s) NotEqual(y Uint16s) Mask16s { + var res Mask16s + for i := 0; i < 8; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Uint16s) Len() int { + return 8 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Uint16s) Masked(mask Mask16s) Uint16s { + return Uint16s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Uint16s) Max(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Uint16s) IfElse(mask Mask16s, y Uint16s) Uint16s { + return Uint16s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Uint16s) Min(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Uint16s) Mul(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Uint16s) Not() Uint16s { + return Uint16s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Uint16s) Or(y Uint16s) Uint16s { + return Uint16s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Uint16s) ShiftAllLeft(y uint8) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)<<y) + } + return res +} + +// ShiftAllRight shifts all elements right by y bits. +func (x Uint16s) ShiftAllRight(y uint8) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)>>y) + } + return res +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Uint16s) RotateAllLeft(dist uint64) Uint16s { + var res Uint16s + d := dist & 15 + for i := 0; i < 8; i++ { + u := x.get(i) + r := (u << d) | (u >> ((16 - d) & 15)) + res.set(i, r) + } + return res +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Uint16s) RotateAllRight(dist uint64) Uint16s { + var res Uint16s + d := dist & 15 + for i := 0; i < 8; i++ { + u := x.get(i) + r := (u >> d) | (u << ((16 - d) & 15)) + res.set(i, r) + } + return res +} + +// Store stores the vector elements into the slice s. +func (x Uint16s) Store(s []uint16) { + for i := 0; i < 8 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Uint16s) StorePart(s []uint16) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Uint16s) String() string { + var parts [8]uint16 + for i := 0; i < 8; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Uint16s) Sub(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// SubSaturated returns the element-wise saturated difference of x and y. +func (x Uint16s) SubSaturated(y Uint16s) Uint16s { + var res Uint16s + for i := 0; i < 8; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, 0) + } else { + res.set(i, vx-vy) + } + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Uint16s) Xor(y Uint16s) Uint16s { + return Uint16s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// BitsToInt16 reinterprets the vector bits as an Int16s vector. +func (x Uint16s) BitsToInt16() Int16s { + return Int16s{a: x.a, b: x.b} +} + +// ConvertToInt16 converts the vector elements to int16. +func (x Uint16s) ConvertToInt16() Int16s { + return Int16s{a: x.a, b: x.b} +} + +// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector. +func (x Uint16s) ReshapeToUint32s() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector. +func (x Uint16s) ReshapeToUint64s() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector. +func (x Uint16s) ReshapeToUint8s() Uint8s { + return Uint8s{a: x.a, b: x.b} +} + +// Uint32s represents a 128-bit vector of 4 uint32 elements. +type Uint32s struct { + _ _simd + a, b uint64 +} + +// LoadUint32s loads a slice of uint32 into an Uint32s vector. +func LoadUint32s(s []uint32) Uint32s { + var a, b uint64 + for i := 0; i < 4; i++ { + val := uint64(s[i]) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Uint32s{a: a, b: b} +} + +// LoadUint32sPart loads a partial slice of uint32 into an Uint32s vector. +func LoadUint32sPart(s []uint32) (Uint32s, int) { + var a, b uint64 + n := len(s) + if n > 4 { + n = 4 + } + for i := 0; i < n; i++ { + val := uint64(s[i]) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Uint32s{a: a, b: b}, n +} + +func (x Uint32s) get(i int) uint32 { + if i < 2 { + return uint32(x.a >> (32 * i)) + } + return uint32(x.b >> (32 * (i - 2))) +} + +func (x *Uint32s) set(i int, v uint32) { + val := uint64(v) + if i < 2 { + mask := uint64(0xffffffff) << (32 * i) + x.a = (x.a &^ mask) | (val << (32 * i)) + } else { + mask := uint64(0xffffffff) << (32 * (i - 2)) + x.b = (x.b &^ mask) | (val << (32 * (i - 2))) + } +} + +// Add returns the element-wise sum of x and y. +func (x Uint32s) Add(y Uint32s) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// And returns the bitwise AND of x and y. +func (x Uint32s) And(y Uint32s) Uint32s { + return Uint32s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Uint32s) AndNot(y Uint32s) Uint32s { + return Uint32s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Equal returns a mask indicating where x and y are equal. +func (x Uint32s) Equal(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Uint32s) Greater(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Uint32s) GreaterEqual(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Uint32s) Less(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Uint32s) LessEqual(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Uint32s) NotEqual(y Uint32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Uint32s) Len() int { + return 4 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Uint32s) Masked(mask Mask32s) Uint32s { + return Uint32s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Uint32s) Max(y Uint32s) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Uint32s) IfElse(mask Mask32s, y Uint32s) Uint32s { + return Uint32s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Uint32s) Min(y Uint32s) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Uint32s) Mul(y Uint32s) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// Not returns the bitwise NOT of x. +func (x Uint32s) Not() Uint32s { + return Uint32s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Uint32s) Or(y Uint32s) Uint32s { + return Uint32s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Uint32s) ShiftAllLeft(y uint8) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)<<y) + } + return res +} + +// ShiftAllRight shifts all elements right by y bits. +func (x Uint32s) ShiftAllRight(y uint8) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)>>y) + } + return res +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Uint32s) RotateAllLeft(dist uint64) Uint32s { + var res Uint32s + d := dist & 31 + for i := 0; i < 4; i++ { + u := x.get(i) + r := (u << d) | (u >> ((32 - d) & 31)) + res.set(i, r) + } + return res +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Uint32s) RotateAllRight(dist uint64) Uint32s { + var res Uint32s + d := dist & 31 + for i := 0; i < 4; i++ { + u := x.get(i) + r := (u >> d) | (u << ((32 - d) & 31)) + res.set(i, r) + } + return res +} + +// Store stores the vector elements into the slice s. +func (x Uint32s) Store(s []uint32) { + for i := 0; i < 4 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Uint32s) StorePart(s []uint32) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Uint32s) String() string { + var parts [4]uint32 + for i := 0; i < 4; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Uint32s) Sub(y Uint32s) Uint32s { + var res Uint32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// Xor returns the bitwise XOR of x and y. +func (x Uint32s) Xor(y Uint32s) Uint32s { + return Uint32s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// BitsToFloat32 reinterprets the vector bits as a Float32s vector. +func (x Uint32s) BitsToFloat32() Float32s { + return Float32s{a: x.a, b: x.b} +} + +// BitsToInt32 reinterprets the vector bits as an Int32s vector. +func (x Uint32s) BitsToInt32() Int32s { + return Int32s{a: x.a, b: x.b} +} + +// ConvertToInt32 converts the vector elements to int32. +func (x Uint32s) ConvertToInt32() Int32s { + return Int32s{a: x.a, b: x.b} +} + +// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector. +func (x Uint32s) ReshapeToUint16s() Uint16s { + return Uint16s{a: x.a, b: x.b} +} + +// ReshapeToUint64s reinterprets the vector bits as a Uint64s vector. +func (x Uint32s) ReshapeToUint64s() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector. +func (x Uint32s) ReshapeToUint8s() Uint8s { + return Uint8s{a: x.a, b: x.b} +} + +// Uint64s represents a 128-bit vector of 2 uint64 elements. +type Uint64s struct { + _ _simd + a, b uint64 +} + +// LoadUint64s loads a slice of uint64 into an Uint64s vector. +func LoadUint64s(s []uint64) Uint64s { + var a, b uint64 + a = s[0] + b = s[1] + return Uint64s{a: a, b: b} +} + +// LoadUint64sPart loads a partial slice of uint64 into an Uint64s vector. +func LoadUint64sPart(s []uint64) (Uint64s, int) { + n := len(s) + var a, b uint64 + if n > 0 { + a = s[0] + } + if n > 1 { + b = s[1] + } + return Uint64s{a: a, b: b}, n +} + +func (x Uint64s) get(i int) uint64 { + if i == 0 { + return x.a + } + return x.b +} + +func (x *Uint64s) set(i int, v uint64) { + if i == 0 { + x.a = v + } else { + x.b = v + } +} + +// Add returns the element-wise sum of x and y. +func (x Uint64s) Add(y Uint64s) Uint64s { + return Uint64s{a: x.a + y.a, b: x.b + y.b} +} + +// And returns the bitwise AND of x and y. +func (x Uint64s) And(y Uint64s) Uint64s { + return Uint64s{a: x.a & y.a, b: x.b & y.b} +} + +// AndNot returns the bitwise AND NOT of x and y. +func (x Uint64s) AndNot(y Uint64s) Uint64s { + return Uint64s{a: x.a &^ y.a, b: x.b &^ y.b} +} + +// Equal returns a mask indicating where x and y are equal. +func (x Uint64s) Equal(y Uint64s) Mask64s { + var res Mask64s + if x.a == y.a { + res.a = ^uint64(0) + } + if x.b == y.b { + res.b = ^uint64(0) + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Uint64s) Greater(y Uint64s) Mask64s { + var res Mask64s + for i := 0; i < 2; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Uint64s) GreaterEqual(y Uint64s) Mask64s { + var res Mask64s + for i := 0; i < 2; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Less returns a mask indicating where x is less than y. +func (x Uint64s) Less(y Uint64s) Mask64s { + var res Mask64s + for i := 0; i < 2; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Uint64s) LessEqual(y Uint64s) Mask64s { + var res Mask64s + for i := 0; i < 2; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Uint64s) NotEqual(y Uint64s) Mask64s { + var res Mask64s + if x.a != y.a { + res.a = ^uint64(0) + } + if x.b != y.b { + res.b = ^uint64(0) + } + return res +} + +// Len returns the number of elements in the vector. +func (x Uint64s) Len() int { + return 2 +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Uint64s) Masked(mask Mask64s) Uint64s { + return Uint64s{a: x.a & mask.a, b: x.b & mask.b} +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Uint64s) IfElse(mask Mask64s, y Uint64s) Uint64s { + return Uint64s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Not returns the bitwise NOT of x. +func (x Uint64s) Not() Uint64s { + return Uint64s{a: ^x.a, b: ^x.b} +} + +// Or returns the bitwise OR of x and y. +func (x Uint64s) Or(y Uint64s) Uint64s { + return Uint64s{a: x.a | y.a, b: x.b | y.b} +} + +// ShiftAllLeft shifts all elements left by y bits. +func (x Uint64s) ShiftAllLeft(y uint8) Uint64s { + return Uint64s{a: x.a << y, b: x.b << y} +} + +// ShiftAllRight shifts all elements right by y bits. +func (x Uint64s) ShiftAllRight(y uint8) Uint64s { + return Uint64s{a: x.a >> y, b: x.b >> y} +} + +// RotateAllLeft rotates all elements left by dist bits. +func (x Uint64s) RotateAllLeft(dist uint64) Uint64s { + d := dist & 63 + return Uint64s{ + a: (x.a << d) | (x.a >> ((64 - d) & 63)), + b: (x.b << d) | (x.b >> ((64 - d) & 63)), + } +} + +// RotateAllRight rotates all elements right by dist bits. +func (x Uint64s) RotateAllRight(dist uint64) Uint64s { + d := dist & 63 + return Uint64s{ + a: (x.a >> d) | (x.a << ((64 - d) & 63)), + b: (x.b >> d) | (x.b << ((64 - d) & 63)), + } +} + +// Store stores the vector elements into the slice s. +func (x Uint64s) Store(s []uint64) { + if len(s) > 0 { + s[0] = x.a + } + if len(s) > 1 { + s[1] = x.b + } +} + +// StorePart stores a partial vector into the slice s. +func (x Uint64s) StorePart(s []uint64) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Uint64s) String() string { + return fmt.Sprint([2]uint64{x.a, x.b}) +} + +// Sub returns the element-wise difference of x and y. +func (x Uint64s) Sub(y Uint64s) Uint64s { + return Uint64s{a: x.a - y.a, b: x.b - y.b} +} + +// Xor returns the bitwise XOR of x and y. +func (x Uint64s) Xor(y Uint64s) Uint64s { + return Uint64s{a: x.a ^ y.a, b: x.b ^ y.b} +} + +// BitsToFloat64 reinterprets the vector bits as a Float64s vector. +func (x Uint64s) BitsToFloat64() Float64s { + return Float64s{a: x.a, b: x.b} +} + +// BitsToInt64 reinterprets the vector bits as an Int64s vector. +func (x Uint64s) BitsToInt64() Int64s { + return Int64s{a: x.a, b: x.b} +} + +// ConvertToInt64 converts the vector elements to int64. +func (x Uint64s) ConvertToInt64() Int64s { + return Int64s{a: x.a, b: x.b} +} + +// ReshapeToUint16s reinterprets the vector bits as a Uint16s vector. +func (x Uint64s) ReshapeToUint16s() Uint16s { + return Uint16s{a: x.a, b: x.b} +} + +// ReshapeToUint32s reinterprets the vector bits as a Uint32s vector. +func (x Uint64s) ReshapeToUint32s() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// ReshapeToUint8s reinterprets the vector bits as a Uint8s vector. +func (x Uint64s) ReshapeToUint8s() Uint8s { + return Uint8s{a: x.a, b: x.b} +} + +// Float32s represents a 128-bit vector of 4 float32 elements. +type Float32s struct { + _ _simd + a, b uint64 +} + +// LoadFloat32s loads a slice of float32 into an Float32s vector. +func LoadFloat32s(s []float32) Float32s { + var a, b uint64 + for i := 0; i < 4; i++ { + val := uint64(math.Float32bits(s[i])) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Float32s{a: a, b: b} +} + +// LoadFloat32sPart loads a partial slice of float32 into an Float32s vector. +func LoadFloat32sPart(s []float32) (Float32s, int) { + var a, b uint64 + n := len(s) + if n > 4 { + n = 4 + } + for i := 0; i < n; i++ { + val := uint64(math.Float32bits(s[i])) + if i < 2 { + a |= val << (32 * i) + } else { + b |= val << (32 * (i - 2)) + } + } + return Float32s{a: a, b: b}, n +} + +func (x Float32s) get(i int) float32 { + if i < 2 { + return math.Float32frombits(uint32(x.a >> (32 * i))) + } + return math.Float32frombits(uint32(x.b >> (32 * (i - 2)))) +} + +func (x *Float32s) set(i int, v float32) { + val := uint64(math.Float32bits(v)) + if i < 2 { + mask := uint64(0xffffffff) << (32 * i) + x.a = (x.a &^ mask) | (val << (32 * i)) + } else { + mask := uint64(0xffffffff) << (32 * (i - 2)) + x.b = (x.b &^ mask) | (val << (32 * (i - 2))) + } +} + +// Abs returns the element-wise absolute value of x. +func (x Float32s) Abs() Float32s { + var res Float32s + for i := 0; i < 4; i++ { + v := x.get(i) + if v < 0 { + res.set(i, -v) + } else { + res.set(i, v) + } + } + return res +} + +// Add returns the element-wise sum of x and y. +func (x Float32s) Add(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)+y.get(i)) + } + return res +} + +// ConvertToInt32 converts the vector elements to int32. +func (x Float32s) ConvertToInt32() Int32s { + var res Int32s + for i := 0; i < 4; i++ { + res.set(i, int32(x.get(i))) + } + return res +} + +// Div returns the element-wise quotient of x and y. +func (x Float32s) Div(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)/y.get(i)) + } + return res +} + +// Equal returns a mask indicating where x and y are equal. +func (x Float32s) Equal(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) == y.get(i) { + res.set(i, true) + } + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Float32s) Greater(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) > y.get(i) { + res.set(i, true) + } + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Float32s) GreaterEqual(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) >= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Len returns the number of elements in the vector. +func (x Float32s) Len() int { + return 4 +} + +// Less returns a mask indicating where x is less than y. +func (x Float32s) Less(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) < y.get(i) { + res.set(i, true) + } + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Float32s) LessEqual(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) <= y.get(i) { + res.set(i, true) + } + } + return res +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Float32s) Masked(mask Mask32s) Float32s { + return Float32s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Float32s) Max(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx > vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Float32s) IfElse(mask Mask32s, y Float32s) Float32s { + return Float32s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Float32s) Min(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + vx := x.get(i) + vy := y.get(i) + if vx < vy { + res.set(i, vx) + } else { + res.set(i, vy) + } + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Float32s) Mul(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)*y.get(i)) + } + return res +} + +// MulAdd returns x * y + z element-wise. +func (x Float32s) MulAdd(y, z Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)+y.get(i)*z.get(i)) + } + return res +} + +// Neg returns the element-wise negation of x. +func (x Float32s) Neg() Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, -(x.get(i))) + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Float32s) NotEqual(y Float32s) Mask32s { + var res Mask32s + for i := 0; i < 4; i++ { + if x.get(i) != y.get(i) { + res.set(i, true) + } + } + return res +} + +// Sqrt returns the element-wise square root of x. +func (x Float32s) Sqrt() Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, float32(math.Sqrt(float64(x.get(i))))) + } + return res +} + +// Store stores the vector elements into the slice s. +func (x Float32s) Store(s []float32) { + for i := 0; i < 4 && i < len(s); i++ { + s[i] = x.get(i) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Float32s) StorePart(s []float32) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Float32s) String() string { + var parts [4]float32 + for i := 0; i < 4; i++ { + parts[i] = x.get(i) + } + return fmt.Sprint(parts) +} + +// Sub returns the element-wise difference of x and y. +func (x Float32s) Sub(y Float32s) Float32s { + var res Float32s + for i := 0; i < 4; i++ { + res.set(i, x.get(i)-y.get(i)) + } + return res +} + +// ToBits reinterprets the vector bits as a Uint32s vector. +func (x Float32s) ToBits() Uint32s { + return Uint32s{a: x.a, b: x.b} +} + +// Float64s represents a 128-bit vector of 2 float64 elements. +type Float64s struct { + _ _simd + a, b uint64 +} + +// LoadFloat64s loads a slice of float64 into an Float64s vector. +func LoadFloat64s(s []float64) Float64s { + var a, b uint64 + a = math.Float64bits(s[0]) + b = math.Float64bits(s[1]) + return Float64s{a: a, b: b} +} + +// LoadFloat64sPart loads a partial slice of float64 into an Float64s vector. +func LoadFloat64sPart(s []float64) (Float64s, int) { + n := len(s) + var a, b uint64 + if n > 0 { + a = math.Float64bits(s[0]) + } + if n > 1 { + b = math.Float64bits(s[1]) + } + return Float64s{a: a, b: b}, n +} + +func (x Float64s) get(i int) float64 { + if i == 0 { + return math.Float64frombits(x.a) + } + return math.Float64frombits(x.b) +} + +func (x *Float64s) set(i int, v float64) { + if i == 0 { + x.a = math.Float64bits(v) + } else { + x.b = math.Float64bits(v) + } +} + +// Abs returns the element-wise absolute value of x. +func (x Float64s) Abs() Float64s { + var res Float64s + for i := 0; i < 4; i++ { + v := x.get(i) + if v < 0 { + res.set(i, -v) + } else { + res.set(i, v) + } + } + return res +} + +// Add returns the element-wise sum of x and y. +func (x Float64s) Add(y Float64s) Float64s { + var res Float64s + res.set(0, x.get(0)+y.get(0)) + res.set(1, x.get(1)+y.get(1)) + return res +} + +// Div returns the element-wise quotient of x and y. +func (x Float64s) Div(y Float64s) Float64s { + var res Float64s + res.set(0, x.get(0)/y.get(0)) + res.set(1, x.get(1)/y.get(1)) + return res +} + +// Equal returns a mask indicating where x and y are equal. +func (x Float64s) Equal(y Float64s) Mask64s { + var res Mask64s + if x.get(0) == y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) == y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// Greater returns a mask indicating where x is greater than y. +func (x Float64s) Greater(y Float64s) Mask64s { + var res Mask64s + if x.get(0) > y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) > y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// GreaterEqual returns a mask indicating where x is greater than or equal to y. +func (x Float64s) GreaterEqual(y Float64s) Mask64s { + var res Mask64s + if x.get(0) >= y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) >= y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// Len returns the number of elements in the vector. +func (x Float64s) Len() int { + return 2 +} + +// Less returns a mask indicating where x is less than y. +func (x Float64s) Less(y Float64s) Mask64s { + var res Mask64s + if x.get(0) < y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) < y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// LessEqual returns a mask indicating where x is less than or equal to y. +func (x Float64s) LessEqual(y Float64s) Mask64s { + var res Mask64s + if x.get(0) <= y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) <= y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// Masked returns a new vector with elements from x where mask is true, and zero elsewhere. +func (x Float64s) Masked(mask Mask64s) Float64s { + return Float64s{a: x.a & mask.a, b: x.b & mask.b} +} + +// Max returns the element-wise maximum of x and y. +func (x Float64s) Max(y Float64s) Float64s { + var res Float64s + vx := x.get(0) + vy := y.get(0) + if vx > vy { + res.set(0, vx) + } else { + res.set(0, vy) + } + vx = x.get(1) + vy = y.get(1) + if vx > vy { + res.set(1, vx) + } else { + res.set(1, vy) + } + return res +} + +// IfElse returns a new vector with elements from x where mask is true, and y where mask is false. +func (x Float64s) IfElse(mask Mask64s, y Float64s) Float64s { + return Float64s{ + a: (x.a & mask.a) | (y.a &^ mask.a), + b: (x.b & mask.b) | (y.b &^ mask.b), + } +} + +// Min returns the element-wise minimum of x and y. +func (x Float64s) Min(y Float64s) Float64s { + var res Float64s + vx := x.get(0) + vy := y.get(0) + if vx < vy { + res.set(0, vx) + } else { + res.set(0, vy) + } + vx = x.get(1) + vy = y.get(1) + if vx < vy { + res.set(1, vx) + } else { + res.set(1, vy) + } + return res +} + +// Mul returns the element-wise product of x and y. +func (x Float64s) Mul(y Float64s) Float64s { + var res Float64s + res.set(0, x.get(0)*y.get(0)) + res.set(1, x.get(1)*y.get(1)) + return res +} + +// MulAdd returns x * y + z element-wise. +func (x Float64s) MulAdd(y, z Float64s) Float64s { + var res Float64s + res.set(0, x.get(0)+y.get(0)*z.get(0)) + res.set(1, x.get(1)+y.get(1)*z.get(1)) + return res +} + +// Neg returns the element-wise negation of x. +func (x Float64s) Neg() Float64s { + var res Float64s + for i := 0; i < 4; i++ { + res.set(i, -(x.get(i))) + } + return res +} + +// NotEqual returns a mask indicating where x and y are not equal. +func (x Float64s) NotEqual(y Float64s) Mask64s { + var res Mask64s + if x.get(0) != y.get(0) { + res.a = ^uint64(0) + } + if x.get(1) != y.get(1) { + res.b = ^uint64(0) + } + return res +} + +// Sqrt returns the element-wise square root of x. +func (x Float64s) Sqrt() Float64s { + var res Float64s + res.set(0, math.Sqrt(x.get(0))) + res.set(1, math.Sqrt(x.get(1))) + return res +} + +// Store stores the vector elements into the slice s. +func (x Float64s) Store(s []float64) { + if len(s) > 0 { + s[0] = x.get(0) + } + if len(s) > 1 { + s[1] = x.get(1) + } +} + +// StorePart stores a partial vector into the slice s. +func (x Float64s) StorePart(s []float64) { + x.Store(s) +} + +// String returns a string representation of the vector. +func (x Float64s) String() string { + return fmt.Sprint([2]float64{x.get(0), x.get(1)}) +} + +// Sub returns the element-wise difference of x and y. +func (x Float64s) Sub(y Float64s) Float64s { + var res Float64s + res.set(0, x.get(0)-y.get(0)) + res.set(1, x.get(1)-y.get(1)) + return res +} + +// ToBits reinterprets the vector bits as a Uint64s vector. +func (x Float64s) ToBits() Uint64s { + return Uint64s{a: x.a, b: x.b} +} + +// Mask8s represents a 128-bit mask vector for 16 int8/uint8 elements. +type Mask8s struct { + _ _simd + a, b uint64 +} + +func (x *Mask8s) set(i int, v bool) { + if v { + if i < 8 { + mask := uint64(0xff) << (8 * i) + x.a |= mask + } else { + mask := uint64(0xff) << (8 * (i - 8)) + x.b |= mask + } + } +} + +// And returns the bitwise AND of x and y. +func (x Mask8s) And(y Mask8s) Mask8s { + return Mask8s{a: x.a & y.a, b: x.b & y.b} +} + +// Or returns the bitwise OR of x and y. +func (x Mask8s) Or(y Mask8s) Mask8s { + return Mask8s{a: x.a | y.a, b: x.b | y.b} +} + +// String returns a string representation of the vector. +func (x Mask8s) String() string { + return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b) +} + +// ToInt8s converts the mask to an Int8s vector. +func (x Mask8s) ToInt8s() Int8s { + return Int8s{a: x.a, b: x.b} +} + +// Mask16s represents a 128-bit mask vector for 8 int16/uint16 elements. +type Mask16s struct { + _ _simd + a, b uint64 +} + +func (x *Mask16s) set(i int, v bool) { + if v { + if i < 4 { + mask := uint64(0xffff) << (16 * i) + x.a |= mask + } else { + mask := uint64(0xffff) << (16 * (i - 4)) + x.b |= mask + } + } +} + +// And returns the bitwise AND of x and y. +func (x Mask16s) And(y Mask16s) Mask16s { + return Mask16s{a: x.a & y.a, b: x.b & y.b} +} + +// Or returns the bitwise OR of x and y. +func (x Mask16s) Or(y Mask16s) Mask16s { + return Mask16s{a: x.a | y.a, b: x.b | y.b} +} + +// String returns a string representation of the vector. +func (x Mask16s) String() string { + return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b) +} + +// ToInt16s converts the mask to an Int16s vector. +func (x Mask16s) ToInt16s() Int16s { + return Int16s{a: x.a, b: x.b} +} + +// Mask32s represents a 128-bit mask vector for 4 int32/uint32/float32 elements. +type Mask32s struct { + _ _simd + a, b uint64 +} + +func (x *Mask32s) set(i int, v bool) { + if v { + if i < 2 { + mask := uint64(0xffffffff) << (32 * i) + x.a |= mask + } else { + mask := uint64(0xffffffff) << (32 * (i - 2)) + x.b |= mask + } + } +} + +// And returns the bitwise AND of x and y. +func (x Mask32s) And(y Mask32s) Mask32s { + return Mask32s{a: x.a & y.a, b: x.b & y.b} +} + +// Or returns the bitwise OR of x and y. +func (x Mask32s) Or(y Mask32s) Mask32s { + return Mask32s{a: x.a | y.a, b: x.b | y.b} +} + +// String returns a string representation of the vector. +func (x Mask32s) String() string { + return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b) +} + +// ToInt32s converts the mask to an Int32s vector. +func (x Mask32s) ToInt32s() Int32s { + return Int32s{a: x.a, b: x.b} +} + +// Mask64s represents a 128-bit mask vector for 2 int64/uint64/float64 elements. +type Mask64s struct { + _ _simd + a, b uint64 +} + +func (x *Mask64s) set(i int, v bool) { + if v { + if i == 0 { + x.a = ^uint64(0) + } else { + x.b = ^uint64(0) + } + } +} + +// And returns the bitwise AND of x and y. +func (x Mask64s) And(y Mask64s) Mask64s { + return Mask64s{a: x.a & y.a, b: x.b & y.b} +} + +// Or returns the bitwise OR of x and y. +func (x Mask64s) Or(y Mask64s) Mask64s { + return Mask64s{a: x.a | y.a, b: x.b | y.b} +} + +// String returns a string representation of the vector. +func (x Mask64s) String() string { + return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b) +} + +// ToInt64s converts the mask to an Int64s vector. +func (x Mask64s) ToInt64s() Int64s { + return Int64s{a: x.a, b: x.b} +} + +func newT(lo, hi uint64) Uint64s { + return Uint64s{a: lo, b: hi} +} + +// mwl returns the 128-bit product of the lower halves of x and y +func (x Uint64s) mwl(y Uint64s) Uint64s { + hi, lo := bits.Mul64(x.a, y.a) + return Uint64s{a: lo, b: hi} +} + +var ( + m1 = newT(0x1084210842108421, 0x2108421084210842) + m2 = newT(0x2108421084210842, 0x4210842108421084) + m3 = newT(0x4210842108421084, 0x8421084210842108) + m4 = newT(0x8421084210842108, 0x0842108421084210) + m5 = newT(0x0842108421084210, 0x1084210842108421) +) + +func (x Uint64s) clmul(y Uint64s) Uint64s { + x1 := x.And(m1) + x2 := x.And(m2) + x3 := x.And(m3) + x4 := x.And(m4) + x5 := x.And(m5) + + y1 := y.And(m1) + y2 := y.And(m2) + y3 := y.And(m3) + y4 := y.And(m4) + y5 := y.And(m5) + + // sum of x, y indices == K mod 5; mask index = K-1 + z := (x1.mwl(y1)).Xor(x2.mwl(y5)).Xor(x5.mwl(y2)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).And(m1) + z = (x4.mwl(y4)).Xor(x3.mwl(y5)).Xor(x5.mwl(y3)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m2).Or(z) + z = (x2.mwl(y2)).Xor(x4.mwl(y5)).Xor(x5.mwl(y4)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m3).Or(z) + z = (x5.mwl(y5)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m4).Or(z) + z = (x3.mwl(y3)).Xor(x1.mwl(y5)).Xor(x5.mwl(y1)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).And(m5).Or(z) + + return z +} + +// CarrylessMultiplyEven computes the carryless +// multiplications of selected even halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s { + return x.clmul(y) +} + +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected odd halves of the elements of x and y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s { + x.a = x.b + y.a = y.b + return x.clmul(y) +}
diff --git a/src/simd/internal/bridge/tofrom_amd64.go b/src/simd/internal/bridge/tofrom_amd64.go index 6e814f4..fa2878f 100644 --- a/src/simd/internal/bridge/tofrom_amd64.go +++ b/src/simd/internal/bridge/tofrom_amd64.go
@@ -8,6 +8,8 @@ import "simd/archsimd" +// For amd64, handle the larger types not mentioned in tofrom_128.go + func (x Float32x16) ToArch() any { return archsimd.Float32x16(x) }
diff --git a/src/simd/internal/bridge/tofrom_emulated.go b/src/simd/internal/bridge/tofrom_emulated.go new file mode 100644 index 0000000..4dc3bea --- /dev/null +++ b/src/simd/internal/bridge/tofrom_emulated.go
@@ -0,0 +1,63 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && (amd64 || wasm || arm64) + +package bridge + +func (x Float32s) ToArch() any { + return x +} + +func (x Float64s) ToArch() any { + return x +} + +func (x Int16s) ToArch() any { + return x +} + +func (x Int32s) ToArch() any { + return x +} + +func (x Int64s) ToArch() any { + return x +} + +func (x Int8s) ToArch() any { + return x +} + +func (x Mask16s) ToArch() any { + return x +} + +func (x Mask32s) ToArch() any { + return x +} + +func (x Mask64s) ToArch() any { + return x +} + +func (x Mask8s) ToArch() any { + return x +} + +func (x Uint16s) ToArch() any { + return x +} + +func (x Uint32s) ToArch() any { + return x +} + +func (x Uint64s) ToArch() any { + return x +} + +func (x Uint8s) ToArch() any { + return x +}
diff --git a/src/simd/midway_amd64.go b/src/simd/midway_amd64.go index 78acde3..8f37f65 100644 --- a/src/simd/midway_amd64.go +++ b/src/simd/midway_amd64.go
@@ -7,58 +7,28 @@ package simd import ( - "fmt" - "os" + "internal/cpu" "simd/archsimd" - "strconv" ) -var maxVectorSize int +const archHasHwClmul = true -func init() { - actualMax := archMaxVectorSize() - if gosimd := os.Getenv("GOSIMD"); gosimd != "" { - val, err := strconv.Atoi(gosimd) - if err != nil { - panic(fmt.Errorf("Could not parse GOSIMD(='%s') as a decimal number, %v", gosimd, err)) - } - if val > actualMax { - panic(fmt.Errorf("Requested GOSIMD(='%d') is larger than the simd length (%d) supported on this cpu ", val, actualMax)) - } - if val < 0 { - panic(fmt.Errorf("Requested GOSIMD(='%d') is negative", val)) - } - maxVectorSize = val - return - } - maxVectorSize = actualMax -} - -// VectorBitSize returns the bit length of the longest vector available -// on the current hardware. For amd64, this is 128, 256, or 512, depending -// on the hardware. It can be artificially reduced by setting the -// GOSIMD environment variable before running a program. -func VectorBitSize() int { - return maxVectorSize -} - -// Emulated returns whether simd operations are emulated or -// running on actual vector hardware. -func Emulated() bool { - return false -} - -func archMaxVectorSize() int { - if archsimd.X86.AVX512() { - return 512 +func archMaxVectorSize() (size, allFeatureSize int) { + if archsimd.X86.AVX() { + size = 128 + allFeatureSize = 128 } if archsimd.X86.AVX2() { - return 256 + size = 256 + if cpu.X86.HasVPCLMULQDQ { + allFeatureSize = 256 + } } - // AVX has 256 bit float ops but only 128-bit integer ops - // therefore it is 128. - if archsimd.X86.AVX() { - return 128 + if archsimd.X86.AVX512() { + size = 512 + if cpu.X86.HasAVX512VPCLMULQDQ { + allFeatureSize = 512 + } } - return 0 + return }
diff --git a/src/simd/midway_arm64.go b/src/simd/midway_arm64.go index 80f24cd..a138131 100644 --- a/src/simd/midway_arm64.go +++ b/src/simd/midway_arm64.go
@@ -6,14 +6,17 @@ package simd -// VectorBitSize returns the bit length of the longest vector available -// on the current hardware. For arm64-neon, this is 128. -func VectorBitSize() int { - return 128 -} +import ( + "internal/cpu" +) -// Emulated returns whether simd operations are emulated or -// running on actual vector hardware. -func Emulated() bool { - return false +const archHasHwClmul = true + +func archMaxVectorSize() (size, allFeatureSize int) { + // This describes Neon, SVE is still TBD. + size = 128 + if cpu.ARM64.HasPMULL { + allFeatureSize = 128 + } + return }
diff --git a/src/simd/midway_common.go b/src/simd/midway_common.go new file mode 100644 index 0000000..aa6e509 --- /dev/null +++ b/src/simd/midway_common.go
@@ -0,0 +1,138 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && (amd64 || arm64 || wasm) + +package simd + +import ( + "fmt" + "internal/godebug" + "strconv" +) + +// The `simd` package provides an architecture and vector-length agnostic API +// for single-instruction-multiple-data "SIMD" vectors and operations. The +// functions and methods in this package are those that can be mostly supported +// in hardware, combined with an emulation for those platforms that are not yet +// supported. +// +// Users can also control emulation and vector length with the 'simd' GODEBUG +// setting. GODEBUG=simd=0 requests emulation, not hardware SIMD, even if +// hardware is available. On platforms that may support multiple vector +// lengths, GODEBUG=simd=N (N=128, 256, or 512) requests a specific vector +// length. If the request cannot be satisfied, the simd package panics +// informatively. +// +// Some platforms may support vectors of a particular length, but not all of the +// expected operations (those appearing in this package) are available at that +// length. In that case, the default is to automatically downgrade to a length +// where the operations are supported, perhaps even to emulated-only +// (size=0). If a size is requested that is not compatible with the available +// features, the simd package will panic (and note the reason). To override +// the feature check, in the case that the user knows that the missing +// operations will not be used, prefix the size request with a '+', for +// example "GODEBUG=simd=+256". A plain '+' will override the feature check at +// whatever the hardware's default vector size happens to be. + +var simd = godebug.New("#simd") + +var maxVectorSize int +var emulated = false +var hwClmul = true + +func init() { + actualMax, allFeatureSize := archMaxVectorSize() // zero == no simd, zero == features unavailable + gosimd := simd.Value() + explicitRequest := false + + // No SIMD, must emulate + if actualMax == 0 { + maxVectorSize = 128 + emulated = true + hwClmul = false + return + } + + maxVectorSize = actualMax + + // If gosimd begins with a '+' or is a single '1' then override + // any hardware feature check disabling of hardware SIMD. + // The '+' may be followed by a size, expected to be 0, 128, 256, 512. + // If it is zero (e.g., "0" or +0") then hardware SIMD is still disabled. + if len(gosimd) > 0 && gosimd[0] == '+' { + // override feature reduction + // keep maxVectorSize + // emulated remains false + // note if features missing. + hwClmul = allFeatureSize < actualMax + gosimd = gosimd[1:] + explicitRequest = true + + } else if allFeatureSize < actualMax { + if allFeatureSize > 0 { + maxVectorSize = allFeatureSize + hwClmul = true + emulated = false + } else { + maxVectorSize = 128 + hwClmul = false + emulated = true + } + } + + if gosimd == "" { + return + } + + // possible adjustment to chosen size + val, err := strconv.Atoi(gosimd) + if err != nil { + panic(fmt.Errorf("Could not parse GODEBUG=gosimd='%s' as a decimal number, %v", gosimd, err)) + } + if val > actualMax { + panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is larger than the simd length (%d) supported on this cpu ", val, actualMax)) + } + if !explicitRequest && val > allFeatureSize { + panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is larger than the simd length required for expected features (%d) on this cpu. GODEBUG=gosimd='+%d' will skip this check.", val, allFeatureSize, val)) + } + if val < 0 { + panic(fmt.Errorf("Requested GODEBUG=gosimd=%d is negative", val)) + } + // user-requested emulation + if val == 0 { + maxVectorSize = 128 + hwClmul = false + emulated = true + return + } + + hwClmul = allFeatureSize >= val + maxVectorSize = val + emulated = false + return +} + +// VectorBitSize returns the bit length of the longest vector available +// on the current hardware. It can be artificially reduced by setting +// GODEBUG=simd=<smaller size> environment variable before running a program. +func VectorBitSize() int { + return maxVectorSize +} + +// Emulated returns whether simd operations are emulated or +// running on actual vector hardware. +func Emulated() bool { + return emulated +} + +// HasHardwareCarrylessMultiply returns whether this platform +// as a hardware-implemented version of carryless multiply. +// With default GODEBUG=simd settings, if this is false, +// it is emulated and merely slow, but with non-default settings +// this can indicate the possibility of a missing instruction +// that will fail ("SIGILL") if it is executed. +func HasHardwareCarrylessMultiply() bool { + return hwClmul && archHasHwClmul +}
diff --git a/src/simd/midway_wasm.go b/src/simd/midway_wasm.go index e3c1ce0..8b0673f 100644 --- a/src/simd/midway_wasm.go +++ b/src/simd/midway_wasm.go
@@ -6,14 +6,8 @@ package simd -// VectorBitSize returns the bit length of the longest vector available -// on the current hardware. For wasm, this is 128. -func VectorBitSize() int { - return 128 -} +const archHasHwClmul = false -// Emulated returns whether simd operations are emulated or -// running on actual vector hardware. -func Emulated() bool { - return false +func archMaxVectorSize() (size, allFeatureSize int) { + return 128, 128 }
diff --git a/src/simd/simd.go b/src/simd/simd.go index a85181f..b4a298e 100644 --- a/src/simd/simd.go +++ b/src/simd/simd.go
@@ -758,6 +758,38 @@ // BitsToInt64 reinterprets the vector bits as an Int64s vector. func (x Uint64s) BitsToInt64() Int64s +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected even indexed elements of x and y. +// Each product is 128 bits wide and fills the corresponding +// even-odd pairs in the result. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.)" +func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s + +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected odd indexed elements of x and y. +// Each product is 128 bits wide and fills the corresponding +// even-odd pairs in the result. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.)" +func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s + // ConvertToInt64 converts the vector elements to int64. func (x Uint64s) ConvertToInt64() Int64s
diff --git a/src/simd/simd_emulated.go b/src/simd/simd_emulated.go index 128a091..995f6fa 100644 --- a/src/simd/simd_emulated.go +++ b/src/simd/simd_emulated.go
@@ -9,6 +9,7 @@ import ( "fmt" "math" + "math/bits" ) // VectorSize returns the bit length of the emulated vector (fixed to 128). @@ -21,6 +22,14 @@ return true } +// EmulatedCarrylessMultiply returns whether CarrylessMultiply is emulated. +// This sometimes matters to choice of algorithm (e.g., when computing CRC). +// The emulation's execution time does not depend on its inputs, so it is +// okay in that sense. +func EmulatedCarrylessMultiply() bool { + return true +} + type _simd struct { _ [0]func(*_simd) *_simd } @@ -3144,3 +3153,81 @@ func (x Mask64s) ToInt64s() Int64s { return Int64s{a: x.a, b: x.b} } + +func newT(lo, hi uint64) Uint64s { + return Uint64s{a: lo, b: hi} +} + +// mwl returns the 128-bit product of the lower halves of x and y +func (x Uint64s) mwl(y Uint64s) Uint64s { + hi, lo := bits.Mul64(x.a, y.a) + return Uint64s{a: lo, b: hi} +} + +var ( + // For mK, bits J such that J mod 5 == K are set + m0 = newT(0x0084210842108421, 0x1108421084210842) + m1 = newT(0x1108421084210842, 0x3210842108421084) + m2 = newT(0x3210842108421084, 0x8421084210842108) + m3 = newT(0x8421084210842108, 0x0842108421084210) + m4 = newT(0x0842108421084210, 0x0084210842108421) +) + +func (x Uint64s) clmul(y Uint64s) Uint64s { + x0 := x.And(m0) + x1 := x.And(m1) + x2 := x.And(m2) + x3 := x.And(m3) + x4 := x.And(m4) + + y0 := y.And(m0) + y1 := y.And(m1) + y2 := y.And(m2) + y3 := y.And(m3) + y4 := y.And(m4) + + // sum of x, y indices == K mod 5; mask index = K + z := (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0) + z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z) + z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z) + z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z) + z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z) + + return z +} + +// CarrylessMultiplyEven computes the carryless +// multiplications of selected even halves of the elements of x and y. +// The result fills the 128 bits of each even-odd pair. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s { + return x.clmul(y) +} + +// CarrylessMultiplyOdd computes the carryless +// multiplications of selected odd halves of the elements of x and y. +// The result fills the 128 bits of each even-odd pair. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s { + x.a = x.b + y.a = y.b + return x.clmul(y) +}
diff --git a/src/simd/testdata/ip/sum_amd64.go b/src/simd/testdata/ip/sum_amd64.go index 5c936b5..64f7657 100644 --- a/src/simd/testdata/ip/sum_amd64.go +++ b/src/simd/testdata/ip/sum_amd64.go
@@ -24,6 +24,8 @@ return b.GetLo().GetElem(0) + b.GetHi().GetElem(0) case archsimd.Float32x4: return boringSum(simd.Float32sFromArch(a)) + default: + return boringSum(x) } panic("nope") }
diff --git a/test/codegen/simd_arm64.go b/test/codegen/simd_arm64.go index 5386ad2..1acbb1d 100644 --- a/test/codegen/simd_arm64.go +++ b/test/codegen/simd_arm64.go
@@ -95,10 +95,10 @@ } func foldGetHiSetHiShifts(x archsimd.Uint32x4) archsimd.Uint16x8 { - shrN := x.ShiftRightNarrowConst(16) // arm64: `VSHRN [$]16, V0.S4, V[0-9]+.H4` - trunc := x.ShiftRightNarrowConst(0) // arm64: `VXTN V0.S4, V[0-9]+.H4` -`VSHRN` - shlLo := x.ShiftLeftLoLongConst(1) // arm64: `VUSHLL [$]1, V0.S2, V[0-9]+.D2` - shlHi := x.GetHi().ShiftLeftLoLongConst(1) // arm64: `VUSHLL2 [$]1, V0.S4, V[0-9]+.D2` -`VDUP` + shrN := x.ShiftRightNarrowConst(16) // arm64: `VSHRN [$]16, V0.S4, V[0-9]+.H4` + trunc := x.ShiftRightNarrowConst(0) // arm64: `VXTN V0.S4, V[0-9]+.H4` -`VSHRN` + shlLo := x.ShiftLeftWidenLoConst(1) // arm64: `VUSHLL [$]1, V0.S2, V[0-9]+.D2` + shlHi := x.GetHi().ShiftLeftWidenLoConst(1) // arm64: `VUSHLL2 [$]1, V0.S4, V[0-9]+.D2` -`VDUP` sum := shrN.Add(trunc) combined := sum.SetHi(x.ShiftRightNarrowConst(15)) // arm64: `VSHRN2 [$]15, V0.S4, V[0-9]+.H8` -`VMOV.*D\[` sinkU64 = shlLo.Sub(shlHi) @@ -106,13 +106,19 @@ } func foldGetHiSetHiMuls(a, b archsimd.Uint16x8) archsimd.Uint16x8 { - wLo := a.MulLoLong(b) // arm64: `VUMULL V0.H4, V1.H4, V[0-9].S4` - wHi := a.GetHi().MulLoLong(b.GetHi()) // arm64: `VUMULL2 V1.H8, V0.H8, V[0-9].S4` -`VDUP` + wLo := a.MulWidenLo(b) // arm64: `VUMULL V0.H4, V1.H4, V[0-9].S4` + wHi := a.GetHi().MulWidenLo(b.GetHi()) // arm64: `VUMULL2 V1.H8, V0.H8, V[0-9].S4` -`VDUP` wHiRight := wHi.ShiftRightNarrowConst(16) // arm64: -`.*` wLoRight := wLo.ShiftRightNarrowConst(16) // arm64: `VSHRN [$]16, V[0-9]+.S4, V0.H4` return wLoRight.SetHi(wHiRight) // arm64: `VSHRN2 [$]16, V[0-9]+.S4, V0.H8` -`VMOV.*D\[` } +func carrylessMultiplies(x, y archsimd.Uint64x2) archsimd.Uint64x2 { + lo := x.CarrylessMultiplyEven(y) // arm64:`VPMULL V` -`VPMULL2` + hi := x.CarrylessMultiplyOdd(y) // arm64:`VPMULL2 V` -`VPMULL ` + return lo.Xor(hi) +} + func mergeWithNotMask(x, y archsimd.Int8x16, mask archsimd.Mask8x16, f1, f2 archsimd.Float32x4) { // arm64:`VBIF` -`VBIT` -`VNOT` sinkI8 = x.IfElse(mask.Not(), y)