| // Copyright 2026 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build goexperiment.simd |
| |
| package simd_test |
| |
| import ( |
| "fmt" |
| "math/bits" |
| "math/rand" |
| "simd" |
| "testing" |
| ) |
| |
| func vpsumd(xlo, xhi, ylo, yhi uint64) (lo, hi uint64) { |
| lo, hi = clmul64(xhi, yhi) |
| l, h := clmul64(xlo, ylo) |
| hi ^= h |
| lo ^= l |
| return |
| } |
| |
| func clmul64(a, b uint64) (lo, hi uint64) { |
| for i := range uint(64) { |
| if (a>>i)&1 == 1 { |
| if i == 0 { |
| lo ^= b |
| } else { |
| lo ^= b << i |
| hi ^= b >> (64 - i) |
| } |
| } |
| } |
| return |
| } |
| |
| func em2(xlo, xhi, ylo, yhi uint64) string { |
| lx := newT(xlo, 0) |
| ly := newT(ylo, 0) |
| hx := newT(xhi, 0) |
| hy := newT(yhi, 0) |
| |
| z := (lx.ClMul(ly)).Xor(hx.ClMul(hy)) |
| |
| return fmt.Sprintf("0x%08x%08x", z.b, z.a) |
| } |
| |
| func em1(a, b, c, d uint64) string { |
| lo, hi := vpsumd(a, b, c, d) |
| return fmt.Sprintf("0x%08x%08x", hi, lo) |
| } |
| |
| func set0(v uint64) simd.Uint64s { |
| a := [2]uint64{v, 0} |
| r, _ := simd.LoadUint64sPart(a[:]) |
| return r |
| } |
| |
| func get(v simd.Uint64s) (lo, hi uint64) { |
| var a [2]uint64 |
| v.StorePart(a[:]) |
| return a[0], a[1] |
| } |
| |
| func em3(xlo, xhi, ylo, yhi uint64) string { |
| lx := set0(xlo) |
| ly := set0(ylo) |
| hx := set0(xhi) |
| hy := set0(yhi) |
| |
| z := (lx.CarrylessMultiplyEven(ly)).Xor(hx.CarrylessMultiplyEven(hy)) |
| |
| lo, hi := get(z) |
| return fmt.Sprintf("0x%08x%08x", hi, lo) |
| } |
| |
| func TestClMul(t *testing.T) { |
| fmt.Println("Vector length:", simd.VectorBitSize()) |
| fmt.Println("Emulated:", simd.Emulated()) |
| fmt.Println("HasHWCLMUL:", simd.HasHardwareCarrylessMultiply()) |
| |
| x := uint64(0x0807060504030201) |
| y := uint64(0x0101010101010101) |
| |
| var a, b, c, d uint64 |
| a, b, c, d = 0x66b32838754f59a3, 0xaeba319ab2418c50, 0x45678b3c7f11fc73, 0xd62ef8ae5f7b693 |
| |
| fmt.Println("EMULATION 1") |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, x, 1, 16, em1(x, x, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, 1, 16, em1(x, y, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, x, y, em1(x, y, x, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", 1, 2, y*4, y, em1(1, 2, y*4, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", a, b, c, d, em1(a, b, c, d)) |
| |
| fmt.Println("EMULATION 2") |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, x, 1, 16, em2(x, x, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, 1, 16, em2(x, y, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, x, y, em2(x, y, x, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", 1, 2, y*4, y, em2(1, 2, y*4, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", a, b, c, d, em2(a, b, c, d)) |
| |
| fmt.Println("EMULATION 3") |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, x, 1, 16, em3(x, x, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, 1, 16, em3(x, y, 1, 16)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", x, y, x, y, em3(x, y, x, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", 1, 2, y*4, y, em3(1, 2, y*4, y)) |
| fmt.Printf("clmul(0x%08x, 0x%08x, 0x%08x, 0x%08x) = %s\n", a, b, c, d, em3(a, b, c, d)) |
| |
| for i := range 10000 { |
| a, b, c, d := rand.Uint64(), rand.Uint64(), rand.Uint64(), rand.Uint64() |
| |
| e1 := em1(a, b, c, d) |
| e2 := em2(a, b, c, d) |
| e3 := em3(a, b, c, d) |
| |
| if e1 != e2 || e1 != e3 { |
| t.Errorf("Mismatch at %d, a,b,c,d = 0x%08x, 0x%08x, 0x%08x, 0x%08x; e1=%s, e2=%s, e3=%s", i, a, b, c, d, e1, e2, e3) |
| if i > 5 { |
| return |
| } |
| } |
| |
| } |
| |
| } |
| |
| type T struct { |
| a, b uint64 |
| } |
| |
| func newT(lo, hi uint64) T { |
| return T{a: lo, b: hi} |
| } |
| |
| func (x T) And(y T) T { |
| return T{a: x.a & y.a, b: x.b & y.b} |
| } |
| |
| func (x T) Xor(y T) T { |
| return T{a: x.a ^ y.a, b: x.b ^ y.b} |
| } |
| |
| func (x T) Or(y T) T { |
| return T{a: x.a | y.a, b: x.b | y.b} |
| } |
| |
| func (x T) MWL(y T) T { // MulWidenLo |
| hi, lo := bits.Mul64(x.a, y.a) |
| return T{a: lo, b: hi} |
| } |
| |
| func (x T) ClMul(y T) T { |
| m1 := newT(0x1084210842108421, 0x2108421084210842) |
| m2 := newT(0x2108421084210842, 0x4210842108421084) |
| m3 := newT(0x4210842108421084, 0x8421084210842108) |
| m4 := newT(0x8421084210842108, 0x0842108421084210) |
| m5 := newT(0x0842108421084210, 0x1084210842108421) |
| |
| x1 := x.And(m1) |
| x2 := x.And(m2) |
| x3 := x.And(m3) |
| x4 := x.And(m4) |
| x5 := x.And(m5) |
| |
| y1 := y.And(m1) |
| y2 := y.And(m2) |
| y3 := y.And(m3) |
| y4 := y.And(m4) |
| y5 := y.And(m5) |
| |
| // sum of x, y indices == K mod 5; mask index = K-1 |
| z := (x1.MWL(y1)).Xor(x2.MWL(y5)).Xor(x5.MWL(y2)).Xor(x3.MWL(y4)).Xor(x4.MWL(y3)).And(m1) |
| z = (x4.MWL(y4)).Xor(x3.MWL(y5)).Xor(x5.MWL(y3)).Xor(x1.MWL(y2)).Xor(x2.MWL(y1)).And(m2).Or(z) |
| z = (x2.MWL(y2)).Xor(x4.MWL(y5)).Xor(x5.MWL(y4)).Xor(x1.MWL(y3)).Xor(x3.MWL(y1)).And(m3).Or(z) |
| z = (x5.MWL(y5)).Xor(x1.MWL(y4)).Xor(x4.MWL(y1)).Xor(x2.MWL(y3)).Xor(x3.MWL(y2)).And(m4).Or(z) |
| z = (x3.MWL(y3)).Xor(x1.MWL(y5)).Xor(x5.MWL(y1)).Xor(x2.MWL(y4)).Xor(x4.MWL(y2)).And(m5).Or(z) |
| |
| return z |
| } |