| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "fmt" |
| "os" |
| "simd/archsimd" |
| "unsafe" |
| ) |
| |
| func load(s []float64) archsimd.Float64x4 { |
| return archsimd.LoadFloat64x4((*[4]float64)(s[:4])) |
| } |
| |
| type S1 = archsimd.Float64x4 |
| |
| type S2 archsimd.Float64x4 |
| |
| func (s S2) Len() int { |
| return archsimd.Float64x4(s).Len() |
| } |
| |
| func (s S2) Load(a []float64) S2 { |
| return S2(load(a)) |
| } |
| |
| func (s S2) Store(a *[4]float64) { |
| archsimd.Float64x4(s).Store(a) |
| } |
| |
| func (s S2) Add(a S2) S2 { |
| return S2(archsimd.Float64x4(s).Add(archsimd.Float64x4(a))) |
| } |
| |
| func (s S2) Mul(a S2) S2 { |
| return S2(archsimd.Float64x4(s).Mul(archsimd.Float64x4(a))) |
| } |
| |
| type S3 struct { |
| archsimd.Float64x4 |
| } |
| |
| func ip64_0(a, b []float64) float64 { |
| s := 0.0 |
| for i := range a { |
| s += a[i] * b[i] |
| } |
| return s |
| } |
| |
| func ip64_1(a, b []float64) float64 { |
| var z S1 |
| sum := z |
| var i int |
| stride := z.Len() |
| for ; i <= len(a)-stride; i += stride { |
| va := load(a[i:]) |
| vb := load(b[i:]) |
| sum = sum.Add(va.Mul(vb)) |
| } |
| var tmp [4]float64 |
| sum.Store(&tmp) |
| return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| } |
| |
| func ip64_1a(a, b []float64) float64 { |
| var z S1 |
| sum := z |
| var i int |
| stride := z.Len() |
| for ; i <= len(a)-stride; i += stride { |
| va := load(a[i:]) |
| vb := load(b[i:]) |
| sum = FMA(sum, va, vb) |
| } |
| var tmp [4]float64 |
| sum.Store(&tmp) |
| return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| } |
| |
| //go:noinline |
| func FMA(a, b, c archsimd.Float64x4) archsimd.Float64x4 { |
| return a.Add(b.Mul(c)) |
| } |
| |
| func ip64_2(a, b []float64) float64 { |
| var z S2 |
| sum := z |
| var i int |
| stride := z.Len() |
| for ; i <= len(a)-stride; i += stride { |
| va := z.Load(a[i:]) |
| vb := z.Load(b[i:]) |
| sum = sum.Add(va.Mul(vb)) |
| } |
| var tmp [4]float64 |
| sum.Store(&tmp) |
| return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| } |
| |
| func ip64_3(a, b []float64) float64 { |
| var z S3 |
| sum := z |
| var i int |
| stride := z.Len() |
| for ; i <= len(a)-stride; i += stride { |
| va := load(a[i:]) |
| vb := load(b[i:]) |
| sum = S3{sum.Add(va.Mul(vb))} |
| } |
| var tmp [4]float64 |
| sum.Store(&tmp) |
| return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| } |
| |
| func main() { |
| a := []float64{1, 2, 3, 4, 5, 6, 7, 8} |
| ip0 := ip64_0(a, a) |
| ip1 := ip64_1(a, a) |
| ip1a := ip64_1a(a, a) |
| ip2 := ip64_2(a, a) |
| ip3 := ip64_3(a, a) |
| fmt.Printf("Test IP = %f\n", ip0) |
| fmt.Printf("SIMD IP 1 = %f\n", ip1) |
| fmt.Printf("SIMD IP 1a = %f\n", ip1a) |
| fmt.Printf("SIMD IP 2 = %f\n", ip2) |
| fmt.Printf("SIMD IP 3 = %f\n", ip3) |
| var z1 S1 |
| var z2 S2 |
| var z3 S2 |
| |
| s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3) |
| |
| fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3) |
| |
| fail := false |
| |
| if s1 != 32 || s2 != 32 || s3 != 32 { |
| fmt.Println("Failed a sizeof check, should all be 32") |
| fail = true |
| } |
| |
| if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 { |
| fmt.Println("Failed an inner product check, should all be", ip0) |
| fail = true |
| } |
| |
| if fail { |
| os.Exit(1) |
| } |
| } |