| // Copyright 2026 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build goexperiment.simd |
| |
| package simd_test |
| |
| import ( |
| "fmt" |
| "math/rand/v2" |
| "simd" |
| "testing" |
| ) |
| |
| func fill(x, y []float32) { |
| for i := range x { |
| x[i] = 2*rand.Float32() - 1 |
| y[i] = 2*rand.Float32() - 1 |
| } |
| } |
| |
| func checkErrors(b *testing.B, errors int) { |
| b.Helper() |
| if errors > 0 { |
| b.Logf("errors = %d", errors) |
| } |
| } |
| |
| // BenchmarkIPFMA is simd vector inner product computing using FMA. |
| func BenchmarkIPFMA(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0, _, _ := ipFMA(x, y) |
| |
| var errors int |
| for b.Loop() { |
| z, _, _ := ipFMA(x, y) |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| func ipFMA(x, y []float32) (float32, int, bool) { |
| var a simd.Float32s |
| sumWidth := a.Len() * 32 |
| emulated := simd.Emulated() |
| var i int |
| for i = 0; i < len(x)-a.Len()+1; i += a.Len() { |
| u := simd.LoadFloat32s(x[i : i+a.Len()]) |
| v := simd.LoadFloat32s(y[i : i+a.Len()]) |
| a = u.MulAdd(v, a) |
| } |
| if i < len(x) { |
| a = first(simd.LoadFloat32sPart(x[i:])).MulAdd( |
| first(simd.LoadFloat32sPart(y[i:])), a) |
| } |
| |
| return sum(a), sumWidth, emulated |
| } |
| |
| func TestIP(t *testing.T) { |
| |
| var a, b [50]float32 |
| for i := 0; i < 50; i++ { |
| a[i] = float32(i) |
| b[i] = float32(i) |
| } |
| x, sumWidth, emulated := ip(a[:50], b[:50]) |
| |
| if x != 40425 { |
| t.Errorf("Expected 40425, got %f", x) |
| } |
| |
| fmt.Printf("ip: sum was computed in width %d, emulated = %v\n", sumWidth, emulated) |
| } |
| |
| func TestIPGoTo(t *testing.T) { |
| |
| var a, b [50]float32 |
| for i := 0; i < 50; i++ { |
| a[i] = float32(i) |
| b[i] = float32(i) |
| } |
| x, sumWidth, emulated := ipGoTo(a[:50], b[:50]) |
| |
| if x != 40425 { |
| t.Errorf("Expected 40425, got %f", x) |
| } |
| |
| fmt.Printf("ipgoto: sum was computed in width %d, emulated = %v\n", sumWidth, emulated) |
| } |
| |
| func first[T, U any](t T, u U) T { |
| return t |
| } |
| |
| const ipBenchLen = 300000 |
| |
| // BenchmarkIP is simd vector inner product, vanilla transcription. |
| func BenchmarkIP(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0, _, _ := ip(x, y) |
| |
| var errors int |
| for b.Loop() { |
| z, _, _ := ip(x, y) |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| // BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops. |
| func BenchmarkIPUnroll(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0, _, _ := ipU(x, y) |
| |
| var errors int |
| for b.Loop() { |
| z, _, _ := ipU(x, y) |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| // BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops |
| func BenchmarkIPUnrollMore(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0, _, _ := ipUmore(x, y) |
| |
| var errors int |
| for b.Loop() { |
| z, _, _ := ipUmore(x, y) |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| // ipNosimd computes inner product with serial |
| // addition order of the terms (to make the) |
| // check comparison turn out right. |
| func ipNosimd(x, y []float32) float32 { |
| var z float32 |
| for i, a := range x { |
| z += a * y[i] |
| } |
| return z |
| } |
| |
| // BenchmarkIPnosimd1 is serial, just a vanilla inner product. |
| func BenchmarkIPnosimd0(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0 := ipNosimd(x, y) |
| |
| var errors int |
| for b.Loop() { |
| var z float32 |
| for i, a := range x { |
| z += a * y[i] |
| } |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| // BenchmarkIPnosimd1 is serial, but with a no-op subslice that |
| // makes it clear that x and y have the same length. |
| func BenchmarkIPnosimd1(b *testing.B) { |
| x := make([]float32, ipBenchLen) |
| y := make([]float32, ipBenchLen) |
| |
| fill(x, y) |
| |
| ip0 := ipNosimd(x, y) |
| |
| var errors int |
| for b.Loop() { |
| var z float32 |
| yy := y[:(len(x))] |
| for i, a := range x { |
| z += a * yy[i] |
| } |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| // BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices, |
| // so no bounds checking, gosh darn it to heck. |
| func BenchmarkIPnosimdA(b *testing.B) { |
| var x, y [ipBenchLen]float32 |
| |
| fill(x[:], y[:]) |
| |
| ip0 := ipNosimd(x[:], y[:]) |
| |
| var errors int |
| for b.Loop() { |
| var z float32 |
| for i, a := range x { |
| z += a * y[i] |
| } |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| var x, y [ipBenchLen]float32 |
| var ip0 float32 |
| |
| func initIp0() { |
| fill(x[:], y[:]) |
| ip0 = ipNosimd(x[:], y[:]) |
| } |
| |
| // BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices, |
| // and using a classic iterated loop to see if b.Loop affects subscript inference, |
| // so no bounds checking, gosh darn it to heck, this time, for sure. |
| func BenchmarkIPnosimdAnotBloop(b *testing.B) { |
| if ip0 == 0 { |
| initIp0() |
| } |
| |
| var errors int |
| for range b.N { |
| var z float32 |
| for i, a := range x { |
| z += a * y[i] |
| } |
| if z != ip0 { |
| errors++ |
| } |
| } |
| checkErrors(b, errors) |
| } |
| |
| func ip(x, y []float32) (float32, int, bool) { |
| var a simd.Float32s |
| sumWidth := a.Len() * 32 |
| emulated := simd.Emulated() |
| var i int |
| for i = 0; i < len(x)-a.Len()+1; i += a.Len() { |
| u := simd.LoadFloat32s(x[i : i+a.Len()]) |
| v := simd.LoadFloat32s(y[i : i+a.Len()]) |
| a = a.Add(u.Mul(v)) |
| } |
| if i < len(x) { |
| a = a.Add(first(simd.LoadFloat32sPart(x[i:])). |
| Mul(first(simd.LoadFloat32sPart(y[i:])))) |
| } |
| |
| return sum(a), sumWidth, emulated |
| } |
| |
| func ipU(x, y []float32) (float32, int, bool) { |
| const U = 4 |
| var a, a0, a1, a2, a3 simd.Float32s |
| sumWidth := a.Len() * 32 |
| emulated := simd.Emulated() |
| var i int |
| for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() { |
| i0 := i |
| i1 := i + a.Len() |
| i2 := i + 2*a.Len() |
| i3 := i + 3*a.Len() |
| |
| u := simd.LoadFloat32s(x[i0 : i0+a.Len()]) |
| v := simd.LoadFloat32s(y[i0 : i0+a.Len()]) |
| a0 = a0.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i1 : i1+a.Len()]) |
| v = simd.LoadFloat32s(y[i1 : i1+a.Len()]) |
| a1 = a1.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i2 : i2+a.Len()]) |
| v = simd.LoadFloat32s(y[i2 : i2+a.Len()]) |
| a2 = a2.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i3 : i3+a.Len()]) |
| v = simd.LoadFloat32s(y[i3 : i3+a.Len()]) |
| a3 = a3.Add(u.Mul(v)) |
| } |
| a = a0.Add(a1).Add(a2.Add(a3)) |
| for ; i < len(x)-a.Len()+1; i += a.Len() { |
| u := simd.LoadFloat32s(x[i : i+a.Len()]) |
| v := simd.LoadFloat32s(y[i : i+a.Len()]) |
| a = a.Add(u.Mul(v)) |
| } |
| if i < len(x) { |
| a = a.Add(first(simd.LoadFloat32sPart(x[i:])). |
| Mul(first(simd.LoadFloat32sPart(y[i:])))) |
| } |
| |
| return sum(a), sumWidth, emulated |
| } |
| |
| func ipUmore(x, y []float32) (float32, int, bool) { |
| const U = 5 |
| var a, a0, a1, a2, a3, a4 simd.Float32s |
| sumWidth := a.Len() * 32 |
| emulated := simd.Emulated() |
| var i int |
| for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() { |
| i0 := i |
| i1 := i + a.Len() |
| i2 := i + 2*a.Len() |
| i3 := i + 3*a.Len() |
| i4 := i + 4*a.Len() |
| |
| u := simd.LoadFloat32s(x[i0 : i0+a.Len()]) |
| v := simd.LoadFloat32s(y[i0 : i0+a.Len()]) |
| a0 = a0.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i1 : i1+a.Len()]) |
| v = simd.LoadFloat32s(y[i1 : i1+a.Len()]) |
| a1 = a1.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i2 : i2+a.Len()]) |
| v = simd.LoadFloat32s(y[i2 : i2+a.Len()]) |
| a2 = a2.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i3 : i3+a.Len()]) |
| v = simd.LoadFloat32s(y[i3 : i3+a.Len()]) |
| a3 = a3.Add(u.Mul(v)) |
| |
| u = simd.LoadFloat32s(x[i4 : i4+a.Len()]) |
| v = simd.LoadFloat32s(y[i4 : i4+a.Len()]) |
| a4 = a4.Add(u.Mul(v)) |
| } |
| a = a0.Add(a1).Add(a2.Add(a3)).Add(a4) |
| |
| for ; i < len(x)-a.Len()+1; i += a.Len() { |
| u := simd.LoadFloat32s(x[i : i+a.Len()]) |
| v := simd.LoadFloat32s(y[i : i+a.Len()]) |
| a = a.Add(u.Mul(v)) |
| } |
| if i < len(x) { |
| a = a.Add(first(simd.LoadFloat32sPart(x[i:])). |
| Mul(first(simd.LoadFloat32sPart(y[i:])))) |
| } |
| |
| return sum(a), sumWidth, emulated |
| } |
| |
| func ipGoTo(x, y []float32) (float32, int, bool) { |
| var a simd.Float32s |
| sumWidth := a.Len() * 32 |
| emulated := simd.Emulated() |
| var i int |
| var u, v simd.Float32s |
| loop: |
| if !(i < len(x)-a.Len()+1) { |
| goto done |
| } |
| u = simd.LoadFloat32s(x[i : i+a.Len()]) |
| v = simd.LoadFloat32s(y[i : i+a.Len()]) |
| a = a.Add(u.Mul(v)) |
| i += a.Len() |
| goto loop |
| done: |
| if i < len(x) { |
| a = a.Add(first(simd.LoadFloat32sPart(x[i:])). |
| Mul(first(simd.LoadFloat32sPart(y[i:])))) |
| } |
| |
| return sum(a), sumWidth, emulated |
| } |
| |
| func boringSum(x simd.Float32s) float32 { |
| s := make([]float32, x.Len()) |
| x.Store(s) |
| var r float32 |
| for _, e := range s { |
| r += e |
| } |
| return r |
| } |