vector: add SIMD versions of xxxAccumulateOpSrc.
name old time/op new time/op delta
GlyphAlpha16Src-8 3.37µs ± 0% 3.07µs ± 1% -8.86% (p=0.000 n=9+9)
GlyphAlpha32Src-8 6.01µs ± 1% 4.55µs ± 0% -24.28% (p=0.000 n=10+9)
GlyphAlpha64Src-8 13.2µs ± 0% 8.1µs ± 0% -38.69% (p=0.000 n=10+9)
GlyphAlpha128Src-8 32.9µs ± 0% 16.9µs ± 0% -48.85% (p=0.000 n=10+9)
GlyphAlpha256Src-8 98.0µs ± 0% 43.6µs ± 1% -55.50% (p=0.000 n=10+10)
A comparison of the non-SIMD and SIMD versions:
name time/op
FixedAccumulateOpSrc16-8 368ns ± 0%
FixedAccumulateOpSrcSIMD16-8 86.8ns ± 1%
FloatingAccumulateOpSrc16-8 434ns ± 0%
FloatingAccumulateOpSrcSIMD16-8 119ns ± 0%
FixedAccumulateOpSrc64-8 6.12µs ± 0%
FixedAccumulateOpSrcSIMD64-8 1.17µs ± 0%
FloatingAccumulateOpSrc64-8 7.15µs ± 0%
FloatingAccumulateOpSrcSIMD64-8 1.68µs ± 1%
Change-Id: I58e5c7a3ecd12e536aab8e765e94275453d0eac8
Reviewed-on: https://go-review.googlesource.com/30431
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/vector/acc_amd64.go b/vector/acc_amd64.go
new file mode 100644
index 0000000..e2149de
--- /dev/null
+++ b/vector/acc_amd64.go
@@ -0,0 +1,21 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package vector
+
+func haveSSE4_1() bool
+
+var haveFixedAccumulateSIMD = haveSSE4_1()
+
+const haveFloatingAccumulateSIMD = true
+
+//go:noescape
+func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
+
+//go:noescape
+func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s
new file mode 100644
index 0000000..7ef7aac
--- /dev/null
+++ b/vector/acc_amd64.s
@@ -0,0 +1,321 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// fl is short for floating point math. fx is short for fixed point math.
+
+DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
+DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
+DATA flOnes<>+0x00(SB)/8, $0x3f8000003f800000
+DATA flOnes<>+0x08(SB)/8, $0x3f8000003f800000
+DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
+DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
+DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400
+DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400
+DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
+DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
+
+GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
+GLOBL flOnes<>(SB), (NOPTR+RODATA), $16
+GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
+GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16
+GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
+
+// func haveSSE4_1() bool
+TEXT ·haveSSE4_1(SB), NOSPLIT, $0
+ MOVQ $1, AX
+ CPUID
+ SHRQ $19, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// ----------------------------------------------------------------------------
+
+// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
+//
+// XMM registers. Variable names are per
+// https://github.com/google/font-rs/blob/master/src/accumulate.c
+//
+// xmm0 scratch
+// xmm1 x
+// xmm2 y, z
+// xmm3 -
+// xmm4 -
+// xmm5 fxAlmost256
+// xmm6 shuffleMask
+// xmm7 offset
+TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
+ MOVQ dst_base+0(FP), DI
+ MOVQ dst_len+8(FP), BX
+ MOVQ src_base+24(FP), SI
+ MOVQ src_len+32(FP), CX
+
+ // Sanity check that len(dst) >= len(src).
+ CMPQ BX, CX
+ JLT fxAccOpSrcEnd
+
+ // CX = len(src) &^ 3
+ // DX = len(src)
+ MOVQ CX, DX
+ ANDQ $-4, CX
+
+ // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
+ // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
+ // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
+ MOVOU fxAlmost256<>(SB), X5
+ MOVOU shuffleMask<>(SB), X6
+ XORPS X7, X7
+
+ // i := 0
+ MOVQ $0, AX
+
+fxAccOpSrcLoop4:
+ // for i < (len(src) &^ 3)
+ CMPQ AX, CX
+ JAE fxAccOpSrcLoop1
+
+ // x = XMM(s0, s1, s2, s3)
+ //
+ // Where s0 is src[i+0], s1 is src[i+1], etc.
+ MOVOU (SI), X1
+
+ // scratch = XMM(0, s0, s1, s2)
+ // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
+ MOVOU X1, X0
+ PSLLO $4, X0
+ PADDD X0, X1
+
+ // scratch = XMM(0, 0, 0, 0)
+ // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
+ // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
+ XORPS X0, X0
+ SHUFPS $0x40, X1, X0
+ PADDD X0, X1
+
+ // x += offset
+ PADDD X7, X1
+
+ // y = abs(x)
+ // y >>= 12 // Shift by 2*ϕ - 8.
+ // y = min(y, fxAlmost256)
+ //
+ // pabsd %xmm1,%xmm2
+ // psrld $0xc,%xmm2
+ // pminud %xmm5,%xmm2
+ //
+ // Hopefully we'll get these opcode mnemonics into the assembler for Go
+ // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
+ // it's similar.
+ BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
+ BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
+ BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+
+ // z = shuffleTheLowBytesOfEach4ByteElement(y)
+ // copy(dst[:4], low4BytesOf(z))
+ PSHUFB X6, X2
+ MOVL X2, (DI)
+
+ // offset = XMM(x@3, x@3, x@3, x@3)
+ MOVOU X1, X7
+ SHUFPS $0xff, X1, X7
+
+ // i += 4
+ // dst = dst[4:]
+ // src = src[4:]
+ ADDQ $4, AX
+ ADDQ $4, DI
+ ADDQ $16, SI
+ JMP fxAccOpSrcLoop4
+
+fxAccOpSrcLoop1:
+ // for i < len(src)
+ CMPQ AX, DX
+ JAE fxAccOpSrcEnd
+
+ // x = src[i] + offset
+ MOVL (SI), X1
+ PADDD X7, X1
+
+ // y = abs(x)
+ // y >>= 12 // Shift by 2*ϕ - 8.
+ // y = min(y, fxAlmost256)
+ //
+ // pabsd %xmm1,%xmm2
+ // psrld $0xc,%xmm2
+ // pminud %xmm5,%xmm2
+ //
+ // Hopefully we'll get these opcode mnemonics into the assembler for Go
+ // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
+ // it's similar.
+ BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
+ BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
+ BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+
+ // dst[0] = uint8(y)
+ MOVL X2, BX
+ MOVB BX, (DI)
+
+ // offset = x
+ MOVOU X1, X7
+
+ // i += 1
+ // dst = dst[1:]
+ // src = src[1:]
+ ADDQ $1, AX
+ ADDQ $1, DI
+ ADDQ $4, SI
+ JMP fxAccOpSrcLoop1
+
+fxAccOpSrcEnd:
+ RET
+
+// ----------------------------------------------------------------------------
+
+// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
+//
+// XMM registers. Variable names are per
+// https://github.com/google/font-rs/blob/master/src/accumulate.c
+//
+// xmm0 scratch
+// xmm1 x
+// xmm2 y, z
+// xmm3 flAlmost256
+// xmm4 flOnes
+// xmm5 flSignMask
+// xmm6 shuffleMask
+// xmm7 offset
+TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
+ MOVQ dst_base+0(FP), DI
+ MOVQ dst_len+8(FP), BX
+ MOVQ src_base+24(FP), SI
+ MOVQ src_len+32(FP), CX
+
+ // Sanity check that len(dst) >= len(src).
+ CMPQ BX, CX
+ JLT flAccOpSrcEnd
+
+ // CX = len(src) &^ 3
+ // DX = len(src)
+ MOVQ CX, DX
+ ANDQ $-4, CX
+
+ // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
+ STMXCSR mxcsrOrig-8(SP)
+ MOVL mxcsrOrig-8(SP), AX
+ ORL $0x6000, AX
+ MOVL AX, mxcsrNew-4(SP)
+ LDMXCSR mxcsrNew-4(SP)
+
+ // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
+ // flOnes := XMM(0x3f800000 repeated four times) // 1 as a float32.
+ // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
+ // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
+ // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
+ MOVOU flAlmost256<>(SB), X3
+ MOVOU flOnes<>(SB), X4
+ MOVOU flSignMask<>(SB), X5
+ MOVOU shuffleMask<>(SB), X6
+ XORPS X7, X7
+
+ // i := 0
+ MOVQ $0, AX
+
+flAccOpSrcLoop4:
+ // for i < (len(src) &^ 3)
+ CMPQ AX, CX
+ JAE flAccOpSrcLoop1
+
+ // x = XMM(s0, s1, s2, s3)
+ //
+ // Where s0 is src[i+0], s1 is src[i+1], etc.
+ MOVOU (SI), X1
+
+ // scratch = XMM(0, s0, s1, s2)
+ // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
+ MOVOU X1, X0
+ PSLLO $4, X0
+ ADDPS X0, X1
+
+ // scratch = XMM(0, 0, 0, 0)
+ // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
+ // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
+ XORPS X0, X0
+ SHUFPS $0x40, X1, X0
+ ADDPS X0, X1
+
+ // x += offset
+ ADDPS X7, X1
+
+ // y = x & flSignMask
+ // y = min(y, flOnes)
+ // y = mul(y, flAlmost256)
+ MOVOU X5, X2
+ ANDPS X1, X2
+ MINPS X4, X2
+ MULPS X3, X2
+
+ // z = float32ToInt32(y)
+ // z = shuffleTheLowBytesOfEach4ByteElement(z)
+ // copy(dst[:4], low4BytesOf(z))
+ CVTPS2PL X2, X2
+ PSHUFB X6, X2
+ MOVL X2, (DI)
+
+ // offset = XMM(x@3, x@3, x@3, x@3)
+ MOVOU X1, X7
+ SHUFPS $0xff, X1, X7
+
+ // i += 4
+ // dst = dst[4:]
+ // src = src[4:]
+ ADDQ $4, AX
+ ADDQ $4, DI
+ ADDQ $16, SI
+ JMP flAccOpSrcLoop4
+
+flAccOpSrcLoop1:
+ // for i < len(src)
+ CMPQ AX, DX
+ JAE flAccOpSrcRestoreMXCSR
+
+ // x = src[i] + offset
+ MOVL (SI), X1
+ ADDPS X7, X1
+
+ // y = x & flSignMask
+ // y = min(y, flOnes)
+ // y = mul(y, flAlmost256)
+ MOVOU X5, X2
+ ANDPS X1, X2
+ MINPS X4, X2
+ MULPS X3, X2
+
+ // z = float32ToInt32(y)
+ // dst[0] = uint8(z)
+ CVTPS2PL X2, X2
+ MOVL X2, BX
+ MOVB BX, (DI)
+
+ // offset = x
+ MOVOU X1, X7
+
+ // i += 1
+ // dst = dst[1:]
+ // src = src[1:]
+ ADDQ $1, AX
+ ADDQ $1, DI
+ ADDQ $4, SI
+ JMP flAccOpSrcLoop1
+
+flAccOpSrcRestoreMXCSR:
+ LDMXCSR mxcsrOrig-8(SP)
+
+flAccOpSrcEnd:
+ RET
diff --git a/vector/acc_other.go b/vector/acc_other.go
new file mode 100644
index 0000000..c0b78f8
--- /dev/null
+++ b/vector/acc_other.go
@@ -0,0 +1,13 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package vector
+
+const haveFixedAccumulateSIMD = false
+const haveFloatingAccumulateSIMD = false
+
+func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {}
+func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {}
diff --git a/vector/acc_test.go b/vector/acc_test.go
index 8ba932e..1c59c2b 100644
--- a/vector/acc_test.go
+++ b/vector/acc_test.go
@@ -10,6 +10,87 @@
"testing"
)
+// TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash.
+
+func TestFixedAccumulateSIMDUnaligned(t *testing.T) {
+ if !haveFixedAccumulateSIMD {
+ t.Skip("No SIMD implemention")
+ }
+
+ dst := make([]uint8, 64)
+ src := make([]uint32, 64)
+ for d := 0; d < 16; d++ {
+ for s := 0; s < 16; s++ {
+ fixedAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
+ }
+ }
+}
+
+func TestFloatingAccumulateSIMDUnaligned(t *testing.T) {
+ if !haveFloatingAccumulateSIMD {
+ t.Skip("No SIMD implemention")
+ }
+
+ dst := make([]uint8, 64)
+ src := make([]float32, 64)
+ for d := 0; d < 16; d++ {
+ for s := 0; s < 16; s++ {
+ floatingAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
+ }
+ }
+}
+
+// TestXxxSIMDShortDst tests that the SIMD implementations don't write past the
+// end of the dst buffer.
+
+func TestFixedAccumulateSIMDShortDst(t *testing.T) {
+ if !haveFixedAccumulateSIMD {
+ t.Skip("No SIMD implemention")
+ }
+
+ const oneQuarter = uint32(int2ϕ(fxOne*fxOne)) / 4
+ src := []uint32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
+ for i := 0; i < 4; i++ {
+ dst := make([]uint8, 4)
+ fixedAccumulateOpSrcSIMD(dst[:i], src[:i])
+ for j := range dst {
+ if j < i {
+ if got := dst[j]; got == 0 {
+ t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
+ }
+ } else {
+ if got := dst[j]; got != 0 {
+ t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
+ }
+ }
+ }
+ }
+}
+
+func TestFloatingAccumulateSIMDShortDst(t *testing.T) {
+ if !haveFloatingAccumulateSIMD {
+ t.Skip("No SIMD implemention")
+ }
+
+ const oneQuarter = 0.25
+ src := []float32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
+ for i := 0; i < 4; i++ {
+ dst := make([]uint8, 4)
+ floatingAccumulateOpSrcSIMD(dst[:i], src[:i])
+ for j := range dst {
+ if j < i {
+ if got := dst[j]; got == 0 {
+ t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
+ }
+ } else {
+ if got := dst[j]; got != 0 {
+ t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
+ }
+ }
+ }
+ }
+}
+
func TestFixedAccumulateOpOverShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "over") }
func TestFixedAccumulateOpSrcShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "src") }
func TestFixedAccumulateMaskShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "mask") }
@@ -25,81 +106,97 @@
func TestFloatingAccumulateMask16(t *testing.T) { testAcc(t, flIn16, flMask16, "mask") }
func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
- maxN := 0
- switch in := in.(type) {
- case []uint32:
- maxN = len(in)
- case []float32:
- maxN = len(in)
- }
-
- for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
- 33, 55, 79, 96, 120, 165, 256, maxN} {
-
- if n > maxN {
- continue
- }
-
- var (
- got8, want8 []uint8
- got32, want32 []uint32
- )
- switch op {
- case "over":
- const background = 0x40
- got8 = make([]uint8, n)
- for i := range got8 {
- got8[i] = background
- }
- want8 = make([]uint8, n)
- for i := range want8 {
- dstA := uint32(background * 0x101)
- maskA := mask[i]
- outA := dstA*(0xffff-maskA)/0xffff + maskA
- want8[i] = uint8(outA >> 8)
- }
-
- case "src":
- got8 = make([]uint8, n)
- want8 = make([]uint8, n)
- for i := range want8 {
- want8[i] = uint8(mask[i] >> 8)
- }
-
- case "mask":
- got32 = make([]uint32, n)
- want32 = mask[:n]
- }
-
+ for _, simd := range []bool{false, true} {
+ maxN := 0
switch in := in.(type) {
case []uint32:
- switch op {
- case "over":
- fixedAccumulateOpOver(got8, in[:n])
- case "src":
- fixedAccumulateOpSrc(got8, in[:n])
- case "mask":
- copy(got32, in[:n])
- fixedAccumulateMask(got32)
+ if simd && !haveFixedAccumulateSIMD {
+ continue
}
+ maxN = len(in)
case []float32:
- switch op {
- case "over":
- floatingAccumulateOpOver(got8, in[:n])
- case "src":
- floatingAccumulateOpSrc(got8, in[:n])
- case "mask":
- floatingAccumulateMask(got32, in[:n])
+ if simd && !haveFloatingAccumulateSIMD {
+ continue
}
+ maxN = len(in)
}
- if op != "mask" {
- if !bytes.Equal(got8, want8) {
- t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got8, want8)
+ for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 33, 55, 79, 96, 120, 165, 256, maxN} {
+
+ if n > maxN {
+ continue
}
- } else {
- if !uint32sEqual(got32, want32) {
- t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got32, want32)
+
+ var (
+ got8, want8 []uint8
+ got32, want32 []uint32
+ )
+ switch op {
+ case "over":
+ const background = 0x40
+ got8 = make([]uint8, n)
+ for i := range got8 {
+ got8[i] = background
+ }
+ want8 = make([]uint8, n)
+ for i := range want8 {
+ dstA := uint32(background * 0x101)
+ maskA := mask[i]
+ outA := dstA*(0xffff-maskA)/0xffff + maskA
+ want8[i] = uint8(outA >> 8)
+ }
+
+ case "src":
+ got8 = make([]uint8, n)
+ want8 = make([]uint8, n)
+ for i := range want8 {
+ want8[i] = uint8(mask[i] >> 8)
+ }
+
+ case "mask":
+ got32 = make([]uint32, n)
+ want32 = mask[:n]
+ }
+
+ switch in := in.(type) {
+ case []uint32:
+ switch op {
+ case "over":
+ fixedAccumulateOpOver(got8, in[:n])
+ case "src":
+ if simd {
+ fixedAccumulateOpSrcSIMD(got8, in[:n])
+ } else {
+ fixedAccumulateOpSrc(got8, in[:n])
+ }
+ case "mask":
+ copy(got32, in[:n])
+ fixedAccumulateMask(got32)
+ }
+ case []float32:
+ switch op {
+ case "over":
+ floatingAccumulateOpOver(got8, in[:n])
+ case "src":
+ if simd {
+ floatingAccumulateOpSrcSIMD(got8, in[:n])
+ } else {
+ floatingAccumulateOpSrc(got8, in[:n])
+ }
+ case "mask":
+ floatingAccumulateMask(got32, in[:n])
+ }
+ }
+
+ if op != "mask" {
+ if !bytes.Equal(got8, want8) {
+ t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got8, want8)
+ }
+ } else {
+ if !uint32sEqual(got32, want32) {
+ t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got32, want32)
+ }
}
}
}
@@ -129,45 +226,66 @@
return true
}
-func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over") }
-func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src") }
-func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask") }
-func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over") }
-func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src") }
-func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask") }
+func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) }
+func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) }
+func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) }
+func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) }
+func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) }
+func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) }
+func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) }
+func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) }
-func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over") }
-func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src") }
-func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask") }
-func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over") }
-func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src") }
-func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask") }
+func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) }
+func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) }
+func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) }
+func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) }
+func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) }
+func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) }
+func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) }
+func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) }
-func benchAcc(b *testing.B, in interface{}, op string) {
+func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
var f func()
switch in := in.(type) {
case []uint32:
+ if simd && !haveFixedAccumulateSIMD {
+ b.Skip("No SIMD implemention")
+ }
+
switch op {
case "over":
dst := make([]uint8, len(in))
f = func() { fixedAccumulateOpOver(dst, in) }
case "src":
dst := make([]uint8, len(in))
- f = func() { fixedAccumulateOpSrc(dst, in) }
+ if simd {
+ f = func() { fixedAccumulateOpSrcSIMD(dst, in) }
+ } else {
+ f = func() { fixedAccumulateOpSrc(dst, in) }
+ }
case "mask":
buf := make([]uint32, len(in))
copy(buf, in)
f = func() { fixedAccumulateMask(buf) }
}
+
case []float32:
+ if simd && !haveFloatingAccumulateSIMD {
+ b.Skip("No SIMD implemention")
+ }
+
switch op {
case "over":
dst := make([]uint8, len(in))
f = func() { floatingAccumulateOpOver(dst, in) }
case "src":
dst := make([]uint8, len(in))
- f = func() { floatingAccumulateOpSrc(dst, in) }
+ if simd {
+ f = func() { floatingAccumulateOpSrcSIMD(dst, in) }
+ } else {
+ f = func() { floatingAccumulateOpSrc(dst, in) }
+ }
case "mask":
dst := make([]uint32, len(in))
f = func() { floatingAccumulateMask(dst, in) }
diff --git a/vector/raster_fixed.go b/vector/raster_fixed.go
index 5678bab..40c17aa 100644
--- a/vector/raster_fixed.go
+++ b/vector/raster_fixed.go
@@ -21,9 +21,9 @@
// in the .s files).
ϕ = 10
- one int1ϕ = 1 << ϕ
- oneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1)
- oneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
+ fxOne int1ϕ = 1 << ϕ
+ fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1)
+ fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
)
// int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed
@@ -56,7 +56,7 @@
}
func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) }
-func fixedCeil(x int1ϕ) int32 { return int32((x + oneMinusIota) >> ϕ) }
+func fixedCeil(x int1ϕ) int32 { return int32((x + fxOneMinusIota) >> ϕ) }
func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
a := z.pen
@@ -74,10 +74,10 @@
}
dxdy := (b[0] - a[0]) / (b[1] - a[1])
- ay := int1ϕ(a[1] * float32(one))
- by := int1ϕ(b[1] * float32(one))
+ ay := int1ϕ(a[1] * float32(fxOne))
+ by := int1ϕ(b[1] * float32(fxOne))
- x := int1ϕ(a[0] * float32(one))
+ x := int1ϕ(a[0] * float32(fxOne))
y := fixedFloor(ay)
yMax := fixedCeil(by)
if yMax > int32(z.size.Y) {
@@ -106,7 +106,7 @@
if x1i <= x0i+1 {
xmf := (x+xNext)>>1 - x0Floor
if i := clamp(x0i+0, width); i < uint(len(buf)) {
- buf[i] += uint32(d * (one - xmf))
+ buf[i] += uint32(d * (fxOne - xmf))
}
if i := clamp(x0i+1, width); i < uint(len(buf)) {
buf[i] += uint32(d * xmf)
@@ -115,9 +115,9 @@
oneOverS := x1 - x0
twoOverS := 2 * oneOverS
x0f := x0 - x0Floor
- oneMinusX0f := one - x0f
+ oneMinusX0f := fxOne - x0f
oneMinusX0fSquared := oneMinusX0f * oneMinusX0f
- x1f := x1 - x1Ceil + one
+ x1f := x1 - x1Ceil + fxOne
x1fSquared := x1f * x1f
// These next two variables are unused, as rounding errors are
@@ -139,7 +139,7 @@
if x1i == x0i+2 {
if i := clamp(x0i+1, width); i < uint(len(buf)) {
- // In ideal math: buf[i] += uint32(d * (one - a0 - am))
+ // In ideal math: buf[i] += uint32(d * (fxOne - a0 - am))
D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared
D *= d
D /= twoOverS
@@ -148,14 +148,14 @@
} else {
// This is commented out for the same reason as a0 and am.
//
- // a1 := ((oneAndAHalf - x0f) << ϕ) / oneOverS
+ // a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS
if i := clamp(x0i+1, width); i < uint(len(buf)) {
// In ideal math: buf[i] += uint32(d * (a1 - a0))
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterizePolygon fails.
- D := int64((oneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
+ D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
D *= int64(d)
D /= int64(twoOverS)
buf[i] += uint32(D)
@@ -172,12 +172,12 @@
// a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS
if i := clamp(x1i-1, width); i < uint(len(buf)) {
- // In ideal math: buf[i] += uint32(d * (one - a2 - am))
+ // In ideal math: buf[i] += uint32(d * (fxOne - a2 - am))
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterizePolygon fails.
D := int64(twoOverS << ϕ)
- D -= int64((oneAndAHalf - x0f) << (ϕ + 1))
+ D -= int64((fxOneAndAHalf - x0f) << (ϕ + 1))
D -= int64((x1i - x0i - 3) << (2*ϕ + 1))
D -= int64(x1fSquared)
D *= int64(d)
@@ -220,6 +220,11 @@
}
func fixedAccumulateOpSrc(dst []uint8, src []uint32) {
+ // Sanity check that len(dst) >= len(src).
+ if len(dst) < len(src) {
+ return
+ }
+
acc := int2ϕ(0)
for i, v := range src {
acc += int2ϕ(v)
diff --git a/vector/raster_floating.go b/vector/raster_floating.go
index fa3e7b9..50621e1 100644
--- a/vector/raster_floating.go
+++ b/vector/raster_floating.go
@@ -165,6 +165,11 @@
}
func floatingAccumulateOpSrc(dst []uint8, src []float32) {
+ // Sanity check that len(dst) >= len(src).
+ if len(dst) < len(src) {
+ return
+ }
+
acc := float32(0)
for i, v := range src {
acc += v
diff --git a/vector/vector.go b/vector/vector.go
index 4813832..a9f832a 100644
--- a/vector/vector.go
+++ b/vector/vector.go
@@ -312,7 +312,6 @@
}
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
- // TODO: add SIMD implementations.
// TODO: non-zero vs even-odd winding?
if r == dst.Bounds() && r == z.Bounds() {
// We bypass the z.accumulateMask step and convert straight from
@@ -341,15 +340,22 @@
}
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) {
- // TODO: add SIMD implementations.
// TODO: non-zero vs even-odd winding?
if r == dst.Bounds() && r == z.Bounds() {
// We bypass the z.accumulateMask step and convert straight from
// z.bufF32 or z.bufU32 to dst.Pix.
if z.useFloatingPointMath {
- floatingAccumulateOpSrc(dst.Pix, z.bufF32)
+ if haveFloatingAccumulateSIMD {
+ floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32)
+ } else {
+ floatingAccumulateOpSrc(dst.Pix, z.bufF32)
+ }
} else {
- fixedAccumulateOpSrc(dst.Pix, z.bufU32)
+ if haveFixedAccumulateSIMD {
+ fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32)
+ } else {
+ fixedAccumulateOpSrc(dst.Pix, z.bufU32)
+ }
}
return
}