shiny/driver/internal/swizzle: new package.
On my amd64 desktop machine:
BenchmarkBGRA-8 3000 469214 ns/op
BenchmarkPureGoBGRA-8 500 3267103 ns/op
When swizzling a 1920x1080 RGBA pixel buffer, there's a 7x difference
between 3.27ms and 0.47ms, and that 3-ish milliseconds difference is a
noticable fraction of the 16.67ms that a 60Hz refresh rate gives you.
Thanks to Aaron Jacobs for his help with SIMD assembly.
Change-Id: I8c1a50cc3f038824e07442492f8f0f6b22c83728
Reviewed-on: https://go-review.googlesource.com/13003
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/shiny/driver/internal/swizzle/swizzle_amd64.go b/shiny/driver/internal/swizzle/swizzle_amd64.go
new file mode 100644
index 0000000..dac0246
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_amd64.go
@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package swizzle
+
+const (
+ haveSIMD16 = true
+)
+
+func bgra16(p []byte)
diff --git a/shiny/driver/internal/swizzle/swizzle_amd64.s b/shiny/driver/internal/swizzle/swizzle_amd64.s
new file mode 100644
index 0000000..b1bb67a
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_amd64.s
@@ -0,0 +1,42 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·bgra16(SB),NOSPLIT,$0-24
+ MOVQ p+0(FP), SI
+ MOVQ len+8(FP), DI
+
+ // Sanity check that len is a multiple of 16.
+ MOVQ DI, AX
+ ANDQ $15, AX
+ CMPQ AX, $0
+ JNE done
+
+ // Make the shuffle control mask (16-byte register X0) look like this,
+ // where the low order byte comes first:
+ //
+ // 02 01 00 03 06 05 04 07 0a 09 08 0b 0e 0d 0c 0f
+ //
+ // Load the bottom 8 bytes into X0, the top into X1, then interleave them
+ // into X0.
+ MOVQ $0x0704050603000102, AX
+ MOVQ AX, X0
+ MOVQ $0x0f0c0d0e0b08090a, AX
+ MOVQ AX, X1
+ PUNPCKLQDQ X1, X0
+
+ ADDQ SI, DI
+loop:
+ CMPQ SI, DI
+ JEQ done
+
+ MOVOU (SI), X1
+ PSHUFB X0, X1
+ MOVOU X1, (SI)
+
+ ADDQ $16, SI
+ JMP loop
+done:
+ RET
diff --git a/shiny/driver/internal/swizzle/swizzle_common.go b/shiny/driver/internal/swizzle/swizzle_common.go
new file mode 100644
index 0000000..be52fd0
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_common.go
@@ -0,0 +1,28 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package swizzle provides functions for converting between RGBA pixel
+// formats.
+package swizzle
+
+// BGRA converts a pixel buffer between Go's RGBA and other systems' BGRA byte
+// orders.
+//
+// It panics if the input slice length is not a multiple of 4.
+func BGRA(p []byte) {
+ if len(p)%4 != 0 {
+ panic("input slice length is not a multiple of 4")
+ }
+
+ // Use SIMD code for 16-byte chunks, if supported.
+ if haveSIMD16 {
+ n := len(p) &^ (16 - 1)
+ bgra16(p[:n])
+ p = p[n:]
+ }
+
+ for i := 0; i < len(p); i += 4 {
+ p[i+0], p[i+2] = p[i+2], p[i+0]
+ }
+}
diff --git a/shiny/driver/internal/swizzle/swizzle_other.go b/shiny/driver/internal/swizzle/swizzle_other.go
new file mode 100644
index 0000000..ff342d7
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_other.go
@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package swizzle
+
+const (
+ haveSIMD16 = false
+)
+
+func bgra16(p []byte) { panic("unreachable") }
diff --git a/shiny/driver/internal/swizzle/swizzle_test.go b/shiny/driver/internal/swizzle/swizzle_test.go
new file mode 100644
index 0000000..83a6d0c
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_test.go
@@ -0,0 +1,86 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package swizzle
+
+import (
+ "bytes"
+ "math/rand"
+ "testing"
+)
+
+func TestBGRAShortInput(t *testing.T) {
+ const s = "012.456.89A.CDE.GHI.KLM.O"
+ testCases := []string{
+ 0: "012.456.89A.CDE.GHI.KLM.O",
+ 1: "210.456.89A.CDE.GHI.KLM.O",
+ 2: "210.654.89A.CDE.GHI.KLM.O",
+ 3: "210.654.A98.CDE.GHI.KLM.O",
+ 4: "210.654.A98.EDC.GHI.KLM.O",
+ 5: "210.654.A98.EDC.IHG.KLM.O",
+ 6: "210.654.A98.EDC.IHG.MLK.O",
+ }
+ for i, want := range testCases {
+ b := []byte(s)
+ BGRA(b[:4*i])
+ got := string(b)
+ if got != want {
+ t.Errorf("i=%d: got %q, want %q", i, got, want)
+ }
+ changed := got != s
+ wantChanged := i != 0
+ if changed != wantChanged {
+ t.Errorf("i=%d: changed=%t, want %t", i, changed, wantChanged)
+ }
+ }
+}
+
+func TestBGRARandomInput(t *testing.T) {
+ r := rand.New(rand.NewSource(1))
+ fastBuf := make([]byte, 1024)
+ slowBuf := make([]byte, 1024)
+ for i := range fastBuf {
+ fastBuf[i] = uint8(r.Intn(256))
+ }
+ copy(slowBuf, fastBuf)
+
+ for i := 0; i < 100000; i++ {
+ o := r.Intn(len(fastBuf))
+ n := r.Intn(len(fastBuf)-o) &^ 0x03
+ BGRA(fastBuf[o : o+n])
+ pureGoBGRA(slowBuf[o : o+n])
+ if bytes.Equal(fastBuf, slowBuf) {
+ continue
+ }
+ for j := range fastBuf {
+ x := fastBuf[j]
+ y := slowBuf[j]
+ if x != y {
+ t.Fatalf("iter %d: swizzling [%d:%d+%d]: bytes differ at offset %d (aka %d+%d): %#02x vs %#02x",
+ i, o, o, n, j, o, j-o, x, y)
+ }
+ }
+ }
+}
+
+func pureGoBGRA(p []byte) {
+ if len(p)%4 != 0 {
+ return
+ }
+ for i := 0; i < len(p); i += 4 {
+ p[i+0], p[i+2] = p[i+2], p[i+0]
+ }
+}
+
+func benchmarkBGRA(b *testing.B, f func([]byte)) {
+ const w, h = 1920, 1080 // 1080p RGBA.
+ buf := make([]byte, 4*w*h)
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ f(buf)
+ }
+}
+
+func BenchmarkBGRA(b *testing.B) { benchmarkBGRA(b, BGRA) }
+func BenchmarkPureGoBGRA(b *testing.B) { benchmarkBGRA(b, pureGoBGRA) }
diff --git a/shiny/driver/x11driver/buffer.go b/shiny/driver/x11driver/buffer.go
index 9d32500..15dc8f0 100644
--- a/shiny/driver/x11driver/buffer.go
+++ b/shiny/driver/x11driver/buffer.go
@@ -11,6 +11,8 @@
"unsafe"
"github.com/BurntSushi/xgb/shm"
+
+ "golang.org/x/exp/shiny/driver/internal/swizzle"
)
type bufferImpl struct {
@@ -42,20 +44,7 @@
b.mu.Unlock()
if needsSwizzle {
- swizzle(b.buf)
- }
-}
-
-// swizzle converts a pixel buffer between Go's RGBA byte order and X11's BGRA
-// byte order.
-//
-// TODO: optimize this.
-func swizzle(p []byte) {
- if len(p)%4 != 0 {
- return
- }
- for i := 0; i < len(p); i += 4 {
- p[i+0], p[i+2] = p[i+2], p[i+0]
+ swizzle.BGRA(b.buf)
}
}
@@ -72,7 +61,7 @@
if released {
b.cleanUp()
} else {
- swizzle(b.buf)
+ swizzle.BGRA(b.buf)
}
}