shiny/driver/internal/swizzle: new package.

On my amd64 desktop machine:
BenchmarkBGRA-8      	    3000	    469214 ns/op
BenchmarkPureGoBGRA-8	     500	   3267103 ns/op

When swizzling a 1920x1080 RGBA pixel buffer, there's a 7x difference
between 3.27ms and 0.47ms, and that 3-ish milliseconds difference is a
noticable fraction of the 16.67ms that a 60Hz refresh rate gives you.

Thanks to Aaron Jacobs for his help with SIMD assembly.

Change-Id: I8c1a50cc3f038824e07442492f8f0f6b22c83728
Reviewed-on: https://go-review.googlesource.com/13003
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/shiny/driver/internal/swizzle/swizzle_amd64.go b/shiny/driver/internal/swizzle/swizzle_amd64.go
new file mode 100644
index 0000000..dac0246
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_amd64.go
@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package swizzle
+
+const (
+	haveSIMD16 = true
+)
+
+func bgra16(p []byte)
diff --git a/shiny/driver/internal/swizzle/swizzle_amd64.s b/shiny/driver/internal/swizzle/swizzle_amd64.s
new file mode 100644
index 0000000..b1bb67a
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_amd64.s
@@ -0,0 +1,42 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·bgra16(SB),NOSPLIT,$0-24
+	MOVQ	p+0(FP), SI
+	MOVQ	len+8(FP), DI
+
+	// Sanity check that len is a multiple of 16.
+	MOVQ	DI, AX
+	ANDQ	$15, AX
+	CMPQ	AX, $0
+	JNE	done
+
+	// Make the shuffle control mask (16-byte register X0) look like this,
+	// where the low order byte comes first:
+	//
+	// 02 01 00 03  06 05 04 07  0a 09 08 0b  0e 0d 0c 0f
+	//
+	// Load the bottom 8 bytes into X0, the top into X1, then interleave them
+	// into X0.
+	MOVQ	$0x0704050603000102, AX
+	MOVQ	AX, X0
+	MOVQ	$0x0f0c0d0e0b08090a, AX
+	MOVQ	AX, X1
+	PUNPCKLQDQ	X1, X0
+
+	ADDQ	SI, DI
+loop:
+	CMPQ	SI, DI
+	JEQ	done
+
+	MOVOU	(SI), X1
+	PSHUFB	X0, X1
+	MOVOU	X1, (SI)
+
+	ADDQ	$16, SI
+	JMP	loop
+done:
+	RET
diff --git a/shiny/driver/internal/swizzle/swizzle_common.go b/shiny/driver/internal/swizzle/swizzle_common.go
new file mode 100644
index 0000000..be52fd0
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_common.go
@@ -0,0 +1,28 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package swizzle provides functions for converting between RGBA pixel
+// formats.
+package swizzle
+
+// BGRA converts a pixel buffer between Go's RGBA and other systems' BGRA byte
+// orders.
+//
+// It panics if the input slice length is not a multiple of 4.
+func BGRA(p []byte) {
+	if len(p)%4 != 0 {
+		panic("input slice length is not a multiple of 4")
+	}
+
+	// Use SIMD code for 16-byte chunks, if supported.
+	if haveSIMD16 {
+		n := len(p) &^ (16 - 1)
+		bgra16(p[:n])
+		p = p[n:]
+	}
+
+	for i := 0; i < len(p); i += 4 {
+		p[i+0], p[i+2] = p[i+2], p[i+0]
+	}
+}
diff --git a/shiny/driver/internal/swizzle/swizzle_other.go b/shiny/driver/internal/swizzle/swizzle_other.go
new file mode 100644
index 0000000..ff342d7
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_other.go
@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package swizzle
+
+const (
+	haveSIMD16 = false
+)
+
+func bgra16(p []byte) { panic("unreachable") }
diff --git a/shiny/driver/internal/swizzle/swizzle_test.go b/shiny/driver/internal/swizzle/swizzle_test.go
new file mode 100644
index 0000000..83a6d0c
--- /dev/null
+++ b/shiny/driver/internal/swizzle/swizzle_test.go
@@ -0,0 +1,86 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package swizzle
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+)
+
+func TestBGRAShortInput(t *testing.T) {
+	const s = "012.456.89A.CDE.GHI.KLM.O"
+	testCases := []string{
+		0: "012.456.89A.CDE.GHI.KLM.O",
+		1: "210.456.89A.CDE.GHI.KLM.O",
+		2: "210.654.89A.CDE.GHI.KLM.O",
+		3: "210.654.A98.CDE.GHI.KLM.O",
+		4: "210.654.A98.EDC.GHI.KLM.O",
+		5: "210.654.A98.EDC.IHG.KLM.O",
+		6: "210.654.A98.EDC.IHG.MLK.O",
+	}
+	for i, want := range testCases {
+		b := []byte(s)
+		BGRA(b[:4*i])
+		got := string(b)
+		if got != want {
+			t.Errorf("i=%d: got %q, want %q", i, got, want)
+		}
+		changed := got != s
+		wantChanged := i != 0
+		if changed != wantChanged {
+			t.Errorf("i=%d: changed=%t, want %t", i, changed, wantChanged)
+		}
+	}
+}
+
+func TestBGRARandomInput(t *testing.T) {
+	r := rand.New(rand.NewSource(1))
+	fastBuf := make([]byte, 1024)
+	slowBuf := make([]byte, 1024)
+	for i := range fastBuf {
+		fastBuf[i] = uint8(r.Intn(256))
+	}
+	copy(slowBuf, fastBuf)
+
+	for i := 0; i < 100000; i++ {
+		o := r.Intn(len(fastBuf))
+		n := r.Intn(len(fastBuf)-o) &^ 0x03
+		BGRA(fastBuf[o : o+n])
+		pureGoBGRA(slowBuf[o : o+n])
+		if bytes.Equal(fastBuf, slowBuf) {
+			continue
+		}
+		for j := range fastBuf {
+			x := fastBuf[j]
+			y := slowBuf[j]
+			if x != y {
+				t.Fatalf("iter %d: swizzling [%d:%d+%d]: bytes differ at offset %d (aka %d+%d): %#02x vs %#02x",
+					i, o, o, n, j, o, j-o, x, y)
+			}
+		}
+	}
+}
+
+func pureGoBGRA(p []byte) {
+	if len(p)%4 != 0 {
+		return
+	}
+	for i := 0; i < len(p); i += 4 {
+		p[i+0], p[i+2] = p[i+2], p[i+0]
+	}
+}
+
+func benchmarkBGRA(b *testing.B, f func([]byte)) {
+	const w, h = 1920, 1080 // 1080p RGBA.
+	buf := make([]byte, 4*w*h)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		f(buf)
+	}
+}
+
+func BenchmarkBGRA(b *testing.B)       { benchmarkBGRA(b, BGRA) }
+func BenchmarkPureGoBGRA(b *testing.B) { benchmarkBGRA(b, pureGoBGRA) }
diff --git a/shiny/driver/x11driver/buffer.go b/shiny/driver/x11driver/buffer.go
index 9d32500..15dc8f0 100644
--- a/shiny/driver/x11driver/buffer.go
+++ b/shiny/driver/x11driver/buffer.go
@@ -11,6 +11,8 @@
 	"unsafe"
 
 	"github.com/BurntSushi/xgb/shm"
+
+	"golang.org/x/exp/shiny/driver/internal/swizzle"
 )
 
 type bufferImpl struct {
@@ -42,20 +44,7 @@
 	b.mu.Unlock()
 
 	if needsSwizzle {
-		swizzle(b.buf)
-	}
-}
-
-// swizzle converts a pixel buffer between Go's RGBA byte order and X11's BGRA
-// byte order.
-//
-// TODO: optimize this.
-func swizzle(p []byte) {
-	if len(p)%4 != 0 {
-		return
-	}
-	for i := 0; i < len(p); i += 4 {
-		p[i+0], p[i+2] = p[i+2], p[i+0]
+		swizzle.BGRA(b.buf)
 	}
 }
 
@@ -72,7 +61,7 @@
 	if released {
 		b.cleanUp()
 	} else {
-		swizzle(b.buf)
+		swizzle.BGRA(b.buf)
 	}
 }