crypto/md5: speed up aligned writes and test/bench unaligned writes
Write() can safely use uint32 loads when input is aligned.
Also add test and benchmarks for unaligned writes.

Benchmark result obtained by Dave Cheney on ARMv5TE @ 1.2GHz:
benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                  4104         3417  -16.74%
BenchmarkHash1K                     22061        11208  -49.20%
BenchmarkHash8K                    146630        65148  -55.57%
BenchmarkHash8BytesUnaligned         4128         3436  -16.76%
BenchmarkHash1KUnaligned            22054        21473   -2.63%
BenchmarkHash8KUnaligned           146658       146909   +0.17%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                  1.95         2.34    1.20x
BenchmarkHash1K                     46.42        91.36    1.97x
BenchmarkHash8K                     55.87       125.74    2.25x
BenchmarkHash8BytesUnaligned         1.94         2.33    1.20x
BenchmarkHash1KUnaligned            46.43        47.69    1.03x
BenchmarkHash8KUnaligned            55.86        55.76    1.00x

R=golang-dev, dave, bradfitz
CC=golang-dev
https://golang.org/cl/6782072
diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go
index 1a9c4ab..966bdae 100644
--- a/src/pkg/crypto/md5/gen.go
+++ b/src/pkg/crypto/md5/gen.go
@@ -203,6 +203,8 @@
 			// less code and run 1.3x faster if we take advantage of that.
 			// My apologies.
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+		} else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
 		} else {
 			X = &xbuf
 			j := 0
diff --git a/src/pkg/crypto/md5/md5_test.go b/src/pkg/crypto/md5/md5_test.go
index c810251..cac39ad 100644
--- a/src/pkg/crypto/md5/md5_test.go
+++ b/src/pkg/crypto/md5/md5_test.go
@@ -9,6 +9,7 @@
 	"fmt"
 	"io"
 	"testing"
+	"unsafe"
 )
 
 type md5Test struct {
@@ -54,13 +55,19 @@
 	for i := 0; i < len(golden); i++ {
 		g := golden[i]
 		c := md5.New()
-		for j := 0; j < 3; j++ {
+		buf := make([]byte, len(g.in)+4)
+		for j := 0; j < 3+4; j++ {
 			if j < 2 {
 				io.WriteString(c, g.in)
-			} else {
+			} else if j == 2 {
 				io.WriteString(c, g.in[0:len(g.in)/2])
 				c.Sum(nil)
 				io.WriteString(c, g.in[len(g.in)/2:])
+			} else if j > 2 {
+				// test unaligned write
+				buf = buf[1:]
+				copy(buf, g.in)
+				c.Write(buf[:len(g.in)])
 			}
 			s := fmt.Sprintf("%x", c.Sum(nil))
 			if s != g.out {
@@ -80,11 +87,18 @@
 }
 
 var bench = md5.New()
-var buf = make([]byte, 8192)
+var buf = make([]byte, 8192+1)
+var sum = make([]byte, bench.Size())
 
-func benchmarkSize(b *testing.B, size int) {
+func benchmarkSize(b *testing.B, size int, unaligned bool) {
 	b.SetBytes(int64(size))
-	sum := make([]byte, bench.Size())
+	buf := buf
+	if unaligned {
+		if uintptr(unsafe.Pointer(&buf[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+			buf = buf[1:]
+		}
+	}
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		bench.Reset()
 		bench.Write(buf[:size])
@@ -93,13 +107,25 @@
 }
 
 func BenchmarkHash8Bytes(b *testing.B) {
-	benchmarkSize(b, 8)
+	benchmarkSize(b, 8, false)
 }
 
 func BenchmarkHash1K(b *testing.B) {
-	benchmarkSize(b, 1024)
+	benchmarkSize(b, 1024, false)
 }
 
 func BenchmarkHash8K(b *testing.B) {
-	benchmarkSize(b, 8192)
+	benchmarkSize(b, 8192, false)
+}
+
+func BenchmarkHash8BytesUnaligned(b *testing.B) {
+	benchmarkSize(b, 8, true)
+}
+
+func BenchmarkHash1KUnaligned(b *testing.B) {
+	benchmarkSize(b, 1024, true)
+}
+
+func BenchmarkHash8KUnaligned(b *testing.B) {
+	benchmarkSize(b, 8192, true)
 }
diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go
index 5dbdf56..59f8f6f 100644
--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
@@ -22,6 +22,8 @@
 			// less code and run 1.3x faster if we take advantage of that.
 			// My apologies.
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+		} else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
 		} else {
 			X = &xbuf
 			j := 0