x/crypto/poly1305: fix memory alignment fault in ARM

  The current ARM implementation assumes that the input message
  is memory aligned and so it can cause alignment fault when it
  is not enabled. Also it may generate incorrect outputs in ARMv5.

  This change fixes this issue by temporarily copying the input
  to a local aligned space. Although there may be a better way
  to handle unaligned access, this would be a safe way in all
  ARM versions.

  This change also added a test and benchmarks with unaligned
  data. The benchmark result on RasberryPI 2 is

  Benchmark64  2000000         812 ns/op    78.81 MB/s
  Benchmark1K   200000        7809 ns/op   131.12 MB/s
  Benchmark64Unaligned   2000000         967 ns/op    66.13 MB/s
  Benchmark1KUnaligned    200000       10316 ns/op    99.26 MB/s

Change-Id: I189cc1b7bb6c67a04c9877271fb27326f2896e82
Reviewed-on: https://go-review.googlesource.com/12797
Reviewed-by: Adam Langley <agl@golang.org>
diff --git a/poly1305/poly1305_arm.s b/poly1305/poly1305_arm.s
index c9ceaeb..c153867 100644
--- a/poly1305/poly1305_arm.s
+++ b/poly1305/poly1305_arm.s
@@ -47,6 +47,16 @@
   MOVM.IA.W (R13), [R4-R11]
   RET
 
+#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
+  MOVBU (offset+0)(Rsrc), Rtmp; \
+  MOVBU Rtmp, (offset+0)(Rdst); \
+  MOVBU (offset+1)(Rsrc), Rtmp; \
+  MOVBU Rtmp, (offset+1)(Rdst); \
+  MOVBU (offset+2)(Rsrc), Rtmp; \
+  MOVBU Rtmp, (offset+2)(Rdst); \
+  MOVBU (offset+3)(Rsrc), Rtmp; \
+  MOVBU Rtmp, (offset+3)(Rdst)
+
 TEXT poly1305_blocks_armv6<>(SB),4,$-4
   MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
   SUB $128, R13
@@ -66,7 +76,19 @@
   CMP $16, R12
   BLO poly1305_blocks_armv6_done
 poly1305_blocks_armv6_mainloop:
+  WORD $0xe31e0003 // TST R14, #3 not working see issue 5921
+  BEQ poly1305_blocks_armv6_mainloop_aligned
+  ADD $48, R13, g
+  MOVW_UNALIGNED(R14, g, R0, 0)
+  MOVW_UNALIGNED(R14, g, R0, 4)
+  MOVW_UNALIGNED(R14, g, R0, 8)
+  MOVW_UNALIGNED(R14, g, R0, 12)
+  MOVM.IA (g), [R0-R3]
+  ADD $16, R14
+  B poly1305_blocks_armv6_mainloop_loaded
+poly1305_blocks_armv6_mainloop_aligned:
   MOVM.IA.W (R14), [R0-R3]
+poly1305_blocks_armv6_mainloop_loaded:
   MOVW R0>>26, g
   MOVW R1>>20, R11
   MOVW R2>>14, R12
@@ -174,6 +196,16 @@
   MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
   RET
 
+#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+  MOVBU.P 1(Rsrc), Rtmp; \
+  MOVBU.P Rtmp, 1(Rdst); \
+  MOVBU.P 1(Rsrc), Rtmp; \
+  MOVBU.P Rtmp, 1(Rdst)
+
+#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+  MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
+  MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
+
 TEXT poly1305_finish_ext_armv6<>(SB),4,$-4
   MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
   SUB $16, R13, R13
@@ -189,16 +221,32 @@
   MOVW R0, 4(R13)
   MOVW R0, 8(R13)
   MOVW R0, 12(R13)
+  WORD $0xe3110003 // TST R1, #3 not working see issue 5921
+  BEQ poly1305_finish_ext_armv6_aligned
   WORD $0xe3120008 // TST R2, #8 not working see issue 5921
   BEQ poly1305_finish_ext_armv6_skip8
-  MOVM.IA.W (R1), [g-R11]
-  MOVM.IA.W [g-R11], (R9)
+  MOVWP_UNALIGNED(R1, R9, g)
+  MOVWP_UNALIGNED(R1, R9, g)
 poly1305_finish_ext_armv6_skip8:
   WORD $0xe3120004 // TST $4, R2 not working see issue 5921
   BEQ poly1305_finish_ext_armv6_skip4
+  MOVWP_UNALIGNED(R1, R9, g)
+poly1305_finish_ext_armv6_skip4:
+  WORD $0xe3120002 // TST $2, R2 not working see issue 5921
+  BEQ poly1305_finish_ext_armv6_skip2
+  MOVHUP_UNALIGNED(R1, R9, g)
+  B poly1305_finish_ext_armv6_skip2
+poly1305_finish_ext_armv6_aligned:
+  WORD $0xe3120008 // TST R2, #8 not working see issue 5921
+  BEQ poly1305_finish_ext_armv6_skip8_aligned
+  MOVM.IA.W (R1), [g-R11]
+  MOVM.IA.W [g-R11], (R9)
+poly1305_finish_ext_armv6_skip8_aligned:
+  WORD $0xe3120004 // TST $4, R2 not working see issue 5921
+  BEQ poly1305_finish_ext_armv6_skip4_aligned
   MOVW.P 4(R1), g
   MOVW.P g, 4(R9)
-poly1305_finish_ext_armv6_skip4:
+poly1305_finish_ext_armv6_skip4_aligned:
   WORD $0xe3120002 // TST $2, R2 not working see issue 5921
   BEQ poly1305_finish_ext_armv6_skip2
   MOVHU.P 2(R1), g
diff --git a/poly1305/poly1305_test.go b/poly1305/poly1305_test.go
index 2c6d1bc..b3e9231 100644
--- a/poly1305/poly1305_test.go
+++ b/poly1305/poly1305_test.go
@@ -7,6 +7,7 @@
 import (
 	"bytes"
 	"testing"
+	"unsafe"
 )
 
 var testData = []struct {
@@ -34,41 +35,52 @@
 	},
 }
 
-func TestSum(t *testing.T) {
+func testSum(t *testing.T, unaligned bool) {
 	var out [16]byte
 	var key [32]byte
 
 	for i, v := range testData {
+		in := v.in
+		if unaligned {
+			in = unalignBytes(in)
+		}
 		copy(key[:], v.k)
-		Sum(&out, v.in, &key)
+		Sum(&out, in, &key)
 		if !bytes.Equal(out[:], v.correct) {
 			t.Errorf("%d: expected %x, got %x", i, v.correct, out[:])
 		}
 	}
 }
 
-func Benchmark1K(b *testing.B) {
-	b.StopTimer()
+func TestSum(t *testing.T)          { testSum(t, false) }
+func TestSumUnaligned(t *testing.T) { testSum(t, true) }
+
+func benchmark(b *testing.B, size int, unaligned bool) {
 	var out [16]byte
 	var key [32]byte
-	in := make([]byte, 1024)
+	in := make([]byte, size)
+	if unaligned {
+		in = unalignBytes(in)
+	}
 	b.SetBytes(int64(len(in)))
-	b.StartTimer()
-
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		Sum(&out, in, &key)
 	}
 }
 
-func Benchmark64(b *testing.B) {
-	b.StopTimer()
-	var out [16]byte
-	var key [32]byte
-	in := make([]byte, 64)
-	b.SetBytes(int64(len(in)))
-	b.StartTimer()
+func Benchmark64(b *testing.B)          { benchmark(b, 64, false) }
+func Benchmark1K(b *testing.B)          { benchmark(b, 1024, false) }
+func Benchmark64Unaligned(b *testing.B) { benchmark(b, 64, true) }
+func Benchmark1KUnaligned(b *testing.B) { benchmark(b, 1024, true) }
 
-	for i := 0; i < b.N; i++ {
-		Sum(&out, in, &key)
+func unalignBytes(in []byte) []byte {
+	out := make([]byte, len(in)+1)
+	if uintptr(unsafe.Pointer(&out[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+		out = out[1:]
+	} else {
+		out = out[:len(in)]
 	}
+	copy(out, in)
+	return out
 }