x/crypto/poly1305: fix memory alignment fault in ARM
The current ARM implementation assumes that the input message
is memory aligned and so it can cause alignment fault when it
is not enabled. Also it may generate incorrect outputs in ARMv5.
This change fixes this issue by temporarily copying the input
to a local aligned space. Although there may be a better way
to handle unaligned access, this would be a safe way in all
ARM versions.
This change also added a test and benchmarks with unaligned
data. The benchmark result on RasberryPI 2 is
Benchmark64 2000000 812 ns/op 78.81 MB/s
Benchmark1K 200000 7809 ns/op 131.12 MB/s
Benchmark64Unaligned 2000000 967 ns/op 66.13 MB/s
Benchmark1KUnaligned 200000 10316 ns/op 99.26 MB/s
Change-Id: I189cc1b7bb6c67a04c9877271fb27326f2896e82
Reviewed-on: https://go-review.googlesource.com/12797
Reviewed-by: Adam Langley <agl@golang.org>
diff --git a/poly1305/poly1305_arm.s b/poly1305/poly1305_arm.s
index c9ceaeb..c153867 100644
--- a/poly1305/poly1305_arm.s
+++ b/poly1305/poly1305_arm.s
@@ -47,6 +47,16 @@
MOVM.IA.W (R13), [R4-R11]
RET
+#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
+ MOVBU (offset+0)(Rsrc), Rtmp; \
+ MOVBU Rtmp, (offset+0)(Rdst); \
+ MOVBU (offset+1)(Rsrc), Rtmp; \
+ MOVBU Rtmp, (offset+1)(Rdst); \
+ MOVBU (offset+2)(Rsrc), Rtmp; \
+ MOVBU Rtmp, (offset+2)(Rdst); \
+ MOVBU (offset+3)(Rsrc), Rtmp; \
+ MOVBU Rtmp, (offset+3)(Rdst)
+
TEXT poly1305_blocks_armv6<>(SB),4,$-4
MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
SUB $128, R13
@@ -66,7 +76,19 @@
CMP $16, R12
BLO poly1305_blocks_armv6_done
poly1305_blocks_armv6_mainloop:
+ WORD $0xe31e0003 // TST R14, #3 not working see issue 5921
+ BEQ poly1305_blocks_armv6_mainloop_aligned
+ ADD $48, R13, g
+ MOVW_UNALIGNED(R14, g, R0, 0)
+ MOVW_UNALIGNED(R14, g, R0, 4)
+ MOVW_UNALIGNED(R14, g, R0, 8)
+ MOVW_UNALIGNED(R14, g, R0, 12)
+ MOVM.IA (g), [R0-R3]
+ ADD $16, R14
+ B poly1305_blocks_armv6_mainloop_loaded
+poly1305_blocks_armv6_mainloop_aligned:
MOVM.IA.W (R14), [R0-R3]
+poly1305_blocks_armv6_mainloop_loaded:
MOVW R0>>26, g
MOVW R1>>20, R11
MOVW R2>>14, R12
@@ -174,6 +196,16 @@
MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
RET
+#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+ MOVBU.P 1(Rsrc), Rtmp; \
+ MOVBU.P Rtmp, 1(Rdst); \
+ MOVBU.P 1(Rsrc), Rtmp; \
+ MOVBU.P Rtmp, 1(Rdst)
+
+#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+ MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
+ MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
+
TEXT poly1305_finish_ext_armv6<>(SB),4,$-4
MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
SUB $16, R13, R13
@@ -189,16 +221,32 @@
MOVW R0, 4(R13)
MOVW R0, 8(R13)
MOVW R0, 12(R13)
+ WORD $0xe3110003 // TST R1, #3 not working see issue 5921
+ BEQ poly1305_finish_ext_armv6_aligned
WORD $0xe3120008 // TST R2, #8 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip8
- MOVM.IA.W (R1), [g-R11]
- MOVM.IA.W [g-R11], (R9)
+ MOVWP_UNALIGNED(R1, R9, g)
+ MOVWP_UNALIGNED(R1, R9, g)
poly1305_finish_ext_armv6_skip8:
WORD $0xe3120004 // TST $4, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip4
+ MOVWP_UNALIGNED(R1, R9, g)
+poly1305_finish_ext_armv6_skip4:
+ WORD $0xe3120002 // TST $2, R2 not working see issue 5921
+ BEQ poly1305_finish_ext_armv6_skip2
+ MOVHUP_UNALIGNED(R1, R9, g)
+ B poly1305_finish_ext_armv6_skip2
+poly1305_finish_ext_armv6_aligned:
+ WORD $0xe3120008 // TST R2, #8 not working see issue 5921
+ BEQ poly1305_finish_ext_armv6_skip8_aligned
+ MOVM.IA.W (R1), [g-R11]
+ MOVM.IA.W [g-R11], (R9)
+poly1305_finish_ext_armv6_skip8_aligned:
+ WORD $0xe3120004 // TST $4, R2 not working see issue 5921
+ BEQ poly1305_finish_ext_armv6_skip4_aligned
MOVW.P 4(R1), g
MOVW.P g, 4(R9)
-poly1305_finish_ext_armv6_skip4:
+poly1305_finish_ext_armv6_skip4_aligned:
WORD $0xe3120002 // TST $2, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip2
MOVHU.P 2(R1), g
diff --git a/poly1305/poly1305_test.go b/poly1305/poly1305_test.go
index 2c6d1bc..b3e9231 100644
--- a/poly1305/poly1305_test.go
+++ b/poly1305/poly1305_test.go
@@ -7,6 +7,7 @@
import (
"bytes"
"testing"
+ "unsafe"
)
var testData = []struct {
@@ -34,41 +35,52 @@
},
}
-func TestSum(t *testing.T) {
+func testSum(t *testing.T, unaligned bool) {
var out [16]byte
var key [32]byte
for i, v := range testData {
+ in := v.in
+ if unaligned {
+ in = unalignBytes(in)
+ }
copy(key[:], v.k)
- Sum(&out, v.in, &key)
+ Sum(&out, in, &key)
if !bytes.Equal(out[:], v.correct) {
t.Errorf("%d: expected %x, got %x", i, v.correct, out[:])
}
}
}
-func Benchmark1K(b *testing.B) {
- b.StopTimer()
+func TestSum(t *testing.T) { testSum(t, false) }
+func TestSumUnaligned(t *testing.T) { testSum(t, true) }
+
+func benchmark(b *testing.B, size int, unaligned bool) {
var out [16]byte
var key [32]byte
- in := make([]byte, 1024)
+ in := make([]byte, size)
+ if unaligned {
+ in = unalignBytes(in)
+ }
b.SetBytes(int64(len(in)))
- b.StartTimer()
-
+ b.ResetTimer()
for i := 0; i < b.N; i++ {
Sum(&out, in, &key)
}
}
-func Benchmark64(b *testing.B) {
- b.StopTimer()
- var out [16]byte
- var key [32]byte
- in := make([]byte, 64)
- b.SetBytes(int64(len(in)))
- b.StartTimer()
+func Benchmark64(b *testing.B) { benchmark(b, 64, false) }
+func Benchmark1K(b *testing.B) { benchmark(b, 1024, false) }
+func Benchmark64Unaligned(b *testing.B) { benchmark(b, 64, true) }
+func Benchmark1KUnaligned(b *testing.B) { benchmark(b, 1024, true) }
- for i := 0; i < b.N; i++ {
- Sum(&out, in, &key)
+func unalignBytes(in []byte) []byte {
+ out := make([]byte, len(in)+1)
+ if uintptr(unsafe.Pointer(&out[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+ out = out[1:]
+ } else {
+ out = out[:len(in)]
}
+ copy(out, in)
+ return out
}