salsa20/salsa: fix keystream loop in amd64 assembly when overflowing 32-bit counter

Fixes golang/go#30965

Change-Id: I83a804d555c048e0124c35f95c9e611b2c5bdb01
Reviewed-on: https://team-review.git.corp.google.com/c/golang/go-private/+/436856
Reviewed-by: Adam Langley <agl@google.com>
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/168406
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Run-TryBot: Filippo Valsorda <filippo@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/salsa20/salsa/salsa20_amd64.go b/salsa20/salsa/salsa20_amd64.go
index f9269c3..656e8df 100644
--- a/salsa20/salsa/salsa20_amd64.go
+++ b/salsa20/salsa/salsa20_amd64.go
@@ -6,10 +6,9 @@
 
 package salsa
 
-// This function is implemented in salsa2020_amd64.s.
-
 //go:noescape
 
+// salsa2020XORKeyStream is implemented in salsa20_amd64.s.
 func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
 
 // XORKeyStream crypts bytes from in to out using the given key and counters.
diff --git a/salsa20/salsa/salsa2020_amd64.s b/salsa20/salsa/salsa20_amd64.s
similarity index 99%
rename from salsa20/salsa/salsa2020_amd64.s
rename to salsa20/salsa/salsa20_amd64.s
index 22afbdc..18085d2 100644
--- a/salsa20/salsa/salsa2020_amd64.s
+++ b/salsa20/salsa/salsa20_amd64.s
@@ -99,30 +99,24 @@
 	MOVL  36 (SP),CX
 	MOVL DX,288(SP)
 	MOVL CX,304(SP)
-	ADDQ $1,DX
 	SHLQ $32,CX
 	ADDQ CX,DX
+	ADDQ $1,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
 	MOVL DX, 292 (SP)
 	MOVL CX, 308 (SP)
 	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
 	MOVL DX, 296 (SP)
 	MOVL CX, 312 (SP)
 	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
 	MOVL DX, 300 (SP)
 	MOVL CX, 316 (SP)
 	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
 	MOVL DX,16(SP)
diff --git a/salsa20/salsa/salsa20_amd64_test.go b/salsa20/salsa/salsa20_amd64_test.go
new file mode 100644
index 0000000..d4e779c
--- /dev/null
+++ b/salsa20/salsa/salsa20_amd64_test.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!appengine,!gccgo
+
+package salsa
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestCounterOverflow(t *testing.T) {
+	in := make([]byte, 4096)
+	key := &[32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
+		6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2}
+	for n, counter := range []*[16]byte{
+		&[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0},             // zero counter
+		&[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff}, // counter about to overflow 32 bits
+		&[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 0xff, 0xff, 0xff, 0xff}, // counter above 32 bits
+	} {
+		out := make([]byte, 4096)
+		XORKeyStream(out, in, counter, key)
+		outGeneric := make([]byte, 4096)
+		genericXORKeyStream(outGeneric, in, counter, key)
+		if !bytes.Equal(out, outGeneric) {
+			t.Errorf("%d: assembly and go implementations disagree", n)
+		}
+	}
+}
diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go
new file mode 100644
index 0000000..8a46bd2
--- /dev/null
+++ b/salsa20/salsa/salsa20_noasm.go
@@ -0,0 +1,14 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine gccgo
+
+package salsa
+
+// XORKeyStream crypts bytes from in to out using the given key and counters.
+// In and out must overlap entirely or not at all. Counter
+// contains the raw salsa20 counter bytes (both nonce and block counter).
+func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+	genericXORKeyStream(out, in, counter, key)
+}
diff --git a/salsa20/salsa/salsa20_ref.go b/salsa20/salsa/salsa20_ref.go
index 22126d1..68169c6 100644
--- a/salsa20/salsa/salsa20_ref.go
+++ b/salsa20/salsa/salsa20_ref.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine gccgo
-
 package salsa
 
 const rounds = 20
@@ -202,10 +200,9 @@
 	out[63] = byte(x15 >> 24)
 }
 
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter
-// contains the raw salsa20 counter bytes (both nonce and block counter).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+// genericXORKeyStream is the generic implementation of XORKeyStream to be used
+// when no assembly implementation is available.
+func genericXORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
 	var block [64]byte
 	var counterCopy [16]byte
 	copy(counterCopy[:], counter[:])