chacha20poly1305: fix detection of BMI on amd64

This change detects BMI2 usability as an additional condition
to examine the usability of AVX2 version algorithm, fixes
the crash on the platfrom which supports AVX2 but not support BMI2.

Change-Id: I5438d4ec84265c79a51c1439265a33b1be04878a
Reviewed-on: https://go-review.googlesource.com/34852
Reviewed-by: Adam Langley <agl@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s
index 1c895d8..39c58b4 100644
--- a/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/chacha20poly1305/chacha20poly1305_amd64.s
@@ -278,8 +278,15 @@
 	MOVQ ad+72(FP), adp
 
 	// Check for AVX2 support
-	CMPB runtime·support_avx2(SB), $1
-	JE   chacha20Poly1305Open_AVX2
+	CMPB runtime·support_avx2(SB), $0
+	JE   noavx2bmi2Open
+
+	// Check BMI2 bit for MULXQ.
+	// runtime·cpuid_ebx7 is always available here
+	// because it passed avx2 check
+	TESTL $(1<<8), runtime·cpuid_ebx7(SB)
+	JNE   chacha20Poly1305Open_AVX2
+noavx2bmi2Open:
 
 	// Special optimization, for very short buffers
 	CMPQ inl, $128
@@ -1485,8 +1492,15 @@
 	MOVQ ad+72(FP), adp
 
 	// Check for AVX2 support
-	CMPB runtime·support_avx2(SB), $1
-	JE   chacha20Poly1305Seal_AVX2
+	CMPB runtime·support_avx2(SB), $0
+	JE   noavx2bmi2Seal
+
+	// Check BMI2 bit for MULXQ.
+	// runtime·cpuid_ebx7 is always available here
+	// because it passed avx2 check
+	TESTL $(1<<8), runtime·cpuid_ebx7(SB)
+	JNE   chacha20Poly1305Seal_AVX2
+noavx2bmi2Seal:
 
 	// Special optimization, for very short buffers
 	CMPQ inl, $128