math/big: add fast path for amd64 addVW for large z

This matches the pure Go fast path added in the previous commit.

I will leave other architectures to those with ready access to hardware.

name            old time/op    new time/op    delta
AddVW/1-8         3.60ns ± 3%    3.59ns ± 1%      ~     (p=0.147 n=91+86)
AddVW/2-8         3.92ns ± 1%    3.91ns ± 2%    -0.36%  (p=0.000 n=86+92)
AddVW/3-8         4.33ns ± 5%    4.46ns ± 5%    +2.94%  (p=0.000 n=96+97)
AddVW/4-8         4.76ns ± 5%    4.82ns ± 5%    +1.28%  (p=0.000 n=95+92)
AddVW/5-8         5.40ns ± 1%    5.42ns ± 0%    +0.47%  (p=0.000 n=76+71)
AddVW/10-8        8.03ns ± 1%    7.80ns ± 5%    -2.90%  (p=0.000 n=73+96)
AddVW/100-8       43.8ns ± 5%    17.9ns ± 1%   -59.12%  (p=0.000 n=94+81)
AddVW/1000-8       428ns ± 4%      85ns ± 6%   -80.20%  (p=0.000 n=96+99)
AddVW/10000-8     4.22µs ± 2%    1.80µs ± 3%   -57.32%  (p=0.000 n=69+92)
AddVW/100000-8    44.8µs ± 8%    31.5µs ± 3%   -29.76%  (p=0.000 n=99+90)

name            old time/op    new time/op    delta
SubVW/1-8         3.53ns ± 2%    3.63ns ± 5%    +2.97%  (p=0.000 n=94+93)
SubVW/2-8         4.33ns ± 5%    4.01ns ± 2%    -7.36%  (p=0.000 n=90+85)
SubVW/3-8         4.32ns ± 2%    4.32ns ± 5%      ~     (p=0.084 n=87+97)
SubVW/4-8         4.70ns ± 2%    4.83ns ± 6%    +2.77%  (p=0.000 n=85+96)
SubVW/5-8         5.84ns ± 1%    5.35ns ± 1%    -8.35%  (p=0.000 n=87+87)
SubVW/10-8        8.01ns ± 4%    7.54ns ± 4%    -5.84%  (p=0.000 n=98+97)
SubVW/100-8       43.9ns ± 5%    17.9ns ± 1%   -59.20%  (p=0.000 n=98+76)
SubVW/1000-8       426ns ± 2%      85ns ± 3%   -80.13%  (p=0.000 n=90+98)
SubVW/10000-8     4.24µs ± 2%    1.81µs ± 3%   -57.28%  (p=0.000 n=74+91)
SubVW/100000-8    44.5µs ± 4%    31.5µs ± 2%   -29.33%  (p=0.000 n=84+91)

Change-Id: I10dd361cbaca22197c27e7734c0f50065292afbb
Reviewed-on: https://go-review.googlesource.com/c/go/+/164969
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index e9c8887..a0d1660 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -143,6 +143,8 @@
 // func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB),NOSPLIT,$0
 	MOVQ z_len+8(FP), DI
+	CMPQ DI, $32
+	JG large
 	MOVQ x+24(FP), R8
 	MOVQ y+48(FP), CX	// c = y
 	MOVQ z+0(FP), R10
@@ -189,12 +191,16 @@
 
 E3:	MOVQ CX, c+56(FP)	// return c
 	RET
+large:
+	JMP ·addVWlarge(SB)
 
 
 // func subVW(z, x []Word, y Word) (c Word)
 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
 TEXT ·subVW(SB),NOSPLIT,$0
 	MOVQ z_len+8(FP), DI
+	CMPQ DI, $32
+	JG large
 	MOVQ x+24(FP), R8
 	MOVQ y+48(FP), CX	// c = y
 	MOVQ z+0(FP), R10
@@ -242,6 +248,8 @@
 
 E4:	MOVQ CX, c+56(FP)	// return c
 	RET
+large:
+	JMP ·subVWlarge(SB)
 
 
 // func shlVU(z, x []Word, s uint) (c Word)