math/big: add fast path for amd64 addVW for large z
This matches the pure Go fast path added in the previous commit.
I will leave other architectures to those with ready access to hardware.
name old time/op new time/op delta
AddVW/1-8 3.60ns ± 3% 3.59ns ± 1% ~ (p=0.147 n=91+86)
AddVW/2-8 3.92ns ± 1% 3.91ns ± 2% -0.36% (p=0.000 n=86+92)
AddVW/3-8 4.33ns ± 5% 4.46ns ± 5% +2.94% (p=0.000 n=96+97)
AddVW/4-8 4.76ns ± 5% 4.82ns ± 5% +1.28% (p=0.000 n=95+92)
AddVW/5-8 5.40ns ± 1% 5.42ns ± 0% +0.47% (p=0.000 n=76+71)
AddVW/10-8 8.03ns ± 1% 7.80ns ± 5% -2.90% (p=0.000 n=73+96)
AddVW/100-8 43.8ns ± 5% 17.9ns ± 1% -59.12% (p=0.000 n=94+81)
AddVW/1000-8 428ns ± 4% 85ns ± 6% -80.20% (p=0.000 n=96+99)
AddVW/10000-8 4.22µs ± 2% 1.80µs ± 3% -57.32% (p=0.000 n=69+92)
AddVW/100000-8 44.8µs ± 8% 31.5µs ± 3% -29.76% (p=0.000 n=99+90)
name old time/op new time/op delta
SubVW/1-8 3.53ns ± 2% 3.63ns ± 5% +2.97% (p=0.000 n=94+93)
SubVW/2-8 4.33ns ± 5% 4.01ns ± 2% -7.36% (p=0.000 n=90+85)
SubVW/3-8 4.32ns ± 2% 4.32ns ± 5% ~ (p=0.084 n=87+97)
SubVW/4-8 4.70ns ± 2% 4.83ns ± 6% +2.77% (p=0.000 n=85+96)
SubVW/5-8 5.84ns ± 1% 5.35ns ± 1% -8.35% (p=0.000 n=87+87)
SubVW/10-8 8.01ns ± 4% 7.54ns ± 4% -5.84% (p=0.000 n=98+97)
SubVW/100-8 43.9ns ± 5% 17.9ns ± 1% -59.20% (p=0.000 n=98+76)
SubVW/1000-8 426ns ± 2% 85ns ± 3% -80.13% (p=0.000 n=90+98)
SubVW/10000-8 4.24µs ± 2% 1.81µs ± 3% -57.28% (p=0.000 n=74+91)
SubVW/100000-8 44.5µs ± 4% 31.5µs ± 2% -29.33% (p=0.000 n=84+91)
Change-Id: I10dd361cbaca22197c27e7734c0f50065292afbb
Reviewed-on: https://go-review.googlesource.com/c/go/+/164969
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index e9c8887..a0d1660 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -143,6 +143,8 @@
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
+ CMPQ DI, $32
+ JG large
MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10
@@ -189,12 +191,16 @@
E3: MOVQ CX, c+56(FP) // return c
RET
+large:
+ JMP ·addVWlarge(SB)
// func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
TEXT ·subVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
+ CMPQ DI, $32
+ JG large
MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10
@@ -242,6 +248,8 @@
E4: MOVQ CX, c+56(FP) // return c
RET
+large:
+ JMP ·subVWlarge(SB)
// func shlVU(z, x []Word, s uint) (c Word)