crypto,internal/bytealg: fix assembly that clobbers BP
BP should be callee-save. It will be saved automatically if
there is a nonzero frame size. Otherwise, we need to avoid this register.
Change-Id: If3f551efa42d830c8793d9f0183cb8daad7a2ab5
Reviewed-on: https://go-review.googlesource.com/c/go/+/248260
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s
index 7afa54a..c77b11b 100644
--- a/src/crypto/elliptic/p256_asm_amd64.s
+++ b/src/crypto/elliptic/p256_asm_amd64.s
@@ -1336,7 +1336,7 @@
RET
/* ---------------------------------------*/
-TEXT p256MulInternal(SB),NOSPLIT,$0
+TEXT p256MulInternal(SB),NOSPLIT,$8
MOVQ acc4, mul0
MULQ t0
MOVQ mul0, acc0
@@ -1519,7 +1519,7 @@
RET
/* ---------------------------------------*/
-TEXT p256SqrInternal(SB),NOSPLIT,$0
+TEXT p256SqrInternal(SB),NOSPLIT,$8
MOVQ acc4, mul0
MULQ acc5
@@ -2345,4 +2345,3 @@
RET
/* ---------------------------------------*/
-
diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s
index 90d932b..7c7d92d 100644
--- a/src/crypto/md5/md5block_amd64.s
+++ b/src/crypto/md5/md5block_amd64.s
@@ -13,7 +13,7 @@
// Licence: I hereby disclaim the copyright on this code and place it
// in the public domain.
-TEXT ·block(SB),NOSPLIT,$0-32
+TEXT ·block(SB),NOSPLIT,$8-32
MOVQ dig+0(FP), BP
MOVQ p+8(FP), SI
MOVQ p_len+16(FP), DX
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 4459820..6193b57 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -8,7 +8,7 @@
TEXT ·Index(SB),NOSPLIT,$0-56
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
- MOVQ b_base+24(FP), BP
+ MOVQ b_base+24(FP), R8
MOVQ b_len+32(FP), AX
MOVQ DI, R10
LEAQ ret+48(FP), R11
@@ -17,7 +17,7 @@
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
- MOVQ b_base+16(FP), BP
+ MOVQ b_base+16(FP), R8
MOVQ b_len+24(FP), AX
MOVQ DI, R10
LEAQ ret+32(FP), R11
@@ -26,7 +26,7 @@
// AX: length of string, that we are searching for
// DX: length of string, in which we are searching
// DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
+// R8: pointer to string, that we are searching for
// R11: address, where to put return value
// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
TEXT indexbody<>(SB),NOSPLIT,$0
@@ -37,11 +37,11 @@
no_sse42:
CMPQ AX, $2
JA _3_or_more
- MOVW (BP), BP
+ MOVW (R8), R8
LEAQ -1(DI)(DX*1), DX
loop2:
MOVW (DI), SI
- CMPW SI,BP
+ CMPW SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
@@ -50,12 +50,12 @@
_3_or_more:
CMPQ AX, $3
JA _4_or_more
- MOVW 1(BP), BX
- MOVW (BP), BP
+ MOVW 1(R8), BX
+ MOVW (R8), R8
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
- CMPW SI,BP
+ CMPW SI,R8
JZ partial_success3
ADDQ $1,DI
CMPQ DI,DX
@@ -72,11 +72,11 @@
_4_or_more:
CMPQ AX, $4
JA _5_or_more
- MOVL (BP), BP
+ MOVL (R8), R8
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
- CMPL SI,BP
+ CMPL SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
@@ -87,11 +87,11 @@
JA _8_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
- MOVL -4(BP)(AX*1), BX
- MOVL (BP), BP
+ MOVL -4(R8)(AX*1), BX
+ MOVL (R8), R8
loop5to7:
MOVL (DI), SI
- CMPL SI,BP
+ CMPL SI,R8
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,DX
@@ -108,11 +108,11 @@
_8_or_more:
CMPQ AX, $8
JA _9_or_more
- MOVQ (BP), BP
+ MOVQ (R8), R8
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
- CMPQ SI,BP
+ CMPQ SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
@@ -123,11 +123,11 @@
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
- MOVQ -8(BP)(AX*1), BX
- MOVQ (BP), BP
+ MOVQ -8(R8)(AX*1), BX
+ MOVQ (R8), R8
loop9to15:
MOVQ (DI), SI
- CMPQ SI,BP
+ CMPQ SI,R8
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,DX
@@ -144,7 +144,7 @@
_16_or_more:
CMPQ AX, $16
JA _17_or_more
- MOVOU (BP), X1
+ MOVOU (R8), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
@@ -161,8 +161,8 @@
JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
- MOVOU -16(BP)(AX*1), X0
- MOVOU (BP), X1
+ MOVOU -16(R8)(AX*1), X0
+ MOVOU (R8), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
@@ -188,7 +188,7 @@
_32_or_more:
CMPQ AX, $32
JA _33_to_63
- VMOVDQU (BP), Y1
+ VMOVDQU (R8), Y1
LEAQ -31(DI)(DX*1), DX
loop32:
VMOVDQU (DI), Y2
@@ -203,8 +203,8 @@
_33_to_63:
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
- VMOVDQU -32(BP)(AX*1), Y0
- VMOVDQU (BP), Y1
+ VMOVDQU -32(R8)(AX*1), Y0
+ VMOVDQU (R8), Y1
loop33to63:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
@@ -241,10 +241,10 @@
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
- LEAQ 16(BP), SI
+ LEAQ 16(R8), SI
TESTW $0xff0, SI
JEQ no_sse42
- MOVOU (BP), X1
+ MOVOU (R8), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index b60057c..621c01b 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -212,7 +212,7 @@
// due to stack probes inserted to avoid stack/heap collisions.
// See issue #20427.
- MOVQ SP, BP // Save old SP; BP unchanged by C code.
+ MOVQ SP, R12 // Save old SP; R12 unchanged by C code.
get_tls(CX)
MOVQ g(CX), AX
@@ -250,7 +250,7 @@
MOVQ 0(SP), AX // sec
MOVQ 8(SP), DX // nsec
ret:
- MOVQ BP, SP // Restore real SP
+ MOVQ R12, SP // Restore real SP
// Restore vdsoPC, vdsoSP
// We don't worry about being signaled between the two stores.
// If we are not in a signal handler, we'll restore vdsoSP to 0,
@@ -277,7 +277,7 @@
TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
// Switch to g0 stack. See comment above in runtime·walltime.
- MOVQ SP, BP // Save old SP; BP unchanged by C code.
+ MOVQ SP, R12 // Save old SP; R12 unchanged by C code.
get_tls(CX)
MOVQ g(CX), AX
@@ -315,7 +315,7 @@
MOVQ 0(SP), AX // sec
MOVQ 8(SP), DX // nsec
ret:
- MOVQ BP, SP // Restore real SP
+ MOVQ R12, SP // Restore real SP
// Restore vdsoPC, vdsoSP
// We don't worry about being signaled between the two stores.
// If we are not in a signal handler, we'll restore vdsoSP to 0,