internal/bytealg: port bytes.Index and bytes.Count to reg ABI on ppc64x

This change adds support for the reg ABI to the Index and Count
functions for ppc64/ppc64le.

Most Index and Count benchmarks show improvement in performance on
POWER9 with this change. Similar numbers observed on POWER8 and POWER10.

name                             old time/op    new time/op    delta
Index/32                         71.0ns ± 0%    67.9ns ± 0%   -4.42% (p=0.001 n=7+6)
IndexEasy/10                     17.5ns ± 0%    17.2ns ± 0%   -1.30% (p=0.001 n=7+7)

name             old time/op    new time/op    delta
Count/10           26.6ns ± 0%    25.0ns ± 1%   -6.02%  (p=0.001 n=7+7)
Count/32           78.6ns ± 0%    74.7ns ± 0%   -4.97%  (p=0.001 n=7+7)
Count/4K           5.03µs ± 0%    5.03µs ± 0%   -0.07%  (p=0.000 n=6+7)
CountEasy/10       26.9ns ± 0%    25.2ns ± 1%   -6.31%  (p=0.001 n=7+7)
CountSingle/32     11.8ns ± 0%     9.9ns ± 0%  -15.70%  (p=0.002 n=6+6)

Change-Id: Ibd146c04f8107291c55f9e6100b8264dfccc41ae
Reviewed-on: https://go-review.googlesource.com/c/go/+/355509
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
index 94163cb..dbafd06 100644
--- a/src/internal/bytealg/count_ppc64x.s
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -8,24 +8,37 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+#ifdef GOEXPERIMENT_regabiargs
+// R3 = byte array pointer 
+// R4 = length
+        MOVBZ R6,R5               // R5 = byte
+#else
+
 	MOVD  b_base+0(FP), R3    // R3 = byte array pointer
 	MOVD  b_len+8(FP), R4     // R4 = length
 	MOVBZ c+24(FP), R5        // R5 = byte
 	MOVD  $ret+32(FP), R14    // R14 = &ret
+#endif
 	BR    countbytebody<>(SB)
 
-TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+#ifdef GOEXPERIMENT_regabiargs
+// R3 = byte array pointer
+// R4 = length
+        MOVBZ R5,R5               // R5 = byte
+#else
 	MOVD  s_base+0(FP), R3    // R3 = string
 	MOVD  s_len+8(FP), R4     // R4 = length
 	MOVBZ c+16(FP), R5        // R5 = byte
 	MOVD  $ret+24(FP), R14    // R14 = &ret
+#endif
 	BR    countbytebody<>(SB)
 
 // R3: addr of string
 // R4: len of string
 // R5: byte to count
-// R14: addr for return value
+// R14: addr for return value when not regabi
 // endianness shouldn't matter since we are just counting and order
 // is irrelevant
 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
@@ -94,5 +107,10 @@
 	BR  small
 
 done:
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD R18, R3    // return count
+#else
 	MOVD R18, (R14) // return count
+#endif
+
 	RET
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
index 3ed9442..f587a8a 100644
--- a/src/internal/bytealg/index_ppc64x.s
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -46,12 +46,20 @@
 
 GLOBL byteswap<>+0(SB), RODATA, $16
 
-TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
+TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+#ifdef GOEXPERIMENT_regabiargs 
+// R3 = byte array pointer 
+// R4 = length 
+        MOVD R6,R5             // R5 = separator pointer
+        MOVD R7,R6             // R6 = separator length 
+#else
 	MOVD a_base+0(FP), R3  // R3 = byte array pointer
 	MOVD a_len+8(FP), R4   // R4 = length
 	MOVD b_base+24(FP), R5 // R5 = separator pointer
 	MOVD b_len+32(FP), R6  // R6 = separator length
 	MOVD $ret+48(FP), R14  // R14 = &ret
+#endif
+
 
 #ifdef GOARCH_ppc64le
 	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
@@ -63,12 +71,15 @@
 power8:
 	BR indexbody<>(SB)
 
-TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+#ifndef GOEXPERIMENT_regabiargs
 	MOVD a_base+0(FP), R3  // R3 = string
 	MOVD a_len+8(FP), R4   // R4 = length
 	MOVD b_base+16(FP), R5 // R5 = separator pointer
 	MOVD b_len+24(FP), R6  // R6 = separator length
 	MOVD $ret+32(FP), R14  // R14 = &ret
+#endif
+
 
 #ifdef GOARCH_ppc64le
 	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
@@ -420,8 +431,12 @@
 	BR         index17to32loop // Continue
 
 notfound:
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD $-1, R3   // Return -1 if not found
+#else
 	MOVD $-1, R8   // Return -1 if not found
 	MOVD R8, (R14)
+#endif
 	RET
 
 index33plus:
@@ -432,12 +447,20 @@
 	SRD  $3, R25   // Convert from bits to bytes
 	ADD  R25, R7   // Add to current string address
 	SUB  R3, R7    // Subtract from start of string
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD R7, R3    // Return byte where found
+#else
 	MOVD R7, (R14) // Return byte where found
+#endif
 	RET
 
 found:
 	SUB  R3, R7    // Return byte where found
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD R7, R3
+#else
 	MOVD R7, (R14)
+#endif
 	RET
 
 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
@@ -746,8 +769,12 @@
 	BR         index17to32loop // Continue
 
 notfound:
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD $-1, R3   // Return -1 if not found
+#else
 	MOVD $-1, R8   // Return -1 if not found
 	MOVD R8, (R14)
+#endif
 	RET
 
 index33plus:
@@ -758,11 +785,19 @@
 	SRD  $3, R25   // Convert from bits to bytes
 	ADD  R25, R7   // Add to current string address
 	SUB  R3, R7    // Subtract from start of string
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD R7, R3    // Return byte where found
+#else
 	MOVD R7, (R14) // Return byte where found
+#endif
 	RET
 
 found:
 	SUB  R3, R7    // Return byte where found
+#ifdef GOEXPERIMENT_regabiargs
+        MOVD R7, R3
+#else
 	MOVD R7, (R14)
+#endif
 	RET