internal/bytealg: optimize cmpbody for ppc64le/ppc64

Vectorize the cmpbody loop for bytes of size greater than or equal
to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve
performance of smaller size compares

Performance improves for most sizes with this change on POWER8, 9
and POWER10. For the very small sizes (upto 8) the overhead of
calling function starts to impact performance.

POWER9:
name               old time/op  new time/op  delta
BytesCompare/1     4.60ns ± 0%  5.49ns ± 0%  +19.27%
BytesCompare/2     4.68ns ± 0%  5.46ns ± 0%  +16.71%
BytesCompare/4     6.58ns ± 0%  5.49ns ± 0%  -16.58%
BytesCompare/8     4.89ns ± 0%  5.46ns ± 0%  +11.64%
BytesCompare/16    5.21ns ± 0%  4.96ns ± 0%   -4.70%
BytesCompare/32    5.09ns ± 0%  4.98ns ± 0%   -2.14%
BytesCompare/64    6.40ns ± 0%  5.96ns ± 0%   -6.84%
BytesCompare/128   11.3ns ± 0%   8.1ns ± 0%  -28.09%
BytesCompare/256   15.1ns ± 0%  12.8ns ± 0%  -15.16%
BytesCompare/512   26.5ns ± 0%  23.3ns ± 5%  -12.03%
BytesCompare/1024  50.2ns ± 0%  41.6ns ± 2%  -17.01%
BytesCompare/2048  99.3ns ± 0%  86.5ns ± 0%  -12.88%

Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609
Reviewed-on: https://go-review.googlesource.com/c/go/+/362797
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
index fc6f170c..cbe0525 100644
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -21,11 +21,12 @@
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -52,11 +53,12 @@
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -70,209 +72,431 @@
 	MOVD	$0, R3
 	RET
 
-// Do an efficient memcmp for ppc64le
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
 	CMP	R8,$32		// optimize >= 32
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
+	BLT	setup8a		// optimize < 32
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$64
+	BLT	cmp32		// process size 32-63
 
-	// Special processing for 32 bytes or longer.
-	// Loading this way is faster and correct as long as the
-	// doublewords being compared are equal. Once they
-	// are found unequal, reload them in proper byte order
-	// to determine greater or less than.
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	MOVD	$0,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	MOVD	$8,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD    24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	MOVD	$16,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU    R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BNE	cmpne
+	DCBT	(R5)		// optimize >= 64
+	DCBT	(R6)		// cache hint
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different	// jump out if its different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
+	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
+	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+	
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+rem:
+	MOVD	R9,R8
 	ANDCC	$24,R8,R9	// Any 8 byte chunks?
 	BEQ	leftover	// and result is 0
+	BR	setup8a
+
+different:
+#ifdef	GOARCH_ppc64le
+	MOVD	$byteswap<>+00(SB), R16
+	LXVD2X	(R16)(R0),SWAP	// Set up swap string
+
+	VPERM	V3,V3,SWAP,V3
+	VPERM	V4,V4,SWAP,V4
+#endif
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	VSLDOI	$8,V3,V3,V3	// move lower doublwords of A and B into GPR for comparison
+	MFVSRD	VS35,R16
+	VSLDOI	$8,V4,V4,V4
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
 setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
+	SRADCC	$3,R8,R9	// get the 8 byte count
 	BEQ	leftover	// shifted value is 0
+	CMPU	R8,$8		// optimize 8byte move
+	BEQ	size8
+	CMPU	R8,$16
+	BEQ	size16
 	MOVD	R9,CTR		// loop count for doublewords
 loop8:
-	MOVDBR	(R5+R0),R9	// doublewords to compare
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
 	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
 	ADD	$8,R5
 	ADD	$8,R6
-	CMPU	R9,R10		// match?
+	CMPU	R16,R10		// match?
 	BC	8,2,loop8	// bt ctr <> 0 && cr
 	BGT	greater
 	BLT	less
 leftover:
 	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
+	BEQ	zeroremainder
 simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// 1st len > 2nd len must be greater
-simple:
-	MOVBZ	0(R5), R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6), R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9, R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
-	BR	less		// must be less
-cmpne:				// only here is not equal
-	MOVDBR	(R5+R16),R8	// reload in reverse order
-	MOVDBR	(R6+R16),R9
-	CMPU	R8,R9		// compare correct endianness
-	BGT	greater		// here only if NE
-less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	R0,R14
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5)(R14),R10
+	MOVWBR	(R6)(R14),R11
+#else
+	MOVWZ	(R5)(R14),R10
+	MOVWZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-4,R9
+	ADD	$4,R14
+	PCALIGN	$16
+
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5)(R14),R10
+	MOVHBR	(R6)(R14),R11
+#else
+	MOVHZ	(R5)(R14),R10
+	MOVHZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-2,R9
+	ADD	$2,R14
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5)(R14),R10
+	MOVBZ	(R6)(R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
+
+less:	MOVD	$-1,R3		// return value if A < B
 	RET
+size16:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+zeroremainder:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+size8:
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
+	CMPU	R16,R10		// match?
+	BGT	greater
+	BLT	less
+	BGT	CR2,greater	// 2nd len > 1st len
+	BLT	CR2,less	// 2nd len < 1st len
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
 greater:
-	MOVD	$1, R3		// return value if A > B
+	MOVD	$1,R3		// return value if A > B
 	RET
 
-// Do an efficient memcmp for ppc64 (BE)
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
-	CMP	R8,$32		// optimize >= 32
+	CMP	R8,$16		// optimize for size<16
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
+	BLT	simplecheck
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$32		// optimize for size 16-31
+	BLT	cmp16
+	CMP	R8,$64
+	BLT	cmp32		// optimize for size 32-63
+	DCBT	(R5)		// optimize for size>=64
+	DCBT	(R6)		// cache hint
 
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	BLT	less		// found less
-	BGT	greater		// found greater
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD	24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU	R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BLT	less		// with BE, byte ordering is
-	BGT	greater		// good for compare
-	ANDCC	$24,R8,R9	// Any 8 byte chunks?
-	BEQ	leftover	// and result is 0
-setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
-	BEQ	leftover	// shifted value is 0
-	MOVD	R9,CTR		// loop count for doublewords
-loop8:
-	MOVD	(R5),R9
-	MOVD	(R6),R10
-	ADD	$8,R5
-	ADD	$8,R6
-	CMPU	R9,R10		// match?
-	BC	8,2,loop8	// bt ctr <> 0 && cr
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R11)(R5),V3	// load bytes of A at offset 32 into vector
+	LXVB16X	(R11)(R6),V4	// load bytes of B at offset 32 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R12)(R5),V3	// load bytes of A at offset 48 into vector
+	LXVB16X	(R12)(R6),V4	// load bytes of B at offset 48 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$16
+	BGE	cmp16		// loop to cmp16 if there are 16-31 bytes left
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remaining bytes
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$16		// loop to cmp16 if there are 16-31 bytes left
+	BGE	cmp16
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remainder bytes
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+different:
+
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	MFVSRLD	VS35,R16	// next move lower doublewords of A and B into GPR for comparison
+	MFVSRLD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+
+greater:
+	MOVD	$1,R3		// return value if A > B
+	RET
+cmp16:
+	ANDCC	$16,R9,R31
+	BEQ	tail
+
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$16,R5
+	ADD	$16,R6
+tail:
+	ANDCC	$15,R9		// Load the last 16 bytes (we know there are at least 32b)
+	BEQ	end
+
+	ADD	R9,R5
+	ADD	R9,R6
+	MOVD	$-16,R10
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+end:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if BLT CR2 that is, len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+simplecheck:
+	MOVD	$0,R14		// process 8 bytes
+	CMP	R9,$8
+	BLT	word
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R14),R10
+	MOVDBR	(R6+R14),R11
+#else
+	MOVD	(R5+R14),R10
+	MOVD	(R6+R14),R11
+#endif
+	CMPU	R10,R11
 	BGT	greater
 	BLT	less
-leftover:
-	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
-simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC 	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// same len, must be equal
-simple:
-	MOVBZ	0(R5),R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6),R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9,R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
+	ADD	$8,R14
+	ADD	$-8,R9
+	PCALIGN	$16
+word:
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5+R14),R10
+	MOVWBR	(R6+R14),R11
+#else
+	MOVWZ	(R5+R14),R10
+	MOVWZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$4,R14
+	ADD	$-4,R9
+	PCALIGN	$16
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5+R14),R10
+	MOVHBR	(R6+R14),R11
+#else
+	MOVHZ	(R5+R14),R10
+	MOVHZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$2,R14
+	ADD	$-2,R9
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5+R14),R10
+	MOVBZ	(R6+R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
 less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	$-1,R3		// return value if A < B
 	RET
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
-greater:
-	MOVD	$1, R3		// return value if A > B
-	RET