|  | // Copyright 2018 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | //go:build ppc64 || ppc64le | 
|  |  | 
|  | #include "go_asm.h" | 
|  | #include "textflag.h" | 
|  |  | 
|  | TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 | 
|  | // R3 = byte array pointer | 
|  | // R4 = length | 
|  | MOVD	R6, R5		// R5 = byte | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 | 
|  | // R3 = string | 
|  | // R4 = length | 
|  | // R5 = byte | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | #ifndef GOPPC64_power9 | 
|  | #ifdef GOARCH_ppc64le | 
|  | DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 | 
|  | DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 | 
|  | #else | 
|  | DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 | 
|  | DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 | 
|  | #endif | 
|  | GLOBL indexbytevbperm<>+0(SB), RODATA, $16 | 
|  | #endif | 
|  |  | 
|  | // Some operations are endian specific, choose the correct opcode base on GOARCH. | 
|  | // Note, _VCZBEBB is only available on power9 and newer. | 
|  | #ifdef GOARCH_ppc64le | 
|  | #define _LDBEX	MOVDBR | 
|  | #define _LWBEX	MOVWBR | 
|  | #define _LHBEX	MOVHBR | 
|  | #define _VCZBEBB VCTZLSBB | 
|  | #else | 
|  | #define _LDBEX	MOVD | 
|  | #define _LWBEX	MOVW | 
|  | #define _LHBEX	MOVH | 
|  | #define _VCZBEBB VCLZLSBB | 
|  | #endif | 
|  |  | 
|  | // R3 = addr of string | 
|  | // R4 = len of string | 
|  | // R5 = byte to find | 
|  | // On exit: | 
|  | // R3 = return value | 
|  | TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 | 
|  | CMPU	R4,$32 | 
|  |  | 
|  | #ifndef GOPPC64_power9 | 
|  | // Load VBPERMQ constant to reduce compare into an ordered bit mask. | 
|  | MOVD	$indexbytevbperm<>+00(SB),R16 | 
|  | LXVD2X	(R16),V0	// Set up swap string | 
|  | #endif | 
|  |  | 
|  | MTVRD	R5,V1 | 
|  | VSPLTB	$7,V1,V1	// Replicate byte across V1 | 
|  |  | 
|  | BLT	cmp16		// Jump to the small string case if it's <32 bytes. | 
|  |  | 
|  | CMP	R4,$64,CR1 | 
|  | MOVD	$16,R11 | 
|  | MOVD	R3,R8 | 
|  | BLT	CR1,cmp32	// Special case for length 32 - 63 | 
|  | MOVD	$32,R12 | 
|  | MOVD	$48,R6 | 
|  |  | 
|  | RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63 | 
|  | ADD	R3,R9,R9	// R9 = &s[len &^ 63] | 
|  | ANDCC	$63,R4		// (len &= 63) cmp 0. | 
|  |  | 
|  | PCALIGN	$16 | 
|  | loop64: | 
|  | LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0] | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0	// Match found at R8, jump out | 
|  |  | 
|  | LXVD2X	(R11)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out | 
|  |  | 
|  | LXVD2X	(R12)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out | 
|  |  | 
|  | LXVD2X	(R6)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out | 
|  |  | 
|  | ADD	$64,R8 | 
|  | CMPU	R8,R9,CR1 | 
|  | BNE	CR1,loop64	// R8 != &s[len &^ 63]? | 
|  |  | 
|  | PCALIGN	$32 | 
|  | BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64. | 
|  |  | 
|  | CMP	R4,$32		// Tail length >= 32, use cmp32 path. | 
|  | CMP	R4,$16,CR1 | 
|  | BGE	cmp32 | 
|  |  | 
|  | ADD	R8,R4,R9 | 
|  | ADD	$-16,R9 | 
|  | BLE	CR1,cmp64_tail_gt0 | 
|  |  | 
|  | cmp64_tail_gt16:	// Tail length 17 - 32 | 
|  | LXVD2X	(R0)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0 | 
|  |  | 
|  | cmp64_tail_gt0:	// Tail length 1 - 16 | 
|  | MOVD	R9,R8 | 
|  | LXVD2X	(R0)(R9),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0 | 
|  |  | 
|  | BR	notfound | 
|  |  | 
|  | cmp32:	// Length 32 - 63 | 
|  |  | 
|  | // Bytes 0 - 15 | 
|  | LXVD2X	(R0)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0 | 
|  |  | 
|  | // Bytes 16 - 31 | 
|  | LXVD2X	(R8)(R11),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out | 
|  |  | 
|  | BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32) | 
|  | CMP	R4,$48 | 
|  |  | 
|  | ADD	R4,R8,R9		// Compute &s[len(s)-16] | 
|  | ADD	$32,R8,R8 | 
|  | ADD	$-16,R9,R9 | 
|  | ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8 | 
|  |  | 
|  | // Bytes 33 - 47 | 
|  | LXVD2X	(R0)(R8),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0		// match found at R8+32 bytes, jump out | 
|  |  | 
|  | BLE	notfound | 
|  |  | 
|  | // Bytes 48 - 63 | 
|  | MOVD	R9,R8			// R9 holds the final check. | 
|  | LXVD2X	(R0)(R9),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out | 
|  |  | 
|  | BR	notfound | 
|  |  | 
|  | // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. | 
|  | #ifndef GOPPC64_power9 | 
|  | #define ADJUST_FOR_CNTLZW -16 | 
|  | #else | 
|  | #define ADJUST_FOR_CNTLZW 0 | 
|  | #endif | 
|  |  | 
|  | // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used | 
|  | // to determine the offset into the 16B vector, it will overcount by 16. Account for it here. | 
|  | foundat3: | 
|  | SUB	R3,R8,R3 | 
|  | ADD	$48+ADJUST_FOR_CNTLZW,R3 | 
|  | BR	vfound | 
|  | foundat2: | 
|  | SUB	R3,R8,R3 | 
|  | ADD	$32+ADJUST_FOR_CNTLZW,R3 | 
|  | BR	vfound | 
|  | foundat1: | 
|  | SUB	R3,R8,R3 | 
|  | ADD	$16+ADJUST_FOR_CNTLZW,R3 | 
|  | BR	vfound | 
|  | foundat0: | 
|  | SUB	R3,R8,R3 | 
|  | ADD	$0+ADJUST_FOR_CNTLZW,R3 | 
|  | vfound: | 
|  | // Map equal values into a 16 bit value with earlier matches setting higher bits. | 
|  | #ifndef GOPPC64_power9 | 
|  | VBPERMQ	V6,V0,V6 | 
|  | MFVRD	V6,R4 | 
|  | CNTLZW	R4,R4 | 
|  | #else | 
|  | #ifdef GOARCH_ppc64le | 
|  | // Put the value back into LE ordering by swapping doublewords. | 
|  | XXPERMDI	V6,V6,$2,V6 | 
|  | #endif | 
|  | _VCZBEBB	V6,R4 | 
|  | #endif | 
|  | ADD	R3,R4,R3 | 
|  | RET | 
|  |  | 
|  | cmp16:	// Length 16 - 31 | 
|  | CMPU	R4,$16 | 
|  | ADD	R4,R3,R9 | 
|  | BLT	cmp8 | 
|  |  | 
|  | ADD	$-16,R9,R9		// &s[len(s)-16] | 
|  |  | 
|  | // Bytes 0 - 15 | 
|  | LXVD2X	(R0)(R3),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | MOVD	R3,R8 | 
|  | BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out | 
|  |  | 
|  | BEQ	notfound | 
|  |  | 
|  | // Bytes 16 - 30 | 
|  | MOVD	R9,R8			// R9 holds the final check. | 
|  | LXVD2X	(R0)(R9),V2 | 
|  | VCMPEQUBCC	V2,V1,V6 | 
|  | BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out | 
|  |  | 
|  | BR	notfound | 
|  |  | 
|  |  | 
|  | cmp8:	// Length 8 - 15 | 
|  | #ifdef GOPPC64_power10 | 
|  | // Load all the bytes into a single VSR in BE order. | 
|  | SLD	$56,R4,R5 | 
|  | LXVLL	R3,R5,V2 | 
|  | // Compare and count the number which don't match. | 
|  | VCMPEQUB	V2,V1,V6 | 
|  | VCLZLSBB	V6,R3 | 
|  | // If count is the number of bytes, or more. No matches are found. | 
|  | CMPU	R3,R4 | 
|  | MOVD	$-1,R5 | 
|  | // Otherwise, the count is the index of the first match. | 
|  | ISEL	CR0LT,R3,R5,R3 | 
|  | RET | 
|  | #else | 
|  | RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register. | 
|  | RLDIMI	$16,R5,$32,R5 | 
|  | RLDIMI	$32,R5,$0,R5 | 
|  | CMPU	R4,$8 | 
|  | BLT	cmp4 | 
|  | MOVD	$-8,R11 | 
|  | ADD	$-8,R4,R4 | 
|  |  | 
|  | _LDBEX	(R0)(R3),R10 | 
|  | _LDBEX	(R11)(R9),R11 | 
|  | CMPB	R10,R5,R10 | 
|  | CMPB	R11,R5,R11 | 
|  | CMPU	R10,$0 | 
|  | CMPU	R11,$0,CR1 | 
|  | CNTLZD	R10,R10 | 
|  | CNTLZD	R11,R11 | 
|  | SRD	$3,R10,R3 | 
|  | SRD	$3,R11,R11 | 
|  | BNE	found | 
|  |  | 
|  | ADD	R4,R11,R4 | 
|  | MOVD	$-1,R3 | 
|  | ISEL	CR1EQ,R3,R4,R3 | 
|  | RET | 
|  |  | 
|  | cmp4:	// Length 4 - 7 | 
|  | CMPU	R4,$4 | 
|  | BLT	cmp2 | 
|  | MOVD	$-4,R11 | 
|  | ADD	$-4,R4,R4 | 
|  |  | 
|  | _LWBEX	(R0)(R3),R10 | 
|  | _LWBEX	(R11)(R9),R11 | 
|  | CMPB	R10,R5,R10 | 
|  | CMPB	R11,R5,R11 | 
|  | CNTLZW	R10,R10 | 
|  | CNTLZW	R11,R11 | 
|  | CMPU	R10,$32 | 
|  | CMPU	R11,$32,CR1 | 
|  | SRD	$3,R10,R3 | 
|  | SRD	$3,R11,R11 | 
|  | BNE	found | 
|  |  | 
|  | ADD	R4,R11,R4 | 
|  | MOVD	$-1,R3 | 
|  | ISEL	CR1EQ,R3,R4,R3 | 
|  | RET | 
|  |  | 
|  | cmp2:	// Length 2 - 3 | 
|  | CMPU	R4,$2 | 
|  | BLT	cmp1 | 
|  |  | 
|  | _LHBEX	(R0)(R3),R10 | 
|  | CMPB	R10,R5,R10 | 
|  | SLDCC	$48,R10,R10 | 
|  | CNTLZD	R10,R10 | 
|  | SRD	$3,R10,R3 | 
|  | BNE	found | 
|  |  | 
|  | cmp1:	// Length 1 | 
|  | MOVD	$-1,R3 | 
|  | ANDCC	$1,R4,R31 | 
|  | BEQ	found | 
|  |  | 
|  | MOVBZ	-1(R9),R10 | 
|  | CMPB	R10,R5,R10 | 
|  | ANDCC	$1,R10 | 
|  | ADD	$-1,R4 | 
|  | ISEL	CR0EQ,R3,R4,R3 | 
|  |  | 
|  | found: | 
|  | RET | 
|  | #endif | 
|  |  | 
|  | notfound: | 
|  | MOVD $-1,R3 | 
|  | RET | 
|  |  |