|  | // Copyright 2018 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // +build ppc64 ppc64le | 
|  |  | 
|  | #include "go_asm.h" | 
|  | #include "textflag.h" | 
|  |  | 
|  | TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 | 
|  | MOVD	b_base+0(FP), R3	// R3 = byte array pointer | 
|  | MOVD	b_len+8(FP), R4		// R4 = length | 
|  | MOVBZ	c+24(FP), R5		// R5 = byte | 
|  | MOVD	$ret+32(FP), R14	// R14 = &ret | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 | 
|  | MOVD	s_base+0(FP), R3  // R3 = string | 
|  | MOVD	s_len+8(FP), R4	  // R4 = length | 
|  | MOVBZ	c+16(FP), R5	  // R5 = byte | 
|  | MOVD	$ret+24(FP), R14  // R14 = &ret | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 | 
|  | MOVD	b_base+0(FP), R3	// R3 = byte array pointer | 
|  | MOVD	b_len+8(FP), R4		// R4 = length | 
|  | MOVBZ	c+24(FP), R5		// R5 = byte | 
|  | MOVD	$ret+32(FP), R14	// R14 = &ret | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 | 
|  | MOVD	s_base+0(FP), R3  // R3 = string | 
|  | MOVD	s_len+8(FP), R4	  // R4 = length | 
|  | MOVBZ	c+16(FP), R5	  // R5 = byte | 
|  | MOVD	$ret+24(FP), R14  // R14 = &ret | 
|  | BR	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 | 
|  | DCBT	(R3)		// Prepare cache line. | 
|  | MOVD	R3,R17		// Save base address for calculating the index later. | 
|  | RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8. | 
|  | RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register. | 
|  | ADD	R4,R3,R7	// Last acceptable address in R7. | 
|  |  | 
|  | RLDIMI	$16,R5,$32,R5 | 
|  | CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently. | 
|  | MOVD	$-1,R9 | 
|  | WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28). | 
|  | RLDIMI	$32,R5,$0,R5 | 
|  | MOVD	R7,R10		// Save last acceptable address in R10 for later. | 
|  | ADD	$-1,R7,R7 | 
|  | #ifdef GOARCH_ppc64le | 
|  | SLD	R6,R9,R9	// Prepare mask for Little Endian | 
|  | #else | 
|  | SRD	R6,R9,R9	// Same for Big Endian | 
|  | #endif | 
|  | BLE	small_string	// Jump to the small string case if it's <32 bytes. | 
|  |  | 
|  | // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values | 
|  | // in V0, V1 and V10, then branch to the preloop. | 
|  | ANDCC	$63,R3,R11 | 
|  | BEQ	CR0,qw_align | 
|  | RLDICL	$0,R3,$61,R11 | 
|  |  | 
|  | MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8. | 
|  | CMPB	R12,R5,R3	// Check for a match. | 
|  | AND	R9,R3,R3	// Mask bytes below s_base | 
|  | RLDICL	$0,R7,$61,R6	// length-1 | 
|  | RLDICR	$0,R7,$60,R7	// Last doubleword in R7 | 
|  | CMPU	R3,$0,CR7	// If we have a match, jump to the final computation | 
|  | BNE	CR7,done | 
|  | ADD	$8,R8,R8 | 
|  | ADD	$-8,R4,R4 | 
|  | ADD	R4,R11,R4 | 
|  |  | 
|  | // Check for quadword alignment | 
|  | ANDCC	$15,R8,R11 | 
|  | BEQ	CR0,qw_align | 
|  |  | 
|  | // Not aligned, so handle the next doubleword | 
|  | MOVD	0(R8),R12 | 
|  | CMPB	R12,R5,R3 | 
|  | CMPU	R3,$0,CR7 | 
|  | BNE	CR7,done | 
|  | ADD	$8,R8,R8 | 
|  | ADD	$-8,R4,R4 | 
|  |  | 
|  | // Either quadword aligned or 64-byte at this point. We can use LVX. | 
|  | qw_align: | 
|  |  | 
|  | // Set up auxiliary data for the vectorized algorithm. | 
|  | VSPLTISB  $0,V0		// Replicate 0 across V0 | 
|  | VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ | 
|  | MTVRD	  R5,V1 | 
|  | LVSL	  (R0+R0),V11 | 
|  | VSLB	  V11,V10,V10 | 
|  | VSPLTB	  $7,V1,V1	// Replicate byte across V1 | 
|  | CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop | 
|  | BLE	  tail | 
|  |  | 
|  | // We will load 4 quardwords per iteration in the loop, so check for | 
|  | // 64-byte alignment. If 64-byte aligned, then branch to the preloop. | 
|  | ANDCC	  $63,R8,R11 | 
|  | BEQ	  CR0,preloop | 
|  |  | 
|  | // Not 64-byte aligned. Load one quadword at a time until aligned. | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6		// Check for byte in V4 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $16,R8,R8 | 
|  | ADD	    $-16,R4,R4 | 
|  |  | 
|  | ANDCC	    $63,R8,R11 | 
|  | BEQ	    CR0,preloop | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6		// Check for byte in V4 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $16,R8,R8 | 
|  | ADD	    $-16,R4,R4 | 
|  |  | 
|  | ANDCC	    $63,R8,R11 | 
|  | BEQ	    CR0,preloop | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6		// Check for byte in V4 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $-16,R4,R4 | 
|  | ADD	    $16,R8,R8 | 
|  |  | 
|  | // 64-byte aligned. Prepare for the main loop. | 
|  | preloop: | 
|  | CMPU	R4,$64 | 
|  | BLE	tail	      // If len <= 64, don't use the vectorized loop | 
|  |  | 
|  | // We are now aligned to a 64-byte boundary. We will load 4 quadwords | 
|  | // per loop iteration. The last doubleword is in R10, so our loop counter | 
|  | // starts at (R10-R8)/64. | 
|  | SUB	R8,R10,R6 | 
|  | SRD	$6,R6,R9      // Loop counter in R9 | 
|  | MOVD	R9,CTR | 
|  |  | 
|  | MOVD	$16,R11      // Load offsets for the vector loads | 
|  | MOVD	$32,R9 | 
|  | MOVD	$48,R7 | 
|  |  | 
|  | // Main loop we will load 64 bytes per iteration | 
|  | loop: | 
|  | LVX	    (R8+R0),V2	      // Load 4 16-byte vectors | 
|  | LVX	    (R11+R8),V3 | 
|  | LVX	    (R9+R8),V4 | 
|  | LVX	    (R7+R8),V5 | 
|  | VCMPEQUB    V1,V2,V6	      // Look for byte in each vector | 
|  | VCMPEQUB    V1,V3,V7 | 
|  | VCMPEQUB    V1,V4,V8 | 
|  | VCMPEQUB    V1,V5,V9 | 
|  | VOR	    V6,V7,V11	      // Compress the result in a single vector | 
|  | VOR	    V8,V9,V12 | 
|  | VOR	    V11,V12,V11 | 
|  | VCMPEQUBCC  V0,V11,V11	      // Check for byte | 
|  | BGE	    CR6,found | 
|  | ADD	    $64,R8,R8 | 
|  | BC	    16,0,loop	      // bdnz loop | 
|  |  | 
|  | // Handle the tailing bytes or R4 <= 64 | 
|  | RLDICL	$0,R6,$58,R4 | 
|  | tail: | 
|  | CMPU	    R4,$0 | 
|  | BEQ	    notfound | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $16,R8,R8 | 
|  | CMPU	    R4,$16,CR6 | 
|  | BLE	    CR6,notfound | 
|  | ADD	    $-16,R4,R4 | 
|  |  | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $16,R8,R8 | 
|  | CMPU	    R4,$16,CR6 | 
|  | BLE	    CR6,notfound | 
|  | ADD	    $-16,R4,R4 | 
|  |  | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6 | 
|  | BNE	    CR6,found_qw_align | 
|  | ADD	    $16,R8,R8 | 
|  | CMPU	    R4,$16,CR6 | 
|  | BLE	    CR6,notfound | 
|  | ADD	    $-16,R4,R4 | 
|  |  | 
|  | LVX	    (R8+R0),V4 | 
|  | VCMPEQUBCC  V1,V4,V6 | 
|  | BNE	    CR6,found_qw_align | 
|  |  | 
|  | notfound: | 
|  | MOVD	$-1,R3 | 
|  | MOVD	R3,(R14) | 
|  | RET | 
|  |  | 
|  | found: | 
|  | // We will now compress the results into a single doubleword, | 
|  | // so it can be moved to a GPR for the final index calculation. | 
|  |  | 
|  | // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the | 
|  | // first bit of each byte into bits 48-63. | 
|  | VBPERMQ	  V6,V10,V6 | 
|  | VBPERMQ	  V7,V10,V7 | 
|  | VBPERMQ	  V8,V10,V8 | 
|  | VBPERMQ	  V9,V10,V9 | 
|  |  | 
|  | // Shift each 16-bit component into its correct position for | 
|  | // merging into a single doubleword. | 
|  | #ifdef GOARCH_ppc64le | 
|  | VSLDOI	  $2,V7,V7,V7 | 
|  | VSLDOI	  $4,V8,V8,V8 | 
|  | VSLDOI	  $6,V9,V9,V9 | 
|  | #else | 
|  | VSLDOI	  $6,V6,V6,V6 | 
|  | VSLDOI	  $4,V7,V7,V7 | 
|  | VSLDOI	  $2,V8,V8,V8 | 
|  | #endif | 
|  |  | 
|  | // Merge V6-V9 into a single doubleword and move to a GPR. | 
|  | VOR	V6,V7,V11 | 
|  | VOR	V8,V9,V4 | 
|  | VOR	V4,V11,V4 | 
|  | MFVRD	V4,R3 | 
|  |  | 
|  | #ifdef GOARCH_ppc64le | 
|  | ADD	  $-1,R3,R11 | 
|  | ANDN	  R3,R11,R11 | 
|  | POPCNTD	  R11,R11	// Count trailing zeros (Little Endian). | 
|  | #else | 
|  | CNTLZD	R3,R11		// Count leading zeros (Big Endian). | 
|  | #endif | 
|  | ADD	R8,R11,R3	// Calculate byte address | 
|  |  | 
|  | return: | 
|  | SUB	R17,R3 | 
|  | MOVD	R3,(R14) | 
|  | RET | 
|  |  | 
|  | found_qw_align: | 
|  | // Use the same algorithm as above. Compress the result into | 
|  | // a single doubleword and move it to a GPR for the final | 
|  | // calculation. | 
|  | VBPERMQ	  V6,V10,V6 | 
|  |  | 
|  | #ifdef GOARCH_ppc64le | 
|  | MFVRD	  V6,R3 | 
|  | ADD	  $-1,R3,R11 | 
|  | ANDN	  R3,R11,R11 | 
|  | POPCNTD	  R11,R11 | 
|  | #else | 
|  | VSLDOI	  $6,V6,V6,V6 | 
|  | MFVRD	  V6,R3 | 
|  | CNTLZD	  R3,R11 | 
|  | #endif | 
|  | ADD	  R8,R11,R3 | 
|  | CMPU	  R11,R4 | 
|  | BLT	  return | 
|  | BR	  notfound | 
|  |  | 
|  | done: | 
|  | // At this point, R3 has 0xFF in the same position as the byte we are | 
|  | // looking for in the doubleword. Use that to calculate the exact index | 
|  | // of the byte. | 
|  | #ifdef GOARCH_ppc64le | 
|  | ADD	$-1,R3,R11 | 
|  | ANDN	R3,R11,R11 | 
|  | POPCNTD	R11,R11		// Count trailing zeros (Little Endian). | 
|  | #else | 
|  | CNTLZD	R3,R11		// Count leading zeros (Big Endian). | 
|  | #endif | 
|  | CMPU	R8,R7		// Check if we are at the last doubleword. | 
|  | SRD	$3,R11		// Convert trailing zeros to bytes. | 
|  | ADD	R11,R8,R3 | 
|  | CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset. | 
|  | BNE	return | 
|  | BLE	CR7,return | 
|  | BR	notfound | 
|  |  | 
|  | small_string: | 
|  | // We unroll this loop for better performance. | 
|  | CMPU	R4,$0		// Check for length=0 | 
|  | BEQ	notfound | 
|  |  | 
|  | MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8. | 
|  | CMPB	R12,R5,R3	// Check for a match. | 
|  | AND	R9,R3,R3	// Mask bytes below s_base. | 
|  | CMPU	R3,$0,CR7	// If we have a match, jump to the final computation. | 
|  | RLDICL	$0,R7,$61,R6	// length-1 | 
|  | RLDICR	$0,R7,$60,R7	// Last doubleword in R7. | 
|  | CMPU	R8,R7 | 
|  | BNE	CR7,done | 
|  | BEQ	notfound	// Hit length. | 
|  |  | 
|  | MOVDU	8(R8),R12 | 
|  | CMPB	R12,R5,R3 | 
|  | CMPU	R3,$0,CR6 | 
|  | CMPU	R8,R7 | 
|  | BNE	CR6,done | 
|  | BEQ	notfound | 
|  |  | 
|  | MOVDU	8(R8),R12 | 
|  | CMPB	R12,R5,R3 | 
|  | CMPU	R3,$0,CR6 | 
|  | CMPU	R8,R7 | 
|  | BNE	CR6,done | 
|  | BEQ	notfound | 
|  |  | 
|  | MOVDU	8(R8),R12 | 
|  | CMPB	R12,R5,R3 | 
|  | CMPU	R3,$0,CR6 | 
|  | CMPU	R8,R7 | 
|  | BNE	CR6,done | 
|  | BEQ	notfound | 
|  |  | 
|  | MOVDU	8(R8),R12 | 
|  | CMPB	R12,R5,R3 | 
|  | CMPU	R3,$0,CR6 | 
|  | BNE	CR6,done | 
|  | BR	notfound | 
|  |  |