src/internal/bytealg/indexbyte_ppc64x.s - go - Git at Google

 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 //go:build ppc64 || ppc64le

 #include "go_asm.h"
 #include "textflag.h"

 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
 	// R3 = byte array pointer
 	// R4 = length
 	MOVD	R6, R5		// R5 = byte
 	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)

 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
 	// R3 = string
 	// R4 = length
 	// R5 = byte
 	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)

 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
 // R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R17		// Save base address for calculating the index later.
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
 	ADD	R4,R3,R7	// Last acceptable address in R7.

 	RLDIMI	$16,R5,$32,R5
 	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
 	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
 	RLDIMI	$32,R5,$0,R5
 	MOVD	R7,R10		// Save last acceptable address in R10 for later.
 	ADD	$-1,R7,R7
 #ifdef GOARCH_ppc64le
 	SLD	R6,R9,R9	// Prepare mask for Little Endian
 #else
 	SRD	R6,R9,R9	// Same for Big Endian
 #endif
 	BLT	small_string	// Jump to the small string case if it's <32 bytes.
 	CMP	R16,$1		// optimize for power8 v power9
 	BNE	power8
 	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
 	MTVRD	R5,V1
 	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
 	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
 	VSPLTB	$7,V1,V1	// Replicate byte across V1
 	CMP	R4,$64
 	MOVD	$16,R11
 	MOVD	R3,R8
 	BLT	cmp32
 	MOVD	$32,R12
 	MOVD	$48,R6

 loop64:
 	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat0	// match found at R8, jump out

 	LXVB16X	(R8)(R11),V2
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out

 	LXVB16X	(R8)(R12),V2
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out

 	LXVB16X	(R8)(R6),V2
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
 	ADD	$64,R8
 	ADD	$-64,R4
 	CMP	R4,$64		// >=64 bytes left to scan?
 	BGE	loop64
 	CMP	R4,$32
 	BLT	rem		// jump to rem if there are < 32 bytes left
 cmp32:
 	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat0	// match found at R8

 	LXVB16X	(R11)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
 	BNE	CR6,foundat1	// match found at R8+16

 	ADD	$32,R8
 	ADD	$-32,R4
 rem:
 	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
 	BR	small_string

 foundat3:
 	ADD	$16,R8
 foundat2:
 	ADD	$16,R8
 foundat1:
 	ADD	$16,R8
 foundat0:
 	// Compress the result into a single doubleword and
 	// move it to a GPR for the final calculation.
 	VBPERMQ	V6,V10,V6
 	MFVRD	V6,R3
 	// count leading zeroes upto the match that ends up in low 16 bits
 	// in both endian modes, compute index by subtracting the number by 16
 	CNTLZW	R3,R11
 	ADD	$-16,R11
 	ADD	R8,R11,R3	// Calculate byte address
 	SUB	R17,R3
 	RET
 power8:
 	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
 	// in V0, V1 and V10, then branch to the preloop.
 	ANDCC	$63,R3,R11
 	BEQ	CR0,qw_align
 	RLDICL	$0,R3,$61,R11

 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
 	BNE	CR7,done
 	ADD	$8,R8,R8
 	ADD	$-8,R4,R4
 	ADD	R4,R11,R4

 	// Check for quadword alignment
 	ANDCC	$15,R8,R11
 	BEQ	CR0,qw_align

 	// Not aligned, so handle the next doubleword
 	MOVD	0(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR7
 	BNE	CR7,done
 	ADD	$8,R8,R8
 	ADD	$-8,R4,R4

 	// Either quadword aligned or 64-byte at this point. We can use LVX.
 qw_align:

 	// Set up auxiliary data for the vectorized algorithm.
 	VSPLTISB  $0,V0		// Replicate 0 across V0
 	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
 	MTVRD	  R5,V1
 	LVSL	  (R0+R0),V11
 	VSLB	  V11,V10,V10
 	VSPLTB	  $7,V1,V1	// Replicate byte across V1
 	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
 	BLE	  tail

 	// We will load 4 quardwords per iteration in the loop, so check for
 	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
 	ANDCC	  $63,R8,R11
 	BEQ	  CR0,preloop

 	// Not 64-byte aligned. Load one quadword at a time until aligned.
 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
 	BNE	    CR6,found_qw_align
 	ADD	    $16,R8,R8
 	ADD	    $-16,R4,R4

 	ANDCC	    $63,R8,R11
 	BEQ	    CR0,preloop
 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
 	BNE	    CR6,found_qw_align
 	ADD	    $16,R8,R8
 	ADD	    $-16,R4,R4

 	ANDCC	    $63,R8,R11
 	BEQ	    CR0,preloop
 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
 	BNE	    CR6,found_qw_align
 	ADD	    $-16,R4,R4
 	ADD	    $16,R8,R8

 	// 64-byte aligned. Prepare for the main loop.
 preloop:
 	CMPU	R4,$64
 	BLE	tail	      // If len ≤ 64, don't use the vectorized loop

 	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
 	// per loop iteration. The last doubleword is in R10, so our loop counter
 	// starts at (R10-R8)/64.
 	SUB	R8,R10,R6
 	SRD	$6,R6,R9      // Loop counter in R9
 	MOVD	R9,CTR

 	ADD	$-64,R8,R8   // Adjust index for loop entry
 	MOVD	$16,R11      // Load offsets for the vector loads
 	MOVD	$32,R9
 	MOVD	$48,R7

 	// Main loop we will load 64 bytes per iteration
 loop:
 	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
 	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
 	LVX	    (R8+R11),V3
 	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
 	VCMPEQUB    V1,V3,V7

 	LVX	    (R8+R9),V4
 	LVX	    (R8+R7),V5
 	VCMPEQUB    V1,V4,V8
 	VCMPEQUB    V1,V5,V9

 	VOR	    V6,V7,V11	      // Compress the result in a single vector
 	VOR	    V8,V9,V12
 	VOR	    V11,V12,V13
 	VCMPEQUBCC  V0,V13,V14	      // Check for byte
 	BGE	    CR6,found
 	BC	    16,0,loop	      // bdnz loop

 	// Handle the tailing bytes or R4 ≤ 64
 	RLDICL	$0,R6,$58,R4
 	ADD	$64,R8,R8
 tail:
 	CMPU	    R4,$0
 	BEQ	    notfound
 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6
 	BNE	    CR6,found_qw_align
 	ADD	    $16,R8,R8
 	CMPU	    R4,$16,CR6
 	BLE	    CR6,notfound
 	ADD	    $-16,R4,R4

 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6
 	BNE	    CR6,found_qw_align
 	ADD	    $16,R8,R8
 	CMPU	    R4,$16,CR6
 	BLE	    CR6,notfound
 	ADD	    $-16,R4,R4

 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6
 	BNE	    CR6,found_qw_align
 	ADD	    $16,R8,R8
 	CMPU	    R4,$16,CR6
 	BLE	    CR6,notfound
 	ADD	    $-16,R4,R4

 	LVX	    (R8+R0),V4
 	VCMPEQUBCC  V1,V4,V6
 	BNE	    CR6,found_qw_align

 notfound:
 	MOVD	$-1, R3
 	RET

 found:
 	// We will now compress the results into a single doubleword,
 	// so it can be moved to a GPR for the final index calculation.

 	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
 	// first bit of each byte into bits 48-63.
 	VBPERMQ	  V6,V10,V6
 	VBPERMQ	  V7,V10,V7
 	VBPERMQ	  V8,V10,V8
 	VBPERMQ	  V9,V10,V9

 	// Shift each 16-bit component into its correct position for
 	// merging into a single doubleword.
 #ifdef GOARCH_ppc64le
 	VSLDOI	  $2,V7,V7,V7
 	VSLDOI	  $4,V8,V8,V8
 	VSLDOI	  $6,V9,V9,V9
 #else
 	VSLDOI	  $6,V6,V6,V6
 	VSLDOI	  $4,V7,V7,V7
 	VSLDOI	  $2,V8,V8,V8
 #endif

 	// Merge V6-V9 into a single doubleword and move to a GPR.
 	VOR	V6,V7,V11
 	VOR	V8,V9,V4
 	VOR	V4,V11,V4
 	MFVRD	V4,R3

 #ifdef GOARCH_ppc64le
 	ADD	  $-1,R3,R11
 	ANDN	  R3,R11,R11
 	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
 #else
 	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
 #endif
 	ADD	R8,R11,R3	// Calculate byte address

 return:
 	SUB	R17, R3
 	RET

 found_qw_align:
 	// Use the same algorithm as above. Compress the result into
 	// a single doubleword and move it to a GPR for the final
 	// calculation.
 	VBPERMQ	  V6,V10,V6

 #ifdef GOARCH_ppc64le
 	MFVRD	  V6,R3
 	ADD	  $-1,R3,R11
 	ANDN	  R3,R11,R11
 	POPCNTD	  R11,R11
 #else
 	VSLDOI	  $6,V6,V6,V6
 	MFVRD	  V6,R3
 	CNTLZD	  R3,R11
 #endif
 	ADD	  R8,R11,R3
 	CMPU	  R11,R4
 	BLT	  return
 	BR	  notfound
 	PCALIGN	  $16

 done:
 	ADD	$-1,R10,R6
 	// Offset of last index for the final
 	// doubleword comparison
 	RLDICL	$0,R6,$61,R6
 	// At this point, R3 has 0xFF in the same position as the byte we are
 	// looking for in the doubleword. Use that to calculate the exact index
 	// of the byte.
 #ifdef GOARCH_ppc64le
 	ADD	$-1,R3,R11
 	ANDN	R3,R11,R11
 	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
 #else
 	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
 #endif
 	CMPU	R8,R7		// Check if we are at the last doubleword.
 	SRD	$3,R11		// Convert trailing zeros to bytes.
 	ADD	R11,R8,R3
 	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
 	BNE	return
 	BLE	CR7,return
 	BR	notfound

 small_string:
 	// process string of length < 32 bytes
 	// We unroll this loop for better performance.
 	CMPU	R4,$0		// Check for length=0
 	BEQ	notfound

 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base.
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
 	CMPU	R8,R7
 	BNE	CR7,done
 	BEQ	notfound	// Hit length.

 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
 	CMPU	R8,R7
 	BNE	CR6,done
 	BEQ	notfound

 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
 	CMPU	R8,R7
 	BNE	CR6,done
 	BEQ	notfound

 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
 	CMPU	R8,R7
 	BNE	CR6,done
 	BEQ	notfound

 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
 	BNE	CR6,done
 	BR	notfound
	// Copyright 2018 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build ppc64 \|\| ppc64le

	#include "go_asm.h"
	#include "textflag.h"

	TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT\|NOFRAME,$0-40
	// R3 = byte array pointer
	// R4 = length
	MOVD R6, R5 // R5 = byte
	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
	BR indexbytebody<>(SB)

	TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT\|NOFRAME,$0-32
	// R3 = string
	// R4 = length
	// R5 = byte
	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
	BR indexbytebody<>(SB)

	// R3 = addr of string
	// R4 = len of string
	// R5 = byte to find
	// R16 = 1 if running on a POWER9 system, 0 otherwise
	// On exit:
	// R3 = return value
	TEXT indexbytebody<>(SB),NOSPLIT\|NOFRAME,$0-0
	MOVD R3,R17 // Save base address for calculating the index later.
	RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
	RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
	ADD R4,R3,R7 // Last acceptable address in R7.

	RLDIMI $16,R5,$32,R5
	CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
	MOVD $-1,R9
	RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
	RLDIMI $32,R5,$0,R5
	MOVD R7,R10 // Save last acceptable address in R10 for later.
	ADD $-1,R7,R7
	#ifdef GOARCH_ppc64le
	SLD R6,R9,R9 // Prepare mask for Little Endian
	#else
	SRD R6,R9,R9 // Same for Big Endian
	#endif
	BLT small_string // Jump to the small string case if it's <32 bytes.
	CMP R16,$1 // optimize for power8 v power9
	BNE power8
	VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
	MTVRD R5,V1
	LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
	VSLB V11,V10,V10 // to extract the first bit of match result into GPR
	VSPLTB $7,V1,V1 // Replicate byte across V1
	CMP R4,$64
	MOVD $16,R11
	MOVD R3,R8
	BLT cmp32
	MOVD $32,R12
	MOVD $48,R6

	loop64:
	LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat0 // match found at R8, jump out

	LXVB16X (R8)(R11),V2
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat1 // match found at R8+16 bytes, jump out

	LXVB16X (R8)(R12),V2
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat2 // match found at R8+32 bytes, jump out

	LXVB16X (R8)(R6),V2
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat3 // match found at R8+48 bytes, jump out
	ADD $64,R8
	ADD $-64,R4
	CMP R4,$64 // >=64 bytes left to scan?
	BGE loop64
	CMP R4,$32
	BLT rem // jump to rem if there are < 32 bytes left
	cmp32:
	LXVB16X (R0)(R8),V2 // 32-63 bytes left
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat0 // match found at R8

	LXVB16X (R11)(R8),V2
	VCMPEQUBCC V2,V1,V6
	BNE CR6,foundat1 // match found at R8+16

	ADD $32,R8
	ADD $-32,R4
	rem:
	RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
	BR small_string

	foundat3:
	ADD $16,R8
	foundat2:
	ADD $16,R8
	foundat1:
	ADD $16,R8
	foundat0:
	// Compress the result into a single doubleword and
	// move it to a GPR for the final calculation.
	VBPERMQ V6,V10,V6
	MFVRD V6,R3
	// count leading zeroes upto the match that ends up in low 16 bits
	// in both endian modes, compute index by subtracting the number by 16
	CNTLZW R3,R11
	ADD $-16,R11
	ADD R8,R11,R3 // Calculate byte address
	SUB R17,R3
	RET
	power8:
	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
	// in V0, V1 and V10, then branch to the preloop.
	ANDCC $63,R3,R11
	BEQ CR0,qw_align
	RLDICL $0,R3,$61,R11

	MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
	CMPB R12,R5,R3 // Check for a match.
	AND R9,R3,R3 // Mask bytes below s_base
	RLDICR $0,R7,$60,R7 // Last doubleword in R7
	CMPU R3,$0,CR7 // If we have a match, jump to the final computation
	BNE CR7,done
	ADD $8,R8,R8
	ADD $-8,R4,R4
	ADD R4,R11,R4

	// Check for quadword alignment
	ANDCC $15,R8,R11
	BEQ CR0,qw_align

	// Not aligned, so handle the next doubleword
	MOVD 0(R8),R12
	CMPB R12,R5,R3
	CMPU R3,$0,CR7
	BNE CR7,done
	ADD $8,R8,R8
	ADD $-8,R4,R4

	// Either quadword aligned or 64-byte at this point. We can use LVX.
	qw_align:

	// Set up auxiliary data for the vectorized algorithm.
	VSPLTISB $0,V0 // Replicate 0 across V0
	VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
	MTVRD R5,V1
	LVSL (R0+R0),V11
	VSLB V11,V10,V10
	VSPLTB $7,V1,V1 // Replicate byte across V1
	CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
	BLE tail

	// We will load 4 quardwords per iteration in the loop, so check for
	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
	ANDCC $63,R8,R11
	BEQ CR0,preloop

	// Not 64-byte aligned. Load one quadword at a time until aligned.
	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6 // Check for byte in V4
	BNE CR6,found_qw_align
	ADD $16,R8,R8
	ADD $-16,R4,R4

	ANDCC $63,R8,R11
	BEQ CR0,preloop
	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6 // Check for byte in V4
	BNE CR6,found_qw_align
	ADD $16,R8,R8
	ADD $-16,R4,R4

	ANDCC $63,R8,R11
	BEQ CR0,preloop
	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6 // Check for byte in V4
	BNE CR6,found_qw_align
	ADD $-16,R4,R4
	ADD $16,R8,R8

	// 64-byte aligned. Prepare for the main loop.
	preloop:
	CMPU R4,$64
	BLE tail // If len ≤ 64, don't use the vectorized loop

	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
	// per loop iteration. The last doubleword is in R10, so our loop counter
	// starts at (R10-R8)/64.
	SUB R8,R10,R6
	SRD $6,R6,R9 // Loop counter in R9
	MOVD R9,CTR

	ADD $-64,R8,R8 // Adjust index for loop entry
	MOVD $16,R11 // Load offsets for the vector loads
	MOVD $32,R9
	MOVD $48,R7

	// Main loop we will load 64 bytes per iteration
	loop:
	ADD $64,R8,R8 // Fuse addi+lvx for performance
	LVX (R8+R0),V2 // Load 4 16-byte vectors
	LVX (R8+R11),V3
	VCMPEQUB V1,V2,V6 // Look for byte in each vector
	VCMPEQUB V1,V3,V7

	LVX (R8+R9),V4
	LVX (R8+R7),V5
	VCMPEQUB V1,V4,V8
	VCMPEQUB V1,V5,V9

	VOR V6,V7,V11 // Compress the result in a single vector
	VOR V8,V9,V12
	VOR V11,V12,V13
	VCMPEQUBCC V0,V13,V14 // Check for byte
	BGE CR6,found
	BC 16,0,loop // bdnz loop

	// Handle the tailing bytes or R4 ≤ 64
	RLDICL $0,R6,$58,R4
	ADD $64,R8,R8
	tail:
	CMPU R4,$0
	BEQ notfound
	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6
	BNE CR6,found_qw_align
	ADD $16,R8,R8
	CMPU R4,$16,CR6
	BLE CR6,notfound
	ADD $-16,R4,R4

	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6
	BNE CR6,found_qw_align
	ADD $16,R8,R8
	CMPU R4,$16,CR6
	BLE CR6,notfound
	ADD $-16,R4,R4

	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6
	BNE CR6,found_qw_align
	ADD $16,R8,R8
	CMPU R4,$16,CR6
	BLE CR6,notfound
	ADD $-16,R4,R4

	LVX (R8+R0),V4
	VCMPEQUBCC V1,V4,V6
	BNE CR6,found_qw_align

	notfound:
	MOVD $-1, R3
	RET

	found:
	// We will now compress the results into a single doubleword,
	// so it can be moved to a GPR for the final index calculation.

	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
	// first bit of each byte into bits 48-63.
	VBPERMQ V6,V10,V6
	VBPERMQ V7,V10,V7
	VBPERMQ V8,V10,V8
	VBPERMQ V9,V10,V9

	// Shift each 16-bit component into its correct position for
	// merging into a single doubleword.
	#ifdef GOARCH_ppc64le
	VSLDOI $2,V7,V7,V7
	VSLDOI $4,V8,V8,V8
	VSLDOI $6,V9,V9,V9
	#else
	VSLDOI $6,V6,V6,V6
	VSLDOI $4,V7,V7,V7
	VSLDOI $2,V8,V8,V8
	#endif

	// Merge V6-V9 into a single doubleword and move to a GPR.
	VOR V6,V7,V11
	VOR V8,V9,V4
	VOR V4,V11,V4
	MFVRD V4,R3

	#ifdef GOARCH_ppc64le
	ADD $-1,R3,R11
	ANDN R3,R11,R11
	POPCNTD R11,R11 // Count trailing zeros (Little Endian).
	#else
	CNTLZD R3,R11 // Count leading zeros (Big Endian).
	#endif
	ADD R8,R11,R3 // Calculate byte address

	return:
	SUB R17, R3
	RET

	found_qw_align:
	// Use the same algorithm as above. Compress the result into
	// a single doubleword and move it to a GPR for the final
	// calculation.
	VBPERMQ V6,V10,V6

	#ifdef GOARCH_ppc64le
	MFVRD V6,R3
	ADD $-1,R3,R11
	ANDN R3,R11,R11
	POPCNTD R11,R11
	#else
	VSLDOI $6,V6,V6,V6
	MFVRD V6,R3
	CNTLZD R3,R11
	#endif
	ADD R8,R11,R3
	CMPU R11,R4
	BLT return
	BR notfound
	PCALIGN $16

	done:
	ADD $-1,R10,R6
	// Offset of last index for the final
	// doubleword comparison
	RLDICL $0,R6,$61,R6
	// At this point, R3 has 0xFF in the same position as the byte we are
	// looking for in the doubleword. Use that to calculate the exact index
	// of the byte.
	#ifdef GOARCH_ppc64le
	ADD $-1,R3,R11
	ANDN R3,R11,R11
	POPCNTD R11,R11 // Count trailing zeros (Little Endian).
	#else
	CNTLZD R3,R11 // Count leading zeros (Big Endian).
	#endif
	CMPU R8,R7 // Check if we are at the last doubleword.
	SRD $3,R11 // Convert trailing zeros to bytes.
	ADD R11,R8,R3
	CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
	BNE return
	BLE CR7,return
	BR notfound

	small_string:
	// process string of length < 32 bytes
	// We unroll this loop for better performance.
	CMPU R4,$0 // Check for length=0
	BEQ notfound

	MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
	CMPB R12,R5,R3 // Check for a match.
	AND R9,R3,R3 // Mask bytes below s_base.
	CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
	RLDICR $0,R7,$60,R7 // Last doubleword in R7.
	CMPU R8,R7
	BNE CR7,done
	BEQ notfound // Hit length.

	MOVDU 8(R8),R12
	CMPB R12,R5,R3
	CMPU R3,$0,CR6
	CMPU R8,R7
	BNE CR6,done
	BEQ notfound

	MOVDU 8(R8),R12
	CMPB R12,R5,R3
	CMPU R3,$0,CR6
	CMPU R8,R7
	BNE CR6,done
	BEQ notfound

	MOVDU 8(R8),R12
	CMPB R12,R5,R3
	CMPU R3,$0,CR6
	CMPU R8,R7
	BNE CR6,done
	BEQ notfound

	MOVDU 8(R8),R12
	CMPB R12,R5,R3
	CMPU R3,$0,CR6
	BNE CR6,done
	BR notfound