|  | // Copyright 2018 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | #include "textflag.h" | 
|  |  | 
|  | TEXT ·IndexByte(SB),NOSPLIT,$0-40 | 
|  | MOVD	b_base+0(FP), R0 | 
|  | MOVD	b_len+8(FP), R2 | 
|  | MOVBU	c+24(FP), R1 | 
|  | MOVD	$ret+32(FP), R8 | 
|  | B	indexbytebody<>(SB) | 
|  |  | 
|  | TEXT ·IndexByteString(SB),NOSPLIT,$0-32 | 
|  | MOVD	s_base+0(FP), R0 | 
|  | MOVD	s_len+8(FP), R2 | 
|  | MOVBU	c+16(FP), R1 | 
|  | MOVD	$ret+24(FP), R8 | 
|  | B	indexbytebody<>(SB) | 
|  |  | 
|  | // input: | 
|  | //   R0: data | 
|  | //   R1: byte to search | 
|  | //   R2: data len | 
|  | //   R8: address to put result | 
|  | TEXT indexbytebody<>(SB),NOSPLIT,$0 | 
|  | // Core algorithm: | 
|  | // For each 32-byte chunk we calculate a 64-bit syndrome value, | 
|  | // with two bits per byte. For each tuple, bit 0 is set if the | 
|  | // relevant byte matched the requested character and bit 1 is | 
|  | // not used (faster than using a 32bit syndrome). Since the bits | 
|  | // in the syndrome reflect exactly the order in which things occur | 
|  | // in the original string, counting trailing zeros allows to | 
|  | // identify exactly which byte has matched. | 
|  |  | 
|  | CBZ	R2, fail | 
|  | MOVD	R0, R11 | 
|  | // Magic constant 0x40100401 allows us to identify | 
|  | // which lane matches the requested byte. | 
|  | // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) | 
|  | // Different bytes have different bit masks (i.e: 1, 4, 16, 64) | 
|  | MOVD	$0x40100401, R5 | 
|  | VMOV	R1, V0.B16 | 
|  | // Work with aligned 32-byte chunks | 
|  | BIC	$0x1f, R0, R3 | 
|  | VMOV	R5, V5.S4 | 
|  | ANDS	$0x1f, R0, R9 | 
|  | AND	$0x1f, R2, R10 | 
|  | BEQ	loop | 
|  |  | 
|  | // Input string is not 32-byte aligned. We calculate the | 
|  | // syndrome value for the aligned 32 bytes block containing | 
|  | // the first bytes and mask off the irrelevant part. | 
|  | VLD1.P	(R3), [V1.B16, V2.B16] | 
|  | SUB	$0x20, R9, R4 | 
|  | ADDS	R4, R2, R2 | 
|  | VCMEQ	V0.B16, V1.B16, V3.B16 | 
|  | VCMEQ	V0.B16, V2.B16, V4.B16 | 
|  | VAND	V5.B16, V3.B16, V3.B16 | 
|  | VAND	V5.B16, V4.B16, V4.B16 | 
|  | VADDP	V4.B16, V3.B16, V6.B16 // 256->128 | 
|  | VADDP	V6.B16, V6.B16, V6.B16 // 128->64 | 
|  | VMOV	V6.D[0], R6 | 
|  | // Clear the irrelevant lower bits | 
|  | LSL	$1, R9, R4 | 
|  | LSR	R4, R6, R6 | 
|  | LSL	R4, R6, R6 | 
|  | // The first block can also be the last | 
|  | BLS	masklast | 
|  | // Have we found something already? | 
|  | CBNZ	R6, tail | 
|  |  | 
|  | loop: | 
|  | VLD1.P	(R3), [V1.B16, V2.B16] | 
|  | SUBS	$0x20, R2, R2 | 
|  | VCMEQ	V0.B16, V1.B16, V3.B16 | 
|  | VCMEQ	V0.B16, V2.B16, V4.B16 | 
|  | // If we're out of data we finish regardless of the result | 
|  | BLS	end | 
|  | // Use a fast check for the termination condition | 
|  | VORR	V4.B16, V3.B16, V6.B16 | 
|  | VADDP	V6.D2, V6.D2, V6.D2 | 
|  | VMOV	V6.D[0], R6 | 
|  | // We're not out of data, loop if we haven't found the character | 
|  | CBZ	R6, loop | 
|  |  | 
|  | end: | 
|  | // Termination condition found, let's calculate the syndrome value | 
|  | VAND	V5.B16, V3.B16, V3.B16 | 
|  | VAND	V5.B16, V4.B16, V4.B16 | 
|  | VADDP	V4.B16, V3.B16, V6.B16 | 
|  | VADDP	V6.B16, V6.B16, V6.B16 | 
|  | VMOV	V6.D[0], R6 | 
|  | // Only do the clear for the last possible block with less than 32 bytes | 
|  | // Condition flags come from SUBS in the loop | 
|  | BHS	tail | 
|  |  | 
|  | masklast: | 
|  | // Clear the irrelevant upper bits | 
|  | ADD	R9, R10, R4 | 
|  | AND	$0x1f, R4, R4 | 
|  | SUB	$0x20, R4, R4 | 
|  | NEG	R4<<1, R4 | 
|  | LSL	R4, R6, R6 | 
|  | LSR	R4, R6, R6 | 
|  |  | 
|  | tail: | 
|  | // Check that we have found a character | 
|  | CBZ	R6, fail | 
|  | // Count the trailing zeros using bit reversing | 
|  | RBIT	R6, R6 | 
|  | // Compensate the last post-increment | 
|  | SUB	$0x20, R3, R3 | 
|  | // And count the leading zeros | 
|  | CLZ	R6, R6 | 
|  | // R6 is twice the offset into the fragment | 
|  | ADD	R6>>1, R3, R0 | 
|  | // Compute the offset result | 
|  | SUB	R11, R0, R0 | 
|  | MOVD	R0, (R8) | 
|  | RET | 
|  |  | 
|  | fail: | 
|  | MOVD	$-1, R0 | 
|  | MOVD	R0, (R8) | 
|  | RET |