| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| TEXT ·IndexByte(SB), NOSPLIT, $0-40 |
| MOVQ b_base+0(FP), SI |
| MOVQ b_len+8(FP), BX |
| MOVB c+24(FP), AL |
| LEAQ ret+32(FP), R8 |
| JMP indexbytebody<>(SB) |
| |
| TEXT ·IndexByteString(SB), NOSPLIT, $0-32 |
| MOVQ s_base+0(FP), SI |
| MOVQ s_len+8(FP), BX |
| MOVB c+16(FP), AL |
| LEAQ ret+24(FP), R8 |
| JMP indexbytebody<>(SB) |
| |
| // input: |
| // SI: data |
| // BX: data len |
| // AL: byte sought |
| // R8: address to put result |
| TEXT indexbytebody<>(SB), NOSPLIT, $0 |
| // Shuffle X0 around so that each byte contains |
| // the character we're looking for. |
| MOVD AX, X0 |
| PUNPCKLBW X0, X0 |
| PUNPCKLBW X0, X0 |
| PSHUFL $0, X0, X0 |
| |
| CMPQ BX, $16 |
| JLT small |
| |
| MOVQ SI, DI |
| |
| CMPQ BX, $32 |
| JA avx2 |
| sse: |
| LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes |
| JMP sseloopentry |
| |
| sseloop: |
| // Move the next 16-byte chunk of the data into X1. |
| MOVOU (DI), X1 |
| // Compare bytes in X0 to X1. |
| PCMPEQB X0, X1 |
| // Take the top bit of each byte in X1 and put the result in DX. |
| PMOVMSKB X1, DX |
| // Find first set bit, if any. |
| BSFL DX, DX |
| JNZ ssesuccess |
| // Advance to next block. |
| ADDQ $16, DI |
| sseloopentry: |
| CMPQ DI, AX |
| JB sseloop |
| |
| // Search the last 16-byte chunk. This chunk may overlap with the |
| // chunks we've already searched, but that's ok. |
| MOVQ AX, DI |
| MOVOU (AX), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, DX |
| BSFL DX, DX |
| JNZ ssesuccess |
| |
| failure: |
| MOVQ $-1, (R8) |
| RET |
| |
| // We've found a chunk containing the byte. |
| // The chunk was loaded from DI. |
| // The index of the matching byte in the chunk is DX. |
| // The start of the data is SI. |
| ssesuccess: |
| SUBQ SI, DI // Compute offset of chunk within data. |
| ADDQ DX, DI // Add offset of byte within chunk. |
| MOVQ DI, (R8) |
| RET |
| |
| // handle for lengths < 16 |
| small: |
| TESTQ BX, BX |
| JEQ failure |
| |
| // Check if we'll load across a page boundary. |
| LEAQ 16(SI), AX |
| TESTW $0xff0, AX |
| JEQ endofpage |
| |
| MOVOU (SI), X1 // Load data |
| PCMPEQB X0, X1 // Compare target byte with each byte in data. |
| PMOVMSKB X1, DX // Move result bits to integer register. |
| BSFL DX, DX // Find first set bit. |
| JZ failure // No set bit, failure. |
| CMPL DX, BX |
| JAE failure // Match is past end of data. |
| MOVQ DX, (R8) |
| RET |
| |
| endofpage: |
| MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. |
| PCMPEQB X0, X1 // Compare target byte with each byte in data. |
| PMOVMSKB X1, DX // Move result bits to integer register. |
| MOVL BX, CX |
| SHLL CX, DX |
| SHRL $16, DX // Shift desired bits down to bottom of register. |
| BSFL DX, DX // Find first set bit. |
| JZ failure // No set bit, failure. |
| MOVQ DX, (R8) |
| RET |
| |
| avx2: |
| CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 |
| JNE sse |
| MOVD AX, X0 |
| LEAQ -32(SI)(BX*1), R11 |
| VPBROADCASTB X0, Y1 |
| avx2_loop: |
| VMOVDQU (DI), Y2 |
| VPCMPEQB Y1, Y2, Y3 |
| VPTEST Y3, Y3 |
| JNZ avx2success |
| ADDQ $32, DI |
| CMPQ DI, R11 |
| JLT avx2_loop |
| MOVQ R11, DI |
| VMOVDQU (DI), Y2 |
| VPCMPEQB Y1, Y2, Y3 |
| VPTEST Y3, Y3 |
| JNZ avx2success |
| VZEROUPPER |
| MOVQ $-1, (R8) |
| RET |
| |
| avx2success: |
| VPMOVMSKB Y3, DX |
| BSFL DX, DX |
| SUBQ SI, DI |
| ADDQ DI, DX |
| MOVQ DX, (R8) |
| VZEROUPPER |
| RET |