| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "go_asm.h" |
| #include "asm_amd64.h" |
| #include "textflag.h" |
| |
| TEXT ·Count(SB),NOSPLIT,$0-40 |
| #ifndef hasPOPCNT |
| CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 |
| JEQ 2(PC) |
| JMP ·countGeneric(SB) |
| #endif |
| MOVQ b_base+0(FP), SI |
| MOVQ b_len+8(FP), BX |
| MOVB c+24(FP), AL |
| LEAQ ret+32(FP), R8 |
| JMP countbody<>(SB) |
| |
| TEXT ·CountString(SB),NOSPLIT,$0-32 |
| #ifndef hasPOPCNT |
| CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 |
| JEQ 2(PC) |
| JMP ·countGenericString(SB) |
| #endif |
| MOVQ s_base+0(FP), SI |
| MOVQ s_len+8(FP), BX |
| MOVB c+16(FP), AL |
| LEAQ ret+24(FP), R8 |
| JMP countbody<>(SB) |
| |
| // input: |
| // SI: data |
| // BX: data len |
| // AL: byte sought |
| // R8: address to put result |
| // This function requires the POPCNT instruction. |
| TEXT countbody<>(SB),NOSPLIT,$0 |
| // Shuffle X0 around so that each byte contains |
| // the character we're looking for. |
| MOVD AX, X0 |
| PUNPCKLBW X0, X0 |
| PUNPCKLBW X0, X0 |
| PSHUFL $0, X0, X0 |
| |
| CMPQ BX, $16 |
| JLT small |
| |
| MOVQ $0, R12 // Accumulator |
| |
| MOVQ SI, DI |
| |
| CMPQ BX, $64 |
| JAE avx2 |
| sse: |
| LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes |
| JMP sseloopentry |
| |
| PCALIGN $16 |
| sseloop: |
| // Move the next 16-byte chunk of the data into X1. |
| MOVOU (DI), X1 |
| // Compare bytes in X0 to X1. |
| PCMPEQB X0, X1 |
| // Take the top bit of each byte in X1 and put the result in DX. |
| PMOVMSKB X1, DX |
| // Count number of matching bytes |
| POPCNTL DX, DX |
| // Accumulate into R12 |
| ADDQ DX, R12 |
| // Advance to next block. |
| ADDQ $16, DI |
| sseloopentry: |
| CMPQ DI, AX |
| JBE sseloop |
| |
| // Get the number of bytes to consider in the last 16 bytes |
| ANDQ $15, BX |
| JZ end |
| |
| // Create mask to ignore overlap between previous 16 byte block |
| // and the next. |
| MOVQ $16,CX |
| SUBQ BX, CX |
| MOVQ $0xFFFF, R10 |
| SARQ CL, R10 |
| SALQ CL, R10 |
| |
| // Process the last 16-byte chunk. This chunk may overlap with the |
| // chunks we've already searched so we need to mask part of it. |
| MOVOU (AX), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, DX |
| // Apply mask |
| ANDQ R10, DX |
| POPCNTL DX, DX |
| ADDQ DX, R12 |
| end: |
| MOVQ R12, (R8) |
| RET |
| |
| // handle for lengths < 16 |
| small: |
| TESTQ BX, BX |
| JEQ endzero |
| |
| // Check if we'll load across a page boundary. |
| LEAQ 16(SI), AX |
| TESTW $0xff0, AX |
| JEQ endofpage |
| |
| // We must ignore high bytes as they aren't part of our slice. |
| // Create mask. |
| MOVB BX, CX |
| MOVQ $1, R10 |
| SALQ CL, R10 |
| SUBQ $1, R10 |
| |
| // Load data |
| MOVOU (SI), X1 |
| // Compare target byte with each byte in data. |
| PCMPEQB X0, X1 |
| // Move result bits to integer register. |
| PMOVMSKB X1, DX |
| // Apply mask |
| ANDQ R10, DX |
| POPCNTL DX, DX |
| // Directly return DX, we don't need to accumulate |
| // since we have <16 bytes. |
| MOVQ DX, (R8) |
| RET |
| endzero: |
| MOVQ $0, (R8) |
| RET |
| |
| endofpage: |
| // We must ignore low bytes as they aren't part of our slice. |
| MOVQ $16,CX |
| SUBQ BX, CX |
| MOVQ $0xFFFF, R10 |
| SARQ CL, R10 |
| SALQ CL, R10 |
| |
| // Load data into the high end of X1. |
| MOVOU -16(SI)(BX*1), X1 |
| // Compare target byte with each byte in data. |
| PCMPEQB X0, X1 |
| // Move result bits to integer register. |
| PMOVMSKB X1, DX |
| // Apply mask |
| ANDQ R10, DX |
| // Directly return DX, we don't need to accumulate |
| // since we have <16 bytes. |
| POPCNTL DX, DX |
| MOVQ DX, (R8) |
| RET |
| |
| avx2: |
| #ifndef hasAVX2 |
| CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 |
| JNE sse |
| #endif |
| MOVD AX, X0 |
| LEAQ -64(SI)(BX*1), R11 |
| LEAQ (SI)(BX*1), R13 |
| VPBROADCASTB X0, Y1 |
| PCALIGN $32 |
| avx2_loop: |
| VMOVDQU (DI), Y2 |
| VMOVDQU 32(DI), Y4 |
| VPCMPEQB Y1, Y2, Y3 |
| VPCMPEQB Y1, Y4, Y5 |
| VPMOVMSKB Y3, DX |
| VPMOVMSKB Y5, CX |
| POPCNTL DX, DX |
| POPCNTL CX, CX |
| ADDQ DX, R12 |
| ADDQ CX, R12 |
| ADDQ $64, DI |
| CMPQ DI, R11 |
| JLE avx2_loop |
| |
| // If last block is already processed, |
| // skip to the end. |
| // |
| // This check is NOT an optimization; if the input length is a |
| // multiple of 64, we must not go through the last leg of the |
| // function because the bit shift count passed to SALQ below would |
| // be 64, which is outside of the 0-63 range supported by those |
| // instructions. |
| // |
| // Tests in the bytes and strings packages with input lengths that |
| // are multiples of 64 will break if this condition were removed. |
| CMPQ DI, R13 |
| JEQ endavx |
| |
| // Load address of the last 64 bytes. |
| // There is an overlap with the previous block. |
| MOVQ R11, DI |
| VMOVDQU (DI), Y2 |
| VMOVDQU 32(DI), Y4 |
| VPCMPEQB Y1, Y2, Y3 |
| VPCMPEQB Y1, Y4, Y5 |
| VPMOVMSKB Y3, DX |
| VPMOVMSKB Y5, CX |
| // Exit AVX mode. |
| VZEROUPPER |
| SALQ $32, CX |
| ORQ CX, DX |
| |
| // Create mask to ignore overlap between previous 64 byte block |
| // and the next. |
| ANDQ $63, BX |
| MOVQ $64, CX |
| SUBQ BX, CX |
| MOVQ $0xFFFFFFFFFFFFFFFF, R10 |
| SALQ CL, R10 |
| // Apply mask |
| ANDQ R10, DX |
| POPCNTQ DX, DX |
| ADDQ DX, R12 |
| MOVQ R12, (R8) |
| RET |
| endavx: |
| // Exit AVX mode. |
| VZEROUPPER |
| MOVQ R12, (R8) |
| RET |