// Copyright 2018 The Go Authors. All rights reserved. | |

// Use of this source code is governed by a BSD-style | |

// license that can be found in the LICENSE file. | |

#include "go_asm.h" | |

#include "textflag.h" | |

TEXT ·Count(SB),NOSPLIT,$0-40 | |

CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 | |

JEQ 2(PC) | |

JMP ·countGeneric(SB) | |

MOVQ b_base+0(FP), SI | |

MOVQ b_len+8(FP), BX | |

MOVB c+24(FP), AL | |

LEAQ ret+32(FP), R8 | |

JMP countbody<>(SB) | |

TEXT ·CountString(SB),NOSPLIT,$0-32 | |

CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 | |

JEQ 2(PC) | |

JMP ·countGenericString(SB) | |

MOVQ s_base+0(FP), SI | |

MOVQ s_len+8(FP), BX | |

MOVB c+16(FP), AL | |

LEAQ ret+24(FP), R8 | |

JMP countbody<>(SB) | |

// input: | |

// SI: data | |

// BX: data len | |

// AL: byte sought | |

// R8: address to put result | |

// This function requires the POPCNT instruction. | |

TEXT countbody<>(SB),NOSPLIT,$0 | |

// Shuffle X0 around so that each byte contains | |

// the character we're looking for. | |

MOVD AX, X0 | |

PUNPCKLBW X0, X0 | |

PUNPCKLBW X0, X0 | |

PSHUFL $0, X0, X0 | |

CMPQ BX, $16 | |

JLT small | |

MOVQ $0, R12 // Accumulator | |

MOVQ SI, DI | |

CMPQ BX, $32 | |

JA avx2 | |

sse: | |

LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes | |

JMP sseloopentry | |

sseloop: | |

// Move the next 16-byte chunk of the data into X1. | |

MOVOU (DI), X1 | |

// Compare bytes in X0 to X1. | |

PCMPEQB X0, X1 | |

// Take the top bit of each byte in X1 and put the result in DX. | |

PMOVMSKB X1, DX | |

// Count number of matching bytes | |

POPCNTL DX, DX | |

// Accumulate into R12 | |

ADDQ DX, R12 | |

// Advance to next block. | |

ADDQ $16, DI | |

sseloopentry: | |

CMPQ DI, AX | |

JBE sseloop | |

// Get the number of bytes to consider in the last 16 bytes | |

ANDQ $15, BX | |

JZ end | |

// Create mask to ignore overlap between previous 16 byte block | |

// and the next. | |

MOVQ $16,CX | |

SUBQ BX, CX | |

MOVQ $0xFFFF, R10 | |

SARQ CL, R10 | |

SALQ CL, R10 | |

// Process the last 16-byte chunk. This chunk may overlap with the | |

// chunks we've already searched so we need to mask part of it. | |

MOVOU (AX), X1 | |

PCMPEQB X0, X1 | |

PMOVMSKB X1, DX | |

// Apply mask | |

ANDQ R10, DX | |

POPCNTL DX, DX | |

ADDQ DX, R12 | |

end: | |

MOVQ R12, (R8) | |

RET | |

// handle for lengths < 16 | |

small: | |

TESTQ BX, BX | |

JEQ endzero | |

// Check if we'll load across a page boundary. | |

LEAQ 16(SI), AX | |

TESTW $0xff0, AX | |

JEQ endofpage | |

// We must ignore high bytes as they aren't part of our slice. | |

// Create mask. | |

MOVB BX, CX | |

MOVQ $1, R10 | |

SALQ CL, R10 | |

SUBQ $1, R10 | |

// Load data | |

MOVOU (SI), X1 | |

// Compare target byte with each byte in data. | |

PCMPEQB X0, X1 | |

// Move result bits to integer register. | |

PMOVMSKB X1, DX | |

// Apply mask | |

ANDQ R10, DX | |

POPCNTL DX, DX | |

// Directly return DX, we don't need to accumulate | |

// since we have <16 bytes. | |

MOVQ DX, (R8) | |

RET | |

endzero: | |

MOVQ $0, (R8) | |

RET | |

endofpage: | |

// We must ignore low bytes as they aren't part of our slice. | |

MOVQ $16,CX | |

SUBQ BX, CX | |

MOVQ $0xFFFF, R10 | |

SARQ CL, R10 | |

SALQ CL, R10 | |

// Load data into the high end of X1. | |

MOVOU -16(SI)(BX*1), X1 | |

// Compare target byte with each byte in data. | |

PCMPEQB X0, X1 | |

// Move result bits to integer register. | |

PMOVMSKB X1, DX | |

// Apply mask | |

ANDQ R10, DX | |

// Directly return DX, we don't need to accumulate | |

// since we have <16 bytes. | |

POPCNTL DX, DX | |

MOVQ DX, (R8) | |

RET | |

avx2: | |

CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 | |

JNE sse | |

MOVD AX, X0 | |

LEAQ -32(SI)(BX*1), R11 | |

VPBROADCASTB X0, Y1 | |

avx2_loop: | |

VMOVDQU (DI), Y2 | |

VPCMPEQB Y1, Y2, Y3 | |

VPMOVMSKB Y3, DX | |

POPCNTL DX, DX | |

ADDQ DX, R12 | |

ADDQ $32, DI | |

CMPQ DI, R11 | |

JLE avx2_loop | |

// If last block is already processed, | |

// skip to the end. | |

CMPQ DI, R11 | |

JEQ endavx | |

// Load address of the last 32 bytes. | |

// There is an overlap with the previous block. | |

MOVQ R11, DI | |

VMOVDQU (DI), Y2 | |

VPCMPEQB Y1, Y2, Y3 | |

VPMOVMSKB Y3, DX | |

// Exit AVX mode. | |

VZEROUPPER | |

// Create mask to ignore overlap between previous 32 byte block | |

// and the next. | |

ANDQ $31, BX | |

MOVQ $32,CX | |

SUBQ BX, CX | |

MOVQ $0xFFFFFFFF, R10 | |

SARQ CL, R10 | |

SALQ CL, R10 | |

// Apply mask | |

ANDQ R10, DX | |

POPCNTL DX, DX | |

ADDQ DX, R12 | |

MOVQ R12, (R8) | |

RET | |

endavx: | |

// Exit AVX mode. | |

VZEROUPPER | |

MOVQ R12, (R8) | |

RET |