| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ppc64le || ppc64 |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 |
| // R3 = byte array pointer |
| // R4 = length |
| // R6 = byte to count |
| MTVRD R6, V1 // move compare byte |
| MOVD R6, R5 |
| VSPLTB $7, V1, V1 // replicate byte across V1 |
| BR countbytebody<>(SB) |
| |
| TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32 |
| // R3 = byte array pointer |
| // R4 = length |
| // R5 = byte to count |
| MTVRD R5, V1 // move compare byte |
| VSPLTB $7, V1, V1 // replicate byte across V1 |
| BR countbytebody<>(SB) |
| |
| // R3: addr of string |
| // R4: len of string |
| // R5: byte to count |
| // V1: byte to count, splatted. |
| // On exit: |
| // R3: return value |
| TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 |
| MOVD $0, R18 // byte count |
| |
| #ifndef GOPPC64_power10 |
| RLDIMI $8, R5, $48, R5 |
| RLDIMI $16, R5, $32, R5 |
| RLDIMI $32, R5, $0, R5 // fill reg with the byte to count |
| #endif |
| |
| CMPU R4, $32 // Check if it's a small string (<32 bytes) |
| BLT tail // Jump to the small string case |
| SRD $5, R4, R20 |
| MOVD R20, CTR |
| MOVD $16, R21 |
| XXLXOR V4, V4, V4 |
| XXLXOR V5, V5, V5 |
| |
| PCALIGN $16 |
| cmploop: |
| LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators. |
| LXVD2X (R21)(R3), V2 |
| VCMPEQUB V2, V1, V2 |
| VCMPEQUB V0, V1, V0 |
| VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets. |
| VPOPCNTD V0, V0 |
| VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count. |
| VADDUDM V2, V5, V5 // The count will be fixed up afterwards. |
| ADD $32, R3 |
| BDNZ cmploop |
| |
| VADDUDM V4, V5, V5 |
| MFVSRD V5, R18 |
| VSLDOI $8, V5, V5, V5 |
| MFVSRD V5, R21 |
| ADD R21, R18, R18 |
| ANDCC $31, R4, R4 |
| // Skip the tail processing if no bytes remaining. |
| BEQ tail_0 |
| |
| #ifdef GOPPC64_power10 |
| SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10. |
| |
| tail: // Count the last 0 - 31 bytes. |
| CMP R4, $16 |
| BLE small_tail_p10 |
| LXV 0(R3), V0 |
| VCMPEQUB V0, V1, V0 |
| VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. |
| SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. |
| ADD R14, R18, R18 |
| ADD $16, R3, R3 |
| ANDCC $15, R4, R4 |
| |
| small_tail_p10: |
| SLD $56, R4, R6 |
| LXVLL R3, R6, V0 |
| VCMPEQUB V0, V1, V0 |
| VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes. |
| VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. |
| SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. |
| ADD R14, R18, R3 |
| RET |
| |
| #else |
| tail: // Count the last 0 - 31 bytes. |
| CMP R4, $16 |
| BLT tail_8 |
| MOVD (R3), R12 |
| MOVD 8(R3), R14 |
| CMPB R12, R5, R12 |
| CMPB R14, R5, R14 |
| POPCNTD R12, R12 |
| POPCNTD R14, R14 |
| ADD R12, R18, R18 |
| ADD R14, R18, R18 |
| ADD $16, R3, R3 |
| ADD $-16, R4, R4 |
| |
| tail_8: // Count the remaining 0 - 15 bytes. |
| CMP R4, $8 |
| BLT tail_4 |
| MOVD (R3), R12 |
| CMPB R12, R5, R12 |
| POPCNTD R12, R12 |
| ADD R12, R18, R18 |
| ADD $8, R3, R3 |
| ADD $-8, R4, R4 |
| |
| tail_4: // Count the remaining 0 - 7 bytes. |
| CMP R4, $4 |
| BLT tail_2 |
| MOVWZ (R3), R12 |
| CMPB R12, R5, R12 |
| SLD $32, R12, R12 // Remove non-participating matches. |
| POPCNTD R12, R12 |
| ADD R12, R18, R18 |
| ADD $4, R3, R3 |
| ADD $-4, R4, R4 |
| |
| tail_2: // Count the remaining 0 - 3 bytes. |
| CMP R4, $2 |
| BLT tail_1 |
| MOVHZ (R3), R12 |
| CMPB R12, R5, R12 |
| SLD $48, R12, R12 // Remove non-participating matches. |
| POPCNTD R12, R12 |
| ADD R12, R18, R18 |
| ADD $2, R3, R3 |
| ADD $-2, R4, R4 |
| |
| tail_1: // Count the remaining 0 - 1 bytes. |
| CMP R4, $1 |
| BLT tail_0 |
| MOVBZ (R3), R12 |
| CMPB R12, R5, R12 |
| ANDCC $0x8, R12, R12 |
| ADD R12, R18, R18 |
| #endif |
| |
| tail_0: // No remaining tail to count. |
| SRD $3, R18, R3 // Fixup count, it is off by 8x. |
| RET |