| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ppc64 || ppc64le |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 |
| // R3 = byte array pointer |
| // R4 = length |
| MOVD R6, R5 // R5 = byte |
| BR indexbytebody<>(SB) |
| |
| TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 |
| // R3 = string |
| // R4 = length |
| // R5 = byte |
| BR indexbytebody<>(SB) |
| |
| #ifndef GOPPC64_power9 |
| #ifdef GOARCH_ppc64le |
| DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 |
| DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 |
| #else |
| DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 |
| DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 |
| #endif |
| GLOBL indexbytevbperm<>+0(SB), RODATA, $16 |
| #endif |
| |
| // Some operations are endian specific, choose the correct opcode base on GOARCH. |
| // Note, _VCZBEBB is only available on power9 and newer. |
| #ifdef GOARCH_ppc64le |
| #define _LDBEX MOVDBR |
| #define _LWBEX MOVWBR |
| #define _LHBEX MOVHBR |
| #define _VCZBEBB VCTZLSBB |
| #else |
| #define _LDBEX MOVD |
| #define _LWBEX MOVW |
| #define _LHBEX MOVH |
| #define _VCZBEBB VCLZLSBB |
| #endif |
| |
| // R3 = addr of string |
| // R4 = len of string |
| // R5 = byte to find |
| // On exit: |
| // R3 = return value |
| TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 |
| CMPU R4,$32 |
| |
| #ifndef GOPPC64_power9 |
| // Load VBPERMQ constant to reduce compare into an ordered bit mask. |
| MOVD $indexbytevbperm<>+00(SB),R16 |
| LXVD2X (R16),V0 // Set up swap string |
| #endif |
| |
| MTVRD R5,V1 |
| VSPLTB $7,V1,V1 // Replicate byte across V1 |
| |
| BLT cmp16 // Jump to the small string case if it's <32 bytes. |
| |
| CMP R4,$64,CR1 |
| MOVD $16,R11 |
| MOVD R3,R8 |
| BLT CR1,cmp32 // Special case for length 32 - 63 |
| MOVD $32,R12 |
| MOVD $48,R6 |
| |
| RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63 |
| ADD R3,R9,R9 // R9 = &s[len &^ 63] |
| ANDCC $63,R4 // (len &= 63) cmp 0. |
| |
| PCALIGN $16 |
| loop64: |
| LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0] |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 // Match found at R8, jump out |
| |
| LXVD2X (R11)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat1 // Match found at R8+16 bytes, jump out |
| |
| LXVD2X (R12)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat2 // Match found at R8+32 bytes, jump out |
| |
| LXVD2X (R6)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat3 // Match found at R8+48 bytes, jump out |
| |
| ADD $64,R8 |
| CMPU R8,R9,CR1 |
| BNE CR1,loop64 // R8 != &s[len &^ 63]? |
| |
| PCALIGN $32 |
| BEQ notfound // Is tail length 0? CR0 is set before entering loop64. |
| |
| CMP R4,$32 // Tail length >= 32, use cmp32 path. |
| CMP R4,$16,CR1 |
| BGE cmp32 |
| |
| ADD R8,R4,R9 |
| ADD $-16,R9 |
| BLE CR1,cmp64_tail_gt0 |
| |
| cmp64_tail_gt16: // Tail length 17 - 32 |
| LXVD2X (R0)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 |
| |
| cmp64_tail_gt0: // Tail length 1 - 16 |
| MOVD R9,R8 |
| LXVD2X (R0)(R9),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 |
| |
| BR notfound |
| |
| cmp32: // Length 32 - 63 |
| |
| // Bytes 0 - 15 |
| LXVD2X (R0)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 |
| |
| // Bytes 16 - 31 |
| LXVD2X (R8)(R11),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat1 // Match found at R8+16 bytes, jump out |
| |
| BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32) |
| CMP R4,$48 |
| |
| ADD R4,R8,R9 // Compute &s[len(s)-16] |
| ADD $32,R8,R8 |
| ADD $-16,R9,R9 |
| ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8 |
| |
| // Bytes 33 - 47 |
| LXVD2X (R0)(R8),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 // match found at R8+32 bytes, jump out |
| |
| BLE notfound |
| |
| // Bytes 48 - 63 |
| MOVD R9,R8 // R9 holds the final check. |
| LXVD2X (R0)(R9),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 // Match found at R8+48 bytes, jump out |
| |
| BR notfound |
| |
| // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. |
| #ifndef GOPPC64_power9 |
| #define ADJUST_FOR_CNTLZW -16 |
| #else |
| #define ADJUST_FOR_CNTLZW 0 |
| #endif |
| |
| // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used |
| // to determine the offset into the 16B vector, it will overcount by 16. Account for it here. |
| foundat3: |
| SUB R3,R8,R3 |
| ADD $48+ADJUST_FOR_CNTLZW,R3 |
| BR vfound |
| foundat2: |
| SUB R3,R8,R3 |
| ADD $32+ADJUST_FOR_CNTLZW,R3 |
| BR vfound |
| foundat1: |
| SUB R3,R8,R3 |
| ADD $16+ADJUST_FOR_CNTLZW,R3 |
| BR vfound |
| foundat0: |
| SUB R3,R8,R3 |
| ADD $0+ADJUST_FOR_CNTLZW,R3 |
| vfound: |
| // Map equal values into a 16 bit value with earlier matches setting higher bits. |
| #ifndef GOPPC64_power9 |
| VBPERMQ V6,V0,V6 |
| MFVRD V6,R4 |
| CNTLZW R4,R4 |
| #else |
| #ifdef GOARCH_ppc64le |
| // Put the value back into LE ordering by swapping doublewords. |
| XXPERMDI V6,V6,$2,V6 |
| #endif |
| _VCZBEBB V6,R4 |
| #endif |
| ADD R3,R4,R3 |
| RET |
| |
| cmp16: // Length 16 - 31 |
| CMPU R4,$16 |
| ADD R4,R3,R9 |
| BLT cmp8 |
| |
| ADD $-16,R9,R9 // &s[len(s)-16] |
| |
| // Bytes 0 - 15 |
| LXVD2X (R0)(R3),V2 |
| VCMPEQUBCC V2,V1,V6 |
| MOVD R3,R8 |
| BNE CR6,foundat0 // Match found at R8+32 bytes, jump out |
| |
| BEQ notfound |
| |
| // Bytes 16 - 30 |
| MOVD R9,R8 // R9 holds the final check. |
| LXVD2X (R0)(R9),V2 |
| VCMPEQUBCC V2,V1,V6 |
| BNE CR6,foundat0 // Match found at R8+48 bytes, jump out |
| |
| BR notfound |
| |
| |
| cmp8: // Length 8 - 15 |
| #ifdef GOPPC64_power10 |
| // Load all the bytes into a single VSR in BE order. |
| SLD $56,R4,R5 |
| LXVLL R3,R5,V2 |
| // Compare and count the number which don't match. |
| VCMPEQUB V2,V1,V6 |
| VCLZLSBB V6,R3 |
| // If count is the number of bytes, or more. No matches are found. |
| CMPU R3,R4 |
| MOVD $-1,R5 |
| // Otherwise, the count is the index of the first match. |
| ISEL CR0LT,R3,R5,R3 |
| RET |
| #else |
| RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. |
| RLDIMI $16,R5,$32,R5 |
| RLDIMI $32,R5,$0,R5 |
| CMPU R4,$8 |
| BLT cmp4 |
| MOVD $-8,R11 |
| ADD $-8,R4,R4 |
| |
| _LDBEX (R0)(R3),R10 |
| _LDBEX (R11)(R9),R11 |
| CMPB R10,R5,R10 |
| CMPB R11,R5,R11 |
| CMPU R10,$0 |
| CMPU R11,$0,CR1 |
| CNTLZD R10,R10 |
| CNTLZD R11,R11 |
| SRD $3,R10,R3 |
| SRD $3,R11,R11 |
| BNE found |
| |
| ADD R4,R11,R4 |
| MOVD $-1,R3 |
| ISEL CR1EQ,R3,R4,R3 |
| RET |
| |
| cmp4: // Length 4 - 7 |
| CMPU R4,$4 |
| BLT cmp2 |
| MOVD $-4,R11 |
| ADD $-4,R4,R4 |
| |
| _LWBEX (R0)(R3),R10 |
| _LWBEX (R11)(R9),R11 |
| CMPB R10,R5,R10 |
| CMPB R11,R5,R11 |
| CNTLZW R10,R10 |
| CNTLZW R11,R11 |
| CMPU R10,$32 |
| CMPU R11,$32,CR1 |
| SRD $3,R10,R3 |
| SRD $3,R11,R11 |
| BNE found |
| |
| ADD R4,R11,R4 |
| MOVD $-1,R3 |
| ISEL CR1EQ,R3,R4,R3 |
| RET |
| |
| cmp2: // Length 2 - 3 |
| CMPU R4,$2 |
| BLT cmp1 |
| |
| _LHBEX (R0)(R3),R10 |
| CMPB R10,R5,R10 |
| SLDCC $48,R10,R10 |
| CNTLZD R10,R10 |
| SRD $3,R10,R3 |
| BNE found |
| |
| cmp1: // Length 1 |
| MOVD $-1,R3 |
| ANDCC $1,R4,R31 |
| BEQ found |
| |
| MOVBZ -1(R9),R10 |
| CMPB R10,R5,R10 |
| ANDCC $1,R10 |
| ADD $-1,R4 |
| ISEL CR0EQ,R3,R4,R3 |
| |
| found: |
| RET |
| #endif |
| |
| notfound: |
| MOVD $-1,R3 |
| RET |
| |