| // Copyright 2021 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This is an implementation based on the s390x |
| // implementation. |
| |
| // Find a separator with 2 <= len <= 32 within a string. |
| // Separators with lengths of 2, 3 or 4 are handled |
| // specially. |
| |
| // This works on power8 and above. The loads and |
| // compares are done in big endian order |
| // since that allows the used of VCLZD, and allows |
| // the same implementation to work on big and little |
| // endian platforms with minimal conditional changes. |
| |
| // NOTE: There is a power9 implementation that |
| // improves performance by 10-15% on little |
| // endian for some of the benchmarks. |
| // Unrolled index2to16 loop by 4 on ppc64le/power9 |
| // Work is still needed for a big endian |
| // implementation on power9. |
| |
| //go:build ppc64 || ppc64le |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| // Needed to swap LXVD2X loads to the correct |
| // byte order to work on POWER8. |
| |
| #ifdef GOARCH_ppc64 |
| DATA byteswap<>+0(SB)/8, $0x0001020304050607 |
| DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f |
| #else |
| DATA byteswap<>+0(SB)/8, $0x0706050403020100 |
| DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 |
| #endif |
| |
| // Load bytes in big endian order. Address |
| // alignment does not need checking. |
| #define VLOADSWAP(base, index, vreg, vsreg) \ |
| LXVD2X (base)(index), vsreg; \ |
| VPERM vreg, vreg, SWAP, vreg |
| |
| GLOBL byteswap<>+0(SB), RODATA, $16 |
| |
| TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 |
| // R3 = byte array pointer |
| // R4 = length |
| MOVD R6, R5 // R5 = separator pointer |
| MOVD R7, R6 // R6 = separator length |
| |
| #ifdef GOARCH_ppc64le |
| MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 |
| CMP R7, $1 |
| BNE power8 |
| BR indexbodyp9<>(SB) |
| #endif |
| power8: |
| BR indexbody<>(SB) |
| |
| TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 |
| // R3 = string |
| // R4 = length |
| // R5 = separator pointer |
| // R6 = separator length |
| |
| #ifdef GOARCH_ppc64le |
| MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 |
| CMP R7, $1 |
| BNE power8 |
| BR indexbodyp9<>(SB) |
| |
| #endif |
| power8: |
| BR indexbody<>(SB) |
| |
| // s: string we are searching |
| // sep: string to search for |
| // R3=&s[0], R4=len(s) |
| // R5=&sep[0], R6=len(sep) |
| // R14=&ret (index where sep found) |
| // R7=working addr of string |
| // R16=index value 16 |
| // R17=index value 17 |
| // R18=index value 18 |
| // R19=index value 1 |
| // R26=LASTBYTE of string |
| // R27=LASTSTR last start byte to compare with sep |
| // R8, R9 scratch |
| // V0=sep left justified zero fill |
| // CR4=sep length >= 16 |
| |
| #define SEPMASK V17 |
| #define LASTBYTE R26 |
| #define LASTSTR R27 |
| #define ONES V20 |
| #define SWAP V21 |
| #define SWAP_ VS53 |
| TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 |
| CMP R6, R4 // Compare lengths |
| BGT notfound // If sep len is > string, notfound |
| ADD R4, R3, LASTBYTE // find last byte addr |
| SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) |
| CMP R6, $0 // Check sep len |
| BEQ notfound // sep len 0 -- not found |
| MOVD R3, R7 // Copy of string addr |
| MOVD $16, R16 // Index value 16 |
| MOVD $17, R17 // Index value 17 |
| MOVD $18, R18 // Index value 18 |
| MOVD $1, R19 // Index value 1 |
| MOVD $byteswap<>+00(SB), R8 |
| VSPLTISB $0xFF, ONES // splat all 1s |
| LXVD2X (R8)(R0), SWAP_ // Set up swap string |
| |
| CMP R6, $16, CR4 // CR4 for len(sep) >= 16 |
| VOR ONES, ONES, SEPMASK // Set up full SEPMASK |
| BGE CR4, loadge16 // Load for len(sep) >= 16 |
| SUB R6, R16, R9 // 16-len of sep |
| SLD $3, R9 // Set up for VSLO |
| MTVSRD R9, V9 // Set up for VSLO |
| VSLDOI $8, V9, V9, V9 // Set up for VSLO |
| VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 |
| |
| loadge16: |
| ANDCC $15, R5, R9 // Find byte offset of sep |
| ADD R9, R6, R10 // Add sep len |
| CMP R10, $16 // Check if sep len+offset > 16 |
| BGT sepcross16 // Sep crosses 16 byte boundary |
| |
| RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container |
| VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0 |
| SLD $3, R9 // Set up shift count for VSLO |
| MTVSRD R9, V8 // Set up shift count for VSLO |
| VSLDOI $8, V8, V8, V8 |
| VSLO V0, V8, V0 // Shift by start byte |
| |
| VAND V0, SEPMASK, V0 // Mask separator (< 16) |
| BR index2plus |
| |
| sepcross16: |
| VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0 |
| |
| VAND V0, SEPMASK, V0 // mask out separator |
| BLE CR4, index2to16 |
| BR index17plus // Handle sep > 16 |
| |
| index2plus: |
| CMP R6, $2 // Check length of sep |
| BNE index3plus // If not 2, check for 3 |
| ADD $16, R7, R9 // Check if next 16 bytes past last |
| CMP R9, LASTBYTE // compare with last |
| BGE index2to16 // 2 <= len(string) <= 16 |
| MOVD $0xff00, R21 // Mask for later |
| MTVSRD R21, V25 // Move to Vreg |
| VSPLTH $3, V25, V31 // Splat mask |
| VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep |
| VSPLTISB $0, V10 // Clear V10 |
| |
| // First case: 2 byte separator |
| // V1: 2 byte separator splatted |
| // V2: 16 bytes at addr |
| // V4: 16 bytes at addr+1 |
| // Compare 2 byte separator at start |
| // and at start+1. Use VSEL to combine |
| // those results to find the first |
| // matching start byte, returning |
| // that value when found. Loop as |
| // long as len(string) > 16 |
| index2loop2: |
| VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3 |
| |
| index2loop: |
| VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 |
| VCMPEQUH V1, V2, V5 // Search for sep |
| VCMPEQUH V1, V3, V6 // Search for sep offset by 1 |
| VSEL V6, V5, V31, V7 // merge even and odd indices |
| VCLZD V7, V18 // find index of first match |
| MFVSRD V18, R25 // get first value |
| CMP R25, $64 // Found if < 64 |
| BLT foundR25 // Return byte index where found |
| VSLDOI $8, V18, V18, V18 // Adjust 2nd value |
| MFVSRD V18, R25 // get second value |
| CMP R25, $64 // Found if < 64 |
| ADD $64, R25 // Update byte offset |
| BLT foundR25 // Return value |
| ADD $16, R7 // R7+=16 Update string pointer |
| ADD $17, R7, R9 // R9=F7+17 since loop unrolled |
| CMP R9, LASTBYTE // Compare addr+17 against last byte |
| BLT index2loop2 // If < last, continue loop |
| CMP R7, LASTBYTE // Compare addr+16 against last byte |
| BLT index2to16 // If < 16 handle specially |
| VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3 |
| VSLDOI $1, V3, V10, V3 // Shift left by 1 byte |
| BR index2loop |
| |
| index3plus: |
| CMP R6, $3 // Check if sep == 3 |
| BNE index4plus // If not check larger |
| ADD $19, R7, R9 // Find bytes for use in this loop |
| CMP R9, LASTBYTE // Compare against last byte |
| BGE index2to16 // Remaining string 2<=len<=16 |
| MOVD $0xff00, R21 // Set up mask for upcoming loop |
| MTVSRD R21, V25 // Move mask to Vreg |
| VSPLTH $3, V25, V31 // Splat mask |
| VSPLTH $0, V0, V1 // Splat 1st two bytes of sep |
| VSPLTB $2, V0, V8 // Splat 3rd byte of sep |
| |
| // Loop to process 3 byte separator. |
| // string[0:16] is in V2 |
| // string[2:18] is in V3 |
| // sep[0:2] splatted in V1 |
| // sec[3] splatted in v8 |
| // Load vectors at string, string+1 |
| // and string+2. Compare string, string+1 |
| // against first 2 bytes of separator |
| // splatted, and string+2 against 3rd |
| // byte splatted. Merge the results with |
| // VSEL to find the first byte of a match. |
| |
| // Special handling for last 16 bytes if the |
| // string fits in 16 byte multiple. |
| index3loop2: |
| MOVD $2, R21 // Set up index for 2 |
| VSPLTISB $0, V10 // Clear V10 |
| VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3 |
| VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes |
| |
| index3loop: |
| VLOADSWAP(R7, R0, V2, V2) // Load with correct order |
| VSLDOI $1, V2, V3, V4 // string[1:17] |
| VSLDOI $2, V2, V3, V9 // string[2:18] |
| VCMPEQUH V1, V2, V5 // compare hw even indices |
| VCMPEQUH V1, V4, V6 // compare hw odd indices |
| VCMPEQUB V8, V9, V10 // compare 3rd to last byte |
| VSEL V6, V5, V31, V7 // Find 1st matching byte using mask |
| VAND V7, V10, V7 // AND matched bytes with matched 3rd byte |
| VCLZD V7, V18 // Find first nonzero indexes |
| MFVSRD V18, R25 // Move 1st doubleword |
| CMP R25, $64 // If < 64 found |
| BLT foundR25 // Return matching index |
| VSLDOI $8, V18, V18, V18 // Move value |
| MFVSRD V18, R25 // Move 2nd doubleword |
| CMP R25, $64 // If < 64 found |
| ADD $64, R25 // Update byte index |
| BLT foundR25 // Return matching index |
| ADD $16, R7 // R7+=16 string ptr |
| ADD $19, R7, R9 // Number of string bytes for loop |
| CMP R9, LASTBYTE // Compare against last byte of string |
| BLT index3loop2 // If within, continue this loop |
| CMP R7, LASTSTR // Compare against last start byte |
| BLT index2to16 // Process remainder |
| VSPLTISB $0, V3 // Special case for last 16 bytes |
| BR index3loop // Continue this loop |
| |
| // Loop to process 4 byte separator |
| // string[0:16] in V2 |
| // string[3:16] in V3 |
| // sep[0:4] splatted in V1 |
| // Set up vectors with strings at offsets |
| // 0, 1, 2, 3 and compare against the 4 byte |
| // separator also splatted. Use VSEL with the |
| // compare results to find the first byte where |
| // a separator match is found. |
| index4plus: |
| CMP R6, $4 // Check if 4 byte separator |
| BNE index5plus // If not next higher |
| ADD $20, R7, R9 // Check string size to load |
| CMP R9, LASTBYTE // Verify string length |
| BGE index2to16 // If not large enough, process remaining |
| MOVD $2, R15 // Set up index |
| |
| // Set up masks for use with VSEL |
| MOVD $0xff, R21 // Set up mask 0xff000000ff000000... |
| SLD $24, R21 |
| MTVSRD R21, V10 |
| VSPLTW $1, V10, V29 |
| VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... |
| MOVD $0xffff, R21 |
| SLD $16, R21 |
| MTVSRD R21, V10 |
| VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... |
| VSPLTW $0, V0, V1 // Splat 1st word of separator |
| |
| index4loop: |
| VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 |
| |
| next4: |
| VSPLTISB $0, V10 // Clear |
| MOVD $3, R9 // Number of bytes beyond 16 |
| VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3 |
| VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes |
| VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 |
| VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 |
| VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 |
| VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep |
| VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep |
| VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep |
| VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep |
| VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask |
| VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask |
| VSEL V14, V13, V31, V7 // final merge |
| VCLZD V7, V18 // Find first index for each half |
| MFVSRD V18, R25 // Isolate value |
| CMP R25, $64 // If < 64, found |
| BLT foundR25 // Return found index |
| VSLDOI $8, V18, V18, V18 // Move for MFVSRD |
| MFVSRD V18, R25 // Isolate other value |
| CMP R25, $64 // If < 64, found |
| ADD $64, R25 // Update index for high doubleword |
| BLT foundR25 // Return found index |
| ADD $16, R7 // R7+=16 for next string |
| ADD $20, R7, R9 // R+20 for all bytes to load |
| CMP R9, LASTBYTE // Past end? Maybe check for extra? |
| BLT index4loop // If not, continue loop |
| CMP R7, LASTSTR // Check remainder |
| BLE index2to16 // Process remainder |
| BR notfound // Not found |
| |
| index5plus: |
| CMP R6, $16 // Check for sep > 16 |
| BGT index17plus // Handle large sep |
| |
| // Assumption is that the separator is smaller than the string at this point |
| index2to16: |
| CMP R7, LASTSTR // Compare last start byte |
| BGT notfound // last takes len(sep) into account |
| |
| ADD $16, R7, R9 // Check for last byte of string |
| CMP R9, LASTBYTE |
| BGT index2to16tail |
| |
| // At least 16 bytes of string left |
| // Mask the number of bytes in sep |
| index2to16loop: |
| VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 |
| |
| compare: |
| VAND V1, SEPMASK, V2 // Mask out sep size |
| VCMPEQUBCC V0, V2, V3 // Compare masked string |
| BLT CR6, found // All equal |
| ADD $1, R7 // Update ptr to next byte |
| CMP R7, LASTSTR // Still less than last start byte |
| BGT notfound // Not found |
| ADD $16, R7, R9 // Verify remaining bytes |
| CMP R9, LASTBYTE // At least 16 |
| BLT index2to16loop // Try again |
| |
| // Less than 16 bytes remaining in string |
| // Separator >= 2 |
| index2to16tail: |
| ADD R3, R4, R9 // End of string |
| SUB R7, R9, R9 // Number of bytes left |
| ANDCC $15, R7, R10 // 16 byte offset |
| ADD R10, R9, R11 // offset + len |
| CMP R11, $16 // >= 16? |
| BLE short // Does not cross 16 bytes |
| VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 |
| BR index2to16next // Continue on |
| |
| short: |
| RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container |
| VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1 |
| SLD $3, R10 // Set up shift |
| MTVSRD R10, V8 // Set up shift |
| VSLDOI $8, V8, V8, V8 |
| VSLO V1, V8, V1 // Shift by start byte |
| VSPLTISB $0, V25 // Clear for later use |
| |
| index2to16next: |
| VAND V1, SEPMASK, V2 // Just compare size of sep |
| VCMPEQUBCC V0, V2, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BGT notfound // If at end, then not found |
| VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte |
| BR index2to16next // Check the next partial string |
| |
| index17plus: |
| CMP R6, $32 // Check if 17 < len(sep) <= 32 |
| BGT index33plus |
| SUB $16, R6, R9 // Extra > 16 |
| SLD $56, R9, R10 // Shift to use in VSLO |
| MTVSRD R10, V9 // Set up for VSLO |
| VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1 |
| VSLO V1, V9, V1 // Shift left |
| VSPLTISB $0xff, V7 // Splat 1s |
| VSPLTISB $0, V27 // Splat 0 |
| |
| index17to32loop: |
| VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 |
| |
| next17: |
| VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3 |
| VSLO V3, V9, V3 // Shift left |
| VCMPEQUB V0, V2, V4 // Compare first 16 bytes |
| VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes |
| VAND V4, V5, V6 // Check if both equal |
| VCMPEQUBCC V6, V7, V8 // All equal? |
| BLT CR6, found // Yes |
| ADD $1, R7 // On to next byte |
| CMP R7, LASTSTR // Check if last start byte |
| BGT notfound // If too high, not found |
| BR index17to32loop // Continue |
| |
| notfound: |
| MOVD $-1, R3 // Return -1 if not found |
| RET |
| |
| index33plus: |
| MOVD $0, (R0) // Case not implemented |
| RET // Crash before return |
| |
| foundR25: |
| SRD $3, R25 // Convert from bits to bytes |
| ADD R25, R7 // Add to current string address |
| SUB R3, R7 // Subtract from start of string |
| MOVD R7, R3 // Return byte where found |
| RET |
| |
| found: |
| SUB R3, R7 // Return byte where found |
| MOVD R7, R3 |
| RET |
| |
| TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 |
| CMP R6, R4 // Compare lengths |
| BGT notfound // If sep len is > string, notfound |
| ADD R4, R3, LASTBYTE // find last byte addr |
| SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) |
| CMP R6, $0 // Check sep len |
| BEQ notfound // sep len 0 -- not found |
| MOVD R3, R7 // Copy of string addr |
| #ifndef GOPPC64_power10 |
| MOVD $16, R16 // Index value 16 |
| MOVD $17, R17 // Index value 17 |
| MOVD $18, R18 // Index value 18 |
| VSPLTISB $0xFF, ONES // splat all 1s |
| VOR ONES, ONES, SEPMASK // Set up full SEPMASK |
| #else |
| SLD $56, R6, R14 // Set up separator length for LXVLL |
| #endif |
| MOVD $1, R19 // Index value 1 |
| CMP R6, $16, CR4 // CR4 for len(sep) >= 16 |
| BGE CR4, loadge16 // Load for len(sep) >= 16 |
| #ifndef GOPPC64_power10 |
| SUB R6, R16, R9 // 16-len of sep |
| SLD $3, R9 // Set up for VSLO |
| MTVSRD R9, V9 // Set up for VSLO |
| VSLDOI $8, V9, V9, V9 // Set up for VSLO |
| VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 |
| #endif |
| loadge16: |
| ANDCC $15, R5, R9 // Find byte offset of sep |
| ADD R9, R6, R10 // Add sep len |
| CMP R10, $16 // Check if sep len+offset > 16 |
| BGT sepcross16 // Sep crosses 16 byte boundary |
| #ifdef GOPPC64_power10 |
| LXVLL R5, R14, V0 // Load separator |
| #else |
| RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container |
| LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0 |
| SLD $3, R9 // Set up shift count for VSLO |
| MTVSRD R9, V8 // Set up shift count for VSLO |
| VSLDOI $8, V8, V8, V8 |
| VSLO V0, V8, V0 // Shift by start byte |
| VAND V0, SEPMASK, V0 // Mask separator (< 16) |
| #endif |
| BR index2plus |
| sepcross16: |
| #ifdef GOPPC64_power10 |
| LXVLL R5, R14, V0 // Load separator |
| #else |
| LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\ |
| VAND V0, SEPMASK, V0 // mask out separator |
| #endif |
| BLE CR4, index2to16 |
| BR index17plus // Handle sep > 16 |
| |
| index2plus: |
| CMP R6, $2 // Check length of sep |
| BNE index3plus // If not 2, check for 3 |
| ADD $16, R7, R9 // Check if next 16 bytes past last |
| CMP R9, LASTBYTE // compare with last |
| BGE index2to16 // 2 <= len(string) <= 16 |
| MOVD $0xff00, R21 // Mask for later |
| MTVSRD R21, V25 // Move to Vreg |
| VSPLTH $3, V25, V31 // Splat mask |
| VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep |
| VSPLTISB $0, V10 // Clear V10 |
| |
| // First case: 2 byte separator |
| // V1: 2 byte separator splatted |
| // V2: 16 bytes at addr |
| // V4: 16 bytes at addr+1 |
| // Compare 2 byte separator at start |
| // and at start+1. Use VSEL to combine |
| // those results to find the first |
| // matching start byte, returning |
| // that value when found. Loop as |
| // long as len(string) > 16 |
| index2loop2: |
| LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3 |
| |
| index2loop: |
| LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 |
| VCMPEQUH V1, V2, V5 // Search for sep |
| VCMPEQUH V1, V3, V6 // Search for sep offset by 1 |
| VSEL V6, V5, V31, V7 // merge even and odd indices |
| VCLZD V7, V18 // find index of first match |
| MFVSRD V18, R25 // get first value |
| CMP R25, $64 // Found if < 64 |
| BLT foundR25 // Return byte index where found |
| |
| MFVSRLD V18, R25 // get second value |
| CMP R25, $64 // Found if < 64 |
| ADD $64, R25 // Update byte offset |
| BLT foundR25 // Return value |
| ADD $16, R7 // R7+=16 Update string pointer |
| ADD $17, R7, R9 // R9=F7+17 since loop unrolled |
| CMP R9, LASTBYTE // Compare addr+17 against last byte |
| BLT index2loop2 // If < last, continue loop |
| CMP R7, LASTBYTE // Compare addr+16 against last byte |
| BLT index2to16 // If < 16 handle specially |
| LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3 |
| VSLDOI $1, V3, V10, V3 // Shift left by 1 byte |
| BR index2loop |
| |
| index3plus: |
| CMP R6, $3 // Check if sep == 3 |
| BNE index4plus // If not check larger |
| ADD $19, R7, R9 // Find bytes for use in this loop |
| CMP R9, LASTBYTE // Compare against last byte |
| BGE index2to16 // Remaining string 2<=len<=16 |
| MOVD $0xff00, R21 // Set up mask for upcoming loop |
| MTVSRD R21, V25 // Move mask to Vreg |
| VSPLTH $3, V25, V31 // Splat mask |
| VSPLTH $0, V0, V1 // Splat 1st two bytes of sep |
| VSPLTB $2, V0, V8 // Splat 3rd byte of sep |
| |
| // Loop to process 3 byte separator. |
| // string[0:16] is in V2 |
| // string[2:18] is in V3 |
| // sep[0:2] splatted in V1 |
| // sec[3] splatted in v8 |
| // Load vectors at string, string+1 |
| // and string+2. Compare string, string+1 |
| // against first 2 bytes of separator |
| // splatted, and string+2 against 3rd |
| // byte splatted. Merge the results with |
| // VSEL to find the first byte of a match. |
| |
| // Special handling for last 16 bytes if the |
| // string fits in 16 byte multiple. |
| index3loop2: |
| MOVD $2, R21 // Set up index for 2 |
| VSPLTISB $0, V10 // Clear V10 |
| LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3 |
| VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes |
| |
| index3loop: |
| LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 |
| VSLDOI $1, V2, V3, V4 // string[1:17] |
| VSLDOI $2, V2, V3, V9 // string[2:18] |
| VCMPEQUH V1, V2, V5 // compare hw even indices |
| VCMPEQUH V1, V4, V6 // compare hw odd indices |
| VCMPEQUB V8, V9, V10 // compare 3rd to last byte |
| VSEL V6, V5, V31, V7 // Find 1st matching byte using mask |
| VAND V7, V10, V7 // AND matched bytes with matched 3rd byte |
| VCLZD V7, V18 // Find first nonzero indexes |
| MFVSRD V18, R25 // Move 1st doubleword |
| CMP R25, $64 // If < 64 found |
| BLT foundR25 // Return matching index |
| |
| MFVSRLD V18, R25 // Move 2nd doubleword |
| CMP R25, $64 // If < 64 found |
| ADD $64, R25 // Update byte index |
| BLT foundR25 // Return matching index |
| ADD $16, R7 // R7+=16 string ptr |
| ADD $19, R7, R9 // Number of string bytes for loop |
| CMP R9, LASTBYTE // Compare against last byte of string |
| BLT index3loop2 // If within, continue this loop |
| CMP R7, LASTSTR // Compare against last start byte |
| BLT index2to16 // Process remainder |
| VSPLTISB $0, V3 // Special case for last 16 bytes |
| BR index3loop // Continue this loop |
| |
| // Loop to process 4 byte separator |
| // string[0:16] in V2 |
| // string[3:16] in V3 |
| // sep[0:4] splatted in V1 |
| // Set up vectors with strings at offsets |
| // 0, 1, 2, 3 and compare against the 4 byte |
| // separator also splatted. Use VSEL with the |
| // compare results to find the first byte where |
| // a separator match is found. |
| index4plus: |
| CMP R6, $4 // Check if 4 byte separator |
| BNE index5plus // If not next higher |
| ADD $20, R7, R9 // Check string size to load |
| CMP R9, LASTBYTE // Verify string length |
| BGE index2to16 // If not large enough, process remaining |
| |
| // Set up masks for use with VSEL |
| MOVD $0xff, R21 // Set up mask 0xff000000ff000000... |
| SLD $24, R21 |
| MTVSRWS R21, V29 |
| |
| VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... |
| MOVD $0xffff, R21 |
| SLD $16, R21 |
| MTVSRWS R21, V31 |
| |
| VSPLTW $0, V0, V1 // Splat 1st word of separator |
| |
| index4loop: |
| LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 |
| |
| next4: |
| VSPLTISB $0, V10 // Clear |
| MOVD $3, R9 // Number of bytes beyond 16 |
| LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3 |
| VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes |
| VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 |
| VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 |
| VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 |
| VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep |
| VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep |
| VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep |
| VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep |
| VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask |
| VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask |
| VSEL V14, V13, V31, V7 // final merge |
| VCLZD V7, V18 // Find first index for each half |
| MFVSRD V18, R25 // Isolate value |
| CMP R25, $64 // If < 64, found |
| BLT foundR25 // Return found index |
| |
| MFVSRLD V18, R25 // Isolate other value |
| CMP R25, $64 // If < 64, found |
| ADD $64, R25 // Update index for high doubleword |
| BLT foundR25 // Return found index |
| ADD $16, R7 // R7+=16 for next string |
| ADD $20, R7, R9 // R+20 for all bytes to load |
| CMP R9, LASTBYTE // Past end? Maybe check for extra? |
| BLT index4loop // If not, continue loop |
| CMP R7, LASTSTR // Check remainder |
| BLE index2to16 // Process remainder |
| BR notfound // Not found |
| |
| index5plus: |
| CMP R6, $16 // Check for sep > 16 |
| BGT index17plus // Handle large sep |
| |
| // Assumption is that the separator is smaller than the string at this point |
| index2to16: |
| CMP R7, LASTSTR // Compare last start byte |
| BGT notfound // last takes len(sep) into account |
| |
| ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes |
| CMP R9, LASTBYTE |
| // At least 16 bytes of string left |
| // Mask the number of bytes in sep |
| VSPLTISB $0, V10 // Clear |
| BGT index2to16tail |
| |
| #ifdef GOPPC64_power10 |
| ADD $3,R7, R17 // Base+3 |
| ADD $2,R7, R8 // Base+2 |
| ADD $1,R7, R10 // Base+1 |
| #else |
| MOVD $3, R17 // Number of bytes beyond 16 |
| #endif |
| PCALIGN $16 |
| |
| index2to16loop: |
| |
| #ifdef GOPPC64_power10 |
| LXVLL R7, R14, V8 // Load next 16 bytes of string from Base |
| LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1 |
| LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2 |
| LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3 |
| #else |
| LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7 |
| LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3 |
| |
| VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes |
| VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1 |
| VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2 |
| VAND V1, SEPMASK, V8 // Mask out sep size 0th index |
| VAND V3, SEPMASK, V9 // Mask out sep size 1st index |
| VAND V4, SEPMASK, V11 // Mask out sep size 2nd index |
| VAND V5, SEPMASK, V12 // Mask out sep size 3rd index |
| #endif |
| VCMPEQUBCC V0, V8, V8 // compare masked string |
| BLT CR6, found // All equal while comparing 0th index |
| VCMPEQUBCC V0, V9, V9 // compare masked string |
| BLT CR6, found2 // All equal while comparing 1st index |
| VCMPEQUBCC V0, V11, V11 // compare masked string |
| BLT CR6, found3 // All equal while comparing 2nd index |
| VCMPEQUBCC V0, V12, V12 // compare masked string |
| BLT CR6, found4 // All equal while comparing 3rd index |
| |
| ADD $4, R7 // Update ptr to next 4 bytes |
| #ifdef GOPPC64_power10 |
| ADD $4, R17 // Update ptr to next 4 bytes |
| ADD $4, R8 // Update ptr to next 4 bytes |
| ADD $4, R10 // Update ptr to next 4 bytes |
| #endif |
| CMP R7, LASTSTR // Still less than last start byte |
| BGT notfound // Not found |
| ADD $19, R7, R9 // Verify remaining bytes |
| CMP R9, LASTBYTE // length of string at least 19 |
| BLE index2to16loop // Try again, else do post processing and jump to index2to16next |
| PCALIGN $32 |
| // <19 bytes left, post process the remaining string |
| index2to16tail: |
| #ifdef GOPPC64_power10 |
| index2to16next_p10: |
| LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1 |
| VCMPEQUBCC V1, V0, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BLE index2to16next_p10 // If at end, then not found |
| BR notfound // go to remainder loop |
| #else |
| ADD R3, R4, R9 // End of string |
| SUB R7, R9, R9 // Number of bytes left |
| ANDCC $15, R7, R10 // 16 byte offset |
| ADD R10, R9, R11 // offset + len |
| CMP R11, $16 // >= 16? |
| BLE short // Does not cross 16 bytes |
| LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1 |
| CMP R9, $16 // Post-processing of unrolled loop |
| BLE index2to16next // continue to index2to16next if <= 16 bytes |
| SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2 |
| LXVB16X (R7)(R10), V9 |
| CMP R10, $1 // string length is 17, compare 1 more byte |
| BNE extra2 // string length is 18, compare 2 more bytes |
| VSLDOI $15, V9, V10, V25 |
| VAND V1, SEPMASK, V2 // Just compare size of sep |
| VCMPEQUBCC V0, V2, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BGT notfound // If at end, then not found |
| VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte |
| BR index2to16next // go to remainder loop |
| extra2: |
| VSLDOI $14, V9, V10, V25 |
| VAND V1, SEPMASK, V2 // Just compare size of sep |
| VCMPEQUBCC V0, V2, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BGT notfound // If at end, then not found |
| VOR V1, V1, V4 // save remaining string |
| VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte |
| VAND V1, SEPMASK, V2 // Just compare size of sep |
| VCMPEQUBCC V0, V2, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BGT notfound // If at end, then not found |
| VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte |
| BR index2to16next // Check the remaining partial string in index2to16next |
| |
| short: |
| RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container |
| LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1 |
| SLD $3, R10 // Set up shift |
| MTVSRD R10, V8 // Set up shift |
| VSLDOI $8, V8, V8, V8 |
| VSLO V1, V8, V1 // Shift by start byte |
| PCALIGN $16 |
| index2to16next: |
| VAND V1, SEPMASK, V2 // Just compare size of sep |
| VCMPEQUBCC V0, V2, V3 // Compare sep and partial string |
| BLT CR6, found // Found |
| ADD $1, R7 // Not found, try next partial string |
| CMP R7, LASTSTR // Check for end of string |
| BGT notfound // If at end, then not found |
| VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte |
| BR index2to16next // Check the next partial string |
| #endif // Tail processing if GOPPC64!=power10 |
| |
| index17plus: |
| CMP R6, $32 // Check if 17 < len(sep) <= 32 |
| BGT index33plus |
| SUB $16, R6, R9 // Extra > 16 |
| SLD $56, R9, R10 // Shift to use in VSLO |
| MTVSRD R10, V9 // Set up for VSLO |
| LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1 |
| VSLO V1, V9, V1 // Shift left |
| VSPLTISB $0xff, V7 // Splat 1s |
| VSPLTISB $0, V27 // Splat 0 |
| |
| index17to32loop: |
| LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 |
| |
| next17: |
| LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3 |
| VSLO V3, V9, V3 // Shift left |
| VCMPEQUB V0, V2, V4 // Compare first 16 bytes |
| VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes |
| VAND V4, V5, V6 // Check if both equal |
| VCMPEQUBCC V6, V7, V8 // All equal? |
| BLT CR6, found // Yes |
| ADD $1, R7 // On to next byte |
| CMP R7, LASTSTR // Check if last start byte |
| BGT notfound // If too high, not found |
| BR index17to32loop // Continue |
| |
| notfound: |
| MOVD $-1, R3 // Return -1 if not found |
| RET |
| |
| index33plus: |
| MOVD $0, (R0) // Case not implemented |
| RET // Crash before return |
| |
| foundR25: |
| SRD $3, R25 // Convert from bits to bytes |
| ADD R25, R7 // Add to current string address |
| SUB R3, R7 // Subtract from start of string |
| MOVD R7, R3 // Return byte where found |
| RET |
| found4: |
| ADD $1, R7 // found from unrolled loop at index 3 |
| found3: |
| ADD $1, R7 // found from unrolled loop at index 2 |
| found2: |
| ADD $1, R7 // found from unrolled loop at index 1 |
| found: // found at index 0 |
| SUB R3, R7 // Return byte where found |
| MOVD R7, R3 |
| RET |