| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ppc64 || ppc64le |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 |
| // incoming: |
| // R3 a addr -> R5 |
| // R4 a len -> R3 |
| // R5 a cap unused |
| // R6 b addr -> R6 |
| // R7 b len -> R4 |
| // R8 b cap unused |
| MOVD R3, R5 |
| MOVD R4, R3 |
| MOVD R7, R4 |
| CMP R5,R6,CR7 |
| CMP R3,R4,CR6 |
| BEQ CR7,equal |
| MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 |
| CMP R16,$1 |
| BNE power8 |
| BR cmpbodyp9<>(SB) |
| power8: |
| BR cmpbody<>(SB) |
| equal: |
| BEQ CR6,done |
| MOVD $1, R8 |
| BGT CR6,greater |
| NEG R8 |
| greater: |
| MOVD R8, R3 |
| RET |
| done: |
| MOVD $0, R3 |
| RET |
| |
| TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 |
| // incoming: |
| // R3 a addr -> R5 |
| // R4 a len -> R3 |
| // R5 b addr -> R6 |
| // R6 b len -> R4 |
| MOVD R6, R7 |
| MOVD R5, R6 |
| MOVD R3, R5 |
| MOVD R4, R3 |
| MOVD R7, R4 |
| CMP R5,R6,CR7 |
| CMP R3,R4,CR6 |
| BEQ CR7,equal |
| MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 |
| CMP R16,$1 |
| BNE power8 |
| BR cmpbodyp9<>(SB) |
| power8: |
| BR cmpbody<>(SB) |
| equal: |
| BEQ CR6,done |
| MOVD $1, R8 |
| BGT CR6,greater |
| NEG R8 |
| greater: |
| MOVD R8, R3 |
| RET |
| |
| done: |
| MOVD $0, R3 |
| RET |
| |
| #ifdef GOARCH_ppc64le |
| DATA byteswap<>+0(SB)/8, $0x0706050403020100 |
| DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 |
| GLOBL byteswap<>+0(SB), RODATA, $16 |
| #define SWAP V21 |
| #endif |
| |
| // Do an efficient memcmp for ppc64le/ppc64/POWER8 |
| // R3 = a len |
| // R4 = b len |
| // R5 = a addr |
| // R6 = b addr |
| // On exit: |
| // R3 = return value |
| TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 |
| MOVD R3,R8 // set up length |
| CMP R3,R4,CR2 // unequal? |
| BLT CR2,setuplen // BLT CR2 |
| MOVD R4,R8 // use R4 for comparison len |
| setuplen: |
| CMP R8,$32 // optimize >= 32 |
| MOVD R8,R9 |
| BLT setup8a // optimize < 32 |
| MOVD $16,R10 // set offsets to load into vectors |
| CMP R8,$64 |
| BLT cmp32 // process size 32-63 |
| |
| DCBT (R5) // optimize >= 64 |
| DCBT (R6) // cache hint |
| MOVD $32,R11 // set offsets to load into vector |
| MOVD $48,R12 // set offsets to load into vector |
| |
| loop64a:// process size 64 and greater |
| LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector |
| LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different // jump out if its different |
| |
| LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector |
| LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector |
| |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector |
| LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector |
| |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector |
| LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector |
| |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| ADD $-64,R9,R9 // reduce remaining size by 64 |
| ADD $64,R5,R5 // increment to next 64 bytes of A |
| ADD $64,R6,R6 // increment to next 64 bytes of B |
| CMPU R9,$64 |
| BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining |
| |
| CMPU R9,$32 |
| BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining |
| CMPU R9,$0 |
| BNE rem // loop to rem if the remainder is not 0 |
| |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if len(A)<len(B) |
| BR greater // jump to greater otherwise |
| cmp32: |
| LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector |
| LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector |
| |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector |
| LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector |
| |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| ADD $-32,R9,R9 // reduce remaining size by 32 |
| ADD $32,R5,R5 // increment to next 32 bytes of A |
| ADD $32,R6,R6 // increment to next 32 bytes of B |
| CMPU R9,$0 |
| BNE rem // loop to rem if the remainder is not 0 |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if len(A)<len(B) |
| BR greater // jump to greater otherwise |
| rem: |
| MOVD R9,R8 |
| ANDCC $24,R8,R9 // Any 8 byte chunks? |
| BEQ leftover // and result is 0 |
| BR setup8a |
| |
| different: |
| #ifdef GOARCH_ppc64le |
| MOVD $byteswap<>+00(SB), R16 |
| LXVD2X (R16)(R0),SWAP // Set up swap string |
| |
| VPERM V3,V3,SWAP,V3 |
| VPERM V4,V4,SWAP,V4 |
| #endif |
| MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison |
| MFVSRD VS36,R10 |
| |
| CMPU R16,R10 |
| BEQ lower |
| BGT greater |
| MOVD $-1,R3 // return value if A < B |
| RET |
| lower: |
| VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison |
| MFVSRD VS35,R16 |
| VSLDOI $8,V4,V4,V4 |
| MFVSRD VS36,R10 |
| |
| CMPU R16,R10 |
| BGT greater |
| MOVD $-1,R3 // return value if A < B |
| RET |
| setup8a: |
| SRADCC $3,R8,R9 // get the 8 byte count |
| BEQ leftover // shifted value is 0 |
| CMPU R8,$8 // optimize 8byte move |
| BEQ size8 |
| CMPU R8,$16 |
| BEQ size16 |
| MOVD R9,CTR // loop count for doublewords |
| loop8: |
| #ifdef GOARCH_ppc64le |
| MOVDBR (R5+R0),R16 // doublewords to compare |
| MOVDBR (R6+R0),R10 // LE compare order |
| #else |
| MOVD (R5+R0),R16 // doublewords to compare |
| MOVD (R6+R0),R10 // BE compare order |
| #endif |
| ADD $8,R5 |
| ADD $8,R6 |
| CMPU R16,R10 // match? |
| BC 8,2,loop8 // bt ctr <> 0 && cr |
| BGT greater |
| BLT less |
| leftover: |
| ANDCC $7,R8,R9 // check for leftover bytes |
| BEQ zeroremainder |
| simplecheck: |
| MOVD R0,R14 |
| CMP R9,$4 // process 4 bytes |
| BLT halfword |
| #ifdef GOARCH_ppc64le |
| MOVWBR (R5)(R14),R10 |
| MOVWBR (R6)(R14),R11 |
| #else |
| MOVWZ (R5)(R14),R10 |
| MOVWZ (R6)(R14),R11 |
| #endif |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| ADD $-4,R9 |
| ADD $4,R14 |
| PCALIGN $16 |
| |
| halfword: |
| CMP R9,$2 // process 2 bytes |
| BLT byte |
| #ifdef GOARCH_ppc64le |
| MOVHBR (R5)(R14),R10 |
| MOVHBR (R6)(R14),R11 |
| #else |
| MOVHZ (R5)(R14),R10 |
| MOVHZ (R6)(R14),R11 |
| #endif |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| ADD $-2,R9 |
| ADD $2,R14 |
| PCALIGN $16 |
| byte: |
| CMP R9,$0 // process 1 byte |
| BEQ skip |
| MOVBZ (R5)(R14),R10 |
| MOVBZ (R6)(R14),R11 |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| PCALIGN $16 |
| skip: |
| BEQ CR2,equal |
| BGT CR2,greater |
| |
| less: MOVD $-1,R3 // return value if A < B |
| RET |
| size16: |
| LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector |
| LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| zeroremainder: |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if len(A)<len(B) |
| BR greater // jump to greater otherwise |
| size8: |
| #ifdef GOARCH_ppc64le |
| MOVDBR (R5+R0),R16 // doublewords to compare |
| MOVDBR (R6+R0),R10 // LE compare order |
| #else |
| MOVD (R5+R0),R16 // doublewords to compare |
| MOVD (R6+R0),R10 // BE compare order |
| #endif |
| CMPU R16,R10 // match? |
| BGT greater |
| BLT less |
| BGT CR2,greater // 2nd len > 1st len |
| BLT CR2,less // 2nd len < 1st len |
| equal: |
| MOVD $0, R3 // return value if A == B |
| RET |
| greater: |
| MOVD $1,R3 // return value if A > B |
| RET |
| |
| // Do an efficient memcmp for ppc64le/ppc64/POWER9 |
| // R3 = a len |
| // R4 = b len |
| // R5 = a addr |
| // R6 = b addr |
| // On exit: |
| // R3 = return value |
| TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 |
| MOVD R3,R8 // set up length |
| CMP R3,R4,CR2 // unequal? |
| BLT CR2,setuplen // BLT CR2 |
| MOVD R4,R8 // use R4 for comparison len |
| setuplen: |
| CMP R8,$16 // optimize for size<16 |
| MOVD R8,R9 |
| BLT simplecheck |
| MOVD $16,R10 // set offsets to load into vectors |
| CMP R8,$32 // optimize for size 16-31 |
| BLT cmp16 |
| CMP R8,$64 |
| BLT cmp32 // optimize for size 32-63 |
| DCBT (R5) // optimize for size>=64 |
| DCBT (R6) // cache hint |
| |
| MOVD $32,R11 // set offsets to load into vector |
| MOVD $48,R12 // set offsets to load into vector |
| |
| loop64a:// process size 64 and greater |
| LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector |
| LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector |
| VCMPNEBCC V3,V4,V1 // record comparison into V1 |
| BNE CR6,different // jump out if its different |
| |
| LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector |
| LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector |
| VCMPNEBCC V3,V4,V1 |
| BNE CR6,different |
| |
| LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector |
| LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector |
| VCMPNEBCC V3,V4,V1 |
| BNE CR6,different |
| |
| LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector |
| LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector |
| VCMPNEBCC V3,V4,V1 |
| BNE CR6,different |
| |
| ADD $-64,R9,R9 // reduce remaining size by 64 |
| ADD $64,R5,R5 // increment to next 64 bytes of A |
| ADD $64,R6,R6 // increment to next 64 bytes of B |
| CMPU R9,$64 |
| BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining |
| |
| CMPU R9,$32 |
| BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining |
| CMPU R9,$16 |
| BGE cmp16 // loop to cmp16 if there are 16-31 bytes left |
| CMPU R9,$0 |
| BNE simplecheck // loop to simplecheck for remaining bytes |
| |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if len(A)<len(B) |
| BR greater // jump to greater otherwise |
| cmp32: |
| LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector |
| LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector |
| |
| VCMPNEBCC V3,V4,V1 // record comparison into V1 |
| BNE CR6,different // jump out if its different |
| |
| LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector |
| LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector |
| VCMPNEBCC V3,V4,V1 |
| BNE CR6,different |
| |
| ADD $-32,R9,R9 // reduce remaining size by 32 |
| ADD $32,R5,R5 // increment to next 32 bytes of A |
| ADD $32,R6,R6 // increment to next 32 bytes of B |
| CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left |
| BGE cmp16 |
| CMPU R9,$0 |
| BNE simplecheck // loop to simplecheck for remainder bytes |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if len(A)<len(B) |
| BR greater // jump to greater otherwise |
| different: |
| |
| MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison |
| MFVSRD VS36,R10 |
| |
| CMPU R16,R10 |
| BEQ lower |
| BGT greater |
| MOVD $-1,R3 // return value if A < B |
| RET |
| lower: |
| MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison |
| MFVSRLD VS36,R10 |
| |
| CMPU R16,R10 |
| BGT greater |
| MOVD $-1,R3 // return value if A < B |
| RET |
| |
| greater: |
| MOVD $1,R3 // return value if A > B |
| RET |
| cmp16: |
| ANDCC $16,R9,R31 |
| BEQ tail |
| |
| LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector |
| LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| |
| ADD $16,R5 |
| ADD $16,R6 |
| tail: |
| ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) |
| BEQ end |
| |
| ADD R9,R5 |
| ADD R9,R6 |
| MOVD $-16,R10 |
| |
| LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector |
| LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector |
| VCMPEQUDCC V3,V4,V1 |
| BGE CR6,different |
| end: |
| BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) |
| BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) |
| BR greater // jump to greater otherwise |
| simplecheck: |
| MOVD $0,R14 // process 8 bytes |
| CMP R9,$8 |
| BLT word |
| #ifdef GOARCH_ppc64le |
| MOVDBR (R5+R14),R10 |
| MOVDBR (R6+R14),R11 |
| #else |
| MOVD (R5+R14),R10 |
| MOVD (R6+R14),R11 |
| #endif |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| ADD $8,R14 |
| ADD $-8,R9 |
| PCALIGN $16 |
| word: |
| CMP R9,$4 // process 4 bytes |
| BLT halfword |
| #ifdef GOARCH_ppc64le |
| MOVWBR (R5+R14),R10 |
| MOVWBR (R6+R14),R11 |
| #else |
| MOVWZ (R5+R14),R10 |
| MOVWZ (R6+R14),R11 |
| #endif |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| ADD $4,R14 |
| ADD $-4,R9 |
| PCALIGN $16 |
| halfword: |
| CMP R9,$2 // process 2 bytes |
| BLT byte |
| #ifdef GOARCH_ppc64le |
| MOVHBR (R5+R14),R10 |
| MOVHBR (R6+R14),R11 |
| #else |
| MOVHZ (R5+R14),R10 |
| MOVHZ (R6+R14),R11 |
| #endif |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| ADD $2,R14 |
| ADD $-2,R9 |
| PCALIGN $16 |
| byte: |
| CMP R9,$0 // process 1 byte |
| BEQ skip |
| MOVBZ (R5+R14),R10 |
| MOVBZ (R6+R14),R11 |
| CMPU R10,R11 |
| BGT greater |
| BLT less |
| PCALIGN $16 |
| skip: |
| BEQ CR2,equal |
| BGT CR2,greater |
| less: |
| MOVD $-1,R3 // return value if A < B |
| RET |
| equal: |
| MOVD $0, R3 // return value if A == B |
| RET |