| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build (ppc64 || ppc64le) && !purego |
| |
| #include "textflag.h" |
| |
| // func xorBytes(dst, a, b *byte, n int) |
| TEXT ·xorBytes(SB), NOSPLIT, $0 |
| MOVD dst+0(FP), R3 // R3 = dst |
| MOVD a+8(FP), R4 // R4 = a |
| MOVD b+16(FP), R5 // R5 = b |
| MOVD n+24(FP), R6 // R6 = n |
| |
| CMPU R6, $64, CR7 // Check if n ≥ 64 bytes |
| MOVD R0, R8 // R8 = index |
| CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes |
| BLE CR6, small // <= 8 |
| BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes |
| |
| // Case for n ≥ 64 bytes |
| preloop64: |
| SRD $6, R6, R7 // Set up loop counter |
| MOVD R7, CTR |
| MOVD $16, R10 |
| MOVD $32, R14 |
| MOVD $48, R15 |
| ANDCC $63, R6, R9 // Check for tailing bytes for later |
| PCALIGN $16 |
| // Case for >= 64 bytes |
| // Process 64 bytes per iteration |
| // Load 4 vectors of a and b |
| // XOR the corresponding vectors |
| // from a and b and store the result |
| loop64: |
| LXVD2X (R4)(R8), VS32 |
| LXVD2X (R4)(R10), VS34 |
| LXVD2X (R4)(R14), VS36 |
| LXVD2X (R4)(R15), VS38 |
| LXVD2X (R5)(R8), VS33 |
| LXVD2X (R5)(R10), VS35 |
| LXVD2X (R5)(R14), VS37 |
| LXVD2X (R5)(R15), VS39 |
| XXLXOR VS32, VS33, VS32 |
| XXLXOR VS34, VS35, VS34 |
| XXLXOR VS36, VS37, VS36 |
| XXLXOR VS38, VS39, VS38 |
| STXVD2X VS32, (R3)(R8) |
| STXVD2X VS34, (R3)(R10) |
| STXVD2X VS36, (R3)(R14) |
| STXVD2X VS38, (R3)(R15) |
| ADD $64, R8 |
| ADD $64, R10 |
| ADD $64, R14 |
| ADD $64, R15 |
| BDNZ loop64 |
| BC 12,2,LR // BEQLR |
| MOVD R9, R6 |
| CMP R6, $8 |
| BLE small |
| // Case for 8 <= n < 64 bytes |
| // Process 32 bytes if available |
| xor32: |
| CMP R6, $32 |
| BLT xor16 |
| ADD $16, R8, R9 |
| LXVD2X (R4)(R8), VS32 |
| LXVD2X (R4)(R9), VS33 |
| LXVD2X (R5)(R8), VS34 |
| LXVD2X (R5)(R9), VS35 |
| XXLXOR VS32, VS34, VS32 |
| XXLXOR VS33, VS35, VS33 |
| STXVD2X VS32, (R3)(R8) |
| STXVD2X VS33, (R3)(R9) |
| ADD $32, R8 |
| ADD $-32, R6 |
| CMP R6, $8 |
| BLE small |
| // Case for 8 <= n < 32 bytes |
| // Process 16 bytes if available |
| xor16: |
| CMP R6, $16 |
| BLT xor8 |
| LXVD2X (R4)(R8), VS32 |
| LXVD2X (R5)(R8), VS33 |
| XXLXOR VS32, VS33, VS32 |
| STXVD2X VS32, (R3)(R8) |
| ADD $16, R8 |
| ADD $-16, R6 |
| small: |
| CMP R6, R0 |
| BC 12,2,LR // BEQLR |
| xor8: |
| #ifdef GOPPC64_power10 |
| SLD $56,R6,R17 |
| ADD R4,R8,R18 |
| ADD R5,R8,R19 |
| ADD R3,R8,R20 |
| LXVL R18,R17,V0 |
| LXVL R19,R17,V1 |
| VXOR V0,V1,V1 |
| STXVL V1,R20,R17 |
| RET |
| #else |
| CMP R6, $8 |
| BLT xor4 |
| // Case for 8 ≤ n < 16 bytes |
| MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] |
| MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] |
| XOR R14, R15, R16 // R16 = a[] ^ b[] |
| SUB $8, R6 // n = n - 8 |
| MOVD R16, (R3)(R8) // Store to dst |
| ADD $8, R8 |
| xor4: |
| CMP R6, $4 |
| BLT xor2 |
| MOVWZ (R4)(R8), R14 |
| MOVWZ (R5)(R8), R15 |
| XOR R14, R15, R16 |
| MOVW R16, (R3)(R8) |
| ADD $4,R8 |
| ADD $-4,R6 |
| xor2: |
| CMP R6, $2 |
| BLT xor1 |
| MOVHZ (R4)(R8), R14 |
| MOVHZ (R5)(R8), R15 |
| XOR R14, R15, R16 |
| MOVH R16, (R3)(R8) |
| ADD $2,R8 |
| ADD $-2,R6 |
| xor1: |
| CMP R6, R0 |
| BC 12,2,LR // BEQLR |
| MOVBZ (R4)(R8), R14 // R14 = a[i] |
| MOVBZ (R5)(R8), R15 // R15 = b[i] |
| XOR R14, R15, R16 // R16 = a[i] ^ b[i] |
| MOVB R16, (R3)(R8) // Store to dst |
| #endif |
| done: |
| RET |