|  | // Copyright 2019 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | #include "go_asm.h" | 
|  | #include "textflag.h" | 
|  |  | 
|  | // condition code masks | 
|  | #define EQ 8 | 
|  | #define NE 7 | 
|  |  | 
|  | // register assignments | 
|  | #define R_ZERO R0 | 
|  | #define R_VAL  R1 | 
|  | #define R_TMP  R2 | 
|  | #define R_PTR  R3 | 
|  | #define R_LEN  R4 | 
|  | #define R_CHAR R5 | 
|  | #define R_RET  R6 | 
|  | #define R_ITER R7 | 
|  | #define R_CNT  R8 | 
|  | #define R_MPTR R9 | 
|  |  | 
|  | // vector register assignments | 
|  | #define V_ZERO V0 | 
|  | #define V_CHAR V1 | 
|  | #define V_MASK V2 | 
|  | #define V_VAL  V3 | 
|  | #define V_CNT  V4 | 
|  |  | 
|  | // mask for trailing bytes in vector implementation | 
|  | GLOBL countbytemask<>(SB), RODATA, $16 | 
|  | DATA countbytemask<>+0(SB)/8, $0x0101010101010101 | 
|  | DATA countbytemask<>+8(SB)/8, $0x0101010101010101 | 
|  |  | 
|  | // func Count(b []byte, c byte) int | 
|  | TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40 | 
|  | LMG   b+0(FP), R_PTR, R_LEN | 
|  | MOVBZ c+24(FP), R_CHAR | 
|  | MOVD  $ret+32(FP), R_RET | 
|  | BR    countbytebody<>(SB) | 
|  |  | 
|  | // func CountString(s string, c byte) int | 
|  | TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 | 
|  | LMG   s+0(FP), R_PTR, R_LEN | 
|  | MOVBZ c+16(FP), R_CHAR | 
|  | MOVD  $ret+24(FP), R_RET | 
|  | BR    countbytebody<>(SB) | 
|  |  | 
|  | // input: | 
|  | // R_PTR  = address of array of bytes | 
|  | // R_LEN  = number of bytes in array | 
|  | // R_CHAR = byte value to count zero (extended to register width) | 
|  | // R_RET  = address of return value | 
|  | TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 | 
|  | MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP | 
|  | MOVD  $countbytemask<>(SB), R_MPTR | 
|  | CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0. | 
|  | SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks | 
|  | MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility | 
|  | CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available | 
|  |  | 
|  | // Start of vector code (have vector facility). | 
|  | // | 
|  | // Set R_LEN to be the length mod 16 minus 1 to use as an index for | 
|  | // vector 'load with length' (VLL). It will be in the range [-1,14]. | 
|  | // Also replicate c across a 16-byte vector and initialize V_ZERO. | 
|  | ANDW  $0xf, R_LEN | 
|  | VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0} | 
|  | VZERO V_ZERO             // V_ZERO = [1]uint128{0} | 
|  | ADDW  $-1, R_LEN | 
|  | VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c} | 
|  |  | 
|  | // Jump to loop if we have more than 15 bytes to process. | 
|  | CGIJ $NE, R_ITER, $0, vxchunks | 
|  |  | 
|  | // Load 1-15 bytes and corresponding mask. | 
|  | // Note: only the low 32-bits of R_LEN are used for the index. | 
|  | VLL R_LEN, (R_PTR), V_VAL | 
|  | VLL R_LEN, (R_MPTR), V_MASK | 
|  |  | 
|  | // Compare each byte in input chunk against byte to be counted. | 
|  | // Each byte element will be set to either 0 (no match) or 1 (match). | 
|  | VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 | 
|  | VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits | 
|  |  | 
|  | // Accumulate matched byte count in 128-bit integer value. | 
|  | VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} | 
|  | VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} | 
|  |  | 
|  | // Return rightmost (lowest) 64-bit part of accumulator. | 
|  | VSTEG $1, V_CNT, (R_RET) | 
|  | RET | 
|  |  | 
|  | vxchunks: | 
|  | // Load 0x01 into every byte element in the 16-byte mask vector. | 
|  | VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1} | 
|  | VZERO  V_CNT      // intial uint128 count of 0 | 
|  |  | 
|  | vxloop: | 
|  | // Load input bytes in 16-byte chunks. | 
|  | VL (R_PTR), V_VAL | 
|  |  | 
|  | // Compare each byte in input chunk against byte to be counted. | 
|  | // Each byte element will be set to either 0 (no match) or 1 (match). | 
|  | VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 | 
|  | VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits | 
|  |  | 
|  | // Increment input string address. | 
|  | MOVD $16(R_PTR), R_PTR | 
|  |  | 
|  | // Accumulate matched byte count in 128-bit integer value. | 
|  | VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} | 
|  | VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} | 
|  | VAQ    V_VAL, V_CNT, V_CNT  // accumulate | 
|  |  | 
|  | // Repeat until all 16-byte chunks are done. | 
|  | BRCTG R_ITER, vxloop | 
|  |  | 
|  | // Skip to end if there are no trailing bytes. | 
|  | CIJ $EQ, R_LEN, $-1, vxret | 
|  |  | 
|  | // Load 1-15 bytes and corresponding mask. | 
|  | // Note: only the low 32-bits of R_LEN are used for the index. | 
|  | VLL R_LEN, (R_PTR), V_VAL | 
|  | VLL R_LEN, (R_MPTR), V_MASK | 
|  |  | 
|  | // Compare each byte in input chunk against byte to be counted. | 
|  | // Each byte element will be set to either 0 (no match) or 1 (match). | 
|  | VCEQB V_CHAR, V_VAL, V_VAL | 
|  | VN    V_MASK, V_VAL, V_VAL | 
|  |  | 
|  | // Accumulate matched byte count in 128-bit integer value. | 
|  | VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} | 
|  | VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} | 
|  | VAQ    V_VAL, V_CNT, V_CNT  // accumulate | 
|  |  | 
|  | vxret: | 
|  | // Return rightmost (lowest) 64-bit part of accumulator. | 
|  | VSTEG $1, V_CNT, (R_RET) | 
|  | RET | 
|  |  | 
|  | novx: | 
|  | // Start of non-vector code (the vector facility not available). | 
|  | // | 
|  | // Initialise counter and constant zero. | 
|  | MOVD $0, R_CNT | 
|  | MOVD $0, R_ZERO | 
|  |  | 
|  | loop: | 
|  | // Read 1-byte from input and compare. | 
|  | // Note: avoid putting LOCGR in critical path. | 
|  | MOVBZ (R_PTR), R_VAL | 
|  | MOVD  $1, R_TMP | 
|  | MOVD  $1(R_PTR), R_PTR | 
|  | CMPW  R_VAL, R_CHAR | 
|  | LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match) | 
|  | ADD   R_TMP, R_CNT       // accumulate 64-bit result | 
|  |  | 
|  | // Repeat until all bytes have been checked. | 
|  | BRCTG R_LEN, loop | 
|  |  | 
|  | ret: | 
|  | MOVD R_CNT, (R_RET) | 
|  | RET | 
|  |  | 
|  | ret0: | 
|  | MOVD $0, (R_RET) | 
|  | RET |