src/internal/bytealg/count_s390x.s - go - Git at Google

 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "go_asm.h"
 #include "textflag.h"

 // condition code masks
 #define EQ 8
 #define NE 7

 // register assignments
 #define R_ZERO R0
 #define R_VAL  R1
 #define R_TMP  R2
 #define R_PTR  R3
 #define R_LEN  R4
 #define R_CHAR R5
 #define R_RET  R6
 #define R_ITER R7
 #define R_CNT  R8
 #define R_MPTR R9

 // vector register assignments
 #define V_ZERO V0
 #define V_CHAR V1
 #define V_MASK V2
 #define V_VAL  V3
 #define V_CNT  V4

 // mask for trailing bytes in vector implementation
 GLOBL countbytemask<>(SB), RODATA, $16
 DATA countbytemask<>+0(SB)/8, $0x0101010101010101
 DATA countbytemask<>+8(SB)/8, $0x0101010101010101

 // func Count(b []byte, c byte) int
 TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
 	LMG   b+0(FP), R_PTR, R_LEN
 	MOVBZ c+24(FP), R_CHAR
 	MOVD  $ret+32(FP), R_RET
 	BR    countbytebody<>(SB)

 // func CountString(s string, c byte) int
 TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
 	LMG   s+0(FP), R_PTR, R_LEN
 	MOVBZ c+16(FP), R_CHAR
 	MOVD  $ret+24(FP), R_RET
 	BR    countbytebody<>(SB)

 // input:
 // R_PTR  = address of array of bytes
 // R_LEN  = number of bytes in array
 // R_CHAR = byte value to count zero (extended to register width)
 // R_RET  = address of return value
 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
 	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
 	MOVD  $countbytemask<>(SB), R_MPTR
 	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
 	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
 	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
 	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available

 	// Start of vector code (have vector facility).
 	//
 	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
 	// vector 'load with length' (VLL). It will be in the range [-1,14].
 	// Also replicate c across a 16-byte vector and initialize V_ZERO.
 	ANDW  $0xf, R_LEN
 	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
 	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
 	ADDW  $-1, R_LEN
 	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}

 	// Jump to loop if we have more than 15 bytes to process.
 	CGIJ $NE, R_ITER, $0, vxchunks

 	// Load 1-15 bytes and corresponding mask.
 	// Note: only the low 32-bits of R_LEN are used for the index.
 	VLL R_LEN, (R_PTR), V_VAL
 	VLL R_LEN, (R_MPTR), V_MASK

 	// Compare each byte in input chunk against byte to be counted.
 	// Each byte element will be set to either 0 (no match) or 1 (match).
 	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
 	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits

 	// Accumulate matched byte count in 128-bit integer value.
 	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
 	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}

 	// Return rightmost (lowest) 64-bit part of accumulator.
 	VSTEG $1, V_CNT, (R_RET)
 	RET

 vxchunks:
 	// Load 0x01 into every byte element in the 16-byte mask vector.
 	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
 	VZERO  V_CNT      // initial uint128 count of 0

 vxloop:
 	// Load input bytes in 16-byte chunks.
 	VL (R_PTR), V_VAL

 	// Compare each byte in input chunk against byte to be counted.
 	// Each byte element will be set to either 0 (no match) or 1 (match).
 	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
 	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits

 	// Increment input string address.
 	MOVD $16(R_PTR), R_PTR

 	// Accumulate matched byte count in 128-bit integer value.
 	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
 	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
 	VAQ    V_VAL, V_CNT, V_CNT  // accumulate

 	// Repeat until all 16-byte chunks are done.
 	BRCTG R_ITER, vxloop

 	// Skip to end if there are no trailing bytes.
 	CIJ $EQ, R_LEN, $-1, vxret

 	// Load 1-15 bytes and corresponding mask.
 	// Note: only the low 32-bits of R_LEN are used for the index.
 	VLL R_LEN, (R_PTR), V_VAL
 	VLL R_LEN, (R_MPTR), V_MASK

 	// Compare each byte in input chunk against byte to be counted.
 	// Each byte element will be set to either 0 (no match) or 1 (match).
 	VCEQB V_CHAR, V_VAL, V_VAL
 	VN    V_MASK, V_VAL, V_VAL

 	// Accumulate matched byte count in 128-bit integer value.
 	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
 	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
 	VAQ    V_VAL, V_CNT, V_CNT  // accumulate

 vxret:
 	// Return rightmost (lowest) 64-bit part of accumulator.
 	VSTEG $1, V_CNT, (R_RET)
 	RET

 novx:
 	// Start of non-vector code (the vector facility not available).
 	//
 	// Initialise counter and constant zero.
 	MOVD $0, R_CNT
 	MOVD $0, R_ZERO

 loop:
 	// Read 1-byte from input and compare.
 	// Note: avoid putting LOCGR in critical path.
 	MOVBZ (R_PTR), R_VAL
 	MOVD  $1, R_TMP
 	MOVD  $1(R_PTR), R_PTR
 	CMPW  R_VAL, R_CHAR
 	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
 	ADD   R_TMP, R_CNT       // accumulate 64-bit result

 	// Repeat until all bytes have been checked.
 	BRCTG R_LEN, loop

 ret:
 	MOVD R_CNT, (R_RET)
 	RET

 ret0:
 	MOVD $0, (R_RET)
 	RET
	// Copyright 2019 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go_asm.h"
	#include "textflag.h"

	// condition code masks
	#define EQ 8
	#define NE 7

	// register assignments
	#define R_ZERO R0
	#define R_VAL R1
	#define R_TMP R2
	#define R_PTR R3
	#define R_LEN R4
	#define R_CHAR R5
	#define R_RET R6
	#define R_ITER R7
	#define R_CNT R8
	#define R_MPTR R9

	// vector register assignments
	#define V_ZERO V0
	#define V_CHAR V1
	#define V_MASK V2
	#define V_VAL V3
	#define V_CNT V4

	// mask for trailing bytes in vector implementation
	GLOBL countbytemask<>(SB), RODATA, $16
	DATA countbytemask<>+0(SB)/8, $0x0101010101010101
	DATA countbytemask<>+8(SB)/8, $0x0101010101010101

	// func Count(b []byte, c byte) int
	TEXT ·Count(SB), NOSPLIT\|NOFRAME, $0-40
	LMG b+0(FP), R_PTR, R_LEN
	MOVBZ c+24(FP), R_CHAR
	MOVD $ret+32(FP), R_RET
	BR countbytebody<>(SB)

	// func CountString(s string, c byte) int
	TEXT ·CountString(SB), NOSPLIT\|NOFRAME, $0-32
	LMG s+0(FP), R_PTR, R_LEN
	MOVBZ c+16(FP), R_CHAR
	MOVD $ret+24(FP), R_RET
	BR countbytebody<>(SB)

	// input:
	// R_PTR = address of array of bytes
	// R_LEN = number of bytes in array
	// R_CHAR = byte value to count zero (extended to register width)
	// R_RET = address of return value
	TEXT countbytebody<>(SB), NOSPLIT\|NOFRAME, $0-0
	MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
	MOVD $countbytemask<>(SB), R_MPTR
	CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
	SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
	MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
	CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available

	// Start of vector code (have vector facility).
	//
	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
	// vector 'load with length' (VLL). It will be in the range [-1,14].
	// Also replicate c across a 16-byte vector and initialize V_ZERO.
	ANDW $0xf, R_LEN
	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
	VZERO V_ZERO // V_ZERO = [1]uint128{0}
	ADDW $-1, R_LEN
	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}

	// Jump to loop if we have more than 15 bytes to process.
	CGIJ $NE, R_ITER, $0, vxchunks

	// Load 1-15 bytes and corresponding mask.
	// Note: only the low 32-bits of R_LEN are used for the index.
	VLL R_LEN, (R_PTR), V_VAL
	VLL R_LEN, (R_MPTR), V_MASK

	// Compare each byte in input chunk against byte to be counted.
	// Each byte element will be set to either 0 (no match) or 1 (match).
	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
	VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits

	// Accumulate matched byte count in 128-bit integer value.
	VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}

	// Return rightmost (lowest) 64-bit part of accumulator.
	VSTEG $1, V_CNT, (R_RET)
	RET

	vxchunks:
	// Load 0x01 into every byte element in the 16-byte mask vector.
	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
	VZERO V_CNT // initial uint128 count of 0

	vxloop:
	// Load input bytes in 16-byte chunks.
	VL (R_PTR), V_VAL

	// Compare each byte in input chunk against byte to be counted.
	// Each byte element will be set to either 0 (no match) or 1 (match).
	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
	VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits

	// Increment input string address.
	MOVD $16(R_PTR), R_PTR

	// Accumulate matched byte count in 128-bit integer value.
	VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
	VAQ V_VAL, V_CNT, V_CNT // accumulate

	// Repeat until all 16-byte chunks are done.
	BRCTG R_ITER, vxloop

	// Skip to end if there are no trailing bytes.
	CIJ $EQ, R_LEN, $-1, vxret

	// Load 1-15 bytes and corresponding mask.
	// Note: only the low 32-bits of R_LEN are used for the index.
	VLL R_LEN, (R_PTR), V_VAL
	VLL R_LEN, (R_MPTR), V_MASK

	// Compare each byte in input chunk against byte to be counted.
	// Each byte element will be set to either 0 (no match) or 1 (match).
	VCEQB V_CHAR, V_VAL, V_VAL
	VN V_MASK, V_VAL, V_VAL

	// Accumulate matched byte count in 128-bit integer value.
	VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
	VAQ V_VAL, V_CNT, V_CNT // accumulate

	vxret:
	// Return rightmost (lowest) 64-bit part of accumulator.
	VSTEG $1, V_CNT, (R_RET)
	RET

	novx:
	// Start of non-vector code (the vector facility not available).
	//
	// Initialise counter and constant zero.
	MOVD $0, R_CNT
	MOVD $0, R_ZERO

	loop:
	// Read 1-byte from input and compare.
	// Note: avoid putting LOCGR in critical path.
	MOVBZ (R_PTR), R_VAL
	MOVD $1, R_TMP
	MOVD $1(R_PTR), R_PTR
	CMPW R_VAL, R_CHAR
	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
	ADD R_TMP, R_CNT // accumulate 64-bit result

	// Repeat until all bytes have been checked.
	BRCTG R_LEN, loop

	ret:
	MOVD R_CNT, (R_RET)
	RET

	ret0:
	MOVD $0, (R_RET)
	RET