src/hash/crc32/crc32_ppc64le.s - go - Git at Google

 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // The vectorized implementation found below is a derived work
 // from code written by Anton Blanchard <anton@au.ibm.com> found
 // at https://github.com/antonblanchard/crc32-vpmsum.  The original
 // is dual licensed under GPL and Apache 2.  As the copyright holder
 // for the work, IBM has contributed this new work under
 // the golang license.

 // Changes include porting to Go assembler with modifications for
 // the Go ABI for ppc64le.

 #include "textflag.h"

 #define POWER8_OFFSET 132

 #define off16	R16
 #define off32	R17
 #define off48	R18
 #define off64	R19
 #define off80	R20
 #define off96	R21
 #define	off112	R22

 #define const1	V24
 #define const2	V25

 #define byteswap	V26
 #define mask_32bit	V27
 #define mask_64bit	V28
 #define zeroes		V29

 #define MAX_SIZE	32*1024
 #define REFLECT

 TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
 	MOVWZ	crc+0(FP), R3   // incoming crc
 	MOVD    table8+8(FP), R4   // *Table
 	MOVD    p+16(FP), R5
 	MOVD    p_len+24(FP), R6 // p len

 	CMP     $0,R6           // len == 0?
 	BNE     start
 	MOVW    R3,ret+40(FP)   // return crc
 	RET

 start:
 	NOR     R3,R3,R7        // ^crc
 	MOVWZ	R7,R7		// 32 bits
 	CMP	R6,$16
 	MOVD	R6,CTR
 	BLT	short
 	SRAD    $3,R6,R8        // 8 byte chunks
 	MOVD    R8,CTR

 loop:
 	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
 	MOVWZ	4(R5),R9	// 4-7 bytes of p
 	MOVD	R4,R10		// &tab[0]
 	XOR	R7,R8,R7	// crc ^= byte[0:3]
 	RLDICL	$40,R9,$56,R17	// p[7]
 	SLD	$2,R17,R17	// p[7]*4
 	RLDICL	$40,R7,$56,R8	// crc>>24
 	ADD	R17,R10,R17	// &tab[0][p[7]]
 	SLD	$2,R8,R8	// crc>>24*4
 	RLDICL	$48,R9,$56,R18	// p[6]
 	SLD	$2,R18,R18	// p[6]*4
 	ADD	$1024,R10,R10	// tab[1]
 	MOVWZ	0(R17),R21	// tab[0][p[7]]
 	RLDICL	$56,R9,$56,R19	// p[5]
 	ADD	R10,R18,R18	// &tab[1][p[6]]
 	SLD	$2,R19,R19	// p[5]*4:1
 	MOVWZ	0(R18),R22	// tab[1][p[6]]
 	ADD	$1024,R10,R10	// tab[2]
 	XOR	R21,R22,R21	// xor done R22
 	ADD	R19,R10,R19	// &tab[2][p[5]]
 	ANDCC	$255,R9,R20	// p[4] ??
 	SLD	$2,R20,R20	// p[4]*4
 	MOVWZ	0(R19),R23	// tab[2][p[5]]
 	ADD	$1024,R10,R10	// &tab[3]
 	ADD	R20,R10,R20	// tab[3][p[4]]
 	XOR	R21,R23,R21	// xor done R23
 	ADD	$1024,R10,R10	// &tab[4]
 	MOVWZ	0(R20),R24	// tab[3][p[4]]
 	ADD	R10,R8,R23	// &tab[4][crc>>24]
 	XOR	R21,R24,R21	// xor done R24
 	MOVWZ	0(R23),R25	// tab[4][crc>>24]
 	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
 	XOR	R21,R25,R21	// xor done R25
 	ADD	$1024,R10,R10	// &tab[5]
 	SLD	$2,R24,R24	// crc>>16&0xFF*4
 	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
 	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
 	XOR	R21,R26,R21	// xor done R26
 	RLDICL	$56,R7,$56,R25	// crc>>8
 	ADD	$1024,R10,R10	// &tab[6]
 	SLD	$2,R25,R25	// crc>>8&FF*2
 	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
 	MOVBZ   R7,R26          // crc&0xFF
 	ADD     $1024,R10,R10   // &tab[7]
 	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
 	SLD	$2,R26,R26	// crc&0xFF*2
 	XOR	R21,R27,R21	// xor done R27
 	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
 	ADD     $8,R5           // p = p[8:]
 	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
 	XOR	R21,R28,R21	// xor done R28
 	MOVWZ	R21,R7		// crc for next round
 	BC	16,0,loop	// next 8 bytes
 	ANDCC	$7,R6,R8	// any leftover bytes
 	BEQ	done		// none --> done
 	MOVD	R8,CTR		// byte count

 short:
 	MOVBZ   0(R5),R8        // get v
 	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
 	MOVWZ	R7,R14
 	SRD	$8,R14,R14	// crc>>8
 	XOR     R8,R9,R8        // byte(crc)^v -> R8
 	ADD	$1,R5		// ptr to next v
 	SLD     $2,R8           // convert index-> bytes
 	ADD     R8,R4,R9        // &tab[byte(crc)^v]
 	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
 	XOR     R10,R14,R7       // loop crc in R7
 	MOVWZ   R7,R7           // 32 bits
 	BC      16,0,short
 done:
 	NOR     R7,R7,R7        // ^crc
 	MOVW    R7,ret+40(FP)   // return crc
 	RET

 #ifdef BYTESWAP_DATA
 DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
 DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908

 GLOBL ·byteswapcons+0(SB),RODATA,$16
 #endif

 TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
 	MOVWZ	crc+0(FP), R3   // incoming crc
 	MOVWZ	ctab+4(FP), R14   // crc poly id
 	MOVD    p+8(FP), R4
 	MOVD    p_len+16(FP), R5 // p len

 	// R3 = incoming crc
 	// R14 = constant table identifier
 	// R5 = address of bytes
 	// R6 = length of bytes

 	// defines for index loads

 	MOVD	$16,off16
 	MOVD	$32,off32
 	MOVD	$48,off48
 	MOVD	$64,off64
 	MOVD	$80,off80
 	MOVD	$96,off96
 	MOVD	$112,off112
 	MOVD	$0,R15

 	MOVD	R3,R10	// save initial crc

 	NOR	R3,R3,R3  // ^crc
 	MOVWZ	R3,R3	// 32 bits
 	VXOR	zeroes,zeroes,zeroes  // clear the V reg
 	VSPLTISW $-1,V0
 	VSLDOI	$4,V29,V0,mask_32bit
 	VSLDOI	$8,V29,V0,mask_64bit

 	VXOR	V8,V8,V8
 	MTVSRD	R3,VS40	// crc initial value VS40 = V8

 #ifdef REFLECT
 	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
 #else
 	VSLDOI	$4,V8,zeroes,V8
 #endif

 #ifdef BYTESWAP_DATA
 	MOVD    $·byteswapcons(SB),R3
 	LVX	(R3),byteswap
 #endif

 	CMPU	R5,$256		// length of bytes
 	BLT	short

 	RLDICR	$0,R5,$56,R6 // chunk to process

 	// First step for larger sizes
 l1:	MOVD	$32768,R7
 	MOVD	R7,R9
 	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
 	BGT	top	// less than MAX, just do remainder
 	MOVD	R6,R7
 top:
 	SUB	R7,R6,R6

 	// mainloop does 128 bytes at a time
 	SRD	$7,R7

 	// determine the offset into the constants table to start with.
 	// Each constant is 128 bytes, used against 16 bytes of data.
 	SLD	$4,R7,R8
 	SRD	$3,R9,R9
 	SUB	R8,R9,R8

 	// The last iteration is reduced in a separate step
 	ADD	$-1,R7
 	MOVD	R7,CTR

 	// Determine which constant table (depends on poly)
 	CMP	R14,$1
 	BNE	castTable
 	MOVD	$·IEEEConst(SB),R3
 	BR	startConst
 castTable:
 	MOVD	$·CastConst(SB),R3

 startConst:
 	ADD	R3,R8,R3	// starting point in constants table

 	VXOR	V0,V0,V0	// clear the V regs
 	VXOR	V1,V1,V1
 	VXOR	V2,V2,V2
 	VXOR	V3,V3,V3
 	VXOR	V4,V4,V4
 	VXOR	V5,V5,V5
 	VXOR	V6,V6,V6
 	VXOR	V7,V7,V7

 	LVX	(R3),const1	// loading constant values

 	CMP	R15,$1		// Identify warm up pass
 	BEQ	next

 	// First warm up pass: load the bytes to process
 	LVX	(R4),V16
 	LVX	(R4+off16),V17
 	LVX	(R4+off32),V18
 	LVX	(R4+off48),V19
 	LVX	(R4+off64),V20
 	LVX	(R4+off80),V21
 	LVX	(R4+off96),V22
 	LVX	(R4+off112),V23
 	ADD	$128,R4		// bump up to next 128 bytes in buffer

 	VXOR	V16,V8,V16	// xor in initial CRC in V8

 next:
 	BC	18,0,first_warm_up_done

 	ADD	$16,R3		// bump up to next constants
 	LVX	(R3),const2	// table values

 	VPMSUMD	V16,const1,V8 // second warm up pass
 	LVX	(R4),V16	// load from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V17,const1,V9	// vpmsumd with constants
 	LVX	(R4+off16),V17	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V18,const1,V10	// vpmsumd with constants
 	LVX	(R4+off32),V18	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V19,const1,V11	// vpmsumd with constants
 	LVX	(R4+off48),V19	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V20,const1,V12	// vpmsumd with constants
 	LVX	(R4+off64),V20	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V21,const1,V13	// vpmsumd with constants
 	LVX	(R4+off80),V21	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V22,const1,V14	// vpmsumd with constants
 	LVX	(R4+off96),V22	// load next from buffer
 	OR	$0,R2,R2

 	VPMSUMD	V23,const1,V15	// vpmsumd with constants
 	LVX	(R4+off112),V23	// load next from buffer

 	ADD	$128,R4		// bump up to next 128 bytes in buffer

 	BC	18,0,first_cool_down

 cool_top:
 	LVX	(R3),const1	// constants
 	ADD	$16,R3		// inc to next constants
 	OR	$0,R2,R2

 	VXOR	V0,V8,V0	// xor in previous vpmsumd
 	VPMSUMD	V16,const2,V8	// vpmsumd with constants
 	LVX	(R4),V16	// buffer
 	OR	$0,R2,R2

 	VXOR	V1,V9,V1	// xor in previous
 	VPMSUMD	V17,const2,V9	// vpmsumd with constants
 	LVX	(R4+off16),V17	// next in buffer
 	OR	$0,R2,R2

 	VXOR	V2,V10,V2	// xor in previous
 	VPMSUMD	V18,const2,V10	// vpmsumd with constants
 	LVX	(R4+off32),V18	// next in buffer
 	OR	$0,R2,R2

 	VXOR	V3,V11,V3	// xor in previous
 	VPMSUMD	V19,const2,V11	// vpmsumd with constants
 	LVX	(R4+off48),V19	// next in buffer
 	LVX	(R3),const2	// get next constant
 	OR	$0,R2,R2

 	VXOR	V4,V12,V4	// xor in previous
 	VPMSUMD	V20,const1,V12	// vpmsumd with constants
 	LVX	(R4+off64),V20	// next in buffer
 	OR	$0,R2,R2

 	VXOR	V5,V13,V5	// xor in previous
 	VPMSUMD	V21,const1,V13	// vpmsumd with constants
 	LVX	(R4+off80),V21	// next in buffer
 	OR	$0,R2,R2

 	VXOR	V6,V14,V6	// xor in previous
 	VPMSUMD	V22,const1,V14	// vpmsumd with constants
 	LVX	(R4+off96),V22	// next in buffer
 	OR	$0,R2,R2

 	VXOR	V7,V15,V7	// xor in previous
 	VPMSUMD	V23,const1,V15	// vpmsumd with constants
 	LVX	(R4+off112),V23	// next in buffer

 	ADD	$128,R4		// bump up buffer pointer
 	BC	16,0,cool_top	// are we done?

 first_cool_down:

 	// load the constants
 	// xor in the previous value
 	// vpmsumd the result with constants

 	LVX	(R3),const1
 	ADD	$16,R3

 	VXOR	V0,V8,V0
 	VPMSUMD V16,const1,V8
 	OR	$0,R2,R2

 	VXOR	V1,V9,V1
 	VPMSUMD	V17,const1,V9
 	OR	$0,R2,R2

 	VXOR	V2,V10,V2
 	VPMSUMD	V18,const1,V10
 	OR	$0,R2,R2

 	VXOR	V3,V11,V3
 	VPMSUMD	V19,const1,V11
 	OR	$0,R2,R2

 	VXOR	V4,V12,V4
 	VPMSUMD	V20,const1,V12
 	OR	$0,R2,R2

 	VXOR	V5,V13,V5
 	VPMSUMD	V21,const1,V13
 	OR	$0,R2,R2

 	VXOR	V6,V14,V6
 	VPMSUMD	V22,const1,V14
 	OR	$0,R2,R2

 	VXOR	V7,V15,V7
 	VPMSUMD	V23,const1,V15
 	OR	$0,R2,R2

 second_cool_down:

 	VXOR    V0,V8,V0
 	VXOR    V1,V9,V1
 	VXOR    V2,V10,V2
 	VXOR    V3,V11,V3
 	VXOR    V4,V12,V4
 	VXOR    V5,V13,V5
 	VXOR    V6,V14,V6
 	VXOR    V7,V15,V7

 #ifdef REFLECT
 	VSLDOI  $4,V0,zeroes,V0
 	VSLDOI  $4,V1,zeroes,V1
 	VSLDOI  $4,V2,zeroes,V2
 	VSLDOI  $4,V3,zeroes,V3
 	VSLDOI  $4,V4,zeroes,V4
 	VSLDOI  $4,V5,zeroes,V5
 	VSLDOI  $4,V6,zeroes,V6
 	VSLDOI  $4,V7,zeroes,V7
 #endif

 	LVX	(R4),V8
 	LVX	(R4+off16),V9
 	LVX	(R4+off32),V10
 	LVX	(R4+off48),V11
 	LVX	(R4+off64),V12
 	LVX	(R4+off80),V13
 	LVX	(R4+off96),V14
 	LVX	(R4+off112),V15

 	ADD	$128,R4

 	VXOR	V0,V8,V16
 	VXOR	V1,V9,V17
 	VXOR	V2,V10,V18
 	VXOR	V3,V11,V19
 	VXOR	V4,V12,V20
 	VXOR	V5,V13,V21
 	VXOR	V6,V14,V22
 	VXOR	V7,V15,V23

 	MOVD    $1,R15
 	CMP     $0,R6
 	ADD     $128,R6

 	BNE	l1
 	ANDCC   $127,R5
 	SUBC	R5,$128,R6
 	ADD	R3,R6,R3

 	SRD	$4,R5,R7
 	MOVD	R7,CTR
 	LVX	(R3),V0
 	LVX	(R3+off16),V1
 	LVX	(R3+off32),V2
 	LVX	(R3+off48),V3
 	LVX	(R3+off64),V4
 	LVX	(R3+off80),V5
 	LVX	(R3+off96),V6
 	LVX	(R3+off112),V7

 	ADD	$128,R3

 	VPMSUMW	V16,V0,V0
 	VPMSUMW	V17,V1,V1
 	VPMSUMW	V18,V2,V2
 	VPMSUMW	V19,V3,V3
 	VPMSUMW	V20,V4,V4
 	VPMSUMW	V21,V5,V5
 	VPMSUMW	V22,V6,V6
 	VPMSUMW	V23,V7,V7

 	// now reduce the tail

 	CMP	$0,R7
 	BEQ	next1

 	LVX	(R4),V16
 	LVX	(R3),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off16),V16
 	LVX	(R3+off16),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off32),V16
 	LVX	(R3+off32),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off48),V16
 	LVX	(R3+off48),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off64),V16
 	LVX	(R3+off64),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off80),V16
 	LVX	(R3+off80),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0
 	BC	18,0,next1

 	LVX	(R4+off96),V16
 	LVX	(R3+off96),V17
 	VPMSUMW	V16,V17,V16
 	VXOR	V0,V16,V0

 next1:
 	VXOR	V0,V1,V0
 	VXOR	V2,V3,V2
 	VXOR	V4,V5,V4
 	VXOR	V6,V7,V6
 	VXOR	V0,V2,V0
 	VXOR	V4,V6,V4
 	VXOR	V0,V4,V0

 barrett_reduction:

 	CMP	R14,$1
 	BNE	barcstTable
 	MOVD	$·IEEEBarConst(SB),R3
 	BR	startbarConst
 barcstTable:
 	MOVD    $·CastBarConst(SB),R3

 startbarConst:
 	LVX	(R3),const1
 	LVX	(R3+off16),const2

 	VSLDOI	$8,V0,V0,V1
 	VXOR	V0,V1,V0

 #ifdef REFLECT
 	VSPLTISB $1,V1
 	VSL	V0,V1,V0
 #endif

 	VAND	V0,mask_64bit,V0

 #ifndef	REFLECT

 	VPMSUMD	V0,const1,V1
 	VSLDOI	$8,zeroes,V1,V1
 	VPMSUMD	V1,const2,V1
 	VXOR	V0,V1,V0
 	VSLDOI	$8,V0,zeroes,V0

 #else

 	VAND	V0,mask_32bit,V1
 	VPMSUMD	V1,const1,V1
 	VAND	V1,mask_32bit,V1
 	VPMSUMD	V1,const2,V1
 	VXOR	V0,V1,V0
 	VSLDOI  $4,V0,zeroes,V0

 #endif

 	MFVSRD	VS32,R3 // VS32 = V0

 	NOR	R3,R3,R3 // return ^crc
 	MOVW	R3,ret+32(FP)
 	RET

 first_warm_up_done:

 	LVX	(R3),const1
 	ADD	$16,R3

 	VPMSUMD	V16,const1,V8
 	VPMSUMD	V17,const1,V9
 	VPMSUMD	V18,const1,V10
 	VPMSUMD	V19,const1,V11
 	VPMSUMD	V20,const1,V12
 	VPMSUMD	V21,const1,V13
 	VPMSUMD	V22,const1,V14
 	VPMSUMD	V23,const1,V15

 	BR	second_cool_down

 short:
 	CMP	$0,R5
 	BEQ	zero

 	// compute short constants

 	CMP     R14,$1
 	BNE     castshTable
 	MOVD    $·IEEEConst(SB),R3
 	ADD	$4080,R3
 	BR      startshConst
 castshTable:
 	MOVD    $·CastConst(SB),R3
 	ADD	$4080,R3

 startshConst:
 	SUBC	R5,$256,R6	// sub from 256
 	ADD	R3,R6,R3

 	// calculate where to start

 	SRD	$4,R5,R7
 	MOVD	R7,CTR

 	VXOR	V19,V19,V19
 	VXOR	V20,V20,V20

 	LVX	(R4),V0
 	LVX	(R3),V16
 	VXOR	V0,V8,V0
 	VPMSUMW	V0,V16,V0
 	BC	18,0,v0

 	LVX	(R4+off16),V1
 	LVX	(R3+off16),V17
 	VPMSUMW	V1,V17,V1
 	BC	18,0,v1

 	LVX	(R4+off32),V2
 	LVX	(R3+off32),V16
 	VPMSUMW	V2,V16,V2
 	BC	18,0,v2

 	LVX	(R4+off48),V3
 	LVX	(R3+off48),V17
 	VPMSUMW	V3,V17,V3
 	BC	18,0,v3

 	LVX	(R4+off64),V4
 	LVX	(R3+off64),V16
 	VPMSUMW	V4,V16,V4
 	BC	18,0,v4

 	LVX	(R4+off80),V5
 	LVX	(R3+off80),V17
 	VPMSUMW	V5,V17,V5
 	BC	18,0,v5

 	LVX	(R4+off96),V6
 	LVX	(R3+off96),V16
 	VPMSUMW	V6,V16,V6
 	BC	18,0,v6

 	LVX	(R4+off112),V7
 	LVX	(R3+off112),V17
 	VPMSUMW	V7,V17,V7
 	BC	18,0,v7

 	ADD	$128,R3
 	ADD	$128,R4

 	LVX	(R4),V8
 	LVX	(R3),V16
 	VPMSUMW	V8,V16,V8
 	BC	18,0,v8

 	LVX	(R4+off16),V9
 	LVX	(R3+off16),V17
 	VPMSUMW	V9,V17,V9
 	BC	18,0,v9

 	LVX	(R4+off32),V10
 	LVX	(R3+off32),V16
 	VPMSUMW	V10,V16,V10
 	BC	18,0,v10

 	LVX	(R4+off48),V11
 	LVX	(R3+off48),V17
 	VPMSUMW	V11,V17,V11
 	BC	18,0,v11

 	LVX	(R4+off64),V12
 	LVX	(R3+off64),V16
 	VPMSUMW	V12,V16,V12
 	BC	18,0,v12

 	LVX	(R4+off80),V13
 	LVX	(R3+off80),V17
 	VPMSUMW	V13,V17,V13
 	BC	18,0,v13

 	LVX	(R4+off96),V14
 	LVX	(R3+off96),V16
 	VPMSUMW	V14,V16,V14
 	BC	18,0,v14

 	LVX	(R4+off112),V15
 	LVX	(R3+off112),V17
 	VPMSUMW	V15,V17,V15

 	VXOR	V19,V15,V19
 v14:	VXOR	V20,V14,V20
 v13:	VXOR	V19,V13,V19
 v12:	VXOR	V20,V12,V20
 v11:	VXOR	V19,V11,V19
 v10:	VXOR	V20,V10,V20
 v9:	VXOR	V19,V9,V19
 v8:	VXOR	V20,V8,V20
 v7:	VXOR	V19,V7,V19
 v6:	VXOR	V20,V6,V20
 v5:	VXOR	V19,V5,V19
 v4:	VXOR	V20,V4,V20
 v3:	VXOR	V19,V3,V19
 v2:	VXOR	V20,V2,V20
 v1:	VXOR	V19,V1,V19
 v0:	VXOR	V20,V0,V20

 	VXOR	V19,V20,V0

 	BR	barrett_reduction

 zero:
 	// This case is the original crc, so just return it
 	MOVW    R10,ret+32(FP)
 	RET
	// Copyright 2017 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// The vectorized implementation found below is a derived work
	// from code written by Anton Blanchard <anton@au.ibm.com> found
	// at https://github.com/antonblanchard/crc32-vpmsum. The original
	// is dual licensed under GPL and Apache 2. As the copyright holder
	// for the work, IBM has contributed this new work under
	// the golang license.

	// Changes include porting to Go assembler with modifications for
	// the Go ABI for ppc64le.

	#include "textflag.h"

	#define POWER8_OFFSET 132

	#define off16 R16
	#define off32 R17
	#define off48 R18
	#define off64 R19
	#define off80 R20
	#define off96 R21
	#define off112 R22

	#define const1 V24
	#define const2 V25

	#define byteswap V26
	#define mask_32bit V27
	#define mask_64bit V28
	#define zeroes V29

	#define MAX_SIZE 32*1024
	#define REFLECT

	TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT\|NOFRAME, $0-44
	MOVWZ crc+0(FP), R3 // incoming crc
	MOVD table8+8(FP), R4 // *Table
	MOVD p+16(FP), R5
	MOVD p_len+24(FP), R6 // p len

	CMP $0,R6 // len == 0?
	BNE start
	MOVW R3,ret+40(FP) // return crc
	RET

	start:
	NOR R3,R3,R7 // ^crc
	MOVWZ R7,R7 // 32 bits
	CMP R6,$16
	MOVD R6,CTR
	BLT short
	SRAD $3,R6,R8 // 8 byte chunks
	MOVD R8,CTR

	loop:
	MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
	MOVWZ 4(R5),R9 // 4-7 bytes of p
	MOVD R4,R10 // &tab[0]
	XOR R7,R8,R7 // crc ^= byte[0:3]
	RLDICL $40,R9,$56,R17 // p[7]
	SLD $2,R17,R17 // p[7]*4
	RLDICL $40,R7,$56,R8 // crc>>24
	ADD R17,R10,R17 // &tab[0][p[7]]
	SLD $2,R8,R8 // crc>>24*4
	RLDICL $48,R9,$56,R18 // p[6]
	SLD $2,R18,R18 // p[6]*4
	ADD $1024,R10,R10 // tab[1]
	MOVWZ 0(R17),R21 // tab[0][p[7]]
	RLDICL $56,R9,$56,R19 // p[5]
	ADD R10,R18,R18 // &tab[1][p[6]]
	SLD $2,R19,R19 // p[5]*4:1
	MOVWZ 0(R18),R22 // tab[1][p[6]]
	ADD $1024,R10,R10 // tab[2]
	XOR R21,R22,R21 // xor done R22
	ADD R19,R10,R19 // &tab[2][p[5]]
	ANDCC $255,R9,R20 // p[4] ??
	SLD $2,R20,R20 // p[4]*4
	MOVWZ 0(R19),R23 // tab[2][p[5]]
	ADD $1024,R10,R10 // &tab[3]
	ADD R20,R10,R20 // tab[3][p[4]]
	XOR R21,R23,R21 // xor done R23
	ADD $1024,R10,R10 // &tab[4]
	MOVWZ 0(R20),R24 // tab[3][p[4]]
	ADD R10,R8,R23 // &tab[4][crc>>24]
	XOR R21,R24,R21 // xor done R24
	MOVWZ 0(R23),R25 // tab[4][crc>>24]
	RLDICL $48,R7,$56,R24 // crc>>16&0xFF
	XOR R21,R25,R21 // xor done R25
	ADD $1024,R10,R10 // &tab[5]
	SLD $2,R24,R24 // crc>>16&0xFF*4
	ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
	MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
	XOR R21,R26,R21 // xor done R26
	RLDICL $56,R7,$56,R25 // crc>>8
	ADD $1024,R10,R10 // &tab[6]
	SLD $2,R25,R25 // crc>>8&FF*2
	ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
	MOVBZ R7,R26 // crc&0xFF
	ADD $1024,R10,R10 // &tab[7]
	MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
	SLD $2,R26,R26 // crc&0xFF*2
	XOR R21,R27,R21 // xor done R27
	ADD R26,R10,R26 // &tab[7][crc&0xFF]
	ADD $8,R5 // p = p[8:]
	MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
	XOR R21,R28,R21 // xor done R28
	MOVWZ R21,R7 // crc for next round
	BC 16,0,loop // next 8 bytes
	ANDCC $7,R6,R8 // any leftover bytes
	BEQ done // none --> done
	MOVD R8,CTR // byte count

	short:
	MOVBZ 0(R5),R8 // get v
	MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
	MOVWZ R7,R14
	SRD $8,R14,R14 // crc>>8
	XOR R8,R9,R8 // byte(crc)^v -> R8
	ADD $1,R5 // ptr to next v
	SLD $2,R8 // convert index-> bytes
	ADD R8,R4,R9 // &tab[byte(crc)^v]
	MOVWZ 0(R9),R10 // tab[byte(crc)^v]
	XOR R10,R14,R7 // loop crc in R7
	MOVWZ R7,R7 // 32 bits
	BC 16,0,short
	done:
	NOR R7,R7,R7 // ^crc
	MOVW R7,ret+40(FP) // return crc
	RET

	#ifdef BYTESWAP_DATA
	DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
	DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908

	GLOBL ·byteswapcons+0(SB),RODATA,$16
	#endif

	TEXT ·vectorCrc32(SB), NOSPLIT\|NOFRAME, $0-36
	MOVWZ crc+0(FP), R3 // incoming crc
	MOVWZ ctab+4(FP), R14 // crc poly id
	MOVD p+8(FP), R4
	MOVD p_len+16(FP), R5 // p len

	// R3 = incoming crc
	// R14 = constant table identifier
	// R5 = address of bytes
	// R6 = length of bytes

	// defines for index loads

	MOVD $16,off16
	MOVD $32,off32
	MOVD $48,off48
	MOVD $64,off64
	MOVD $80,off80
	MOVD $96,off96
	MOVD $112,off112
	MOVD $0,R15

	MOVD R3,R10 // save initial crc

	NOR R3,R3,R3 // ^crc
	MOVWZ R3,R3 // 32 bits
	VXOR zeroes,zeroes,zeroes // clear the V reg
	VSPLTISW $-1,V0
	VSLDOI $4,V29,V0,mask_32bit
	VSLDOI $8,V29,V0,mask_64bit

	VXOR V8,V8,V8
	MTVSRD R3,VS40 // crc initial value VS40 = V8

	#ifdef REFLECT
	VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
	#else
	VSLDOI $4,V8,zeroes,V8
	#endif

	#ifdef BYTESWAP_DATA
	MOVD $·byteswapcons(SB),R3
	LVX (R3),byteswap
	#endif

	CMPU R5,$256 // length of bytes
	BLT short

	RLDICR $0,R5,$56,R6 // chunk to process

	// First step for larger sizes
	l1: MOVD $32768,R7
	MOVD R7,R9
	CMP R6,R7 // compare R6, R7 (MAX SIZE)
	BGT top // less than MAX, just do remainder
	MOVD R6,R7
	top:
	SUB R7,R6,R6

	// mainloop does 128 bytes at a time
	SRD $7,R7

	// determine the offset into the constants table to start with.
	// Each constant is 128 bytes, used against 16 bytes of data.
	SLD $4,R7,R8
	SRD $3,R9,R9
	SUB R8,R9,R8

	// The last iteration is reduced in a separate step
	ADD $-1,R7
	MOVD R7,CTR

	// Determine which constant table (depends on poly)
	CMP R14,$1
	BNE castTable
	MOVD $·IEEEConst(SB),R3
	BR startConst
	castTable:
	MOVD $·CastConst(SB),R3

	startConst:
	ADD R3,R8,R3 // starting point in constants table

	VXOR V0,V0,V0 // clear the V regs
	VXOR V1,V1,V1
	VXOR V2,V2,V2
	VXOR V3,V3,V3
	VXOR V4,V4,V4
	VXOR V5,V5,V5
	VXOR V6,V6,V6
	VXOR V7,V7,V7

	LVX (R3),const1 // loading constant values

	CMP R15,$1 // Identify warm up pass
	BEQ next

	// First warm up pass: load the bytes to process
	LVX (R4),V16
	LVX (R4+off16),V17
	LVX (R4+off32),V18
	LVX (R4+off48),V19
	LVX (R4+off64),V20
	LVX (R4+off80),V21
	LVX (R4+off96),V22
	LVX (R4+off112),V23
	ADD $128,R4 // bump up to next 128 bytes in buffer

	VXOR V16,V8,V16 // xor in initial CRC in V8

	next:
	BC 18,0,first_warm_up_done

	ADD $16,R3 // bump up to next constants
	LVX (R3),const2 // table values

	VPMSUMD V16,const1,V8 // second warm up pass
	LVX (R4),V16 // load from buffer
	OR $0,R2,R2

	VPMSUMD V17,const1,V9 // vpmsumd with constants
	LVX (R4+off16),V17 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V18,const1,V10 // vpmsumd with constants
	LVX (R4+off32),V18 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V19,const1,V11 // vpmsumd with constants
	LVX (R4+off48),V19 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V20,const1,V12 // vpmsumd with constants
	LVX (R4+off64),V20 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V21,const1,V13 // vpmsumd with constants
	LVX (R4+off80),V21 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V22,const1,V14 // vpmsumd with constants
	LVX (R4+off96),V22 // load next from buffer
	OR $0,R2,R2

	VPMSUMD V23,const1,V15 // vpmsumd with constants
	LVX (R4+off112),V23 // load next from buffer

	ADD $128,R4 // bump up to next 128 bytes in buffer

	BC 18,0,first_cool_down

	cool_top:
	LVX (R3),const1 // constants
	ADD $16,R3 // inc to next constants
	OR $0,R2,R2

	VXOR V0,V8,V0 // xor in previous vpmsumd
	VPMSUMD V16,const2,V8 // vpmsumd with constants
	LVX (R4),V16 // buffer
	OR $0,R2,R2

	VXOR V1,V9,V1 // xor in previous
	VPMSUMD V17,const2,V9 // vpmsumd with constants
	LVX (R4+off16),V17 // next in buffer
	OR $0,R2,R2

	VXOR V2,V10,V2 // xor in previous
	VPMSUMD V18,const2,V10 // vpmsumd with constants
	LVX (R4+off32),V18 // next in buffer
	OR $0,R2,R2

	VXOR V3,V11,V3 // xor in previous
	VPMSUMD V19,const2,V11 // vpmsumd with constants
	LVX (R4+off48),V19 // next in buffer
	LVX (R3),const2 // get next constant
	OR $0,R2,R2

	VXOR V4,V12,V4 // xor in previous
	VPMSUMD V20,const1,V12 // vpmsumd with constants
	LVX (R4+off64),V20 // next in buffer
	OR $0,R2,R2

	VXOR V5,V13,V5 // xor in previous
	VPMSUMD V21,const1,V13 // vpmsumd with constants
	LVX (R4+off80),V21 // next in buffer
	OR $0,R2,R2

	VXOR V6,V14,V6 // xor in previous
	VPMSUMD V22,const1,V14 // vpmsumd with constants
	LVX (R4+off96),V22 // next in buffer
	OR $0,R2,R2

	VXOR V7,V15,V7 // xor in previous
	VPMSUMD V23,const1,V15 // vpmsumd with constants
	LVX (R4+off112),V23 // next in buffer

	ADD $128,R4 // bump up buffer pointer
	BC 16,0,cool_top // are we done?

	first_cool_down:

	// load the constants
	// xor in the previous value
	// vpmsumd the result with constants

	LVX (R3),const1
	ADD $16,R3

	VXOR V0,V8,V0
	VPMSUMD V16,const1,V8
	OR $0,R2,R2

	VXOR V1,V9,V1
	VPMSUMD V17,const1,V9
	OR $0,R2,R2

	VXOR V2,V10,V2
	VPMSUMD V18,const1,V10
	OR $0,R2,R2

	VXOR V3,V11,V3
	VPMSUMD V19,const1,V11
	OR $0,R2,R2

	VXOR V4,V12,V4
	VPMSUMD V20,const1,V12
	OR $0,R2,R2

	VXOR V5,V13,V5
	VPMSUMD V21,const1,V13
	OR $0,R2,R2

	VXOR V6,V14,V6
	VPMSUMD V22,const1,V14
	OR $0,R2,R2

	VXOR V7,V15,V7
	VPMSUMD V23,const1,V15
	OR $0,R2,R2

	second_cool_down:

	VXOR V0,V8,V0
	VXOR V1,V9,V1
	VXOR V2,V10,V2
	VXOR V3,V11,V3
	VXOR V4,V12,V4
	VXOR V5,V13,V5
	VXOR V6,V14,V6
	VXOR V7,V15,V7

	#ifdef REFLECT
	VSLDOI $4,V0,zeroes,V0
	VSLDOI $4,V1,zeroes,V1
	VSLDOI $4,V2,zeroes,V2
	VSLDOI $4,V3,zeroes,V3
	VSLDOI $4,V4,zeroes,V4
	VSLDOI $4,V5,zeroes,V5
	VSLDOI $4,V6,zeroes,V6
	VSLDOI $4,V7,zeroes,V7
	#endif

	LVX (R4),V8
	LVX (R4+off16),V9
	LVX (R4+off32),V10
	LVX (R4+off48),V11
	LVX (R4+off64),V12
	LVX (R4+off80),V13
	LVX (R4+off96),V14
	LVX (R4+off112),V15

	ADD $128,R4

	VXOR V0,V8,V16
	VXOR V1,V9,V17
	VXOR V2,V10,V18
	VXOR V3,V11,V19
	VXOR V4,V12,V20
	VXOR V5,V13,V21
	VXOR V6,V14,V22
	VXOR V7,V15,V23

	MOVD $1,R15
	CMP $0,R6
	ADD $128,R6

	BNE l1
	ANDCC $127,R5
	SUBC R5,$128,R6
	ADD R3,R6,R3

	SRD $4,R5,R7
	MOVD R7,CTR
	LVX (R3),V0
	LVX (R3+off16),V1
	LVX (R3+off32),V2
	LVX (R3+off48),V3
	LVX (R3+off64),V4
	LVX (R3+off80),V5
	LVX (R3+off96),V6
	LVX (R3+off112),V7

	ADD $128,R3

	VPMSUMW V16,V0,V0
	VPMSUMW V17,V1,V1
	VPMSUMW V18,V2,V2
	VPMSUMW V19,V3,V3
	VPMSUMW V20,V4,V4
	VPMSUMW V21,V5,V5
	VPMSUMW V22,V6,V6
	VPMSUMW V23,V7,V7

	// now reduce the tail

	CMP $0,R7
	BEQ next1

	LVX (R4),V16
	LVX (R3),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off16),V16
	LVX (R3+off16),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off32),V16
	LVX (R3+off32),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off48),V16
	LVX (R3+off48),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off64),V16
	LVX (R3+off64),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off80),V16
	LVX (R3+off80),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0
	BC 18,0,next1

	LVX (R4+off96),V16
	LVX (R3+off96),V17
	VPMSUMW V16,V17,V16
	VXOR V0,V16,V0

	next1:
	VXOR V0,V1,V0
	VXOR V2,V3,V2
	VXOR V4,V5,V4
	VXOR V6,V7,V6
	VXOR V0,V2,V0
	VXOR V4,V6,V4
	VXOR V0,V4,V0

	barrett_reduction:

	CMP R14,$1
	BNE barcstTable
	MOVD $·IEEEBarConst(SB),R3
	BR startbarConst
	barcstTable:
	MOVD $·CastBarConst(SB),R3

	startbarConst:
	LVX (R3),const1
	LVX (R3+off16),const2

	VSLDOI $8,V0,V0,V1
	VXOR V0,V1,V0

	#ifdef REFLECT
	VSPLTISB $1,V1
	VSL V0,V1,V0
	#endif

	VAND V0,mask_64bit,V0

	#ifndef REFLECT

	VPMSUMD V0,const1,V1
	VSLDOI $8,zeroes,V1,V1
	VPMSUMD V1,const2,V1
	VXOR V0,V1,V0
	VSLDOI $8,V0,zeroes,V0

	#else

	VAND V0,mask_32bit,V1
	VPMSUMD V1,const1,V1
	VAND V1,mask_32bit,V1
	VPMSUMD V1,const2,V1
	VXOR V0,V1,V0
	VSLDOI $4,V0,zeroes,V0

	#endif

	MFVSRD VS32,R3 // VS32 = V0

	NOR R3,R3,R3 // return ^crc
	MOVW R3,ret+32(FP)
	RET

	first_warm_up_done:

	LVX (R3),const1
	ADD $16,R3

	VPMSUMD V16,const1,V8
	VPMSUMD V17,const1,V9
	VPMSUMD V18,const1,V10
	VPMSUMD V19,const1,V11
	VPMSUMD V20,const1,V12
	VPMSUMD V21,const1,V13
	VPMSUMD V22,const1,V14
	VPMSUMD V23,const1,V15

	BR second_cool_down

	short:
	CMP $0,R5
	BEQ zero

	// compute short constants

	CMP R14,$1
	BNE castshTable
	MOVD $·IEEEConst(SB),R3
	ADD $4080,R3
	BR startshConst
	castshTable:
	MOVD $·CastConst(SB),R3
	ADD $4080,R3

	startshConst:
	SUBC R5,$256,R6 // sub from 256
	ADD R3,R6,R3

	// calculate where to start

	SRD $4,R5,R7
	MOVD R7,CTR

	VXOR V19,V19,V19
	VXOR V20,V20,V20

	LVX (R4),V0
	LVX (R3),V16
	VXOR V0,V8,V0
	VPMSUMW V0,V16,V0
	BC 18,0,v0

	LVX (R4+off16),V1
	LVX (R3+off16),V17
	VPMSUMW V1,V17,V1
	BC 18,0,v1

	LVX (R4+off32),V2
	LVX (R3+off32),V16
	VPMSUMW V2,V16,V2
	BC 18,0,v2

	LVX (R4+off48),V3
	LVX (R3+off48),V17
	VPMSUMW V3,V17,V3
	BC 18,0,v3

	LVX (R4+off64),V4
	LVX (R3+off64),V16
	VPMSUMW V4,V16,V4
	BC 18,0,v4

	LVX (R4+off80),V5
	LVX (R3+off80),V17
	VPMSUMW V5,V17,V5
	BC 18,0,v5

	LVX (R4+off96),V6
	LVX (R3+off96),V16
	VPMSUMW V6,V16,V6
	BC 18,0,v6

	LVX (R4+off112),V7
	LVX (R3+off112),V17
	VPMSUMW V7,V17,V7
	BC 18,0,v7

	ADD $128,R3
	ADD $128,R4

	LVX (R4),V8
	LVX (R3),V16
	VPMSUMW V8,V16,V8
	BC 18,0,v8

	LVX (R4+off16),V9
	LVX (R3+off16),V17
	VPMSUMW V9,V17,V9
	BC 18,0,v9

	LVX (R4+off32),V10
	LVX (R3+off32),V16
	VPMSUMW V10,V16,V10
	BC 18,0,v10

	LVX (R4+off48),V11
	LVX (R3+off48),V17
	VPMSUMW V11,V17,V11
	BC 18,0,v11

	LVX (R4+off64),V12
	LVX (R3+off64),V16
	VPMSUMW V12,V16,V12
	BC 18,0,v12

	LVX (R4+off80),V13
	LVX (R3+off80),V17
	VPMSUMW V13,V17,V13
	BC 18,0,v13

	LVX (R4+off96),V14
	LVX (R3+off96),V16
	VPMSUMW V14,V16,V14
	BC 18,0,v14

	LVX (R4+off112),V15
	LVX (R3+off112),V17
	VPMSUMW V15,V17,V15

	VXOR V19,V15,V19
	v14: VXOR V20,V14,V20
	v13: VXOR V19,V13,V19
	v12: VXOR V20,V12,V20
	v11: VXOR V19,V11,V19
	v10: VXOR V20,V10,V20
	v9: VXOR V19,V9,V19
	v8: VXOR V20,V8,V20
	v7: VXOR V19,V7,V19
	v6: VXOR V20,V6,V20
	v5: VXOR V19,V5,V19
	v4: VXOR V20,V4,V20
	v3: VXOR V19,V3,V19
	v2: VXOR V20,V2,V20
	v1: VXOR V19,V1,V19
	v0: VXOR V20,V0,V20

	VXOR V19,V20,V0

	BR barrett_reduction

	zero:
	// This case is the original crc, so just return it
	MOVW R10,ret+32(FP)
	RET