src/crypto/md5/md5block_riscv64.s - go.git - Git at Google

 // Copyright 2023 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
 // RISCV64 version of md5block.go
 // derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go

 //go:build !purego

 #include "textflag.h"

 #define LOAD32U(base, offset, tmp, dest) \
 	MOVBU	(offset+0*1)(base), dest; \
 	MOVBU	(offset+1*1)(base), tmp; \
 	SLL	$8, tmp; \
 	OR	tmp, dest; \
 	MOVBU	(offset+2*1)(base), tmp; \
 	SLL	$16, tmp; \
 	OR	tmp, dest; \
 	MOVBU	(offset+3*1)(base), tmp; \
 	SLL	$24, tmp; \
 	OR	tmp, dest

 #define LOAD64U(base, offset, tmp1, tmp2, dst) \
 	LOAD32U(base, offset, tmp1, dst); \
 	LOAD32U(base, offset+4, tmp1, tmp2); \
 	SLL	$32, tmp2; \
 	OR	tmp2, dst

 #define ROUND1EVN(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	x, a; \
 	ADDW	X23, a; \
 	XOR	c, d, X23; \
 	AND	b, X23; \
 	XOR	d, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND1ODD(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	X23, a; \
 	SRL	$32, x, X23; \
 	ADDW	X23, a; \
 	XOR	c, d, X23; \
 	AND	b, X23; \
 	XOR	d, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND2EVN(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	x, a; \
 	ADDW	X23, a; \
 	XOR	b, c, X23; \
 	AND	d, X23; \
 	XOR	c, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND2ODD(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	X23, a; \
 	SRL	$32, x, X23; \
 	ADDW	X23, a; \
 	XOR	b, c, X23; \
 	AND	d, X23; \
 	XOR	c, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND3EVN(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	x, a; \
 	ADDW	X23, a; \
 	XOR	c, d, X23; \
 	XOR	b, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND3ODD(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	X23, a; \
 	SRL	$32, x, X23; \
 	ADDW	X23, a; \
 	XOR	c, d, X23; \
 	XOR	b, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND4EVN(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	x, a; \
 	ADDW	X23, a; \
 	ORN	d, b, X23; \
 	XOR	c, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 #define ROUND4ODD(a, b, c, d, x, const, shift) \
 	MOV	$const, X23; \
 	ADDW	X23, a; \
 	SRL	$32, x, X23; \
 	ADDW	X23, a; \
 	ORN	d, b, X23; \
 	XOR	c, X23; \
 	ADDW	X23, a; \
 	RORIW	$(32-shift), a; \
 	ADDW	b, a

 // Register use for the block function
 //
 // X5 - X12	: contain the 16 32 bit data items in the block we're
 //		  processing.  Odd numbered values, e.g., x1, x3 are stored in
 //		  the upper 32 bits of the register.
 // X13 - X16	: a, b, c, d
 // X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
 //		  dd.  X17 and X18 are also used as temporary registers when
 //		  loading unaligned data.
 // X22		: pointer to dig.s
 // X23		: temporary register
 // X28		: pointer to the first byte beyond the end of p
 // X29		: pointer to current 64 byte block of data, initially set to
 //		  &p[0]
 // X30		: temporary register

 TEXT	·block(SB),NOSPLIT,$0-32
 	MOV	p+8(FP), X29
 	MOV	p_len+16(FP), X30
 	SRL	$6, X30
 	SLL	$6, X30
 	BEQZ	X30, zero

 	ADD	X29, X30, X28

 	MOV	dig+0(FP), X22
 	MOVWU	(0*4)(X22), X13	// a = s[0]
 	MOVWU	(1*4)(X22), X14	// b = s[1]
 	MOVWU	(2*4)(X22), X15	// c = s[2]
 	MOVWU	(3*4)(X22), X16	// d = s[3]

 loop:

 	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
 	// Different paths are taken to load the values depending on whether the
 	// buffer is 8 byte aligned or not.  We load all the values up front
 	// here at the start of the loop to avoid multiple alignment checks and
 	// to reduce code size.  It takes 10 instructions to load an unaligned
 	// 32 bit value and this value will be used 4 times in the main body
 	// of the loop below.

 	AND	$7, X29, X30
 	BEQZ	X30, aligned

 	LOAD64U(X29,0, X17, X18, X5)
 	LOAD64U(X29,8, X17, X18, X6)
 	LOAD64U(X29,16, X17, X18, X7)
 	LOAD64U(X29,24, X17, X18, X8)
 	LOAD64U(X29,32, X17, X18, X9)
 	LOAD64U(X29,40, X17, X18, X10)
 	LOAD64U(X29,48, X17, X18, X11)
 	LOAD64U(X29,56, X17, X18, X12)
 	JMP block_loaded

 aligned:
 	MOV	(0*8)(X29), X5
 	MOV	(1*8)(X29), X6
 	MOV	(2*8)(X29), X7
 	MOV	(3*8)(X29), X8
 	MOV	(4*8)(X29), X9
 	MOV	(5*8)(X29), X10
 	MOV	(6*8)(X29), X11
 	MOV	(7*8)(X29), X12

 block_loaded:
 	MOV	X13, X17
 	MOV	X14, X18
 	MOV	X15, X19
 	MOV	X16, X20

 	// Some of the hex constants below are too large to fit into a
 	// signed 32 bit value.  The assembler will handle these
 	// constants in a special way to ensure that they are
 	// zero extended.  Our algorithm is only interested in the
 	// bottom 32 bits and doesn't care whether constants are
 	// sign or zero extended when moved into 64 bit registers.
 	// So we use signed constants instead of hex when bit 31 is
 	// set so all constants can be loaded by lui+addi.

 	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
 	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
 	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
 	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
 	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
 	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
 	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
 	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
 	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
 	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
 	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
 	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
 	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
 	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
 	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
 	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821

 	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
 	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
 	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
 	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
 	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
 	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
 	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
 	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
 	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
 	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
 	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
 	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
 	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
 	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
 	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
 	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a

 	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
 	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
 	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
 	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
 	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
 	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
 	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
 	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
 	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
 	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
 	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
 	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
 	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
 	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
 	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
 	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665

 	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
 	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
 	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
 	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
 	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
 	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
 	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
 	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
 	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
 	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
 	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
 	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
 	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
 	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
 	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
 	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391

 	ADDW	X17, X13
 	ADDW	X18, X14
 	ADDW	X19, X15
 	ADDW	X20, X16

 	ADD	$64, X29
 	BNE	X28, X29, loop

 	MOVW	X13, (0*4)(X22)
 	MOVW	X14, (1*4)(X22)
 	MOVW	X15, (2*4)(X22)
 	MOVW	X16, (3*4)(X22)

 zero:
 	RET
	// Copyright 2023 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.
	//
	// RISCV64 version of md5block.go
	// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go

	//go:build !purego

	#include "textflag.h"

	#define LOAD32U(base, offset, tmp, dest) \
	MOVBU (offset+0*1)(base), dest; \
	MOVBU (offset+1*1)(base), tmp; \
	SLL $8, tmp; \
	OR tmp, dest; \
	MOVBU (offset+2*1)(base), tmp; \
	SLL $16, tmp; \
	OR tmp, dest; \
	MOVBU (offset+3*1)(base), tmp; \
	SLL $24, tmp; \
	OR tmp, dest

	#define LOAD64U(base, offset, tmp1, tmp2, dst) \
	LOAD32U(base, offset, tmp1, dst); \
	LOAD32U(base, offset+4, tmp1, tmp2); \
	SLL $32, tmp2; \
	OR tmp2, dst

	#define ROUND1EVN(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW x, a; \
	ADDW X23, a; \
	XOR c, d, X23; \
	AND b, X23; \
	XOR d, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND1ODD(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW X23, a; \
	SRL $32, x, X23; \
	ADDW X23, a; \
	XOR c, d, X23; \
	AND b, X23; \
	XOR d, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND2EVN(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW x, a; \
	ADDW X23, a; \
	XOR b, c, X23; \
	AND d, X23; \
	XOR c, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND2ODD(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW X23, a; \
	SRL $32, x, X23; \
	ADDW X23, a; \
	XOR b, c, X23; \
	AND d, X23; \
	XOR c, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND3EVN(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW x, a; \
	ADDW X23, a; \
	XOR c, d, X23; \
	XOR b, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND3ODD(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW X23, a; \
	SRL $32, x, X23; \
	ADDW X23, a; \
	XOR c, d, X23; \
	XOR b, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND4EVN(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW x, a; \
	ADDW X23, a; \
	ORN d, b, X23; \
	XOR c, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	#define ROUND4ODD(a, b, c, d, x, const, shift) \
	MOV $const, X23; \
	ADDW X23, a; \
	SRL $32, x, X23; \
	ADDW X23, a; \
	ORN d, b, X23; \
	XOR c, X23; \
	ADDW X23, a; \
	RORIW $(32-shift), a; \
	ADDW b, a

	// Register use for the block function
	//
	// X5 - X12 : contain the 16 32 bit data items in the block we're
	// processing. Odd numbered values, e.g., x1, x3 are stored in
	// the upper 32 bits of the register.
	// X13 - X16 : a, b, c, d
	// X17 - X20 : used to store the old values of a, b, c, d, i.e., aa, bb, cc,
	// dd. X17 and X18 are also used as temporary registers when
	// loading unaligned data.
	// X22 : pointer to dig.s
	// X23 : temporary register
	// X28 : pointer to the first byte beyond the end of p
	// X29 : pointer to current 64 byte block of data, initially set to
	// &p[0]
	// X30 : temporary register

	TEXT ·block(SB),NOSPLIT,$0-32
	MOV p+8(FP), X29
	MOV p_len+16(FP), X30
	SRL $6, X30
	SLL $6, X30
	BEQZ X30, zero

	ADD X29, X30, X28

	MOV dig+0(FP), X22
	MOVWU (0*4)(X22), X13 // a = s[0]
	MOVWU (1*4)(X22), X14 // b = s[1]
	MOVWU (2*4)(X22), X15 // c = s[2]
	MOVWU (3*4)(X22), X16 // d = s[3]

	loop:

	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
	// Different paths are taken to load the values depending on whether the
	// buffer is 8 byte aligned or not. We load all the values up front
	// here at the start of the loop to avoid multiple alignment checks and
	// to reduce code size. It takes 10 instructions to load an unaligned
	// 32 bit value and this value will be used 4 times in the main body
	// of the loop below.

	AND $7, X29, X30
	BEQZ X30, aligned

	LOAD64U(X29,0, X17, X18, X5)
	LOAD64U(X29,8, X17, X18, X6)
	LOAD64U(X29,16, X17, X18, X7)
	LOAD64U(X29,24, X17, X18, X8)
	LOAD64U(X29,32, X17, X18, X9)
	LOAD64U(X29,40, X17, X18, X10)
	LOAD64U(X29,48, X17, X18, X11)
	LOAD64U(X29,56, X17, X18, X12)
	JMP block_loaded

	aligned:
	MOV (0*8)(X29), X5
	MOV (1*8)(X29), X6
	MOV (2*8)(X29), X7
	MOV (3*8)(X29), X8
	MOV (4*8)(X29), X9
	MOV (5*8)(X29), X10
	MOV (6*8)(X29), X11
	MOV (7*8)(X29), X12

	block_loaded:
	MOV X13, X17
	MOV X14, X18
	MOV X15, X19
	MOV X16, X20

	// Some of the hex constants below are too large to fit into a
	// signed 32 bit value. The assembler will handle these
	// constants in a special way to ensure that they are
	// zero extended. Our algorithm is only interested in the
	// bottom 32 bits and doesn't care whether constants are
	// sign or zero extended when moved into 64 bit registers.
	// So we use signed constants instead of hex when bit 31 is
	// set so all constants can be loaded by lui+addi.

	ROUND1EVN(X13,X14,X15,X16,X5, -680876936, 7); // 0xd76aa478
	ROUND1ODD(X16,X13,X14,X15,X5, -389564586,12); // 0xe8c7b756
	ROUND1EVN(X15,X16,X13,X14,X6, 0x242070db,17); // 0x242070db
	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
	ROUND1EVN(X13,X14,X15,X16,X7, -176418897, 7); // 0xf57c0faf
	ROUND1ODD(X16,X13,X14,X15,X7, 0x4787c62a,12); // 0x4787c62a
	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
	ROUND1ODD(X14,X15,X16,X13,X8, -45705983,22); // 0xfd469501
	ROUND1EVN(X13,X14,X15,X16,X9, 0x698098d8, 7); // 0x698098d8
	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
	ROUND1EVN(X15,X16,X13,X14,X10, -42063,17); // 0xffff5bb1
	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
	ROUND1ODD(X16,X13,X14,X15,X11, -40341101,12); // 0xfd987193
	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821

	ROUND2ODD(X13,X14,X15,X16,X5, -165796510, 5); // f61e2562
	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
	ROUND2EVN(X14,X15,X16,X13,X5, -373897302,20); // e9b6c7aa
	ROUND2ODD(X13,X14,X15,X16,X7, -701558691, 5); // d62f105d
	ROUND2EVN(X16,X13,X14,X15,X10, 0x2441453, 9); // 2441453
	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
	ROUND2EVN(X14,X15,X16,X13,X7, -405537848,20); // e7d3fbc8
	ROUND2ODD(X13,X14,X15,X16,X9, 0x21e1cde6, 5); // 21e1cde6
	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
	ROUND2ODD(X15,X16,X13,X14,X6, -187363961,14); // f4d50d87
	ROUND2EVN(X14,X15,X16,X13,X9, 0x455a14ed,20); // 455a14ed
	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
	ROUND2EVN(X16,X13,X14,X15,X6, -51403784, 9); // fcefa3f8
	ROUND2ODD(X15,X16,X13,X14,X8, 0x676f02d9,14); // 676f02d9
	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a

	ROUND3ODD(X13,X14,X15,X16,X7, -378558, 4); // fffa3942
	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
	ROUND3EVN(X14,X15,X16,X13,X12, -35309556,23); // fde5380c
	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
	ROUND3EVN(X16,X13,X14,X15,X7, 0x4bdecfa9,11); // 4bdecfa9
	ROUND3ODD(X15,X16,X13,X14,X8, -155497632,16); // f6bb4b60
	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
	ROUND3EVN(X16,X13,X14,X15,X5, -358537222,11); // eaa127fa
	ROUND3ODD(X15,X16,X13,X14,X6, -722521979,16); // d4ef3085
	ROUND3EVN(X14,X15,X16,X13,X8, 0x4881d05,23); // 4881d05
	ROUND3ODD(X13,X14,X15,X16,X9, -640364487, 4); // d9d4d039
	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
	ROUND3EVN(X14,X15,X16,X13,X6, -995338651,23); // c4ac5665

	ROUND4EVN(X13,X14,X15,X16,X5, -198630844, 6); // f4292244
	ROUND4ODD(X16,X13,X14,X15,X8, 0x432aff97,10); // 432aff97
	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
	ROUND4ODD(X14,X15,X16,X13,X7, -57434055,21); // fc93a039
	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
	ROUND4EVN(X15,X16,X13,X14,X10 ,-1051523,15); // ffeff47d
	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
	ROUND4EVN(X13,X14,X15,X16,X9, 0x6fa87e4f, 6); // 6fa87e4f
	ROUND4ODD(X16,X13,X14,X15,X12, -30611744,10); // fe2ce6e0
	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
	ROUND4EVN(X13,X14,X15,X16,X7, -145523070, 6); // f7537e82
	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
	ROUND4EVN(X15,X16,X13,X14,X6, 0x2ad7d2bb,15); // 2ad7d2bb
	ROUND4ODD(X14,X15,X16,X13,X9, -343485551,21); // eb86d391

	ADDW X17, X13
	ADDW X18, X14
	ADDW X19, X15
	ADDW X20, X16

	ADD $64, X29
	BNE X28, X29, loop

	MOVW X13, (0*4)(X22)
	MOVW X14, (1*4)(X22)
	MOVW X15, (2*4)(X22)
	MOVW X16, (3*4)(X22)

	zero:
	RET