src/crypto/rc4/rc4_amd64.s - go - Git at Google

 // Original source:
 //	http://www.zorinaq.com/papers/rc4-amd64.html
 //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2

 #include "textflag.h"

 // Local modifications:
 //
 // Transliterated from GNU to 6a assembly syntax by the Go authors.
 // The comments and spacing are from the original.
 //
 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
 //
 // The original code accumulated 64 bits of key stream in an integer
 // register and then XOR'ed the key stream into the data 8 bytes at a time.
 // Modified to accumulate 128 bits of key stream into an XMM register
 // and then XOR the key stream into the data 16 bytes at a time.
 // Approximately doubles throughput.

 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
 // but makes the code run 2.0x slower on Xeon.
 #define EXTEND(r) MOVBLZX r, r

 /*
 ** RC4 implementation optimized for AMD64.
 **
 ** Author: Marc Bevand <bevand_m (at) epita.fr>
 ** Licence: I hereby disclaim the copyright on this code and place it
 ** in the public domain.
 **
 ** The code has been designed to be easily integrated into openssl:
 ** the exported RC4() function can replace the actual implementations
 ** openssl already contains. Please note that when linking with openssl,
 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
 ** with -DRC4_INT='unsigned long'.
 **
 ** The throughput achieved by this code is about 320 MBytes/sec, on
 ** a 1.8 GHz AMD Opteron (rev C0) processor.
 */

 TEXT ·xorKeyStream(SB),NOSPLIT,$0
 	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
 	MOVQ	src+8(FP),	SI		// in = ARG(in)
 	MOVQ	dst+0(FP),	DI		// out = ARG(out)
 	MOVQ	state+24(FP),	BP		// d = ARG(data)
 	MOVQ	i+32(FP),	AX
 	MOVBQZX	0(AX),		CX		// x = *xp
 	MOVQ	j+40(FP),	AX
 	MOVBQZX	0(AX),		DX		// y = *yp

 	LEAQ	(SI)(BX*1),	R9		// limit = in+len

 l1:	CMPQ	SI,		R9		// cmp in with in+len
 	JGE	finished			// jump if (in >= in+len)

 	INCB	CX
 	EXTEND(CX)
 	TESTL	$15,		CX
 	JZ	wordloop

 	MOVBLZX	(BP)(CX*4),	AX

 	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
 	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
 	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
 	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
 	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
 	XORB	(SI),		R8		// xor 1 byte
 	MOVB	R8,		(DI)
 	INCQ	SI				// in++
 	INCQ	DI				// out++
 	JMP l1

 wordloop:
 	SUBQ	$16,		R9
 	CMPQ	SI,		R9
 	JGT	end

 start:
 	ADDQ	$16,		SI		// increment in
 	ADDQ	$16,		DI		// increment out

 	// Each KEYROUND generates one byte of key and
 	// inserts it into an XMM register at the given 16-bit index.
 	// The key state array is uint32 words only using the bottom
 	// byte of each word, so the 16-bit OR only copies 8 useful bits.
 	// We accumulate alternating bytes into X0 and X1, and then at
 	// the end we OR X1<<8 into X0 to produce the actual key.
 	//
 	// At the beginning of the loop, CX%16 == 0, so the 16 loads
 	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
 	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
 	// without fear of the byte computation CX+15 wrapping around.
 	//
 	// The first round needs R12[0], the second needs R12[1], and so on.
 	// We can avoid memory stalls by starting the load for round n+1
 	// before the end of round n, using the LOAD macro.
 	LEAQ	(BP)(CX*4),	R12

 #define KEYROUND(xmm, load, off, r1, r2, index) \
 	MOVBLZX	(BP)(DX*4),	R8; \
 	MOVB	r1,		(BP)(DX*4); \
 	load((off+1), r2); \
 	MOVB	R8,		(off*4)(R12); \
 	ADDB	r1,		R8; \
 	EXTEND(R8); \
 	PINSRW	$index, (BP)(R8*4), xmm

 #define LOAD(off, reg) \
 	MOVBLZX	(off*4)(R12),	reg; \
 	ADDB	reg,		DX; \
 	EXTEND(DX)

 #define SKIP(off, reg)

 	LOAD(0, AX)
 	KEYROUND(X0, LOAD, 0, AX, BX, 0)
 	KEYROUND(X1, LOAD, 1, BX, AX, 0)
 	KEYROUND(X0, LOAD, 2, AX, BX, 1)
 	KEYROUND(X1, LOAD, 3, BX, AX, 1)
 	KEYROUND(X0, LOAD, 4, AX, BX, 2)
 	KEYROUND(X1, LOAD, 5, BX, AX, 2)
 	KEYROUND(X0, LOAD, 6, AX, BX, 3)
 	KEYROUND(X1, LOAD, 7, BX, AX, 3)
 	KEYROUND(X0, LOAD, 8, AX, BX, 4)
 	KEYROUND(X1, LOAD, 9, BX, AX, 4)
 	KEYROUND(X0, LOAD, 10, AX, BX, 5)
 	KEYROUND(X1, LOAD, 11, BX, AX, 5)
 	KEYROUND(X0, LOAD, 12, AX, BX, 6)
 	KEYROUND(X1, LOAD, 13, BX, AX, 6)
 	KEYROUND(X0, LOAD, 14, AX, BX, 7)
 	KEYROUND(X1, SKIP, 15, BX, AX, 7)

 	ADDB	$16,		CX

 	PSLLQ	$8,		X1
 	PXOR	X1,		X0
 	MOVOU	-16(SI),	X2
 	PXOR	X0,		X2
 	MOVOU	X2,		-16(DI)

 	CMPQ	SI,		R9		// cmp in with in+len-16
 	JLE	start				// jump if (in <= in+len-16)

 end:
 	DECB	CX
 	ADDQ	$16,		R9		// tmp = in+len

 	// handle the last bytes, one by one
 l2:	CMPQ	SI,		R9		// cmp in with in+len
 	JGE	finished			// jump if (in >= in+len)

 	INCB	CX
 	EXTEND(CX)
 	MOVBLZX	(BP)(CX*4),	AX

 	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
 	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
 	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
 	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
 	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
 	XORB	(SI),		R8		// xor 1 byte
 	MOVB	R8,		(DI)
 	INCQ	SI				// in++
 	INCQ	DI				// out++
 	JMP l2

 finished:
 	MOVQ	j+40(FP),	BX
 	MOVB	DX, 0(BX)
 	MOVQ	i+32(FP),	AX
 	MOVB	CX, 0(AX)
 	RET
	// Original source:
	// http://www.zorinaq.com/papers/rc4-amd64.html
	// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2

	#include "textflag.h"

	// Local modifications:
	//
	// Transliterated from GNU to 6a assembly syntax by the Go authors.
	// The comments and spacing are from the original.
	//
	// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
	//
	// The original code accumulated 64 bits of key stream in an integer
	// register and then XOR'ed the key stream into the data 8 bytes at a time.
	// Modified to accumulate 128 bits of key stream into an XMM register
	// and then XOR the key stream into the data 16 bytes at a time.
	// Approximately doubles throughput.

	// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
	// but makes the code run 2.0x slower on Xeon.
	#define EXTEND(r) MOVBLZX r, r

	/*
	** RC4 implementation optimized for AMD64.
	**
	** Author: Marc Bevand <bevand_m (at) epita.fr>
	** Licence: I hereby disclaim the copyright on this code and place it
	** in the public domain.
	**
	** The code has been designed to be easily integrated into openssl:
	** the exported RC4() function can replace the actual implementations
	** openssl already contains. Please note that when linking with openssl,
	** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
	** with -DRC4_INT='unsigned long'.
	**
	** The throughput achieved by this code is about 320 MBytes/sec, on
	** a 1.8 GHz AMD Opteron (rev C0) processor.
	*/

	TEXT ·xorKeyStream(SB),NOSPLIT,$0
	MOVQ n+16(FP), BX // rbx = ARG(len)
	MOVQ src+8(FP), SI // in = ARG(in)
	MOVQ dst+0(FP), DI // out = ARG(out)
	MOVQ state+24(FP), BP // d = ARG(data)
	MOVQ i+32(FP), AX
	MOVBQZX 0(AX), CX // x = *xp
	MOVQ j+40(FP), AX
	MOVBQZX 0(AX), DX // y = *yp

	LEAQ (SI)(BX*1), R9 // limit = in+len

	l1: CMPQ SI, R9 // cmp in with in+len
	JGE finished // jump if (in >= in+len)

	INCB CX
	EXTEND(CX)
	TESTL $15, CX
	JZ wordloop

	MOVBLZX (BP)(CX*4), AX

	ADDB AX, DX // y += tx
	EXTEND(DX)
	MOVBLZX (BP)(DX*4), BX // ty = d[y]
	MOVB BX, (BP)(CX*4) // d[x] = ty
	ADDB AX, BX // val = ty+tx
	EXTEND(BX)
	MOVB AX, (BP)(DX*4) // d[y] = tx
	MOVBLZX (BP)(BX*4), R8 // val = d[val]
	XORB (SI), R8 // xor 1 byte
	MOVB R8, (DI)
	INCQ SI // in++
	INCQ DI // out++
	JMP l1

	wordloop:
	SUBQ $16, R9
	CMPQ SI, R9
	JGT end

	start:
	ADDQ $16, SI // increment in
	ADDQ $16, DI // increment out

	// Each KEYROUND generates one byte of key and
	// inserts it into an XMM register at the given 16-bit index.
	// The key state array is uint32 words only using the bottom
	// byte of each word, so the 16-bit OR only copies 8 useful bits.
	// We accumulate alternating bytes into X0 and X1, and then at
	// the end we OR X1<<8 into X0 to produce the actual key.
	//
	// At the beginning of the loop, CX%16 == 0, so the 16 loads
	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
	// without fear of the byte computation CX+15 wrapping around.
	//
	// The first round needs R12[0], the second needs R12[1], and so on.
	// We can avoid memory stalls by starting the load for round n+1
	// before the end of round n, using the LOAD macro.
	LEAQ (BP)(CX*4), R12

	#define KEYROUND(xmm, load, off, r1, r2, index) \
	MOVBLZX (BP)(DX*4), R8; \
	MOVB r1, (BP)(DX*4); \
	load((off+1), r2); \
	MOVB R8, (off*4)(R12); \
	ADDB r1, R8; \
	EXTEND(R8); \
	PINSRW $index, (BP)(R8*4), xmm

	#define LOAD(off, reg) \
	MOVBLZX (off*4)(R12), reg; \
	ADDB reg, DX; \
	EXTEND(DX)

	#define SKIP(off, reg)

	LOAD(0, AX)
	KEYROUND(X0, LOAD, 0, AX, BX, 0)
	KEYROUND(X1, LOAD, 1, BX, AX, 0)
	KEYROUND(X0, LOAD, 2, AX, BX, 1)
	KEYROUND(X1, LOAD, 3, BX, AX, 1)
	KEYROUND(X0, LOAD, 4, AX, BX, 2)
	KEYROUND(X1, LOAD, 5, BX, AX, 2)
	KEYROUND(X0, LOAD, 6, AX, BX, 3)
	KEYROUND(X1, LOAD, 7, BX, AX, 3)
	KEYROUND(X0, LOAD, 8, AX, BX, 4)
	KEYROUND(X1, LOAD, 9, BX, AX, 4)
	KEYROUND(X0, LOAD, 10, AX, BX, 5)
	KEYROUND(X1, LOAD, 11, BX, AX, 5)
	KEYROUND(X0, LOAD, 12, AX, BX, 6)
	KEYROUND(X1, LOAD, 13, BX, AX, 6)
	KEYROUND(X0, LOAD, 14, AX, BX, 7)
	KEYROUND(X1, SKIP, 15, BX, AX, 7)

	ADDB $16, CX

	PSLLQ $8, X1
	PXOR X1, X0
	MOVOU -16(SI), X2
	PXOR X0, X2
	MOVOU X2, -16(DI)

	CMPQ SI, R9 // cmp in with in+len-16
	JLE start // jump if (in <= in+len-16)

	end:
	DECB CX
	ADDQ $16, R9 // tmp = in+len

	// handle the last bytes, one by one
	l2: CMPQ SI, R9 // cmp in with in+len
	JGE finished // jump if (in >= in+len)

	INCB CX
	EXTEND(CX)
	MOVBLZX (BP)(CX*4), AX

	ADDB AX, DX // y += tx
	EXTEND(DX)
	MOVBLZX (BP)(DX*4), BX // ty = d[y]
	MOVB BX, (BP)(CX*4) // d[x] = ty
	ADDB AX, BX // val = ty+tx
	EXTEND(BX)
	MOVB AX, (BP)(DX*4) // d[y] = tx
	MOVBLZX (BP)(BX*4), R8 // val = d[val]
	XORB (SI), R8 // xor 1 byte
	MOVB R8, (DI)
	INCQ SI // in++
	INCQ DI // out++
	JMP l2

	finished:
	MOVQ j+40(FP), BX
	MOVB DX, 0(BX)
	MOVQ i+32(FP), AX
	MOVB CX, 0(AX)
	RET