blob: 64530740b40f8407917af35a127c0c9739f339e3 [file] [log] [blame]
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +02001// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build amd64,!gccgo,!appengine
6
7#include "textflag.h"
8
9DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
10DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
11GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
12
13DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
14DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
15GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
16
17DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
18DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
19GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
20
21DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
22DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
Andreas Auernhammer9a6f0a02016-12-10 15:54:14 +010023GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020024
25DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
26DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
27GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
28
29DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
30DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
31GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
32
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010033#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
34 MOVO v4, t1; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020035 MOVO v5, v4; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010036 MOVO t1, v5; \
37 MOVO v6, t1; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020038 PUNPCKLQDQ v6, t2; \
39 PUNPCKHQDQ v7, v6; \
40 PUNPCKHQDQ t2, v6; \
41 PUNPCKLQDQ v7, t2; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010042 MOVO t1, v7; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020043 MOVO v2, t1; \
44 PUNPCKHQDQ t2, v7; \
45 PUNPCKLQDQ v3, t2; \
46 PUNPCKHQDQ t2, v2; \
47 PUNPCKLQDQ t1, t2; \
48 PUNPCKHQDQ t2, v3
49
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010050#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
51 MOVO v4, t1; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020052 MOVO v5, v4; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010053 MOVO t1, v5; \
54 MOVO v2, t1; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020055 PUNPCKLQDQ v2, t2; \
56 PUNPCKHQDQ v3, v2; \
57 PUNPCKHQDQ t2, v2; \
58 PUNPCKLQDQ v3, t2; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010059 MOVO t1, v3; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020060 MOVO v6, t1; \
61 PUNPCKHQDQ t2, v3; \
62 PUNPCKLQDQ v7, t2; \
63 PUNPCKHQDQ t2, v6; \
64 PUNPCKLQDQ t1, t2; \
65 PUNPCKHQDQ t2, v7
66
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010067#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020068 PADDQ m0, v0; \
69 PADDQ m1, v1; \
70 PADDQ v2, v0; \
71 PADDQ v3, v1; \
72 PXOR v0, v6; \
73 PXOR v1, v7; \
74 PSHUFD $0xB1, v6, v6; \
75 PSHUFD $0xB1, v7, v7; \
76 PADDQ v6, v4; \
77 PADDQ v7, v5; \
78 PXOR v4, v2; \
79 PXOR v5, v3; \
80 PSHUFB c40, v2; \
81 PSHUFB c40, v3; \
82 PADDQ m2, v0; \
83 PADDQ m3, v1; \
84 PADDQ v2, v0; \
85 PADDQ v3, v1; \
86 PXOR v0, v6; \
87 PXOR v1, v7; \
88 PSHUFB c48, v6; \
89 PSHUFB c48, v7; \
90 PADDQ v6, v4; \
91 PADDQ v7, v5; \
92 PXOR v4, v2; \
93 PXOR v5, v3; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010094 MOVOU v2, t0; \
95 PADDQ v2, t0; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +020096 PSRLQ $63, v2; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +010097 PXOR t0, v2; \
98 MOVOU v3, t0; \
99 PADDQ v3, t0; \
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200100 PSRLQ $63, v3; \
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100101 PXOR t0, v3
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200102
103#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
104 MOVQ i0*8(src), m0; \
105 PINSRQ $1, i1*8(src), m0; \
106 MOVQ i2*8(src), m1; \
107 PINSRQ $1, i3*8(src), m1; \
108 MOVQ i4*8(src), m2; \
109 PINSRQ $1, i5*8(src), m2; \
110 MOVQ i6*8(src), m3; \
111 PINSRQ $1, i7*8(src), m3
112
113// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100114TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200115 MOVQ h+0(FP), AX
116 MOVQ c+8(FP), BX
117 MOVQ flag+16(FP), CX
118 MOVQ blocks_base+24(FP), SI
119 MOVQ blocks_len+32(FP), DI
120
121 MOVQ SP, BP
Austin Clementse67f5ec2016-10-20 16:53:24 -0400122 MOVQ SP, R9
123 ADDQ $15, R9
124 ANDQ $~15, R9
125 MOVQ R9, SP
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200126
127 MOVOU ·iv3<>(SB), X0
128 MOVO X0, 0(SP)
129 XORQ CX, 0(SP) // 0(SP) = ·iv3 ^ (CX || 0)
130
131 MOVOU ·c40<>(SB), X13
132 MOVOU ·c48<>(SB), X14
133
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100134 MOVOU 0(AX), X12
135 MOVOU 16(AX), X15
136
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200137 MOVQ 0(BX), R8
138 MOVQ 8(BX), R9
139
140loop:
141 ADDQ $128, R8
142 CMPQ R8, $128
143 JGE noinc
144 INCQ R9
145
146noinc:
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100147 MOVQ R8, X8
148 PINSRQ $1, R9, X8
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200149
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100150 MOVO X12, X0
151 MOVO X15, X1
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200152 MOVOU 32(AX), X2
153 MOVOU 48(AX), X3
154 MOVOU ·iv0<>(SB), X4
155 MOVOU ·iv1<>(SB), X5
156 MOVOU ·iv2<>(SB), X6
157
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100158 PXOR X8, X6
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200159 MOVO 0(SP), X7
160
161 LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100162 MOVO X8, 16(SP)
163 MOVO X9, 32(SP)
164 MOVO X10, 48(SP)
165 MOVO X11, 64(SP)
166 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
167 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200168 LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100169 MOVO X8, 80(SP)
170 MOVO X9, 96(SP)
171 MOVO X10, 112(SP)
172 MOVO X11, 128(SP)
173 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
174 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200175
176 LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100177 MOVO X8, 144(SP)
178 MOVO X9, 160(SP)
179 MOVO X10, 176(SP)
180 MOVO X11, 192(SP)
181 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
182 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200183 LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100184 MOVO X8, 208(SP)
185 MOVO X9, 224(SP)
186 MOVO X10, 240(SP)
187 MOVO X11, 256(SP)
188 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
189 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200190
191 LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100192 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
193 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200194 LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100195 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
196 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200197
198 LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100199 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
200 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200201 LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100202 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
203 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200204
205 LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100206 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
207 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200208 LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100209 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
210 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200211
212 LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100213 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
214 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200215 LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100216 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
217 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200218
219 LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100220 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
221 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200222 LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100223 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
224 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200225
226 LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100227 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
228 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200229 LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100230 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
231 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200232
233 LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100234 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
235 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200236 LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100237 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
238 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200239
240 LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100241 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
242 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200243 LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100244 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
245 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200246
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100247 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
248 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
249 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
250 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200251
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100252 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
253 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
254 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
255 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200256
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200257 MOVOU 32(AX), X10
258 MOVOU 48(AX), X11
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100259 PXOR X0, X12
260 PXOR X1, X15
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200261 PXOR X2, X10
262 PXOR X3, X11
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100263 PXOR X4, X12
264 PXOR X5, X15
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200265 PXOR X6, X10
266 PXOR X7, X11
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200267 MOVOU X10, 32(AX)
268 MOVOU X11, 48(AX)
269
270 LEAQ 128(SI), SI
271 SUBQ $128, DI
272 JNE loop
273
Andreas Auernhammerd8e61c62016-12-13 23:25:08 +0100274 MOVOU X12, 0(AX)
275 MOVOU X15, 16(AX)
276
277 MOVQ R8, 0(BX)
278 MOVQ R9, 8(BX)
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200279
280 MOVQ BP, SP
281 RET
282
Mikio Haraf6b343c2016-12-21 13:54:10 +0900283// func supportsSSE4() bool
284TEXT ·supportsSSE4(SB), 4, $0-1
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200285 MOVL $1, AX
286 CPUID
Adam Langley7efbae52016-12-01 10:23:41 -0800287 SHRL $19, CX // Bit 19 indicates SSE4 support
Andreas Auernhammer9e9c7d42016-10-13 19:39:11 +0200288 ANDL $1, CX // CX != 0 if support SSE4
289 MOVB CX, ret+0(FP)
290 RET