Commit | Line | Data |
---|---|---|
66be8951 MK |
1 | /* |
2 | * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental | |
3 | * SSE3 instruction set extensions introduced in Intel Core Microarchitecture | |
4 | * processors. CPUs supporting Intel(R) AVX extensions will get an additional | |
5 | * boost. | |
6 | * | |
7 | * This work was inspired by the vectorized implementation of Dean Gaudet. | |
8 | * Additional information on it can be found at: | |
9 | * http://www.arctic.org/~dean/crypto/sha1.html | |
10 | * | |
11 | * It was improved upon with more efficient vectorization of the message | |
12 | * scheduling. This implementation has also been optimized for all current and | |
13 | * several future generations of Intel CPUs. | |
14 | * | |
15 | * See this article for more information about the implementation details: | |
16 | * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ | |
17 | * | |
18 | * Copyright (C) 2010, Intel Corp. | |
19 | * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> | |
20 | * Ronen Zohar <ronen.zohar@intel.com> | |
21 | * | |
22 | * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: | |
23 | * Author: Mathias Krause <minipli@googlemail.com> | |
24 | * | |
25 | * This program is free software; you can redistribute it and/or modify | |
26 | * it under the terms of the GNU General Public License as published by | |
27 | * the Free Software Foundation; either version 2 of the License, or | |
28 | * (at your option) any later version. | |
29 | */ | |
30 | ||
ac9d55dd JK |
31 | #include <linux/linkage.h> |
32 | ||
66be8951 MK |
33 | #define CTX %rdi // arg1 |
34 | #define BUF %rsi // arg2 | |
35 | #define CNT %rdx // arg3 | |
36 | ||
37 | #define REG_A %ecx | |
38 | #define REG_B %esi | |
39 | #define REG_C %edi | |
40 | #define REG_D %ebp | |
41 | #define REG_E %edx | |
42 | ||
43 | #define REG_T1 %eax | |
44 | #define REG_T2 %ebx | |
45 | ||
46 | #define K_BASE %r8 | |
47 | #define HASH_PTR %r9 | |
48 | #define BUFFER_PTR %r10 | |
49 | #define BUFFER_END %r11 | |
50 | ||
51 | #define W_TMP1 %xmm0 | |
52 | #define W_TMP2 %xmm9 | |
53 | ||
54 | #define W0 %xmm1 | |
55 | #define W4 %xmm2 | |
56 | #define W8 %xmm3 | |
57 | #define W12 %xmm4 | |
58 | #define W16 %xmm5 | |
59 | #define W20 %xmm6 | |
60 | #define W24 %xmm7 | |
61 | #define W28 %xmm8 | |
62 | ||
63 | #define XMM_SHUFB_BSWAP %xmm10 | |
64 | ||
65 | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ | |
66 | #define WK(t) (((t) & 15) * 4)(%rsp) | |
67 | #define W_PRECALC_AHEAD 16 | |
68 | ||
69 | /* | |
70 | * This macro implements the SHA-1 function's body for single 64-byte block | |
71 | * param: function's name | |
72 | */ | |
73 | .macro SHA1_VECTOR_ASM name | |
ac9d55dd JK |
74 | ENTRY(\name) |
75 | ||
66be8951 MK |
76 | push %rbx |
77 | push %rbp | |
78 | push %r12 | |
79 | ||
80 | mov %rsp, %r12 | |
81 | sub $64, %rsp # allocate workspace | |
82 | and $~15, %rsp # align stack | |
83 | ||
84 | mov CTX, HASH_PTR | |
85 | mov BUF, BUFFER_PTR | |
86 | ||
87 | shl $6, CNT # multiply by 64 | |
88 | add BUF, CNT | |
89 | mov CNT, BUFFER_END | |
90 | ||
91 | lea K_XMM_AR(%rip), K_BASE | |
92 | xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP | |
93 | ||
94 | SHA1_PIPELINED_MAIN_BODY | |
95 | ||
96 | # cleanup workspace | |
97 | mov $8, %ecx | |
98 | mov %rsp, %rdi | |
99 | xor %rax, %rax | |
100 | rep stosq | |
101 | ||
102 | mov %r12, %rsp # deallocate workspace | |
103 | ||
104 | pop %r12 | |
105 | pop %rbp | |
106 | pop %rbx | |
107 | ret | |
108 | ||
ac9d55dd | 109 | ENDPROC(\name) |
66be8951 MK |
110 | .endm |
111 | ||
112 | /* | |
113 | * This macro implements 80 rounds of SHA-1 for one 64-byte block | |
114 | */ | |
115 | .macro SHA1_PIPELINED_MAIN_BODY | |
116 | INIT_REGALLOC | |
117 | ||
118 | mov (HASH_PTR), A | |
119 | mov 4(HASH_PTR), B | |
120 | mov 8(HASH_PTR), C | |
121 | mov 12(HASH_PTR), D | |
122 | mov 16(HASH_PTR), E | |
123 | ||
124 | .set i, 0 | |
125 | .rept W_PRECALC_AHEAD | |
126 | W_PRECALC i | |
127 | .set i, (i+1) | |
128 | .endr | |
129 | ||
130 | .align 4 | |
131 | 1: | |
132 | RR F1,A,B,C,D,E,0 | |
133 | RR F1,D,E,A,B,C,2 | |
134 | RR F1,B,C,D,E,A,4 | |
135 | RR F1,E,A,B,C,D,6 | |
136 | RR F1,C,D,E,A,B,8 | |
137 | ||
138 | RR F1,A,B,C,D,E,10 | |
139 | RR F1,D,E,A,B,C,12 | |
140 | RR F1,B,C,D,E,A,14 | |
141 | RR F1,E,A,B,C,D,16 | |
142 | RR F1,C,D,E,A,B,18 | |
143 | ||
144 | RR F2,A,B,C,D,E,20 | |
145 | RR F2,D,E,A,B,C,22 | |
146 | RR F2,B,C,D,E,A,24 | |
147 | RR F2,E,A,B,C,D,26 | |
148 | RR F2,C,D,E,A,B,28 | |
149 | ||
150 | RR F2,A,B,C,D,E,30 | |
151 | RR F2,D,E,A,B,C,32 | |
152 | RR F2,B,C,D,E,A,34 | |
153 | RR F2,E,A,B,C,D,36 | |
154 | RR F2,C,D,E,A,B,38 | |
155 | ||
156 | RR F3,A,B,C,D,E,40 | |
157 | RR F3,D,E,A,B,C,42 | |
158 | RR F3,B,C,D,E,A,44 | |
159 | RR F3,E,A,B,C,D,46 | |
160 | RR F3,C,D,E,A,B,48 | |
161 | ||
162 | RR F3,A,B,C,D,E,50 | |
163 | RR F3,D,E,A,B,C,52 | |
164 | RR F3,B,C,D,E,A,54 | |
165 | RR F3,E,A,B,C,D,56 | |
166 | RR F3,C,D,E,A,B,58 | |
167 | ||
168 | add $64, BUFFER_PTR # move to the next 64-byte block | |
169 | cmp BUFFER_END, BUFFER_PTR # if the current is the last one use | |
170 | cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun | |
171 | ||
172 | RR F4,A,B,C,D,E,60 | |
173 | RR F4,D,E,A,B,C,62 | |
174 | RR F4,B,C,D,E,A,64 | |
175 | RR F4,E,A,B,C,D,66 | |
176 | RR F4,C,D,E,A,B,68 | |
177 | ||
178 | RR F4,A,B,C,D,E,70 | |
179 | RR F4,D,E,A,B,C,72 | |
180 | RR F4,B,C,D,E,A,74 | |
181 | RR F4,E,A,B,C,D,76 | |
182 | RR F4,C,D,E,A,B,78 | |
183 | ||
184 | UPDATE_HASH (HASH_PTR), A | |
185 | UPDATE_HASH 4(HASH_PTR), B | |
186 | UPDATE_HASH 8(HASH_PTR), C | |
187 | UPDATE_HASH 12(HASH_PTR), D | |
188 | UPDATE_HASH 16(HASH_PTR), E | |
189 | ||
190 | RESTORE_RENAMED_REGS | |
191 | cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end | |
192 | jne 1b | |
193 | .endm | |
194 | ||
195 | .macro INIT_REGALLOC | |
196 | .set A, REG_A | |
197 | .set B, REG_B | |
198 | .set C, REG_C | |
199 | .set D, REG_D | |
200 | .set E, REG_E | |
201 | .set T1, REG_T1 | |
202 | .set T2, REG_T2 | |
203 | .endm | |
204 | ||
205 | .macro RESTORE_RENAMED_REGS | |
206 | # order is important (REG_C is where it should be) | |
207 | mov B, REG_B | |
208 | mov D, REG_D | |
209 | mov A, REG_A | |
210 | mov E, REG_E | |
211 | .endm | |
212 | ||
213 | .macro SWAP_REG_NAMES a, b | |
214 | .set _T, \a | |
215 | .set \a, \b | |
216 | .set \b, _T | |
217 | .endm | |
218 | ||
219 | .macro F1 b, c, d | |
220 | mov \c, T1 | |
221 | SWAP_REG_NAMES \c, T1 | |
222 | xor \d, T1 | |
223 | and \b, T1 | |
224 | xor \d, T1 | |
225 | .endm | |
226 | ||
227 | .macro F2 b, c, d | |
228 | mov \d, T1 | |
229 | SWAP_REG_NAMES \d, T1 | |
230 | xor \c, T1 | |
231 | xor \b, T1 | |
232 | .endm | |
233 | ||
234 | .macro F3 b, c ,d | |
235 | mov \c, T1 | |
236 | SWAP_REG_NAMES \c, T1 | |
237 | mov \b, T2 | |
238 | or \b, T1 | |
239 | and \c, T2 | |
240 | and \d, T1 | |
241 | or T2, T1 | |
242 | .endm | |
243 | ||
244 | .macro F4 b, c, d | |
245 | F2 \b, \c, \d | |
246 | .endm | |
247 | ||
248 | .macro UPDATE_HASH hash, val | |
249 | add \hash, \val | |
250 | mov \val, \hash | |
251 | .endm | |
252 | ||
253 | /* | |
254 | * RR does two rounds of SHA-1 back to back with W[] pre-calc | |
255 | * t1 = F(b, c, d); e += w(i) | |
256 | * e += t1; b <<= 30; d += w(i+1); | |
257 | * t1 = F(a, b, c); | |
258 | * d += t1; a <<= 5; | |
259 | * e += a; | |
260 | * t1 = e; a >>= 7; | |
261 | * t1 <<= 5; | |
262 | * d += t1; | |
263 | */ | |
264 | .macro RR F, a, b, c, d, e, round | |
265 | add WK(\round), \e | |
266 | \F \b, \c, \d # t1 = F(b, c, d); | |
267 | W_PRECALC (\round + W_PRECALC_AHEAD) | |
268 | rol $30, \b | |
269 | add T1, \e | |
270 | add WK(\round + 1), \d | |
271 | ||
272 | \F \a, \b, \c | |
273 | W_PRECALC (\round + W_PRECALC_AHEAD + 1) | |
274 | rol $5, \a | |
275 | add \a, \e | |
276 | add T1, \d | |
277 | ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) | |
278 | ||
279 | mov \e, T1 | |
280 | SWAP_REG_NAMES \e, T1 | |
281 | ||
282 | rol $5, T1 | |
283 | add T1, \d | |
284 | ||
285 | # write: \a, \b | |
286 | # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c | |
287 | .endm | |
288 | ||
289 | .macro W_PRECALC r | |
290 | .set i, \r | |
291 | ||
292 | .if (i < 20) | |
293 | .set K_XMM, 0 | |
294 | .elseif (i < 40) | |
295 | .set K_XMM, 16 | |
296 | .elseif (i < 60) | |
297 | .set K_XMM, 32 | |
298 | .elseif (i < 80) | |
299 | .set K_XMM, 48 | |
300 | .endif | |
301 | ||
302 | .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) | |
303 | .set i, ((\r) % 80) # pre-compute for the next iteration | |
304 | .if (i == 0) | |
305 | W_PRECALC_RESET | |
306 | .endif | |
307 | W_PRECALC_00_15 | |
308 | .elseif (i<32) | |
309 | W_PRECALC_16_31 | |
310 | .elseif (i < 80) // rounds 32-79 | |
311 | W_PRECALC_32_79 | |
312 | .endif | |
313 | .endm | |
314 | ||
315 | .macro W_PRECALC_RESET | |
316 | .set W, W0 | |
317 | .set W_minus_04, W4 | |
318 | .set W_minus_08, W8 | |
319 | .set W_minus_12, W12 | |
320 | .set W_minus_16, W16 | |
321 | .set W_minus_20, W20 | |
322 | .set W_minus_24, W24 | |
323 | .set W_minus_28, W28 | |
324 | .set W_minus_32, W | |
325 | .endm | |
326 | ||
327 | .macro W_PRECALC_ROTATE | |
328 | .set W_minus_32, W_minus_28 | |
329 | .set W_minus_28, W_minus_24 | |
330 | .set W_minus_24, W_minus_20 | |
331 | .set W_minus_20, W_minus_16 | |
332 | .set W_minus_16, W_minus_12 | |
333 | .set W_minus_12, W_minus_08 | |
334 | .set W_minus_08, W_minus_04 | |
335 | .set W_minus_04, W | |
336 | .set W, W_minus_32 | |
337 | .endm | |
338 | ||
339 | .macro W_PRECALC_SSSE3 | |
340 | ||
341 | .macro W_PRECALC_00_15 | |
342 | W_PRECALC_00_15_SSSE3 | |
343 | .endm | |
344 | .macro W_PRECALC_16_31 | |
345 | W_PRECALC_16_31_SSSE3 | |
346 | .endm | |
347 | .macro W_PRECALC_32_79 | |
348 | W_PRECALC_32_79_SSSE3 | |
349 | .endm | |
350 | ||
351 | /* message scheduling pre-compute for rounds 0-15 */ | |
352 | .macro W_PRECALC_00_15_SSSE3 | |
353 | .if ((i & 3) == 0) | |
354 | movdqu (i*4)(BUFFER_PTR), W_TMP1 | |
355 | .elseif ((i & 3) == 1) | |
356 | pshufb XMM_SHUFB_BSWAP, W_TMP1 | |
357 | movdqa W_TMP1, W | |
358 | .elseif ((i & 3) == 2) | |
359 | paddd (K_BASE), W_TMP1 | |
360 | .elseif ((i & 3) == 3) | |
361 | movdqa W_TMP1, WK(i&~3) | |
362 | W_PRECALC_ROTATE | |
363 | .endif | |
364 | .endm | |
365 | ||
366 | /* message scheduling pre-compute for rounds 16-31 | |
367 | * | |
368 | * - calculating last 32 w[i] values in 8 XMM registers | |
369 | * - pre-calculate K+w[i] values and store to mem, for later load by ALU add | |
370 | * instruction | |
371 | * | |
372 | * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] | |
373 | * dependency, but improves for 32-79 | |
374 | */ | |
375 | .macro W_PRECALC_16_31_SSSE3 | |
376 | # blended scheduling of vector and scalar instruction streams, one 4-wide | |
377 | # vector iteration / 4 scalar rounds | |
378 | .if ((i & 3) == 0) | |
379 | movdqa W_minus_12, W | |
380 | palignr $8, W_minus_16, W # w[i-14] | |
381 | movdqa W_minus_04, W_TMP1 | |
382 | psrldq $4, W_TMP1 # w[i-3] | |
383 | pxor W_minus_08, W | |
384 | .elseif ((i & 3) == 1) | |
385 | pxor W_minus_16, W_TMP1 | |
386 | pxor W_TMP1, W | |
387 | movdqa W, W_TMP2 | |
388 | movdqa W, W_TMP1 | |
389 | pslldq $12, W_TMP2 | |
390 | .elseif ((i & 3) == 2) | |
391 | psrld $31, W | |
392 | pslld $1, W_TMP1 | |
393 | por W, W_TMP1 | |
394 | movdqa W_TMP2, W | |
395 | psrld $30, W_TMP2 | |
396 | pslld $2, W | |
397 | .elseif ((i & 3) == 3) | |
398 | pxor W, W_TMP1 | |
399 | pxor W_TMP2, W_TMP1 | |
400 | movdqa W_TMP1, W | |
401 | paddd K_XMM(K_BASE), W_TMP1 | |
402 | movdqa W_TMP1, WK(i&~3) | |
403 | W_PRECALC_ROTATE | |
404 | .endif | |
405 | .endm | |
406 | ||
407 | /* message scheduling pre-compute for rounds 32-79 | |
408 | * | |
409 | * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 | |
410 | * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | |
411 | * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken | |
412 | */ | |
413 | .macro W_PRECALC_32_79_SSSE3 | |
414 | .if ((i & 3) == 0) | |
415 | movdqa W_minus_04, W_TMP1 | |
416 | pxor W_minus_28, W # W is W_minus_32 before xor | |
417 | palignr $8, W_minus_08, W_TMP1 | |
418 | .elseif ((i & 3) == 1) | |
419 | pxor W_minus_16, W | |
420 | pxor W_TMP1, W | |
421 | movdqa W, W_TMP1 | |
422 | .elseif ((i & 3) == 2) | |
423 | psrld $30, W | |
424 | pslld $2, W_TMP1 | |
425 | por W, W_TMP1 | |
426 | .elseif ((i & 3) == 3) | |
427 | movdqa W_TMP1, W | |
428 | paddd K_XMM(K_BASE), W_TMP1 | |
429 | movdqa W_TMP1, WK(i&~3) | |
430 | W_PRECALC_ROTATE | |
431 | .endif | |
432 | .endm | |
433 | ||
434 | .endm // W_PRECALC_SSSE3 | |
435 | ||
436 | ||
437 | #define K1 0x5a827999 | |
438 | #define K2 0x6ed9eba1 | |
439 | #define K3 0x8f1bbcdc | |
440 | #define K4 0xca62c1d6 | |
441 | ||
442 | .section .rodata | |
443 | .align 16 | |
444 | ||
445 | K_XMM_AR: | |
446 | .long K1, K1, K1, K1 | |
447 | .long K2, K2, K2, K2 | |
448 | .long K3, K3, K3, K3 | |
449 | .long K4, K4, K4, K4 | |
450 | ||
451 | BSWAP_SHUFB_CTL: | |
452 | .long 0x00010203 | |
453 | .long 0x04050607 | |
454 | .long 0x08090a0b | |
455 | .long 0x0c0d0e0f | |
456 | ||
457 | ||
458 | .section .text | |
459 | ||
460 | W_PRECALC_SSSE3 | |
461 | .macro xmm_mov a, b | |
462 | movdqu \a,\b | |
463 | .endm | |
464 | ||
465 | /* SSSE3 optimized implementation: | |
466 | * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, | |
467 | * unsigned int rounds); | |
468 | */ | |
469 | SHA1_VECTOR_ASM sha1_transform_ssse3 | |
470 | ||
65df5774 | 471 | #ifdef CONFIG_AS_AVX |
66be8951 MK |
472 | |
473 | .macro W_PRECALC_AVX | |
474 | ||
475 | .purgem W_PRECALC_00_15 | |
476 | .macro W_PRECALC_00_15 | |
477 | W_PRECALC_00_15_AVX | |
478 | .endm | |
479 | .purgem W_PRECALC_16_31 | |
480 | .macro W_PRECALC_16_31 | |
481 | W_PRECALC_16_31_AVX | |
482 | .endm | |
483 | .purgem W_PRECALC_32_79 | |
484 | .macro W_PRECALC_32_79 | |
485 | W_PRECALC_32_79_AVX | |
486 | .endm | |
487 | ||
488 | .macro W_PRECALC_00_15_AVX | |
489 | .if ((i & 3) == 0) | |
490 | vmovdqu (i*4)(BUFFER_PTR), W_TMP1 | |
491 | .elseif ((i & 3) == 1) | |
492 | vpshufb XMM_SHUFB_BSWAP, W_TMP1, W | |
493 | .elseif ((i & 3) == 2) | |
494 | vpaddd (K_BASE), W, W_TMP1 | |
495 | .elseif ((i & 3) == 3) | |
496 | vmovdqa W_TMP1, WK(i&~3) | |
497 | W_PRECALC_ROTATE | |
498 | .endif | |
499 | .endm | |
500 | ||
501 | .macro W_PRECALC_16_31_AVX | |
502 | .if ((i & 3) == 0) | |
503 | vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] | |
504 | vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] | |
505 | vpxor W_minus_08, W, W | |
506 | vpxor W_minus_16, W_TMP1, W_TMP1 | |
507 | .elseif ((i & 3) == 1) | |
508 | vpxor W_TMP1, W, W | |
509 | vpslldq $12, W, W_TMP2 | |
510 | vpslld $1, W, W_TMP1 | |
511 | .elseif ((i & 3) == 2) | |
512 | vpsrld $31, W, W | |
513 | vpor W, W_TMP1, W_TMP1 | |
514 | vpslld $2, W_TMP2, W | |
515 | vpsrld $30, W_TMP2, W_TMP2 | |
516 | .elseif ((i & 3) == 3) | |
517 | vpxor W, W_TMP1, W_TMP1 | |
518 | vpxor W_TMP2, W_TMP1, W | |
519 | vpaddd K_XMM(K_BASE), W, W_TMP1 | |
520 | vmovdqu W_TMP1, WK(i&~3) | |
521 | W_PRECALC_ROTATE | |
522 | .endif | |
523 | .endm | |
524 | ||
525 | .macro W_PRECALC_32_79_AVX | |
526 | .if ((i & 3) == 0) | |
527 | vpalignr $8, W_minus_08, W_minus_04, W_TMP1 | |
528 | vpxor W_minus_28, W, W # W is W_minus_32 before xor | |
529 | .elseif ((i & 3) == 1) | |
530 | vpxor W_minus_16, W_TMP1, W_TMP1 | |
531 | vpxor W_TMP1, W, W | |
532 | .elseif ((i & 3) == 2) | |
533 | vpslld $2, W, W_TMP1 | |
534 | vpsrld $30, W, W | |
535 | vpor W, W_TMP1, W | |
536 | .elseif ((i & 3) == 3) | |
537 | vpaddd K_XMM(K_BASE), W, W_TMP1 | |
538 | vmovdqu W_TMP1, WK(i&~3) | |
539 | W_PRECALC_ROTATE | |
540 | .endif | |
541 | .endm | |
542 | ||
543 | .endm // W_PRECALC_AVX | |
544 | ||
545 | W_PRECALC_AVX | |
546 | .purgem xmm_mov | |
547 | .macro xmm_mov a, b | |
548 | vmovdqu \a,\b | |
549 | .endm | |
550 | ||
551 | ||
552 | /* AVX optimized implementation: | |
553 | * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, | |
554 | * unsigned int rounds); | |
555 | */ | |
556 | SHA1_VECTOR_ASM sha1_transform_avx | |
557 | ||
558 | #endif |