crypto: aesni - fix counter overflow handling in "by8" variant
[deliverable/linux.git] / arch / x86 / crypto / aes_ctrby8_avx-x86_64.S
1 /*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65 #include <linux/linkage.h>
66 #include <asm/inst.h>
67
68 #define CONCAT(a,b) a##b
69 #define VMOVDQ vmovdqu
70
71 #define xdata0 %xmm0
72 #define xdata1 %xmm1
73 #define xdata2 %xmm2
74 #define xdata3 %xmm3
75 #define xdata4 %xmm4
76 #define xdata5 %xmm5
77 #define xdata6 %xmm6
78 #define xdata7 %xmm7
79 #define xcounter %xmm8
80 #define xbyteswap %xmm9
81 #define xkey0 %xmm10
82 #define xkey3 %xmm11
83 #define xkey6 %xmm12
84 #define xkey9 %xmm13
85 #define xkey4 %xmm11
86 #define xkey8 %xmm12
87 #define xkey12 %xmm13
88 #define xkeyA %xmm14
89 #define xkeyB %xmm15
90
91 #define p_in %rdi
92 #define p_iv %rsi
93 #define p_keys %rdx
94 #define p_out %rcx
95 #define num_bytes %r8
96
97 #define tmp %r10
98 #define DDQ(i) CONCAT(ddq_add_,i)
99 #define XMM(i) CONCAT(%xmm, i)
100 #define DDQ_DATA 0
101 #define XDATA 1
102 #define KEY_128 1
103 #define KEY_192 2
104 #define KEY_256 3
105
106 .section .rodata
107 .align 16
108
109 byteswap_const:
110 .octa 0x000102030405060708090A0B0C0D0E0F
111 ddq_low_msk:
112 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
113 ddq_high_add_1:
114 .octa 0x00000000000000010000000000000000
115 ddq_add_1:
116 .octa 0x00000000000000000000000000000001
117 ddq_add_2:
118 .octa 0x00000000000000000000000000000002
119 ddq_add_3:
120 .octa 0x00000000000000000000000000000003
121 ddq_add_4:
122 .octa 0x00000000000000000000000000000004
123 ddq_add_5:
124 .octa 0x00000000000000000000000000000005
125 ddq_add_6:
126 .octa 0x00000000000000000000000000000006
127 ddq_add_7:
128 .octa 0x00000000000000000000000000000007
129 ddq_add_8:
130 .octa 0x00000000000000000000000000000008
131
132 .text
133
134 /* generate a unique variable for ddq_add_x */
135
136 .macro setddq n
137 var_ddq_add = DDQ(\n)
138 .endm
139
140 /* generate a unique variable for xmm register */
141 .macro setxdata n
142 var_xdata = XMM(\n)
143 .endm
144
145 /* club the numeric 'id' to the symbol 'name' */
146
147 .macro club name, id
148 .altmacro
149 .if \name == DDQ_DATA
150 setddq %\id
151 .elseif \name == XDATA
152 setxdata %\id
153 .endif
154 .noaltmacro
155 .endm
156
157 /*
158 * do_aes num_in_par load_keys key_len
159 * This increments p_in, but not p_out
160 */
161 .macro do_aes b, k, key_len
162 .set by, \b
163 .set load_keys, \k
164 .set klen, \key_len
165
166 .if (load_keys)
167 vmovdqa 0*16(p_keys), xkey0
168 .endif
169
170 vpshufb xbyteswap, xcounter, xdata0
171
172 .set i, 1
173 .rept (by - 1)
174 club DDQ_DATA, i
175 club XDATA, i
176 vpaddq var_ddq_add(%rip), xcounter, var_xdata
177 vptest ddq_low_msk(%rip), var_xdata
178 jnz 1f
179 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
180 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
181 1:
182 vpshufb xbyteswap, var_xdata, var_xdata
183 .set i, (i +1)
184 .endr
185
186 vmovdqa 1*16(p_keys), xkeyA
187
188 vpxor xkey0, xdata0, xdata0
189 club DDQ_DATA, by
190 vpaddq var_ddq_add(%rip), xcounter, xcounter
191 vptest ddq_low_msk(%rip), xcounter
192 jnz 1f
193 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
194 1:
195
196 .set i, 1
197 .rept (by - 1)
198 club XDATA, i
199 vpxor xkey0, var_xdata, var_xdata
200 .set i, (i +1)
201 .endr
202
203 vmovdqa 2*16(p_keys), xkeyB
204
205 .set i, 0
206 .rept by
207 club XDATA, i
208 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
209 .set i, (i +1)
210 .endr
211
212 .if (klen == KEY_128)
213 .if (load_keys)
214 vmovdqa 3*16(p_keys), xkeyA
215 .endif
216 .else
217 vmovdqa 3*16(p_keys), xkeyA
218 .endif
219
220 .set i, 0
221 .rept by
222 club XDATA, i
223 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
224 .set i, (i +1)
225 .endr
226
227 add $(16*by), p_in
228
229 .if (klen == KEY_128)
230 vmovdqa 4*16(p_keys), xkey4
231 .else
232 .if (load_keys)
233 vmovdqa 4*16(p_keys), xkey4
234 .endif
235 .endif
236
237 .set i, 0
238 .rept by
239 club XDATA, i
240 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
241 .set i, (i +1)
242 .endr
243
244 vmovdqa 5*16(p_keys), xkeyA
245
246 .set i, 0
247 .rept by
248 club XDATA, i
249 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
250 .set i, (i +1)
251 .endr
252
253 .if (klen == KEY_128)
254 .if (load_keys)
255 vmovdqa 6*16(p_keys), xkeyB
256 .endif
257 .else
258 vmovdqa 6*16(p_keys), xkeyB
259 .endif
260
261 .set i, 0
262 .rept by
263 club XDATA, i
264 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
265 .set i, (i +1)
266 .endr
267
268 vmovdqa 7*16(p_keys), xkeyA
269
270 .set i, 0
271 .rept by
272 club XDATA, i
273 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
274 .set i, (i +1)
275 .endr
276
277 .if (klen == KEY_128)
278 vmovdqa 8*16(p_keys), xkey8
279 .else
280 .if (load_keys)
281 vmovdqa 8*16(p_keys), xkey8
282 .endif
283 .endif
284
285 .set i, 0
286 .rept by
287 club XDATA, i
288 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
289 .set i, (i +1)
290 .endr
291
292 .if (klen == KEY_128)
293 .if (load_keys)
294 vmovdqa 9*16(p_keys), xkeyA
295 .endif
296 .else
297 vmovdqa 9*16(p_keys), xkeyA
298 .endif
299
300 .set i, 0
301 .rept by
302 club XDATA, i
303 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
304 .set i, (i +1)
305 .endr
306
307 vmovdqa 10*16(p_keys), xkeyB
308
309 .set i, 0
310 .rept by
311 club XDATA, i
312 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
313 .set i, (i +1)
314 .endr
315
316 .if (klen != KEY_128)
317 vmovdqa 11*16(p_keys), xkeyA
318 .endif
319
320 .set i, 0
321 .rept by
322 club XDATA, i
323 /* key 10 */
324 .if (klen == KEY_128)
325 vaesenclast xkeyB, var_xdata, var_xdata
326 .else
327 vaesenc xkeyB, var_xdata, var_xdata
328 .endif
329 .set i, (i +1)
330 .endr
331
332 .if (klen != KEY_128)
333 .if (load_keys)
334 vmovdqa 12*16(p_keys), xkey12
335 .endif
336
337 .set i, 0
338 .rept by
339 club XDATA, i
340 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
341 .set i, (i +1)
342 .endr
343
344 .if (klen == KEY_256)
345 vmovdqa 13*16(p_keys), xkeyA
346 .endif
347
348 .set i, 0
349 .rept by
350 club XDATA, i
351 .if (klen == KEY_256)
352 /* key 12 */
353 vaesenc xkey12, var_xdata, var_xdata
354 .else
355 vaesenclast xkey12, var_xdata, var_xdata
356 .endif
357 .set i, (i +1)
358 .endr
359
360 .if (klen == KEY_256)
361 vmovdqa 14*16(p_keys), xkeyB
362
363 .set i, 0
364 .rept by
365 club XDATA, i
366 /* key 13 */
367 vaesenc xkeyA, var_xdata, var_xdata
368 .set i, (i +1)
369 .endr
370
371 .set i, 0
372 .rept by
373 club XDATA, i
374 /* key 14 */
375 vaesenclast xkeyB, var_xdata, var_xdata
376 .set i, (i +1)
377 .endr
378 .endif
379 .endif
380
381 .set i, 0
382 .rept (by / 2)
383 .set j, (i+1)
384 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
385 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
386 club XDATA, i
387 vpxor xkeyA, var_xdata, var_xdata
388 club XDATA, j
389 vpxor xkeyB, var_xdata, var_xdata
390 .set i, (i+2)
391 .endr
392
393 .if (i < by)
394 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
395 club XDATA, i
396 vpxor xkeyA, var_xdata, var_xdata
397 .endif
398
399 .set i, 0
400 .rept by
401 club XDATA, i
402 VMOVDQ var_xdata, i*16(p_out)
403 .set i, (i+1)
404 .endr
405 .endm
406
407 .macro do_aes_load val, key_len
408 do_aes \val, 1, \key_len
409 .endm
410
411 .macro do_aes_noload val, key_len
412 do_aes \val, 0, \key_len
413 .endm
414
415 /* main body of aes ctr load */
416
417 .macro do_aes_ctrmain key_len
418
419 cmp $16, num_bytes
420 jb .Ldo_return2\key_len
421
422 vmovdqa byteswap_const(%rip), xbyteswap
423 vmovdqu (p_iv), xcounter
424 vpshufb xbyteswap, xcounter, xcounter
425
426 mov num_bytes, tmp
427 and $(7*16), tmp
428 jz .Lmult_of_8_blks\key_len
429
430 /* 1 <= tmp <= 7 */
431 cmp $(4*16), tmp
432 jg .Lgt4\key_len
433 je .Leq4\key_len
434
435 .Llt4\key_len:
436 cmp $(2*16), tmp
437 jg .Leq3\key_len
438 je .Leq2\key_len
439
440 .Leq1\key_len:
441 do_aes_load 1, \key_len
442 add $(1*16), p_out
443 and $(~7*16), num_bytes
444 jz .Ldo_return2\key_len
445 jmp .Lmain_loop2\key_len
446
447 .Leq2\key_len:
448 do_aes_load 2, \key_len
449 add $(2*16), p_out
450 and $(~7*16), num_bytes
451 jz .Ldo_return2\key_len
452 jmp .Lmain_loop2\key_len
453
454
455 .Leq3\key_len:
456 do_aes_load 3, \key_len
457 add $(3*16), p_out
458 and $(~7*16), num_bytes
459 jz .Ldo_return2\key_len
460 jmp .Lmain_loop2\key_len
461
462 .Leq4\key_len:
463 do_aes_load 4, \key_len
464 add $(4*16), p_out
465 and $(~7*16), num_bytes
466 jz .Ldo_return2\key_len
467 jmp .Lmain_loop2\key_len
468
469 .Lgt4\key_len:
470 cmp $(6*16), tmp
471 jg .Leq7\key_len
472 je .Leq6\key_len
473
474 .Leq5\key_len:
475 do_aes_load 5, \key_len
476 add $(5*16), p_out
477 and $(~7*16), num_bytes
478 jz .Ldo_return2\key_len
479 jmp .Lmain_loop2\key_len
480
481 .Leq6\key_len:
482 do_aes_load 6, \key_len
483 add $(6*16), p_out
484 and $(~7*16), num_bytes
485 jz .Ldo_return2\key_len
486 jmp .Lmain_loop2\key_len
487
488 .Leq7\key_len:
489 do_aes_load 7, \key_len
490 add $(7*16), p_out
491 and $(~7*16), num_bytes
492 jz .Ldo_return2\key_len
493 jmp .Lmain_loop2\key_len
494
495 .Lmult_of_8_blks\key_len:
496 .if (\key_len != KEY_128)
497 vmovdqa 0*16(p_keys), xkey0
498 vmovdqa 4*16(p_keys), xkey4
499 vmovdqa 8*16(p_keys), xkey8
500 vmovdqa 12*16(p_keys), xkey12
501 .else
502 vmovdqa 0*16(p_keys), xkey0
503 vmovdqa 3*16(p_keys), xkey4
504 vmovdqa 6*16(p_keys), xkey8
505 vmovdqa 9*16(p_keys), xkey12
506 .endif
507 .align 16
508 .Lmain_loop2\key_len:
509 /* num_bytes is a multiple of 8 and >0 */
510 do_aes_noload 8, \key_len
511 add $(8*16), p_out
512 sub $(8*16), num_bytes
513 jne .Lmain_loop2\key_len
514
515 .Ldo_return2\key_len:
516 /* return updated IV */
517 vpshufb xbyteswap, xcounter, xcounter
518 vmovdqu xcounter, (p_iv)
519 ret
520 .endm
521
522 /*
523 * routine to do AES128 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
527 * unsigned int num_bytes)
528 */
529 ENTRY(aes_ctr_enc_128_avx_by8)
530 /* call the aes main loop */
531 do_aes_ctrmain KEY_128
532
533 ENDPROC(aes_ctr_enc_128_avx_by8)
534
535 /*
536 * routine to do AES192 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
540 * unsigned int num_bytes)
541 */
542 ENTRY(aes_ctr_enc_192_avx_by8)
543 /* call the aes main loop */
544 do_aes_ctrmain KEY_192
545
546 ENDPROC(aes_ctr_enc_192_avx_by8)
547
548 /*
549 * routine to do AES256 CTR enc/decrypt "by8"
550 * XMM registers are clobbered.
551 * Saving/restoring must be done at a higher level
552 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
553 * unsigned int num_bytes)
554 */
555 ENTRY(aes_ctr_enc_256_avx_by8)
556 /* call the aes main loop */
557 do_aes_ctrmain KEY_256
558
559 ENDPROC(aes_ctr_enc_256_avx_by8)
This page took 0.040996 seconds and 5 git commands to generate.