2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
17 * Copyright(c) 2014 Intel Corporation.
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
35 * Copyright(c) 2014 Intel Corporation.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 #include <linux/linkage.h>
68 #define CONCAT(a,b) a##b
69 #define VMOVDQ vmovdqu
79 #define xcounter %xmm8
80 #define xbyteswap %xmm9
98 #define DDQ(i) CONCAT(ddq_add_,i)
99 #define XMM(i) CONCAT(%xmm, i)
110 .octa 0x000102030405060708090A0B0C0D0E0F
112 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
114 .octa 0x00000000000000010000000000000000
116 .octa 0x00000000000000000000000000000001
118 .octa 0x00000000000000000000000000000002
120 .octa 0x00000000000000000000000000000003
122 .octa 0x00000000000000000000000000000004
124 .octa 0x00000000000000000000000000000005
126 .octa 0x00000000000000000000000000000006
128 .octa 0x00000000000000000000000000000007
130 .octa 0x00000000000000000000000000000008
134 /* generate a unique variable for ddq_add_x */
137 var_ddq_add = DDQ(\n)
140 /* generate a unique variable for xmm register */
145 /* club the numeric 'id' to the symbol 'name' */
149 .if \name == DDQ_DATA
151 .elseif \name == XDATA
158 * do_aes num_in_par load_keys key_len
159 * This increments p_in, but not p_out
161 .macro do_aes b, k, key_len
167 vmovdqa 0*16(p_keys), xkey0
170 vpshufb xbyteswap, xcounter, xdata0
176 vpaddq var_ddq_add(%rip), xcounter, var_xdata
177 vptest ddq_low_msk(%rip), var_xdata
179 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
180 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
182 vpshufb xbyteswap, var_xdata, var_xdata
186 vmovdqa 1*16(p_keys), xkeyA
188 vpxor xkey0, xdata0, xdata0
190 vpaddq var_ddq_add(%rip), xcounter, xcounter
191 vptest ddq_low_msk(%rip), xcounter
193 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
199 vpxor xkey0, var_xdata, var_xdata
203 vmovdqa 2*16(p_keys), xkeyB
208 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
212 .if (klen == KEY_128)
214 vmovdqa 3*16(p_keys), xkeyA
217 vmovdqa 3*16(p_keys), xkeyA
223 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
229 .if (klen == KEY_128)
230 vmovdqa 4*16(p_keys), xkey4
233 vmovdqa 4*16(p_keys), xkey4
240 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
244 vmovdqa 5*16(p_keys), xkeyA
249 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
253 .if (klen == KEY_128)
255 vmovdqa 6*16(p_keys), xkeyB
258 vmovdqa 6*16(p_keys), xkeyB
264 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
268 vmovdqa 7*16(p_keys), xkeyA
273 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
277 .if (klen == KEY_128)
278 vmovdqa 8*16(p_keys), xkey8
281 vmovdqa 8*16(p_keys), xkey8
288 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
292 .if (klen == KEY_128)
294 vmovdqa 9*16(p_keys), xkeyA
297 vmovdqa 9*16(p_keys), xkeyA
303 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
307 vmovdqa 10*16(p_keys), xkeyB
312 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
316 .if (klen != KEY_128)
317 vmovdqa 11*16(p_keys), xkeyA
324 .if (klen == KEY_128)
325 vaesenclast xkeyB, var_xdata, var_xdata
327 vaesenc xkeyB, var_xdata, var_xdata
332 .if (klen != KEY_128)
334 vmovdqa 12*16(p_keys), xkey12
340 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
344 .if (klen == KEY_256)
345 vmovdqa 13*16(p_keys), xkeyA
351 .if (klen == KEY_256)
353 vaesenc xkey12, var_xdata, var_xdata
355 vaesenclast xkey12, var_xdata, var_xdata
360 .if (klen == KEY_256)
361 vmovdqa 14*16(p_keys), xkeyB
367 vaesenc xkeyA, var_xdata, var_xdata
375 vaesenclast xkeyB, var_xdata, var_xdata
384 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
385 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
387 vpxor xkeyA, var_xdata, var_xdata
389 vpxor xkeyB, var_xdata, var_xdata
394 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
396 vpxor xkeyA, var_xdata, var_xdata
402 VMOVDQ var_xdata, i*16(p_out)
407 .macro do_aes_load val, key_len
408 do_aes \val, 1, \key_len
411 .macro do_aes_noload val, key_len
412 do_aes \val, 0, \key_len
415 /* main body of aes ctr load */
417 .macro do_aes_ctrmain key_len
420 jb .Ldo_return2\key_len
422 vmovdqa byteswap_const(%rip), xbyteswap
423 vmovdqu (p_iv), xcounter
424 vpshufb xbyteswap, xcounter, xcounter
428 jz .Lmult_of_8_blks\key_len
441 do_aes_load 1, \key_len
443 and $(~7*16), num_bytes
444 jz .Ldo_return2\key_len
445 jmp .Lmain_loop2\key_len
448 do_aes_load 2, \key_len
450 and $(~7*16), num_bytes
451 jz .Ldo_return2\key_len
452 jmp .Lmain_loop2\key_len
456 do_aes_load 3, \key_len
458 and $(~7*16), num_bytes
459 jz .Ldo_return2\key_len
460 jmp .Lmain_loop2\key_len
463 do_aes_load 4, \key_len
465 and $(~7*16), num_bytes
466 jz .Ldo_return2\key_len
467 jmp .Lmain_loop2\key_len
475 do_aes_load 5, \key_len
477 and $(~7*16), num_bytes
478 jz .Ldo_return2\key_len
479 jmp .Lmain_loop2\key_len
482 do_aes_load 6, \key_len
484 and $(~7*16), num_bytes
485 jz .Ldo_return2\key_len
486 jmp .Lmain_loop2\key_len
489 do_aes_load 7, \key_len
491 and $(~7*16), num_bytes
492 jz .Ldo_return2\key_len
493 jmp .Lmain_loop2\key_len
495 .Lmult_of_8_blks\key_len:
496 .if (\key_len != KEY_128)
497 vmovdqa 0*16(p_keys), xkey0
498 vmovdqa 4*16(p_keys), xkey4
499 vmovdqa 8*16(p_keys), xkey8
500 vmovdqa 12*16(p_keys), xkey12
502 vmovdqa 0*16(p_keys), xkey0
503 vmovdqa 3*16(p_keys), xkey4
504 vmovdqa 6*16(p_keys), xkey8
505 vmovdqa 9*16(p_keys), xkey12
508 .Lmain_loop2\key_len:
509 /* num_bytes is a multiple of 8 and >0 */
510 do_aes_noload 8, \key_len
512 sub $(8*16), num_bytes
513 jne .Lmain_loop2\key_len
515 .Ldo_return2\key_len:
516 /* return updated IV */
517 vpshufb xbyteswap, xcounter, xcounter
518 vmovdqu xcounter, (p_iv)
523 * routine to do AES128 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
527 * unsigned int num_bytes)
529 ENTRY(aes_ctr_enc_128_avx_by8)
530 /* call the aes main loop */
531 do_aes_ctrmain KEY_128
533 ENDPROC(aes_ctr_enc_128_avx_by8)
536 * routine to do AES192 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
540 * unsigned int num_bytes)
542 ENTRY(aes_ctr_enc_192_avx_by8)
543 /* call the aes main loop */
544 do_aes_ctrmain KEY_192
546 ENDPROC(aes_ctr_enc_192_avx_by8)
549 * routine to do AES256 CTR enc/decrypt "by8"
550 * XMM registers are clobbered.
551 * Saving/restoring must be done at a higher level
552 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
553 * unsigned int num_bytes)
555 ENTRY(aes_ctr_enc_256_avx_by8)
556 /* call the aes main loop */
557 do_aes_ctrmain KEY_256
559 ENDPROC(aes_ctr_enc_256_avx_by8)