crypto: aesni - fix "by8" variant for 128 bit keys
[deliverable/linux.git] / arch / x86 / crypto / aes_ctrby8_avx-x86_64.S
CommitLineData
22cddcc7 1/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b) a##b
69#define VMOVDQ vmovdqu
70
71#define xdata0 %xmm0
72#define xdata1 %xmm1
73#define xdata2 %xmm2
74#define xdata3 %xmm3
75#define xdata4 %xmm4
76#define xdata5 %xmm5
77#define xdata6 %xmm6
78#define xdata7 %xmm7
79#define xcounter %xmm8
80#define xbyteswap %xmm9
81#define xkey0 %xmm10
22cddcc7 82#define xkey4 %xmm11
83#define xkey8 %xmm12
84#define xkey12 %xmm13
85#define xkeyA %xmm14
86#define xkeyB %xmm15
87
88#define p_in %rdi
89#define p_iv %rsi
90#define p_keys %rdx
91#define p_out %rcx
92#define num_bytes %r8
93
94#define tmp %r10
95#define DDQ(i) CONCAT(ddq_add_,i)
96#define XMM(i) CONCAT(%xmm, i)
97#define DDQ_DATA 0
98#define XDATA 1
99#define KEY_128 1
100#define KEY_192 2
101#define KEY_256 3
102
103.section .rodata
104.align 16
105
106byteswap_const:
107 .octa 0x000102030405060708090A0B0C0D0E0F
80dca473
MK
108ddq_low_msk:
109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
110ddq_high_add_1:
111 .octa 0x00000000000000010000000000000000
22cddcc7 112ddq_add_1:
113 .octa 0x00000000000000000000000000000001
114ddq_add_2:
115 .octa 0x00000000000000000000000000000002
116ddq_add_3:
117 .octa 0x00000000000000000000000000000003
118ddq_add_4:
119 .octa 0x00000000000000000000000000000004
120ddq_add_5:
121 .octa 0x00000000000000000000000000000005
122ddq_add_6:
123 .octa 0x00000000000000000000000000000006
124ddq_add_7:
125 .octa 0x00000000000000000000000000000007
126ddq_add_8:
127 .octa 0x00000000000000000000000000000008
128
129.text
130
131/* generate a unique variable for ddq_add_x */
132
133.macro setddq n
134 var_ddq_add = DDQ(\n)
135.endm
136
137/* generate a unique variable for xmm register */
138.macro setxdata n
139 var_xdata = XMM(\n)
140.endm
141
142/* club the numeric 'id' to the symbol 'name' */
143
144.macro club name, id
145.altmacro
146 .if \name == DDQ_DATA
147 setddq %\id
148 .elseif \name == XDATA
149 setxdata %\id
150 .endif
151.noaltmacro
152.endm
153
154/*
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
157 */
158.macro do_aes b, k, key_len
159 .set by, \b
160 .set load_keys, \k
161 .set klen, \key_len
162
163 .if (load_keys)
164 vmovdqa 0*16(p_keys), xkey0
165 .endif
166
167 vpshufb xbyteswap, xcounter, xdata0
168
169 .set i, 1
170 .rept (by - 1)
171 club DDQ_DATA, i
172 club XDATA, i
80dca473
MK
173 vpaddq var_ddq_add(%rip), xcounter, var_xdata
174 vptest ddq_low_msk(%rip), var_xdata
175 jnz 1f
176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
178 1:
22cddcc7 179 vpshufb xbyteswap, var_xdata, var_xdata
180 .set i, (i +1)
181 .endr
182
183 vmovdqa 1*16(p_keys), xkeyA
184
185 vpxor xkey0, xdata0, xdata0
186 club DDQ_DATA, by
80dca473
MK
187 vpaddq var_ddq_add(%rip), xcounter, xcounter
188 vptest ddq_low_msk(%rip), xcounter
189 jnz 1f
190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
191 1:
22cddcc7 192
193 .set i, 1
194 .rept (by - 1)
195 club XDATA, i
196 vpxor xkey0, var_xdata, var_xdata
197 .set i, (i +1)
198 .endr
199
200 vmovdqa 2*16(p_keys), xkeyB
201
202 .set i, 0
203 .rept by
204 club XDATA, i
205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
206 .set i, (i +1)
207 .endr
208
209 .if (klen == KEY_128)
210 .if (load_keys)
0b1e95b2 211 vmovdqa 3*16(p_keys), xkey4
22cddcc7 212 .endif
213 .else
214 vmovdqa 3*16(p_keys), xkeyA
215 .endif
216
217 .set i, 0
218 .rept by
219 club XDATA, i
220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
221 .set i, (i +1)
222 .endr
223
224 add $(16*by), p_in
225
226 .if (klen == KEY_128)
0b1e95b2 227 vmovdqa 4*16(p_keys), xkeyB
22cddcc7 228 .else
229 .if (load_keys)
230 vmovdqa 4*16(p_keys), xkey4
231 .endif
232 .endif
233
234 .set i, 0
235 .rept by
236 club XDATA, i
0b1e95b2
MK
237 /* key 3 */
238 .if (klen == KEY_128)
239 vaesenc xkey4, var_xdata, var_xdata
240 .else
241 vaesenc xkeyA, var_xdata, var_xdata
242 .endif
22cddcc7 243 .set i, (i +1)
244 .endr
245
246 vmovdqa 5*16(p_keys), xkeyA
247
248 .set i, 0
249 .rept by
250 club XDATA, i
0b1e95b2
MK
251 /* key 4 */
252 .if (klen == KEY_128)
253 vaesenc xkeyB, var_xdata, var_xdata
254 .else
255 vaesenc xkey4, var_xdata, var_xdata
256 .endif
22cddcc7 257 .set i, (i +1)
258 .endr
259
260 .if (klen == KEY_128)
261 .if (load_keys)
0b1e95b2 262 vmovdqa 6*16(p_keys), xkey8
22cddcc7 263 .endif
264 .else
265 vmovdqa 6*16(p_keys), xkeyB
266 .endif
267
268 .set i, 0
269 .rept by
270 club XDATA, i
271 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
272 .set i, (i +1)
273 .endr
274
275 vmovdqa 7*16(p_keys), xkeyA
276
277 .set i, 0
278 .rept by
279 club XDATA, i
0b1e95b2
MK
280 /* key 6 */
281 .if (klen == KEY_128)
282 vaesenc xkey8, var_xdata, var_xdata
283 .else
284 vaesenc xkeyB, var_xdata, var_xdata
285 .endif
22cddcc7 286 .set i, (i +1)
287 .endr
288
289 .if (klen == KEY_128)
0b1e95b2 290 vmovdqa 8*16(p_keys), xkeyB
22cddcc7 291 .else
292 .if (load_keys)
293 vmovdqa 8*16(p_keys), xkey8
294 .endif
295 .endif
296
297 .set i, 0
298 .rept by
299 club XDATA, i
300 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
301 .set i, (i +1)
302 .endr
303
304 .if (klen == KEY_128)
305 .if (load_keys)
0b1e95b2 306 vmovdqa 9*16(p_keys), xkey12
22cddcc7 307 .endif
308 .else
309 vmovdqa 9*16(p_keys), xkeyA
310 .endif
311
312 .set i, 0
313 .rept by
314 club XDATA, i
0b1e95b2
MK
315 /* key 8 */
316 .if (klen == KEY_128)
317 vaesenc xkeyB, var_xdata, var_xdata
318 .else
319 vaesenc xkey8, var_xdata, var_xdata
320 .endif
22cddcc7 321 .set i, (i +1)
322 .endr
323
324 vmovdqa 10*16(p_keys), xkeyB
325
326 .set i, 0
327 .rept by
328 club XDATA, i
0b1e95b2
MK
329 /* key 9 */
330 .if (klen == KEY_128)
331 vaesenc xkey12, var_xdata, var_xdata
332 .else
333 vaesenc xkeyA, var_xdata, var_xdata
334 .endif
22cddcc7 335 .set i, (i +1)
336 .endr
337
338 .if (klen != KEY_128)
339 vmovdqa 11*16(p_keys), xkeyA
340 .endif
341
342 .set i, 0
343 .rept by
344 club XDATA, i
345 /* key 10 */
346 .if (klen == KEY_128)
347 vaesenclast xkeyB, var_xdata, var_xdata
348 .else
349 vaesenc xkeyB, var_xdata, var_xdata
350 .endif
351 .set i, (i +1)
352 .endr
353
354 .if (klen != KEY_128)
355 .if (load_keys)
356 vmovdqa 12*16(p_keys), xkey12
357 .endif
358
359 .set i, 0
360 .rept by
361 club XDATA, i
362 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
363 .set i, (i +1)
364 .endr
365
366 .if (klen == KEY_256)
367 vmovdqa 13*16(p_keys), xkeyA
368 .endif
369
370 .set i, 0
371 .rept by
372 club XDATA, i
373 .if (klen == KEY_256)
374 /* key 12 */
375 vaesenc xkey12, var_xdata, var_xdata
376 .else
377 vaesenclast xkey12, var_xdata, var_xdata
378 .endif
379 .set i, (i +1)
380 .endr
381
382 .if (klen == KEY_256)
383 vmovdqa 14*16(p_keys), xkeyB
384
385 .set i, 0
386 .rept by
387 club XDATA, i
388 /* key 13 */
389 vaesenc xkeyA, var_xdata, var_xdata
390 .set i, (i +1)
391 .endr
392
393 .set i, 0
394 .rept by
395 club XDATA, i
396 /* key 14 */
397 vaesenclast xkeyB, var_xdata, var_xdata
398 .set i, (i +1)
399 .endr
400 .endif
401 .endif
402
403 .set i, 0
404 .rept (by / 2)
405 .set j, (i+1)
406 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
407 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
408 club XDATA, i
409 vpxor xkeyA, var_xdata, var_xdata
410 club XDATA, j
411 vpxor xkeyB, var_xdata, var_xdata
412 .set i, (i+2)
413 .endr
414
415 .if (i < by)
416 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
417 club XDATA, i
418 vpxor xkeyA, var_xdata, var_xdata
419 .endif
420
421 .set i, 0
422 .rept by
423 club XDATA, i
424 VMOVDQ var_xdata, i*16(p_out)
425 .set i, (i+1)
426 .endr
427.endm
428
429.macro do_aes_load val, key_len
430 do_aes \val, 1, \key_len
431.endm
432
433.macro do_aes_noload val, key_len
434 do_aes \val, 0, \key_len
435.endm
436
437/* main body of aes ctr load */
438
439.macro do_aes_ctrmain key_len
22cddcc7 440 cmp $16, num_bytes
441 jb .Ldo_return2\key_len
442
443 vmovdqa byteswap_const(%rip), xbyteswap
444 vmovdqu (p_iv), xcounter
445 vpshufb xbyteswap, xcounter, xcounter
446
447 mov num_bytes, tmp
448 and $(7*16), tmp
449 jz .Lmult_of_8_blks\key_len
450
451 /* 1 <= tmp <= 7 */
452 cmp $(4*16), tmp
453 jg .Lgt4\key_len
454 je .Leq4\key_len
455
456.Llt4\key_len:
457 cmp $(2*16), tmp
458 jg .Leq3\key_len
459 je .Leq2\key_len
460
461.Leq1\key_len:
462 do_aes_load 1, \key_len
463 add $(1*16), p_out
464 and $(~7*16), num_bytes
465 jz .Ldo_return2\key_len
466 jmp .Lmain_loop2\key_len
467
468.Leq2\key_len:
469 do_aes_load 2, \key_len
470 add $(2*16), p_out
471 and $(~7*16), num_bytes
472 jz .Ldo_return2\key_len
473 jmp .Lmain_loop2\key_len
474
475
476.Leq3\key_len:
477 do_aes_load 3, \key_len
478 add $(3*16), p_out
479 and $(~7*16), num_bytes
480 jz .Ldo_return2\key_len
481 jmp .Lmain_loop2\key_len
482
483.Leq4\key_len:
484 do_aes_load 4, \key_len
485 add $(4*16), p_out
486 and $(~7*16), num_bytes
487 jz .Ldo_return2\key_len
488 jmp .Lmain_loop2\key_len
489
490.Lgt4\key_len:
491 cmp $(6*16), tmp
492 jg .Leq7\key_len
493 je .Leq6\key_len
494
495.Leq5\key_len:
496 do_aes_load 5, \key_len
497 add $(5*16), p_out
498 and $(~7*16), num_bytes
499 jz .Ldo_return2\key_len
500 jmp .Lmain_loop2\key_len
501
502.Leq6\key_len:
503 do_aes_load 6, \key_len
504 add $(6*16), p_out
505 and $(~7*16), num_bytes
506 jz .Ldo_return2\key_len
507 jmp .Lmain_loop2\key_len
508
509.Leq7\key_len:
510 do_aes_load 7, \key_len
511 add $(7*16), p_out
512 and $(~7*16), num_bytes
513 jz .Ldo_return2\key_len
514 jmp .Lmain_loop2\key_len
515
516.Lmult_of_8_blks\key_len:
517 .if (\key_len != KEY_128)
518 vmovdqa 0*16(p_keys), xkey0
519 vmovdqa 4*16(p_keys), xkey4
520 vmovdqa 8*16(p_keys), xkey8
521 vmovdqa 12*16(p_keys), xkey12
522 .else
523 vmovdqa 0*16(p_keys), xkey0
524 vmovdqa 3*16(p_keys), xkey4
525 vmovdqa 6*16(p_keys), xkey8
526 vmovdqa 9*16(p_keys), xkey12
527 .endif
528.align 16
529.Lmain_loop2\key_len:
530 /* num_bytes is a multiple of 8 and >0 */
531 do_aes_noload 8, \key_len
532 add $(8*16), p_out
533 sub $(8*16), num_bytes
534 jne .Lmain_loop2\key_len
535
536.Ldo_return2\key_len:
537 /* return updated IV */
538 vpshufb xbyteswap, xcounter, xcounter
539 vmovdqu xcounter, (p_iv)
540 ret
541.endm
542
543/*
544 * routine to do AES128 CTR enc/decrypt "by8"
545 * XMM registers are clobbered.
546 * Saving/restoring must be done at a higher level
547 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
548 * unsigned int num_bytes)
549 */
550ENTRY(aes_ctr_enc_128_avx_by8)
551 /* call the aes main loop */
552 do_aes_ctrmain KEY_128
553
554ENDPROC(aes_ctr_enc_128_avx_by8)
555
556/*
557 * routine to do AES192 CTR enc/decrypt "by8"
558 * XMM registers are clobbered.
559 * Saving/restoring must be done at a higher level
560 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
561 * unsigned int num_bytes)
562 */
563ENTRY(aes_ctr_enc_192_avx_by8)
564 /* call the aes main loop */
565 do_aes_ctrmain KEY_192
566
567ENDPROC(aes_ctr_enc_192_avx_by8)
568
569/*
570 * routine to do AES256 CTR enc/decrypt "by8"
571 * XMM registers are clobbered.
572 * Saving/restoring must be done at a higher level
573 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
574 * unsigned int num_bytes)
575 */
576ENTRY(aes_ctr_enc_256_avx_by8)
577 /* call the aes main loop */
578 do_aes_ctrmain KEY_256
579
580ENDPROC(aes_ctr_enc_256_avx_by8)
This page took 0.064818 seconds and 5 git commands to generate.