crypto: aesni-intel - Ported implementation to x86-32
[deliverable/linux.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
54b6a1bd 34
0bd82f5f
TS
35.data
36POLY: .octa 0xC2000000000000000000000000000001
37TWOONE: .octa 0x00000001000000000000000000000001
38
39# order of these constants should not change.
40# more specifically, ALL_F should follow SHIFT_MASK,
41# and ZERO should follow ALL_F
42
43SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
44MASK1: .octa 0x0000000000000000ffffffffffffffff
45MASK2: .octa 0xffffffffffffffff0000000000000000
46SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
47ALL_F: .octa 0xffffffffffffffffffffffffffffffff
48ZERO: .octa 0x00000000000000000000000000000000
49ONE: .octa 0x00000000000000000000000000000001
50F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
51dec: .octa 0x1
52enc: .octa 0x2
53
54
54b6a1bd
HY
55.text
56
0bd82f5f
TS
57
58#define STACK_OFFSET 8*3
59#define HashKey 16*0 // store HashKey <<1 mod poly here
60#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
61#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
62#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
63#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
64 // bits of HashKey <<1 mod poly here
65 //(for Karatsuba purposes)
66#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
67 // bits of HashKey^2 <<1 mod poly here
68 // (for Karatsuba purposes)
69#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
70 // bits of HashKey^3 <<1 mod poly here
71 // (for Karatsuba purposes)
72#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
73 // bits of HashKey^4 <<1 mod poly here
74 // (for Karatsuba purposes)
75#define VARIABLE_OFFSET 16*8
76
77#define arg1 rdi
78#define arg2 rsi
79#define arg3 rdx
80#define arg4 rcx
81#define arg5 r8
82#define arg6 r9
83#define arg7 STACK_OFFSET+8(%r14)
84#define arg8 STACK_OFFSET+16(%r14)
85#define arg9 STACK_OFFSET+24(%r14)
86#define arg10 STACK_OFFSET+32(%r14)
87
88
54b6a1bd
HY
89#define STATE1 %xmm0
90#define STATE2 %xmm4
91#define STATE3 %xmm5
92#define STATE4 %xmm6
93#define STATE STATE1
94#define IN1 %xmm1
95#define IN2 %xmm7
96#define IN3 %xmm8
97#define IN4 %xmm9
98#define IN IN1
99#define KEY %xmm2
100#define IV %xmm3
0d258efb 101
12387a46
HY
102#define BSWAP_MASK %xmm10
103#define CTR %xmm11
104#define INC %xmm12
54b6a1bd 105
0d258efb
MK
106#ifdef __x86_64__
107#define AREG %rax
54b6a1bd
HY
108#define KEYP %rdi
109#define OUTP %rsi
0d258efb 110#define UKEYP OUTP
54b6a1bd
HY
111#define INP %rdx
112#define LEN %rcx
113#define IVP %r8
114#define KLEN %r9d
115#define T1 %r10
116#define TKEYP T1
117#define T2 %r11
12387a46 118#define TCTR_LOW T2
0d258efb
MK
119#else
120#define AREG %eax
121#define KEYP %edi
122#define OUTP AREG
123#define UKEYP OUTP
124#define INP %edx
125#define LEN %esi
126#define IVP %ebp
127#define KLEN %ebx
128#define T1 %ecx
129#define TKEYP T1
130#endif
54b6a1bd 131
0bd82f5f
TS
132
133/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
134*
135*
136* Input: A and B (128-bits each, bit-reflected)
137* Output: C = A*B*x mod poly, (i.e. >>1 )
138* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
139* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
140*
141*/
142.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
143 movdqa \GH, \TMP1
144 pshufd $78, \GH, \TMP2
145 pshufd $78, \HK, \TMP3
146 pxor \GH, \TMP2 # TMP2 = a1+a0
147 pxor \HK, \TMP3 # TMP3 = b1+b0
148 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
149 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
150 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
151 pxor \GH, \TMP2
152 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
153 movdqa \TMP2, \TMP3
154 pslldq $8, \TMP3 # left shift TMP3 2 DWs
155 psrldq $8, \TMP2 # right shift TMP2 2 DWs
156 pxor \TMP3, \GH
157 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
158
159 # first phase of the reduction
160
161 movdqa \GH, \TMP2
162 movdqa \GH, \TMP3
163 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
164 # in in order to perform
165 # independent shifts
166 pslld $31, \TMP2 # packed right shift <<31
167 pslld $30, \TMP3 # packed right shift <<30
168 pslld $25, \TMP4 # packed right shift <<25
169 pxor \TMP3, \TMP2 # xor the shifted versions
170 pxor \TMP4, \TMP2
171 movdqa \TMP2, \TMP5
172 psrldq $4, \TMP5 # right shift TMP5 1 DW
173 pslldq $12, \TMP2 # left shift TMP2 3 DWs
174 pxor \TMP2, \GH
175
176 # second phase of the reduction
177
178 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
179 # in in order to perform
180 # independent shifts
181 movdqa \GH,\TMP3
182 movdqa \GH,\TMP4
183 psrld $1,\TMP2 # packed left shift >>1
184 psrld $2,\TMP3 # packed left shift >>2
185 psrld $7,\TMP4 # packed left shift >>7
186 pxor \TMP3,\TMP2 # xor the shifted versions
187 pxor \TMP4,\TMP2
188 pxor \TMP5, \TMP2
189 pxor \TMP2, \GH
190 pxor \TMP1, \GH # result is in TMP1
191.endm
192
193/*
194* if a = number of total plaintext bytes
195* b = floor(a/16)
196* num_initial_blocks = b mod 4
197* encrypt the initial num_initial_blocks blocks and apply ghash on
198* the ciphertext
199* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
200* are clobbered
201* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
202*/
203
204.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
205XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
206
207 mov arg7, %r10 # %r10 = AAD
208 mov arg8, %r12 # %r12 = aadLen
209 mov %r12, %r11
210 pxor %xmm\i, %xmm\i
211_get_AAD_loop\num_initial_blocks\operation:
212 movd (%r10), \TMP1
213 pslldq $12, \TMP1
214 psrldq $4, %xmm\i
215 pxor \TMP1, %xmm\i
216 add $4, %r10
217 sub $4, %r12
218 jne _get_AAD_loop\num_initial_blocks\operation
219 cmp $16, %r11
220 je _get_AAD_loop2_done\num_initial_blocks\operation
221 mov $16, %r12
222_get_AAD_loop2\num_initial_blocks\operation:
223 psrldq $4, %xmm\i
224 sub $4, %r12
225 cmp %r11, %r12
226 jne _get_AAD_loop2\num_initial_blocks\operation
227_get_AAD_loop2_done\num_initial_blocks\operation:
228 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
229 xor %r11, %r11 # initialise the data pointer offset as zero
230
231 # start AES for num_initial_blocks blocks
232
233 mov %arg5, %rax # %rax = *Y0
234 movdqu (%rax), \XMM0 # XMM0 = Y0
235 pshufb SHUF_MASK(%rip), \XMM0
236.if \i_seq != 0
237.irpc index, \i_seq
238 paddd ONE(%rip), \XMM0 # INCR Y0
239 movdqa \XMM0, %xmm\index
240 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
241.endr
242.irpc index, \i_seq
243 pxor 16*0(%arg1), %xmm\index
244.endr
245.irpc index, \i_seq
246 movaps 0x10(%rdi), \TMP1
247 AESENC \TMP1, %xmm\index # Round 1
248.endr
249.irpc index, \i_seq
250 movaps 0x20(%arg1), \TMP1
251 AESENC \TMP1, %xmm\index # Round 2
252.endr
253.irpc index, \i_seq
254 movaps 0x30(%arg1), \TMP1
255 AESENC \TMP1, %xmm\index # Round 2
256.endr
257.irpc index, \i_seq
258 movaps 0x40(%arg1), \TMP1
259 AESENC \TMP1, %xmm\index # Round 2
260.endr
261.irpc index, \i_seq
262 movaps 0x50(%arg1), \TMP1
263 AESENC \TMP1, %xmm\index # Round 2
264.endr
265.irpc index, \i_seq
266 movaps 0x60(%arg1), \TMP1
267 AESENC \TMP1, %xmm\index # Round 2
268.endr
269.irpc index, \i_seq
270 movaps 0x70(%arg1), \TMP1
271 AESENC \TMP1, %xmm\index # Round 2
272.endr
273.irpc index, \i_seq
274 movaps 0x80(%arg1), \TMP1
275 AESENC \TMP1, %xmm\index # Round 2
276.endr
277.irpc index, \i_seq
278 movaps 0x90(%arg1), \TMP1
279 AESENC \TMP1, %xmm\index # Round 2
280.endr
281.irpc index, \i_seq
282 movaps 0xa0(%arg1), \TMP1
283 AESENCLAST \TMP1, %xmm\index # Round 10
284.endr
285.irpc index, \i_seq
286 movdqu (%arg3 , %r11, 1), \TMP1
287 pxor \TMP1, %xmm\index
288 movdqu %xmm\index, (%arg2 , %r11, 1)
289 # write back plaintext/ciphertext for num_initial_blocks
290 add $16, %r11
291.if \operation == dec
292 movdqa \TMP1, %xmm\index
293.endif
294 pshufb SHUF_MASK(%rip), %xmm\index
295 # prepare plaintext/ciphertext for GHASH computation
296.endr
297.endif
298 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
299 # apply GHASH on num_initial_blocks blocks
300
301.if \i == 5
302 pxor %xmm5, %xmm6
303 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
304 pxor %xmm6, %xmm7
305 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
306 pxor %xmm7, %xmm8
307 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
308.elseif \i == 6
309 pxor %xmm6, %xmm7
310 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311 pxor %xmm7, %xmm8
312 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313.elseif \i == 7
314 pxor %xmm7, %xmm8
315 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316.endif
317 cmp $64, %r13
318 jl _initial_blocks_done\num_initial_blocks\operation
319 # no need for precomputed values
320/*
321*
322* Precomputations for HashKey parallel with encryption of first 4 blocks.
323* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
324*/
325 paddd ONE(%rip), \XMM0 # INCR Y0
326 movdqa \XMM0, \XMM1
327 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
328 paddd ONE(%rip), \XMM0 # INCR Y0
329 movdqa \XMM0, \XMM2
330 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
331 paddd ONE(%rip), \XMM0 # INCR Y0
332 movdqa \XMM0, \XMM3
333 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
334 paddd ONE(%rip), \XMM0 # INCR Y0
335 movdqa \XMM0, \XMM4
336 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
337 pxor 16*0(%arg1), \XMM1
338 pxor 16*0(%arg1), \XMM2
339 pxor 16*0(%arg1), \XMM3
340 pxor 16*0(%arg1), \XMM4
341 movdqa \TMP3, \TMP5
342 pshufd $78, \TMP3, \TMP1
343 pxor \TMP3, \TMP1
344 movdqa \TMP1, HashKey_k(%rsp)
345 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
346# TMP5 = HashKey^2<<1 (mod poly)
347 movdqa \TMP5, HashKey_2(%rsp)
348# HashKey_2 = HashKey^2<<1 (mod poly)
349 pshufd $78, \TMP5, \TMP1
350 pxor \TMP5, \TMP1
351 movdqa \TMP1, HashKey_2_k(%rsp)
352.irpc index, 1234 # do 4 rounds
353 movaps 0x10*\index(%arg1), \TMP1
354 AESENC \TMP1, \XMM1
355 AESENC \TMP1, \XMM2
356 AESENC \TMP1, \XMM3
357 AESENC \TMP1, \XMM4
358.endr
359 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
360# TMP5 = HashKey^3<<1 (mod poly)
361 movdqa \TMP5, HashKey_3(%rsp)
362 pshufd $78, \TMP5, \TMP1
363 pxor \TMP5, \TMP1
364 movdqa \TMP1, HashKey_3_k(%rsp)
365.irpc index, 56789 # do next 5 rounds
366 movaps 0x10*\index(%arg1), \TMP1
367 AESENC \TMP1, \XMM1
368 AESENC \TMP1, \XMM2
369 AESENC \TMP1, \XMM3
370 AESENC \TMP1, \XMM4
371.endr
372 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
373# TMP5 = HashKey^3<<1 (mod poly)
374 movdqa \TMP5, HashKey_4(%rsp)
375 pshufd $78, \TMP5, \TMP1
376 pxor \TMP5, \TMP1
377 movdqa \TMP1, HashKey_4_k(%rsp)
378 movaps 0xa0(%arg1), \TMP2
379 AESENCLAST \TMP2, \XMM1
380 AESENCLAST \TMP2, \XMM2
381 AESENCLAST \TMP2, \XMM3
382 AESENCLAST \TMP2, \XMM4
383 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
384 pxor \TMP1, \XMM1
385.if \operation == dec
386 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
387 movdqa \TMP1, \XMM1
388.endif
389 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
390 pxor \TMP1, \XMM2
391.if \operation == dec
392 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
393 movdqa \TMP1, \XMM2
394.endif
395 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
396 pxor \TMP1, \XMM3
397.if \operation == dec
398 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
399 movdqa \TMP1, \XMM3
400.endif
401 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM4
403.if \operation == dec
404 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
405 movdqa \TMP1, \XMM4
406.else
407 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
408 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
409 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
410 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
411.endif
412 add $64, %r11
413 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
414 pxor \XMMDst, \XMM1
415# combine GHASHed value with the corresponding ciphertext
416 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
417 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
418 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
419_initial_blocks_done\num_initial_blocks\operation:
420.endm
421
422/*
423* encrypt 4 blocks at a time
424* ghash the 4 previously encrypted ciphertext blocks
425* arg1, %arg2, %arg3 are used as pointers only, not modified
426* %r11 is the data offset value
427*/
428.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
429TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
430
431 movdqa \XMM1, \XMM5
432 movdqa \XMM2, \XMM6
433 movdqa \XMM3, \XMM7
434 movdqa \XMM4, \XMM8
435
436 # multiply TMP5 * HashKey using karatsuba
437
438 movdqa \XMM5, \TMP4
439 pshufd $78, \XMM5, \TMP6
440 pxor \XMM5, \TMP6
441 paddd ONE(%rip), \XMM0 # INCR CNT
442 movdqa HashKey_4(%rsp), \TMP5
443 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
444 movdqa \XMM0, \XMM1
445 paddd ONE(%rip), \XMM0 # INCR CNT
446 movdqa \XMM0, \XMM2
447 paddd ONE(%rip), \XMM0 # INCR CNT
448 movdqa \XMM0, \XMM3
449 paddd ONE(%rip), \XMM0 # INCR CNT
450 movdqa \XMM0, \XMM4
451 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
452 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
453 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
454 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
455 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
456 pxor (%arg1), \XMM1
457 pxor (%arg1), \XMM2
458 pxor (%arg1), \XMM3
459 pxor (%arg1), \XMM4
460 movdqa HashKey_4_k(%rsp), \TMP5
461 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
462 movaps 0x10(%arg1), \TMP1
463 AESENC \TMP1, \XMM1 # Round 1
464 AESENC \TMP1, \XMM2
465 AESENC \TMP1, \XMM3
466 AESENC \TMP1, \XMM4
467 movaps 0x20(%arg1), \TMP1
468 AESENC \TMP1, \XMM1 # Round 2
469 AESENC \TMP1, \XMM2
470 AESENC \TMP1, \XMM3
471 AESENC \TMP1, \XMM4
472 movdqa \XMM6, \TMP1
473 pshufd $78, \XMM6, \TMP2
474 pxor \XMM6, \TMP2
475 movdqa HashKey_3(%rsp), \TMP5
476 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
477 movaps 0x30(%arg1), \TMP3
478 AESENC \TMP3, \XMM1 # Round 3
479 AESENC \TMP3, \XMM2
480 AESENC \TMP3, \XMM3
481 AESENC \TMP3, \XMM4
482 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
483 movaps 0x40(%arg1), \TMP3
484 AESENC \TMP3, \XMM1 # Round 4
485 AESENC \TMP3, \XMM2
486 AESENC \TMP3, \XMM3
487 AESENC \TMP3, \XMM4
488 movdqa HashKey_3_k(%rsp), \TMP5
489 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
490 movaps 0x50(%arg1), \TMP3
491 AESENC \TMP3, \XMM1 # Round 5
492 AESENC \TMP3, \XMM2
493 AESENC \TMP3, \XMM3
494 AESENC \TMP3, \XMM4
495 pxor \TMP1, \TMP4
496# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
497 pxor \XMM6, \XMM5
498 pxor \TMP2, \TMP6
499 movdqa \XMM7, \TMP1
500 pshufd $78, \XMM7, \TMP2
501 pxor \XMM7, \TMP2
502 movdqa HashKey_2(%rsp ), \TMP5
503
504 # Multiply TMP5 * HashKey using karatsuba
505
506 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
507 movaps 0x60(%arg1), \TMP3
508 AESENC \TMP3, \XMM1 # Round 6
509 AESENC \TMP3, \XMM2
510 AESENC \TMP3, \XMM3
511 AESENC \TMP3, \XMM4
512 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
513 movaps 0x70(%arg1), \TMP3
514 AESENC \TMP3, \XMM1 # Round 7
515 AESENC \TMP3, \XMM2
516 AESENC \TMP3, \XMM3
517 AESENC \TMP3, \XMM4
518 movdqa HashKey_2_k(%rsp), \TMP5
519 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
520 movaps 0x80(%arg1), \TMP3
521 AESENC \TMP3, \XMM1 # Round 8
522 AESENC \TMP3, \XMM2
523 AESENC \TMP3, \XMM3
524 AESENC \TMP3, \XMM4
525 pxor \TMP1, \TMP4
526# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
527 pxor \XMM7, \XMM5
528 pxor \TMP2, \TMP6
529
530 # Multiply XMM8 * HashKey
531 # XMM8 and TMP5 hold the values for the two operands
532
533 movdqa \XMM8, \TMP1
534 pshufd $78, \XMM8, \TMP2
535 pxor \XMM8, \TMP2
536 movdqa HashKey(%rsp), \TMP5
537 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
538 movaps 0x90(%arg1), \TMP3
539 AESENC \TMP3, \XMM1 # Round 9
540 AESENC \TMP3, \XMM2
541 AESENC \TMP3, \XMM3
542 AESENC \TMP3, \XMM4
543 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
544 movaps 0xa0(%arg1), \TMP3
545 AESENCLAST \TMP3, \XMM1 # Round 10
546 AESENCLAST \TMP3, \XMM2
547 AESENCLAST \TMP3, \XMM3
548 AESENCLAST \TMP3, \XMM4
549 movdqa HashKey_k(%rsp), \TMP5
550 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
551 movdqu (%arg3,%r11,1), \TMP3
552 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
553.if \operation == dec
554 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
555 movdqa \TMP3, \XMM1
556.endif
557 movdqu 16(%arg3,%r11,1), \TMP3
558 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
559.if \operation == dec
560 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
561 movdqa \TMP3, \XMM2
562.endif
563 movdqu 32(%arg3,%r11,1), \TMP3
564 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
565.if \operation == dec
566 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
567 movdqa \TMP3, \XMM3
568.endif
569 movdqu 48(%arg3,%r11,1), \TMP3
570 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
571.if \operation == dec
572 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
573 movdqa \TMP3, \XMM4
574.else
575 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
576 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
577 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
578 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
579.endif
580 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
581 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
582 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
583 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
584
585 pxor \TMP4, \TMP1
586 pxor \XMM8, \XMM5
587 pxor \TMP6, \TMP2
588 pxor \TMP1, \TMP2
589 pxor \XMM5, \TMP2
590 movdqa \TMP2, \TMP3
591 pslldq $8, \TMP3 # left shift TMP3 2 DWs
592 psrldq $8, \TMP2 # right shift TMP2 2 DWs
593 pxor \TMP3, \XMM5
594 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
595
596 # first phase of reduction
597
598 movdqa \XMM5, \TMP2
599 movdqa \XMM5, \TMP3
600 movdqa \XMM5, \TMP4
601# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
602 pslld $31, \TMP2 # packed right shift << 31
603 pslld $30, \TMP3 # packed right shift << 30
604 pslld $25, \TMP4 # packed right shift << 25
605 pxor \TMP3, \TMP2 # xor the shifted versions
606 pxor \TMP4, \TMP2
607 movdqa \TMP2, \TMP5
608 psrldq $4, \TMP5 # right shift T5 1 DW
609 pslldq $12, \TMP2 # left shift T2 3 DWs
610 pxor \TMP2, \XMM5
611
612 # second phase of reduction
613
614 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
615 movdqa \XMM5,\TMP3
616 movdqa \XMM5,\TMP4
617 psrld $1, \TMP2 # packed left shift >>1
618 psrld $2, \TMP3 # packed left shift >>2
619 psrld $7, \TMP4 # packed left shift >>7
620 pxor \TMP3,\TMP2 # xor the shifted versions
621 pxor \TMP4,\TMP2
622 pxor \TMP5, \TMP2
623 pxor \TMP2, \XMM5
624 pxor \TMP1, \XMM5 # result is in TMP1
625
626 pxor \XMM5, \XMM1
627.endm
628
629/* GHASH the last 4 ciphertext blocks. */
630.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
631TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
632
633 # Multiply TMP6 * HashKey (using Karatsuba)
634
635 movdqa \XMM1, \TMP6
636 pshufd $78, \XMM1, \TMP2
637 pxor \XMM1, \TMP2
638 movdqa HashKey_4(%rsp), \TMP5
639 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
640 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
641 movdqa HashKey_4_k(%rsp), \TMP4
642 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
643 movdqa \XMM1, \XMMDst
644 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
645
646 # Multiply TMP1 * HashKey (using Karatsuba)
647
648 movdqa \XMM2, \TMP1
649 pshufd $78, \XMM2, \TMP2
650 pxor \XMM2, \TMP2
651 movdqa HashKey_3(%rsp), \TMP5
652 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
653 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
654 movdqa HashKey_3_k(%rsp), \TMP4
655 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
656 pxor \TMP1, \TMP6
657 pxor \XMM2, \XMMDst
658 pxor \TMP2, \XMM1
659# results accumulated in TMP6, XMMDst, XMM1
660
661 # Multiply TMP1 * HashKey (using Karatsuba)
662
663 movdqa \XMM3, \TMP1
664 pshufd $78, \XMM3, \TMP2
665 pxor \XMM3, \TMP2
666 movdqa HashKey_2(%rsp), \TMP5
667 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
668 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
669 movdqa HashKey_2_k(%rsp), \TMP4
670 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
671 pxor \TMP1, \TMP6
672 pxor \XMM3, \XMMDst
673 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
674
675 # Multiply TMP1 * HashKey (using Karatsuba)
676 movdqa \XMM4, \TMP1
677 pshufd $78, \XMM4, \TMP2
678 pxor \XMM4, \TMP2
679 movdqa HashKey(%rsp), \TMP5
680 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
681 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
682 movdqa HashKey_k(%rsp), \TMP4
683 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
684 pxor \TMP1, \TMP6
685 pxor \XMM4, \XMMDst
686 pxor \XMM1, \TMP2
687 pxor \TMP6, \TMP2
688 pxor \XMMDst, \TMP2
689 # middle section of the temp results combined as in karatsuba algorithm
690 movdqa \TMP2, \TMP4
691 pslldq $8, \TMP4 # left shift TMP4 2 DWs
692 psrldq $8, \TMP2 # right shift TMP2 2 DWs
693 pxor \TMP4, \XMMDst
694 pxor \TMP2, \TMP6
695# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
696 # first phase of the reduction
697 movdqa \XMMDst, \TMP2
698 movdqa \XMMDst, \TMP3
699 movdqa \XMMDst, \TMP4
700# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
701 pslld $31, \TMP2 # packed right shifting << 31
702 pslld $30, \TMP3 # packed right shifting << 30
703 pslld $25, \TMP4 # packed right shifting << 25
704 pxor \TMP3, \TMP2 # xor the shifted versions
705 pxor \TMP4, \TMP2
706 movdqa \TMP2, \TMP7
707 psrldq $4, \TMP7 # right shift TMP7 1 DW
708 pslldq $12, \TMP2 # left shift TMP2 3 DWs
709 pxor \TMP2, \XMMDst
710
711 # second phase of the reduction
712 movdqa \XMMDst, \TMP2
713 # make 3 copies of XMMDst for doing 3 shift operations
714 movdqa \XMMDst, \TMP3
715 movdqa \XMMDst, \TMP4
716 psrld $1, \TMP2 # packed left shift >> 1
717 psrld $2, \TMP3 # packed left shift >> 2
718 psrld $7, \TMP4 # packed left shift >> 7
719 pxor \TMP3, \TMP2 # xor the shifted versions
720 pxor \TMP4, \TMP2
721 pxor \TMP7, \TMP2
722 pxor \TMP2, \XMMDst
723 pxor \TMP6, \XMMDst # reduced result is in XMMDst
724.endm
725
726/* Encryption of a single block done*/
727.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
728
729 pxor (%arg1), \XMM0
730 movaps 16(%arg1), \TMP1
731 AESENC \TMP1, \XMM0
732 movaps 32(%arg1), \TMP1
733 AESENC \TMP1, \XMM0
734 movaps 48(%arg1), \TMP1
735 AESENC \TMP1, \XMM0
736 movaps 64(%arg1), \TMP1
737 AESENC \TMP1, \XMM0
738 movaps 80(%arg1), \TMP1
739 AESENC \TMP1, \XMM0
740 movaps 96(%arg1), \TMP1
741 AESENC \TMP1, \XMM0
742 movaps 112(%arg1), \TMP1
743 AESENC \TMP1, \XMM0
744 movaps 128(%arg1), \TMP1
745 AESENC \TMP1, \XMM0
746 movaps 144(%arg1), \TMP1
747 AESENC \TMP1, \XMM0
748 movaps 160(%arg1), \TMP1
749 AESENCLAST \TMP1, \XMM0
750.endm
751
752
753/*****************************************************************************
754* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
755* u8 *out, // Plaintext output. Encrypt in-place is allowed.
756* const u8 *in, // Ciphertext input
757* u64 plaintext_len, // Length of data in bytes for decryption.
758* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
759* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
760* // concatenated with 0x00000001. 16-byte aligned pointer.
761* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
762* const u8 *aad, // Additional Authentication Data (AAD)
763* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
764* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
765* // given authentication tag and only return the plaintext if they match.
766* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
767* // (most likely), 12 or 8.
768*
769* Assumptions:
770*
771* keys:
772* keys are pre-expanded and aligned to 16 bytes. we are using the first
773* set of 11 keys in the data structure void *aes_ctx
774*
775* iv:
776* 0 1 2 3
777* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
778* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
779* | Salt (From the SA) |
780* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
781* | Initialization Vector |
782* | (This is the sequence number from IPSec header) |
783* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
784* | 0x1 |
785* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
786*
787*
788*
789* AAD:
790* AAD padded to 128 bits with 0
791* for example, assume AAD is a u32 vector
792*
793* if AAD is 8 bytes:
794* AAD[3] = {A0, A1};
795* padded AAD in xmm register = {A1 A0 0 0}
796*
797* 0 1 2 3
798* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
799* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
800* | SPI (A1) |
801* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
802* | 32-bit Sequence Number (A0) |
803* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
804* | 0x0 |
805* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
806*
807* AAD Format with 32-bit Sequence Number
808*
809* if AAD is 12 bytes:
810* AAD[3] = {A0, A1, A2};
811* padded AAD in xmm register = {A2 A1 A0 0}
812*
813* 0 1 2 3
814* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
815* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
816* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
817* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
818* | SPI (A2) |
819* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
820* | 64-bit Extended Sequence Number {A1,A0} |
821* | |
822* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
823* | 0x0 |
824* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
825*
826* AAD Format with 64-bit Extended Sequence Number
827*
828* aadLen:
829* from the definition of the spec, aadLen can only be 8 or 12 bytes.
830* The code supports 16 too but for other sizes, the code will fail.
831*
832* TLen:
833* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
834* For other sizes, the code will fail.
835*
836* poly = x^128 + x^127 + x^126 + x^121 + 1
837*
838*****************************************************************************/
839
840ENTRY(aesni_gcm_dec)
841 push %r12
842 push %r13
843 push %r14
844 mov %rsp, %r14
845/*
846* states of %xmm registers %xmm6:%xmm15 not saved
847* all %xmm registers are clobbered
848*/
849 sub $VARIABLE_OFFSET, %rsp
850 and $~63, %rsp # align rsp to 64 bytes
851 mov %arg6, %r12
852 movdqu (%r12), %xmm13 # %xmm13 = HashKey
853 pshufb SHUF_MASK(%rip), %xmm13
854
855# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
856
857 movdqa %xmm13, %xmm2
858 psllq $1, %xmm13
859 psrlq $63, %xmm2
860 movdqa %xmm2, %xmm1
861 pslldq $8, %xmm2
862 psrldq $8, %xmm1
863 por %xmm2, %xmm13
864
865 # Reduction
866
867 pshufd $0x24, %xmm1, %xmm2
868 pcmpeqd TWOONE(%rip), %xmm2
869 pand POLY(%rip), %xmm2
870 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
871
872
873 # Decrypt first few blocks
874
875 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
876 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
877 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
878 mov %r13, %r12
879 and $(3<<4), %r12
880 jz _initial_num_blocks_is_0_decrypt
881 cmp $(2<<4), %r12
882 jb _initial_num_blocks_is_1_decrypt
883 je _initial_num_blocks_is_2_decrypt
884_initial_num_blocks_is_3_decrypt:
885 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
886%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
887 sub $48, %r13
888 jmp _initial_blocks_decrypted
889_initial_num_blocks_is_2_decrypt:
890 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
891%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
892 sub $32, %r13
893 jmp _initial_blocks_decrypted
894_initial_num_blocks_is_1_decrypt:
895 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
896%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
897 sub $16, %r13
898 jmp _initial_blocks_decrypted
899_initial_num_blocks_is_0_decrypt:
900 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
901%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
902_initial_blocks_decrypted:
903 cmp $0, %r13
904 je _zero_cipher_left_decrypt
905 sub $64, %r13
906 je _four_cipher_left_decrypt
907_decrypt_by_4:
908 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
909%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
910 add $64, %r11
911 sub $64, %r13
912 jne _decrypt_by_4
913_four_cipher_left_decrypt:
914 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
915%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
916_zero_cipher_left_decrypt:
917 mov %arg4, %r13
918 and $15, %r13 # %r13 = arg4 (mod 16)
919 je _multiple_of_16_bytes_decrypt
920
921 # Handle the last <16 byte block seperately
922
923 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
924 pshufb SHUF_MASK(%rip), %xmm0
925 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
926 sub $16, %r11
927 add %r13, %r11
928 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
929 lea SHIFT_MASK+16(%rip), %r12
930 sub %r13, %r12
931# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
932# (%r13 is the number of bytes in plaintext mod 16)
933 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
934 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
935 movdqa %xmm1, %xmm2
936 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
937 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
938 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
939 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
940 pand %xmm1, %xmm2
941 pshufb SHUF_MASK(%rip),%xmm2
942 pxor %xmm2, %xmm8
943 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
944 # GHASH computation for the last <16 byte block
945 sub %r13, %r11
946 add $16, %r11
947
948 # output %r13 bytes
949 movq %xmm0, %rax
950 cmp $8, %r13
951 jle _less_than_8_bytes_left_decrypt
952 mov %rax, (%arg2 , %r11, 1)
953 add $8, %r11
954 psrldq $8, %xmm0
955 movq %xmm0, %rax
956 sub $8, %r13
957_less_than_8_bytes_left_decrypt:
958 mov %al, (%arg2, %r11, 1)
959 add $1, %r11
960 shr $8, %rax
961 sub $1, %r13
962 jne _less_than_8_bytes_left_decrypt
963_multiple_of_16_bytes_decrypt:
964 mov arg8, %r12 # %r13 = aadLen (number of bytes)
965 shl $3, %r12 # convert into number of bits
966 movd %r12d, %xmm15 # len(A) in %xmm15
967 shl $3, %arg4 # len(C) in bits (*128)
968 movq %arg4, %xmm1
969 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
970 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
971 pxor %xmm15, %xmm8
972 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
973 # final GHASH computation
974 pshufb SHUF_MASK(%rip), %xmm8
975 mov %arg5, %rax # %rax = *Y0
976 movdqu (%rax), %xmm0 # %xmm0 = Y0
977 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
978 pxor %xmm8, %xmm0
979_return_T_decrypt:
980 mov arg9, %r10 # %r10 = authTag
981 mov arg10, %r11 # %r11 = auth_tag_len
982 cmp $16, %r11
983 je _T_16_decrypt
984 cmp $12, %r11
985 je _T_12_decrypt
986_T_8_decrypt:
987 movq %xmm0, %rax
988 mov %rax, (%r10)
989 jmp _return_T_done_decrypt
990_T_12_decrypt:
991 movq %xmm0, %rax
992 mov %rax, (%r10)
993 psrldq $8, %xmm0
994 movd %xmm0, %eax
995 mov %eax, 8(%r10)
996 jmp _return_T_done_decrypt
997_T_16_decrypt:
998 movdqu %xmm0, (%r10)
999_return_T_done_decrypt:
1000 mov %r14, %rsp
1001 pop %r14
1002 pop %r13
1003 pop %r12
1004 ret
1005
1006
1007/*****************************************************************************
1008* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1009* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1010* const u8 *in, // Plaintext input
1011* u64 plaintext_len, // Length of data in bytes for encryption.
1012* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1013* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1014* // concatenated with 0x00000001. 16-byte aligned pointer.
1015* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1016* const u8 *aad, // Additional Authentication Data (AAD)
1017* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1018* u8 *auth_tag, // Authenticated Tag output.
1019* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1020* // 12 or 8.
1021*
1022* Assumptions:
1023*
1024* keys:
1025* keys are pre-expanded and aligned to 16 bytes. we are using the
1026* first set of 11 keys in the data structure void *aes_ctx
1027*
1028*
1029* iv:
1030* 0 1 2 3
1031* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1032* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1033* | Salt (From the SA) |
1034* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1035* | Initialization Vector |
1036* | (This is the sequence number from IPSec header) |
1037* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1038* | 0x1 |
1039* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1040*
1041*
1042*
1043* AAD:
1044* AAD padded to 128 bits with 0
1045* for example, assume AAD is a u32 vector
1046*
1047* if AAD is 8 bytes:
1048* AAD[3] = {A0, A1};
1049* padded AAD in xmm register = {A1 A0 0 0}
1050*
1051* 0 1 2 3
1052* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1053* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1054* | SPI (A1) |
1055* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1056* | 32-bit Sequence Number (A0) |
1057* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1058* | 0x0 |
1059* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1060*
1061* AAD Format with 32-bit Sequence Number
1062*
1063* if AAD is 12 bytes:
1064* AAD[3] = {A0, A1, A2};
1065* padded AAD in xmm register = {A2 A1 A0 0}
1066*
1067* 0 1 2 3
1068* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1069* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1070* | SPI (A2) |
1071* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1072* | 64-bit Extended Sequence Number {A1,A0} |
1073* | |
1074* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1075* | 0x0 |
1076* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1077*
1078* AAD Format with 64-bit Extended Sequence Number
1079*
1080* aadLen:
1081* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1082* The code supports 16 too but for other sizes, the code will fail.
1083*
1084* TLen:
1085* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1086* For other sizes, the code will fail.
1087*
1088* poly = x^128 + x^127 + x^126 + x^121 + 1
1089***************************************************************************/
1090ENTRY(aesni_gcm_enc)
1091 push %r12
1092 push %r13
1093 push %r14
1094 mov %rsp, %r14
1095#
1096# states of %xmm registers %xmm6:%xmm15 not saved
1097# all %xmm registers are clobbered
1098#
1099 sub $VARIABLE_OFFSET, %rsp
1100 and $~63, %rsp
1101 mov %arg6, %r12
1102 movdqu (%r12), %xmm13
1103 pshufb SHUF_MASK(%rip), %xmm13
1104
1105# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1106
1107 movdqa %xmm13, %xmm2
1108 psllq $1, %xmm13
1109 psrlq $63, %xmm2
1110 movdqa %xmm2, %xmm1
1111 pslldq $8, %xmm2
1112 psrldq $8, %xmm1
1113 por %xmm2, %xmm13
1114
1115 # reduce HashKey<<1
1116
1117 pshufd $0x24, %xmm1, %xmm2
1118 pcmpeqd TWOONE(%rip), %xmm2
1119 pand POLY(%rip), %xmm2
1120 pxor %xmm2, %xmm13
1121 movdqa %xmm13, HashKey(%rsp)
1122 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1123 and $-16, %r13
1124 mov %r13, %r12
1125
1126 # Encrypt first few blocks
1127
1128 and $(3<<4), %r12
1129 jz _initial_num_blocks_is_0_encrypt
1130 cmp $(2<<4), %r12
1131 jb _initial_num_blocks_is_1_encrypt
1132 je _initial_num_blocks_is_2_encrypt
1133_initial_num_blocks_is_3_encrypt:
1134 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1135%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1136 sub $48, %r13
1137 jmp _initial_blocks_encrypted
1138_initial_num_blocks_is_2_encrypt:
1139 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1140%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1141 sub $32, %r13
1142 jmp _initial_blocks_encrypted
1143_initial_num_blocks_is_1_encrypt:
1144 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1145%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1146 sub $16, %r13
1147 jmp _initial_blocks_encrypted
1148_initial_num_blocks_is_0_encrypt:
1149 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1150%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1151_initial_blocks_encrypted:
1152
1153 # Main loop - Encrypt remaining blocks
1154
1155 cmp $0, %r13
1156 je _zero_cipher_left_encrypt
1157 sub $64, %r13
1158 je _four_cipher_left_encrypt
1159_encrypt_by_4_encrypt:
1160 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1161%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1162 add $64, %r11
1163 sub $64, %r13
1164 jne _encrypt_by_4_encrypt
1165_four_cipher_left_encrypt:
1166 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1167%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1168_zero_cipher_left_encrypt:
1169 mov %arg4, %r13
1170 and $15, %r13 # %r13 = arg4 (mod 16)
1171 je _multiple_of_16_bytes_encrypt
1172
1173 # Handle the last <16 Byte block seperately
1174 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1175 pshufb SHUF_MASK(%rip), %xmm0
1176 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1177 sub $16, %r11
1178 add %r13, %r11
1179 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1180 lea SHIFT_MASK+16(%rip), %r12
1181 sub %r13, %r12
1182 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1183 # (%r13 is the number of bytes in plaintext mod 16)
1184 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1185 pshufb %xmm2, %xmm1 # shift right 16-r13 byte
1186 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1187 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1188 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1189 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1190
1191 pshufb SHUF_MASK(%rip),%xmm0
1192 pxor %xmm0, %xmm8
1193 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1194 # GHASH computation for the last <16 byte block
1195 sub %r13, %r11
1196 add $16, %r11
1197 pshufb SHUF_MASK(%rip), %xmm0
1198 # shuffle xmm0 back to output as ciphertext
1199
1200 # Output %r13 bytes
1201 movq %xmm0, %rax
1202 cmp $8, %r13
1203 jle _less_than_8_bytes_left_encrypt
1204 mov %rax, (%arg2 , %r11, 1)
1205 add $8, %r11
1206 psrldq $8, %xmm0
1207 movq %xmm0, %rax
1208 sub $8, %r13
1209_less_than_8_bytes_left_encrypt:
1210 mov %al, (%arg2, %r11, 1)
1211 add $1, %r11
1212 shr $8, %rax
1213 sub $1, %r13
1214 jne _less_than_8_bytes_left_encrypt
1215_multiple_of_16_bytes_encrypt:
1216 mov arg8, %r12 # %r12 = addLen (number of bytes)
1217 shl $3, %r12
1218 movd %r12d, %xmm15 # len(A) in %xmm15
1219 shl $3, %arg4 # len(C) in bits (*128)
1220 movq %arg4, %xmm1
1221 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1222 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1223 pxor %xmm15, %xmm8
1224 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1225 # final GHASH computation
1226
1227 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1228 mov %arg5, %rax # %rax = *Y0
1229 movdqu (%rax), %xmm0 # %xmm0 = Y0
1230 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1231 pxor %xmm8, %xmm0
1232_return_T_encrypt:
1233 mov arg9, %r10 # %r10 = authTag
1234 mov arg10, %r11 # %r11 = auth_tag_len
1235 cmp $16, %r11
1236 je _T_16_encrypt
1237 cmp $12, %r11
1238 je _T_12_encrypt
1239_T_8_encrypt:
1240 movq %xmm0, %rax
1241 mov %rax, (%r10)
1242 jmp _return_T_done_encrypt
1243_T_12_encrypt:
1244 movq %xmm0, %rax
1245 mov %rax, (%r10)
1246 psrldq $8, %xmm0
1247 movd %xmm0, %eax
1248 mov %eax, 8(%r10)
1249 jmp _return_T_done_encrypt
1250_T_16_encrypt:
1251 movdqu %xmm0, (%r10)
1252_return_T_done_encrypt:
1253 mov %r14, %rsp
1254 pop %r14
1255 pop %r13
1256 pop %r12
1257 ret
1258
1259
1260
54b6a1bd
HY
1261_key_expansion_128:
1262_key_expansion_256a:
1263 pshufd $0b11111111, %xmm1, %xmm1
1264 shufps $0b00010000, %xmm0, %xmm4
1265 pxor %xmm4, %xmm0
1266 shufps $0b10001100, %xmm0, %xmm4
1267 pxor %xmm4, %xmm0
1268 pxor %xmm1, %xmm0
0d258efb
MK
1269 movaps %xmm0, (TKEYP)
1270 add $0x10, TKEYP
54b6a1bd
HY
1271 ret
1272
0d258efb 1273.align 4
54b6a1bd
HY
1274_key_expansion_192a:
1275 pshufd $0b01010101, %xmm1, %xmm1
1276 shufps $0b00010000, %xmm0, %xmm4
1277 pxor %xmm4, %xmm0
1278 shufps $0b10001100, %xmm0, %xmm4
1279 pxor %xmm4, %xmm0
1280 pxor %xmm1, %xmm0
1281
1282 movaps %xmm2, %xmm5
1283 movaps %xmm2, %xmm6
1284 pslldq $4, %xmm5
1285 pshufd $0b11111111, %xmm0, %xmm3
1286 pxor %xmm3, %xmm2
1287 pxor %xmm5, %xmm2
1288
1289 movaps %xmm0, %xmm1
1290 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1291 movaps %xmm6, (TKEYP)
54b6a1bd 1292 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1293 movaps %xmm1, 0x10(TKEYP)
1294 add $0x20, TKEYP
54b6a1bd
HY
1295 ret
1296
0d258efb 1297.align 4
54b6a1bd
HY
1298_key_expansion_192b:
1299 pshufd $0b01010101, %xmm1, %xmm1
1300 shufps $0b00010000, %xmm0, %xmm4
1301 pxor %xmm4, %xmm0
1302 shufps $0b10001100, %xmm0, %xmm4
1303 pxor %xmm4, %xmm0
1304 pxor %xmm1, %xmm0
1305
1306 movaps %xmm2, %xmm5
1307 pslldq $4, %xmm5
1308 pshufd $0b11111111, %xmm0, %xmm3
1309 pxor %xmm3, %xmm2
1310 pxor %xmm5, %xmm2
1311
0d258efb
MK
1312 movaps %xmm0, (TKEYP)
1313 add $0x10, TKEYP
54b6a1bd
HY
1314 ret
1315
0d258efb 1316.align 4
54b6a1bd
HY
1317_key_expansion_256b:
1318 pshufd $0b10101010, %xmm1, %xmm1
1319 shufps $0b00010000, %xmm2, %xmm4
1320 pxor %xmm4, %xmm2
1321 shufps $0b10001100, %xmm2, %xmm4
1322 pxor %xmm4, %xmm2
1323 pxor %xmm1, %xmm2
0d258efb
MK
1324 movaps %xmm2, (TKEYP)
1325 add $0x10, TKEYP
54b6a1bd
HY
1326 ret
1327
1328/*
1329 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1330 * unsigned int key_len)
1331 */
1332ENTRY(aesni_set_key)
0d258efb
MK
1333#ifndef __x86_64__
1334 pushl KEYP
1335 movl 8(%esp), KEYP # ctx
1336 movl 12(%esp), UKEYP # in_key
1337 movl 16(%esp), %edx # key_len
1338#endif
1339 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1340 movaps %xmm0, (KEYP)
1341 lea 0x10(KEYP), TKEYP # key addr
1342 movl %edx, 480(KEYP)
54b6a1bd
HY
1343 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1344 cmp $24, %dl
1345 jb .Lenc_key128
1346 je .Lenc_key192
0d258efb
MK
1347 movups 0x10(UKEYP), %xmm2 # other user key
1348 movaps %xmm2, (TKEYP)
1349 add $0x10, TKEYP
b369e521 1350 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1351 call _key_expansion_256a
b369e521 1352 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1353 call _key_expansion_256b
b369e521 1354 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1355 call _key_expansion_256a
b369e521 1356 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1357 call _key_expansion_256b
b369e521 1358 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1359 call _key_expansion_256a
b369e521 1360 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1361 call _key_expansion_256b
b369e521 1362 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1363 call _key_expansion_256a
b369e521 1364 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1365 call _key_expansion_256b
b369e521 1366 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1367 call _key_expansion_256a
b369e521 1368 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1369 call _key_expansion_256b
b369e521 1370 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1371 call _key_expansion_256a
b369e521 1372 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1373 call _key_expansion_256b
b369e521 1374 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1375 call _key_expansion_256a
1376 jmp .Ldec_key
1377.Lenc_key192:
0d258efb 1378 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1379 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1380 call _key_expansion_192a
b369e521 1381 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1382 call _key_expansion_192b
b369e521 1383 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1384 call _key_expansion_192a
b369e521 1385 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1386 call _key_expansion_192b
b369e521 1387 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1388 call _key_expansion_192a
b369e521 1389 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1390 call _key_expansion_192b
b369e521 1391 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1392 call _key_expansion_192a
b369e521 1393 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1394 call _key_expansion_192b
1395 jmp .Ldec_key
1396.Lenc_key128:
b369e521 1397 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1398 call _key_expansion_128
b369e521 1399 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1400 call _key_expansion_128
b369e521 1401 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1402 call _key_expansion_128
b369e521 1403 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1404 call _key_expansion_128
b369e521 1405 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1406 call _key_expansion_128
b369e521 1407 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1408 call _key_expansion_128
b369e521 1409 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1410 call _key_expansion_128
b369e521 1411 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1412 call _key_expansion_128
b369e521 1413 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1414 call _key_expansion_128
b369e521 1415 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1416 call _key_expansion_128
1417.Ldec_key:
0d258efb
MK
1418 sub $0x10, TKEYP
1419 movaps (KEYP), %xmm0
1420 movaps (TKEYP), %xmm1
1421 movaps %xmm0, 240(TKEYP)
1422 movaps %xmm1, 240(KEYP)
1423 add $0x10, KEYP
1424 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1425.align 4
1426.Ldec_key_loop:
0d258efb 1427 movaps (KEYP), %xmm0
b369e521 1428 AESIMC %xmm0 %xmm1
0d258efb
MK
1429 movaps %xmm1, (UKEYP)
1430 add $0x10, KEYP
1431 sub $0x10, UKEYP
1432 cmp TKEYP, KEYP
54b6a1bd 1433 jb .Ldec_key_loop
0d258efb
MK
1434 xor AREG, AREG
1435#ifndef __x86_64__
1436 popl KEYP
1437#endif
54b6a1bd
HY
1438 ret
1439
1440/*
1441 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1442 */
1443ENTRY(aesni_enc)
0d258efb
MK
1444#ifndef __x86_64__
1445 pushl KEYP
1446 pushl KLEN
1447 movl 12(%esp), KEYP
1448 movl 16(%esp), OUTP
1449 movl 20(%esp), INP
1450#endif
54b6a1bd
HY
1451 movl 480(KEYP), KLEN # key length
1452 movups (INP), STATE # input
1453 call _aesni_enc1
1454 movups STATE, (OUTP) # output
0d258efb
MK
1455#ifndef __x86_64__
1456 popl KLEN
1457 popl KEYP
1458#endif
54b6a1bd
HY
1459 ret
1460
1461/*
1462 * _aesni_enc1: internal ABI
1463 * input:
1464 * KEYP: key struct pointer
1465 * KLEN: round count
1466 * STATE: initial state (input)
1467 * output:
1468 * STATE: finial state (output)
1469 * changed:
1470 * KEY
1471 * TKEYP (T1)
1472 */
0d258efb 1473.align 4
54b6a1bd
HY
1474_aesni_enc1:
1475 movaps (KEYP), KEY # key
1476 mov KEYP, TKEYP
1477 pxor KEY, STATE # round 0
1478 add $0x30, TKEYP
1479 cmp $24, KLEN
1480 jb .Lenc128
1481 lea 0x20(TKEYP), TKEYP
1482 je .Lenc192
1483 add $0x20, TKEYP
1484 movaps -0x60(TKEYP), KEY
b369e521 1485 AESENC KEY STATE
54b6a1bd 1486 movaps -0x50(TKEYP), KEY
b369e521 1487 AESENC KEY STATE
54b6a1bd
HY
1488.align 4
1489.Lenc192:
1490 movaps -0x40(TKEYP), KEY
b369e521 1491 AESENC KEY STATE
54b6a1bd 1492 movaps -0x30(TKEYP), KEY
b369e521 1493 AESENC KEY STATE
54b6a1bd
HY
1494.align 4
1495.Lenc128:
1496 movaps -0x20(TKEYP), KEY
b369e521 1497 AESENC KEY STATE
54b6a1bd 1498 movaps -0x10(TKEYP), KEY
b369e521 1499 AESENC KEY STATE
54b6a1bd 1500 movaps (TKEYP), KEY
b369e521 1501 AESENC KEY STATE
54b6a1bd 1502 movaps 0x10(TKEYP), KEY
b369e521 1503 AESENC KEY STATE
54b6a1bd 1504 movaps 0x20(TKEYP), KEY
b369e521 1505 AESENC KEY STATE
54b6a1bd 1506 movaps 0x30(TKEYP), KEY
b369e521 1507 AESENC KEY STATE
54b6a1bd 1508 movaps 0x40(TKEYP), KEY
b369e521 1509 AESENC KEY STATE
54b6a1bd 1510 movaps 0x50(TKEYP), KEY
b369e521 1511 AESENC KEY STATE
54b6a1bd 1512 movaps 0x60(TKEYP), KEY
b369e521 1513 AESENC KEY STATE
54b6a1bd 1514 movaps 0x70(TKEYP), KEY
b369e521 1515 AESENCLAST KEY STATE
54b6a1bd
HY
1516 ret
1517
1518/*
1519 * _aesni_enc4: internal ABI
1520 * input:
1521 * KEYP: key struct pointer
1522 * KLEN: round count
1523 * STATE1: initial state (input)
1524 * STATE2
1525 * STATE3
1526 * STATE4
1527 * output:
1528 * STATE1: finial state (output)
1529 * STATE2
1530 * STATE3
1531 * STATE4
1532 * changed:
1533 * KEY
1534 * TKEYP (T1)
1535 */
0d258efb 1536.align 4
54b6a1bd
HY
1537_aesni_enc4:
1538 movaps (KEYP), KEY # key
1539 mov KEYP, TKEYP
1540 pxor KEY, STATE1 # round 0
1541 pxor KEY, STATE2
1542 pxor KEY, STATE3
1543 pxor KEY, STATE4
1544 add $0x30, TKEYP
1545 cmp $24, KLEN
1546 jb .L4enc128
1547 lea 0x20(TKEYP), TKEYP
1548 je .L4enc192
1549 add $0x20, TKEYP
1550 movaps -0x60(TKEYP), KEY
b369e521
HY
1551 AESENC KEY STATE1
1552 AESENC KEY STATE2
1553 AESENC KEY STATE3
1554 AESENC KEY STATE4
54b6a1bd 1555 movaps -0x50(TKEYP), KEY
b369e521
HY
1556 AESENC KEY STATE1
1557 AESENC KEY STATE2
1558 AESENC KEY STATE3
1559 AESENC KEY STATE4
54b6a1bd
HY
1560#.align 4
1561.L4enc192:
1562 movaps -0x40(TKEYP), KEY
b369e521
HY
1563 AESENC KEY STATE1
1564 AESENC KEY STATE2
1565 AESENC KEY STATE3
1566 AESENC KEY STATE4
54b6a1bd 1567 movaps -0x30(TKEYP), KEY
b369e521
HY
1568 AESENC KEY STATE1
1569 AESENC KEY STATE2
1570 AESENC KEY STATE3
1571 AESENC KEY STATE4
54b6a1bd
HY
1572#.align 4
1573.L4enc128:
1574 movaps -0x20(TKEYP), KEY
b369e521
HY
1575 AESENC KEY STATE1
1576 AESENC KEY STATE2
1577 AESENC KEY STATE3
1578 AESENC KEY STATE4
54b6a1bd 1579 movaps -0x10(TKEYP), KEY
b369e521
HY
1580 AESENC KEY STATE1
1581 AESENC KEY STATE2
1582 AESENC KEY STATE3
1583 AESENC KEY STATE4
54b6a1bd 1584 movaps (TKEYP), KEY
b369e521
HY
1585 AESENC KEY STATE1
1586 AESENC KEY STATE2
1587 AESENC KEY STATE3
1588 AESENC KEY STATE4
54b6a1bd 1589 movaps 0x10(TKEYP), KEY
b369e521
HY
1590 AESENC KEY STATE1
1591 AESENC KEY STATE2
1592 AESENC KEY STATE3
1593 AESENC KEY STATE4
54b6a1bd 1594 movaps 0x20(TKEYP), KEY
b369e521
HY
1595 AESENC KEY STATE1
1596 AESENC KEY STATE2
1597 AESENC KEY STATE3
1598 AESENC KEY STATE4
54b6a1bd 1599 movaps 0x30(TKEYP), KEY
b369e521
HY
1600 AESENC KEY STATE1
1601 AESENC KEY STATE2
1602 AESENC KEY STATE3
1603 AESENC KEY STATE4
54b6a1bd 1604 movaps 0x40(TKEYP), KEY
b369e521
HY
1605 AESENC KEY STATE1
1606 AESENC KEY STATE2
1607 AESENC KEY STATE3
1608 AESENC KEY STATE4
54b6a1bd 1609 movaps 0x50(TKEYP), KEY
b369e521
HY
1610 AESENC KEY STATE1
1611 AESENC KEY STATE2
1612 AESENC KEY STATE3
1613 AESENC KEY STATE4
54b6a1bd 1614 movaps 0x60(TKEYP), KEY
b369e521
HY
1615 AESENC KEY STATE1
1616 AESENC KEY STATE2
1617 AESENC KEY STATE3
1618 AESENC KEY STATE4
54b6a1bd 1619 movaps 0x70(TKEYP), KEY
b369e521
HY
1620 AESENCLAST KEY STATE1 # last round
1621 AESENCLAST KEY STATE2
1622 AESENCLAST KEY STATE3
1623 AESENCLAST KEY STATE4
54b6a1bd
HY
1624 ret
1625
1626/*
1627 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1628 */
1629ENTRY(aesni_dec)
0d258efb
MK
1630#ifndef __x86_64__
1631 pushl KEYP
1632 pushl KLEN
1633 movl 12(%esp), KEYP
1634 movl 16(%esp), OUTP
1635 movl 20(%esp), INP
1636#endif
54b6a1bd
HY
1637 mov 480(KEYP), KLEN # key length
1638 add $240, KEYP
1639 movups (INP), STATE # input
1640 call _aesni_dec1
1641 movups STATE, (OUTP) #output
0d258efb
MK
1642#ifndef __x86_64__
1643 popl KLEN
1644 popl KEYP
1645#endif
54b6a1bd
HY
1646 ret
1647
1648/*
1649 * _aesni_dec1: internal ABI
1650 * input:
1651 * KEYP: key struct pointer
1652 * KLEN: key length
1653 * STATE: initial state (input)
1654 * output:
1655 * STATE: finial state (output)
1656 * changed:
1657 * KEY
1658 * TKEYP (T1)
1659 */
0d258efb 1660.align 4
54b6a1bd
HY
1661_aesni_dec1:
1662 movaps (KEYP), KEY # key
1663 mov KEYP, TKEYP
1664 pxor KEY, STATE # round 0
1665 add $0x30, TKEYP
1666 cmp $24, KLEN
1667 jb .Ldec128
1668 lea 0x20(TKEYP), TKEYP
1669 je .Ldec192
1670 add $0x20, TKEYP
1671 movaps -0x60(TKEYP), KEY
b369e521 1672 AESDEC KEY STATE
54b6a1bd 1673 movaps -0x50(TKEYP), KEY
b369e521 1674 AESDEC KEY STATE
54b6a1bd
HY
1675.align 4
1676.Ldec192:
1677 movaps -0x40(TKEYP), KEY
b369e521 1678 AESDEC KEY STATE
54b6a1bd 1679 movaps -0x30(TKEYP), KEY
b369e521 1680 AESDEC KEY STATE
54b6a1bd
HY
1681.align 4
1682.Ldec128:
1683 movaps -0x20(TKEYP), KEY
b369e521 1684 AESDEC KEY STATE
54b6a1bd 1685 movaps -0x10(TKEYP), KEY
b369e521 1686 AESDEC KEY STATE
54b6a1bd 1687 movaps (TKEYP), KEY
b369e521 1688 AESDEC KEY STATE
54b6a1bd 1689 movaps 0x10(TKEYP), KEY
b369e521 1690 AESDEC KEY STATE
54b6a1bd 1691 movaps 0x20(TKEYP), KEY
b369e521 1692 AESDEC KEY STATE
54b6a1bd 1693 movaps 0x30(TKEYP), KEY
b369e521 1694 AESDEC KEY STATE
54b6a1bd 1695 movaps 0x40(TKEYP), KEY
b369e521 1696 AESDEC KEY STATE
54b6a1bd 1697 movaps 0x50(TKEYP), KEY
b369e521 1698 AESDEC KEY STATE
54b6a1bd 1699 movaps 0x60(TKEYP), KEY
b369e521 1700 AESDEC KEY STATE
54b6a1bd 1701 movaps 0x70(TKEYP), KEY
b369e521 1702 AESDECLAST KEY STATE
54b6a1bd
HY
1703 ret
1704
1705/*
1706 * _aesni_dec4: internal ABI
1707 * input:
1708 * KEYP: key struct pointer
1709 * KLEN: key length
1710 * STATE1: initial state (input)
1711 * STATE2
1712 * STATE3
1713 * STATE4
1714 * output:
1715 * STATE1: finial state (output)
1716 * STATE2
1717 * STATE3
1718 * STATE4
1719 * changed:
1720 * KEY
1721 * TKEYP (T1)
1722 */
0d258efb 1723.align 4
54b6a1bd
HY
1724_aesni_dec4:
1725 movaps (KEYP), KEY # key
1726 mov KEYP, TKEYP
1727 pxor KEY, STATE1 # round 0
1728 pxor KEY, STATE2
1729 pxor KEY, STATE3
1730 pxor KEY, STATE4
1731 add $0x30, TKEYP
1732 cmp $24, KLEN
1733 jb .L4dec128
1734 lea 0x20(TKEYP), TKEYP
1735 je .L4dec192
1736 add $0x20, TKEYP
1737 movaps -0x60(TKEYP), KEY
b369e521
HY
1738 AESDEC KEY STATE1
1739 AESDEC KEY STATE2
1740 AESDEC KEY STATE3
1741 AESDEC KEY STATE4
54b6a1bd 1742 movaps -0x50(TKEYP), KEY
b369e521
HY
1743 AESDEC KEY STATE1
1744 AESDEC KEY STATE2
1745 AESDEC KEY STATE3
1746 AESDEC KEY STATE4
54b6a1bd
HY
1747.align 4
1748.L4dec192:
1749 movaps -0x40(TKEYP), KEY
b369e521
HY
1750 AESDEC KEY STATE1
1751 AESDEC KEY STATE2
1752 AESDEC KEY STATE3
1753 AESDEC KEY STATE4
54b6a1bd 1754 movaps -0x30(TKEYP), KEY
b369e521
HY
1755 AESDEC KEY STATE1
1756 AESDEC KEY STATE2
1757 AESDEC KEY STATE3
1758 AESDEC KEY STATE4
54b6a1bd
HY
1759.align 4
1760.L4dec128:
1761 movaps -0x20(TKEYP), KEY
b369e521
HY
1762 AESDEC KEY STATE1
1763 AESDEC KEY STATE2
1764 AESDEC KEY STATE3
1765 AESDEC KEY STATE4
54b6a1bd 1766 movaps -0x10(TKEYP), KEY
b369e521
HY
1767 AESDEC KEY STATE1
1768 AESDEC KEY STATE2
1769 AESDEC KEY STATE3
1770 AESDEC KEY STATE4
54b6a1bd 1771 movaps (TKEYP), KEY
b369e521
HY
1772 AESDEC KEY STATE1
1773 AESDEC KEY STATE2
1774 AESDEC KEY STATE3
1775 AESDEC KEY STATE4
54b6a1bd 1776 movaps 0x10(TKEYP), KEY
b369e521
HY
1777 AESDEC KEY STATE1
1778 AESDEC KEY STATE2
1779 AESDEC KEY STATE3
1780 AESDEC KEY STATE4
54b6a1bd 1781 movaps 0x20(TKEYP), KEY
b369e521
HY
1782 AESDEC KEY STATE1
1783 AESDEC KEY STATE2
1784 AESDEC KEY STATE3
1785 AESDEC KEY STATE4
54b6a1bd 1786 movaps 0x30(TKEYP), KEY
b369e521
HY
1787 AESDEC KEY STATE1
1788 AESDEC KEY STATE2
1789 AESDEC KEY STATE3
1790 AESDEC KEY STATE4
54b6a1bd 1791 movaps 0x40(TKEYP), KEY
b369e521
HY
1792 AESDEC KEY STATE1
1793 AESDEC KEY STATE2
1794 AESDEC KEY STATE3
1795 AESDEC KEY STATE4
54b6a1bd 1796 movaps 0x50(TKEYP), KEY
b369e521
HY
1797 AESDEC KEY STATE1
1798 AESDEC KEY STATE2
1799 AESDEC KEY STATE3
1800 AESDEC KEY STATE4
54b6a1bd 1801 movaps 0x60(TKEYP), KEY
b369e521
HY
1802 AESDEC KEY STATE1
1803 AESDEC KEY STATE2
1804 AESDEC KEY STATE3
1805 AESDEC KEY STATE4
54b6a1bd 1806 movaps 0x70(TKEYP), KEY
b369e521
HY
1807 AESDECLAST KEY STATE1 # last round
1808 AESDECLAST KEY STATE2
1809 AESDECLAST KEY STATE3
1810 AESDECLAST KEY STATE4
54b6a1bd
HY
1811 ret
1812
1813/*
1814 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1815 * size_t len)
1816 */
1817ENTRY(aesni_ecb_enc)
0d258efb
MK
1818#ifndef __x86_64__
1819 pushl LEN
1820 pushl KEYP
1821 pushl KLEN
1822 movl 16(%esp), KEYP
1823 movl 20(%esp), OUTP
1824 movl 24(%esp), INP
1825 movl 28(%esp), LEN
1826#endif
54b6a1bd
HY
1827 test LEN, LEN # check length
1828 jz .Lecb_enc_ret
1829 mov 480(KEYP), KLEN
1830 cmp $16, LEN
1831 jb .Lecb_enc_ret
1832 cmp $64, LEN
1833 jb .Lecb_enc_loop1
1834.align 4
1835.Lecb_enc_loop4:
1836 movups (INP), STATE1
1837 movups 0x10(INP), STATE2
1838 movups 0x20(INP), STATE3
1839 movups 0x30(INP), STATE4
1840 call _aesni_enc4
1841 movups STATE1, (OUTP)
1842 movups STATE2, 0x10(OUTP)
1843 movups STATE3, 0x20(OUTP)
1844 movups STATE4, 0x30(OUTP)
1845 sub $64, LEN
1846 add $64, INP
1847 add $64, OUTP
1848 cmp $64, LEN
1849 jge .Lecb_enc_loop4
1850 cmp $16, LEN
1851 jb .Lecb_enc_ret
1852.align 4
1853.Lecb_enc_loop1:
1854 movups (INP), STATE1
1855 call _aesni_enc1
1856 movups STATE1, (OUTP)
1857 sub $16, LEN
1858 add $16, INP
1859 add $16, OUTP
1860 cmp $16, LEN
1861 jge .Lecb_enc_loop1
1862.Lecb_enc_ret:
0d258efb
MK
1863#ifndef __x86_64__
1864 popl KLEN
1865 popl KEYP
1866 popl LEN
1867#endif
54b6a1bd
HY
1868 ret
1869
1870/*
1871 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1872 * size_t len);
1873 */
1874ENTRY(aesni_ecb_dec)
0d258efb
MK
1875#ifndef __x86_64__
1876 pushl LEN
1877 pushl KEYP
1878 pushl KLEN
1879 movl 16(%esp), KEYP
1880 movl 20(%esp), OUTP
1881 movl 24(%esp), INP
1882 movl 28(%esp), LEN
1883#endif
54b6a1bd
HY
1884 test LEN, LEN
1885 jz .Lecb_dec_ret
1886 mov 480(KEYP), KLEN
1887 add $240, KEYP
1888 cmp $16, LEN
1889 jb .Lecb_dec_ret
1890 cmp $64, LEN
1891 jb .Lecb_dec_loop1
1892.align 4
1893.Lecb_dec_loop4:
1894 movups (INP), STATE1
1895 movups 0x10(INP), STATE2
1896 movups 0x20(INP), STATE3
1897 movups 0x30(INP), STATE4
1898 call _aesni_dec4
1899 movups STATE1, (OUTP)
1900 movups STATE2, 0x10(OUTP)
1901 movups STATE3, 0x20(OUTP)
1902 movups STATE4, 0x30(OUTP)
1903 sub $64, LEN
1904 add $64, INP
1905 add $64, OUTP
1906 cmp $64, LEN
1907 jge .Lecb_dec_loop4
1908 cmp $16, LEN
1909 jb .Lecb_dec_ret
1910.align 4
1911.Lecb_dec_loop1:
1912 movups (INP), STATE1
1913 call _aesni_dec1
1914 movups STATE1, (OUTP)
1915 sub $16, LEN
1916 add $16, INP
1917 add $16, OUTP
1918 cmp $16, LEN
1919 jge .Lecb_dec_loop1
1920.Lecb_dec_ret:
0d258efb
MK
1921#ifndef __x86_64__
1922 popl KLEN
1923 popl KEYP
1924 popl LEN
1925#endif
54b6a1bd
HY
1926 ret
1927
1928/*
1929 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1930 * size_t len, u8 *iv)
1931 */
1932ENTRY(aesni_cbc_enc)
0d258efb
MK
1933#ifndef __x86_64__
1934 pushl IVP
1935 pushl LEN
1936 pushl KEYP
1937 pushl KLEN
1938 movl 20(%esp), KEYP
1939 movl 24(%esp), OUTP
1940 movl 28(%esp), INP
1941 movl 32(%esp), LEN
1942 movl 36(%esp), IVP
1943#endif
54b6a1bd
HY
1944 cmp $16, LEN
1945 jb .Lcbc_enc_ret
1946 mov 480(KEYP), KLEN
1947 movups (IVP), STATE # load iv as initial state
1948.align 4
1949.Lcbc_enc_loop:
1950 movups (INP), IN # load input
1951 pxor IN, STATE
1952 call _aesni_enc1
1953 movups STATE, (OUTP) # store output
1954 sub $16, LEN
1955 add $16, INP
1956 add $16, OUTP
1957 cmp $16, LEN
1958 jge .Lcbc_enc_loop
1959 movups STATE, (IVP)
1960.Lcbc_enc_ret:
0d258efb
MK
1961#ifndef __x86_64__
1962 popl KLEN
1963 popl KEYP
1964 popl LEN
1965 popl IVP
1966#endif
54b6a1bd
HY
1967 ret
1968
1969/*
1970 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1971 * size_t len, u8 *iv)
1972 */
1973ENTRY(aesni_cbc_dec)
0d258efb
MK
1974#ifndef __x86_64__
1975 pushl IVP
1976 pushl LEN
1977 pushl KEYP
1978 pushl KLEN
1979 movl 20(%esp), KEYP
1980 movl 24(%esp), OUTP
1981 movl 28(%esp), INP
1982 movl 32(%esp), LEN
1983 movl 36(%esp), IVP
1984#endif
54b6a1bd 1985 cmp $16, LEN
e6efaa02 1986 jb .Lcbc_dec_just_ret
54b6a1bd
HY
1987 mov 480(KEYP), KLEN
1988 add $240, KEYP
1989 movups (IVP), IV
1990 cmp $64, LEN
1991 jb .Lcbc_dec_loop1
1992.align 4
1993.Lcbc_dec_loop4:
1994 movups (INP), IN1
1995 movaps IN1, STATE1
1996 movups 0x10(INP), IN2
1997 movaps IN2, STATE2
0d258efb 1998#ifdef __x86_64__
54b6a1bd
HY
1999 movups 0x20(INP), IN3
2000 movaps IN3, STATE3
2001 movups 0x30(INP), IN4
2002 movaps IN4, STATE4
0d258efb
MK
2003#else
2004 movups 0x20(INP), IN1
2005 movaps IN1, STATE3
2006 movups 0x30(INP), IN2
2007 movaps IN2, STATE4
2008#endif
54b6a1bd
HY
2009 call _aesni_dec4
2010 pxor IV, STATE1
0d258efb 2011#ifdef __x86_64__
54b6a1bd
HY
2012 pxor IN1, STATE2
2013 pxor IN2, STATE3
2014 pxor IN3, STATE4
2015 movaps IN4, IV
0d258efb
MK
2016#else
2017 pxor (INP), STATE2
2018 pxor 0x10(INP), STATE3
2019 pxor IN1, STATE4
2020 movaps IN2, IV
2021#endif
54b6a1bd
HY
2022 movups STATE1, (OUTP)
2023 movups STATE2, 0x10(OUTP)
2024 movups STATE3, 0x20(OUTP)
2025 movups STATE4, 0x30(OUTP)
2026 sub $64, LEN
2027 add $64, INP
2028 add $64, OUTP
2029 cmp $64, LEN
2030 jge .Lcbc_dec_loop4
2031 cmp $16, LEN
2032 jb .Lcbc_dec_ret
2033.align 4
2034.Lcbc_dec_loop1:
2035 movups (INP), IN
2036 movaps IN, STATE
2037 call _aesni_dec1
2038 pxor IV, STATE
2039 movups STATE, (OUTP)
2040 movaps IN, IV
2041 sub $16, LEN
2042 add $16, INP
2043 add $16, OUTP
2044 cmp $16, LEN
2045 jge .Lcbc_dec_loop1
54b6a1bd 2046.Lcbc_dec_ret:
e6efaa02
HY
2047 movups IV, (IVP)
2048.Lcbc_dec_just_ret:
0d258efb
MK
2049#ifndef __x86_64__
2050 popl KLEN
2051 popl KEYP
2052 popl LEN
2053 popl IVP
2054#endif
54b6a1bd 2055 ret
12387a46 2056
0d258efb 2057#ifdef __x86_64__
12387a46
HY
2058.align 16
2059.Lbswap_mask:
2060 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2061
2062/*
2063 * _aesni_inc_init: internal ABI
2064 * setup registers used by _aesni_inc
2065 * input:
2066 * IV
2067 * output:
2068 * CTR: == IV, in little endian
2069 * TCTR_LOW: == lower qword of CTR
2070 * INC: == 1, in little endian
2071 * BSWAP_MASK == endian swapping mask
2072 */
0d258efb 2073.align 4
12387a46
HY
2074_aesni_inc_init:
2075 movaps .Lbswap_mask, BSWAP_MASK
2076 movaps IV, CTR
2077 PSHUFB_XMM BSWAP_MASK CTR
2078 mov $1, TCTR_LOW
32cbd7df
HY
2079 MOVQ_R64_XMM TCTR_LOW INC
2080 MOVQ_R64_XMM CTR TCTR_LOW
12387a46
HY
2081 ret
2082
2083/*
2084 * _aesni_inc: internal ABI
2085 * Increase IV by 1, IV is in big endian
2086 * input:
2087 * IV
2088 * CTR: == IV, in little endian
2089 * TCTR_LOW: == lower qword of CTR
2090 * INC: == 1, in little endian
2091 * BSWAP_MASK == endian swapping mask
2092 * output:
2093 * IV: Increase by 1
2094 * changed:
2095 * CTR: == output IV, in little endian
2096 * TCTR_LOW: == lower qword of CTR
2097 */
0d258efb 2098.align 4
12387a46
HY
2099_aesni_inc:
2100 paddq INC, CTR
2101 add $1, TCTR_LOW
2102 jnc .Linc_low
2103 pslldq $8, INC
2104 paddq INC, CTR
2105 psrldq $8, INC
2106.Linc_low:
2107 movaps CTR, IV
2108 PSHUFB_XMM BSWAP_MASK IV
2109 ret
2110
2111/*
2112 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2113 * size_t len, u8 *iv)
2114 */
2115ENTRY(aesni_ctr_enc)
2116 cmp $16, LEN
2117 jb .Lctr_enc_just_ret
2118 mov 480(KEYP), KLEN
2119 movups (IVP), IV
2120 call _aesni_inc_init
2121 cmp $64, LEN
2122 jb .Lctr_enc_loop1
2123.align 4
2124.Lctr_enc_loop4:
2125 movaps IV, STATE1
2126 call _aesni_inc
2127 movups (INP), IN1
2128 movaps IV, STATE2
2129 call _aesni_inc
2130 movups 0x10(INP), IN2
2131 movaps IV, STATE3
2132 call _aesni_inc
2133 movups 0x20(INP), IN3
2134 movaps IV, STATE4
2135 call _aesni_inc
2136 movups 0x30(INP), IN4
2137 call _aesni_enc4
2138 pxor IN1, STATE1
2139 movups STATE1, (OUTP)
2140 pxor IN2, STATE2
2141 movups STATE2, 0x10(OUTP)
2142 pxor IN3, STATE3
2143 movups STATE3, 0x20(OUTP)
2144 pxor IN4, STATE4
2145 movups STATE4, 0x30(OUTP)
2146 sub $64, LEN
2147 add $64, INP
2148 add $64, OUTP
2149 cmp $64, LEN
2150 jge .Lctr_enc_loop4
2151 cmp $16, LEN
2152 jb .Lctr_enc_ret
2153.align 4
2154.Lctr_enc_loop1:
2155 movaps IV, STATE
2156 call _aesni_inc
2157 movups (INP), IN
2158 call _aesni_enc1
2159 pxor IN, STATE
2160 movups STATE, (OUTP)
2161 sub $16, LEN
2162 add $16, INP
2163 add $16, OUTP
2164 cmp $16, LEN
2165 jge .Lctr_enc_loop1
2166.Lctr_enc_ret:
2167 movups IV, (IVP)
2168.Lctr_enc_just_ret:
2169 ret
0d258efb 2170#endif
This page took 0.183549 seconds and 5 git commands to generate.