Commit | Line | Data |
---|---|---|
f3f935a7 JK |
1 | /* |
2 | * x86_64/AVX2/AES-NI assembler implementation of Camellia | |
3 | * | |
4 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | */ | |
12 | ||
13 | #include <linux/linkage.h> | |
8691ccd7 | 14 | #include <asm/frame.h> |
f3f935a7 JK |
15 | |
16 | #define CAMELLIA_TABLE_BYTE_LEN 272 | |
17 | ||
18 | /* struct camellia_ctx: */ | |
19 | #define key_table 0 | |
20 | #define key_length CAMELLIA_TABLE_BYTE_LEN | |
21 | ||
22 | /* register macros */ | |
23 | #define CTX %rdi | |
24 | #define RIO %r8 | |
25 | ||
26 | /********************************************************************** | |
27 | helper macros | |
28 | **********************************************************************/ | |
29 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | |
30 | vpand x, mask4bit, tmp0; \ | |
31 | vpandn x, mask4bit, x; \ | |
32 | vpsrld $4, x, x; \ | |
33 | \ | |
34 | vpshufb tmp0, lo_t, tmp0; \ | |
35 | vpshufb x, hi_t, x; \ | |
36 | vpxor tmp0, x, x; | |
37 | ||
38 | #define ymm0_x xmm0 | |
39 | #define ymm1_x xmm1 | |
40 | #define ymm2_x xmm2 | |
41 | #define ymm3_x xmm3 | |
42 | #define ymm4_x xmm4 | |
43 | #define ymm5_x xmm5 | |
44 | #define ymm6_x xmm6 | |
45 | #define ymm7_x xmm7 | |
46 | #define ymm8_x xmm8 | |
47 | #define ymm9_x xmm9 | |
48 | #define ymm10_x xmm10 | |
49 | #define ymm11_x xmm11 | |
50 | #define ymm12_x xmm12 | |
51 | #define ymm13_x xmm13 | |
52 | #define ymm14_x xmm14 | |
53 | #define ymm15_x xmm15 | |
54 | ||
f3f935a7 JK |
55 | /********************************************************************** |
56 | 32-way camellia | |
57 | **********************************************************************/ | |
58 | ||
59 | /* | |
60 | * IN: | |
61 | * x0..x7: byte-sliced AB state | |
62 | * mem_cd: register pointer storing CD state | |
63 | * key: index for key material | |
64 | * OUT: | |
65 | * x0..x7: new byte-sliced CD state | |
66 | */ | |
67 | #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | |
68 | t7, mem_cd, key) \ | |
69 | /* \ | |
70 | * S-function with AES subbytes \ | |
71 | */ \ | |
72 | vbroadcasti128 .Linv_shift_row, t4; \ | |
acfffdb8 JK |
73 | vpbroadcastd .L0f0f0f0f, t7; \ |
74 | vbroadcasti128 .Lpre_tf_lo_s1, t5; \ | |
75 | vbroadcasti128 .Lpre_tf_hi_s1, t6; \ | |
76 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | |
77 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | |
f3f935a7 JK |
78 | \ |
79 | /* AES inverse shift rows */ \ | |
80 | vpshufb t4, x0, x0; \ | |
81 | vpshufb t4, x7, x7; \ | |
f3f935a7 JK |
82 | vpshufb t4, x3, x3; \ |
83 | vpshufb t4, x6, x6; \ | |
acfffdb8 JK |
84 | vpshufb t4, x2, x2; \ |
85 | vpshufb t4, x5, x5; \ | |
86 | vpshufb t4, x1, x1; \ | |
87 | vpshufb t4, x4, x4; \ | |
f3f935a7 JK |
88 | \ |
89 | /* prefilter sboxes 1, 2 and 3 */ \ | |
f3f935a7 | 90 | /* prefilter sbox 4 */ \ |
acfffdb8 JK |
91 | filter_8bit(x0, t5, t6, t7, t4); \ |
92 | filter_8bit(x7, t5, t6, t7, t4); \ | |
93 | vextracti128 $1, x0, t0##_x; \ | |
94 | vextracti128 $1, x7, t1##_x; \ | |
95 | filter_8bit(x3, t2, t3, t7, t4); \ | |
96 | filter_8bit(x6, t2, t3, t7, t4); \ | |
97 | vextracti128 $1, x3, t3##_x; \ | |
98 | vextracti128 $1, x6, t2##_x; \ | |
99 | filter_8bit(x2, t5, t6, t7, t4); \ | |
100 | filter_8bit(x5, t5, t6, t7, t4); \ | |
101 | filter_8bit(x1, t5, t6, t7, t4); \ | |
102 | filter_8bit(x4, t5, t6, t7, t4); \ | |
103 | \ | |
f3f935a7 | 104 | vpxor t4##_x, t4##_x, t4##_x; \ |
f3f935a7 JK |
105 | \ |
106 | /* AES subbytes + AES shift rows */ \ | |
acfffdb8 JK |
107 | vextracti128 $1, x2, t6##_x; \ |
108 | vextracti128 $1, x5, t5##_x; \ | |
109 | vaesenclast t4##_x, x0##_x, x0##_x; \ | |
110 | vaesenclast t4##_x, t0##_x, t0##_x; \ | |
111 | vinserti128 $1, t0##_x, x0, x0; \ | |
112 | vaesenclast t4##_x, x7##_x, x7##_x; \ | |
113 | vaesenclast t4##_x, t1##_x, t1##_x; \ | |
114 | vinserti128 $1, t1##_x, x7, x7; \ | |
115 | vaesenclast t4##_x, x3##_x, x3##_x; \ | |
116 | vaesenclast t4##_x, t3##_x, t3##_x; \ | |
117 | vinserti128 $1, t3##_x, x3, x3; \ | |
118 | vaesenclast t4##_x, x6##_x, x6##_x; \ | |
119 | vaesenclast t4##_x, t2##_x, t2##_x; \ | |
120 | vinserti128 $1, t2##_x, x6, x6; \ | |
121 | vextracti128 $1, x1, t3##_x; \ | |
122 | vextracti128 $1, x4, t2##_x; \ | |
f3f935a7 JK |
123 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ |
124 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ | |
acfffdb8 JK |
125 | vaesenclast t4##_x, x2##_x, x2##_x; \ |
126 | vaesenclast t4##_x, t6##_x, t6##_x; \ | |
127 | vinserti128 $1, t6##_x, x2, x2; \ | |
128 | vaesenclast t4##_x, x5##_x, x5##_x; \ | |
129 | vaesenclast t4##_x, t5##_x, t5##_x; \ | |
130 | vinserti128 $1, t5##_x, x5, x5; \ | |
131 | vaesenclast t4##_x, x1##_x, x1##_x; \ | |
132 | vaesenclast t4##_x, t3##_x, t3##_x; \ | |
133 | vinserti128 $1, t3##_x, x1, x1; \ | |
134 | vaesenclast t4##_x, x4##_x, x4##_x; \ | |
135 | vaesenclast t4##_x, t2##_x, t2##_x; \ | |
136 | vinserti128 $1, t2##_x, x4, x4; \ | |
f3f935a7 JK |
137 | \ |
138 | /* postfilter sboxes 1 and 4 */ \ | |
139 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ | |
140 | vbroadcasti128 .Lpost_tf_hi_s3, t3; \ | |
141 | filter_8bit(x0, t0, t1, t7, t6); \ | |
142 | filter_8bit(x7, t0, t1, t7, t6); \ | |
143 | filter_8bit(x3, t0, t1, t7, t6); \ | |
144 | filter_8bit(x6, t0, t1, t7, t6); \ | |
145 | \ | |
146 | /* postfilter sbox 3 */ \ | |
147 | vbroadcasti128 .Lpost_tf_lo_s2, t4; \ | |
148 | vbroadcasti128 .Lpost_tf_hi_s2, t5; \ | |
149 | filter_8bit(x2, t2, t3, t7, t6); \ | |
150 | filter_8bit(x5, t2, t3, t7, t6); \ | |
151 | \ | |
152 | vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ | |
153 | \ | |
154 | /* postfilter sbox 2 */ \ | |
155 | filter_8bit(x1, t4, t5, t7, t2); \ | |
156 | filter_8bit(x4, t4, t5, t7, t2); \ | |
acfffdb8 | 157 | vpxor t7, t7, t7; \ |
f3f935a7 JK |
158 | \ |
159 | vpsrldq $1, t0, t1; \ | |
160 | vpsrldq $2, t0, t2; \ | |
acfffdb8 | 161 | vpshufb t7, t1, t1; \ |
f3f935a7 | 162 | vpsrldq $3, t0, t3; \ |
f3f935a7 JK |
163 | \ |
164 | /* P-function */ \ | |
165 | vpxor x5, x0, x0; \ | |
166 | vpxor x6, x1, x1; \ | |
167 | vpxor x7, x2, x2; \ | |
168 | vpxor x4, x3, x3; \ | |
169 | \ | |
acfffdb8 JK |
170 | vpshufb t7, t2, t2; \ |
171 | vpsrldq $4, t0, t4; \ | |
172 | vpshufb t7, t3, t3; \ | |
173 | vpsrldq $5, t0, t5; \ | |
174 | vpshufb t7, t4, t4; \ | |
175 | \ | |
f3f935a7 JK |
176 | vpxor x2, x4, x4; \ |
177 | vpxor x3, x5, x5; \ | |
178 | vpxor x0, x6, x6; \ | |
179 | vpxor x1, x7, x7; \ | |
180 | \ | |
acfffdb8 JK |
181 | vpsrldq $6, t0, t6; \ |
182 | vpshufb t7, t5, t5; \ | |
183 | vpshufb t7, t6, t6; \ | |
184 | \ | |
f3f935a7 JK |
185 | vpxor x7, x0, x0; \ |
186 | vpxor x4, x1, x1; \ | |
187 | vpxor x5, x2, x2; \ | |
188 | vpxor x6, x3, x3; \ | |
189 | \ | |
190 | vpxor x3, x4, x4; \ | |
191 | vpxor x0, x5, x5; \ | |
192 | vpxor x1, x6, x6; \ | |
193 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | |
194 | \ | |
195 | /* Add key material and result to CD (x becomes new CD) */ \ | |
196 | \ | |
f3f935a7 JK |
197 | vpxor t6, x1, x1; \ |
198 | vpxor 5 * 32(mem_cd), x1, x1; \ | |
199 | \ | |
acfffdb8 JK |
200 | vpsrldq $7, t0, t6; \ |
201 | vpshufb t7, t0, t0; \ | |
202 | vpshufb t7, t6, t7; \ | |
203 | \ | |
204 | vpxor t7, x0, x0; \ | |
205 | vpxor 4 * 32(mem_cd), x0, x0; \ | |
206 | \ | |
f3f935a7 JK |
207 | vpxor t5, x2, x2; \ |
208 | vpxor 6 * 32(mem_cd), x2, x2; \ | |
209 | \ | |
210 | vpxor t4, x3, x3; \ | |
211 | vpxor 7 * 32(mem_cd), x3, x3; \ | |
212 | \ | |
213 | vpxor t3, x4, x4; \ | |
214 | vpxor 0 * 32(mem_cd), x4, x4; \ | |
215 | \ | |
216 | vpxor t2, x5, x5; \ | |
217 | vpxor 1 * 32(mem_cd), x5, x5; \ | |
218 | \ | |
219 | vpxor t1, x6, x6; \ | |
220 | vpxor 2 * 32(mem_cd), x6, x6; \ | |
221 | \ | |
222 | vpxor t0, x7, x7; \ | |
223 | vpxor 3 * 32(mem_cd), x7, x7; | |
224 | ||
225 | /* | |
acfffdb8 | 226 | * Size optimization... with inlined roundsm32 binary would be over 5 times |
f3f935a7 JK |
227 | * larger and would only marginally faster. |
228 | */ | |
229 | .align 8 | |
230 | roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | |
231 | roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
232 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, | |
233 | %rcx, (%r9)); | |
234 | ret; | |
235 | ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) | |
236 | ||
237 | .align 8 | |
238 | roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | |
239 | roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, | |
240 | %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, | |
241 | %rax, (%r9)); | |
242 | ret; | |
243 | ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |
244 | ||
245 | /* | |
246 | * IN/OUT: | |
247 | * x0..x7: byte-sliced AB state preloaded | |
248 | * mem_ab: byte-sliced AB state in memory | |
249 | * mem_cb: byte-sliced CD state in memory | |
250 | */ | |
251 | #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
252 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | |
253 | leaq (key_table + (i) * 8)(CTX), %r9; \ | |
254 | call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | |
255 | \ | |
256 | vmovdqu x0, 4 * 32(mem_cd); \ | |
257 | vmovdqu x1, 5 * 32(mem_cd); \ | |
258 | vmovdqu x2, 6 * 32(mem_cd); \ | |
259 | vmovdqu x3, 7 * 32(mem_cd); \ | |
260 | vmovdqu x4, 0 * 32(mem_cd); \ | |
261 | vmovdqu x5, 1 * 32(mem_cd); \ | |
262 | vmovdqu x6, 2 * 32(mem_cd); \ | |
263 | vmovdqu x7, 3 * 32(mem_cd); \ | |
264 | \ | |
265 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | |
266 | call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | |
267 | \ | |
268 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | |
269 | ||
270 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | |
271 | ||
272 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | |
273 | /* Store new AB state */ \ | |
274 | vmovdqu x4, 4 * 32(mem_ab); \ | |
275 | vmovdqu x5, 5 * 32(mem_ab); \ | |
276 | vmovdqu x6, 6 * 32(mem_ab); \ | |
277 | vmovdqu x7, 7 * 32(mem_ab); \ | |
278 | vmovdqu x0, 0 * 32(mem_ab); \ | |
279 | vmovdqu x1, 1 * 32(mem_ab); \ | |
280 | vmovdqu x2, 2 * 32(mem_ab); \ | |
281 | vmovdqu x3, 3 * 32(mem_ab); | |
282 | ||
283 | #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
284 | y6, y7, mem_ab, mem_cd, i) \ | |
285 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
286 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | |
287 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
288 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | |
289 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
290 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | |
291 | ||
292 | #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
293 | y6, y7, mem_ab, mem_cd, i) \ | |
294 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
295 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | |
296 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
297 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | |
298 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
299 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | |
300 | ||
301 | /* | |
302 | * IN: | |
303 | * v0..3: byte-sliced 32-bit integers | |
304 | * OUT: | |
305 | * v0..3: (IN <<< 1) | |
306 | */ | |
307 | #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ | |
308 | vpcmpgtb v0, zero, t0; \ | |
309 | vpaddb v0, v0, v0; \ | |
310 | vpabsb t0, t0; \ | |
311 | \ | |
312 | vpcmpgtb v1, zero, t1; \ | |
313 | vpaddb v1, v1, v1; \ | |
314 | vpabsb t1, t1; \ | |
315 | \ | |
316 | vpcmpgtb v2, zero, t2; \ | |
317 | vpaddb v2, v2, v2; \ | |
318 | vpabsb t2, t2; \ | |
319 | \ | |
320 | vpor t0, v1, v1; \ | |
321 | \ | |
322 | vpcmpgtb v3, zero, t0; \ | |
323 | vpaddb v3, v3, v3; \ | |
324 | vpabsb t0, t0; \ | |
325 | \ | |
326 | vpor t1, v2, v2; \ | |
327 | vpor t2, v3, v3; \ | |
328 | vpor t0, v0, v0; | |
329 | ||
330 | /* | |
331 | * IN: | |
332 | * r: byte-sliced AB state in memory | |
333 | * l: byte-sliced CD state in memory | |
334 | * OUT: | |
335 | * x0..x7: new byte-sliced CD state | |
336 | */ | |
337 | #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | |
338 | tt1, tt2, tt3, kll, klr, krl, krr) \ | |
339 | /* \ | |
340 | * t0 = kll; \ | |
341 | * t0 &= ll; \ | |
342 | * lr ^= rol32(t0, 1); \ | |
343 | */ \ | |
344 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ | |
345 | vpxor tt0, tt0, tt0; \ | |
acfffdb8 | 346 | vpshufb tt0, t0, t3; \ |
f3f935a7 | 347 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 348 | vpshufb tt0, t0, t2; \ |
f3f935a7 | 349 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 350 | vpshufb tt0, t0, t1; \ |
f3f935a7 | 351 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 352 | vpshufb tt0, t0, t0; \ |
f3f935a7 JK |
353 | \ |
354 | vpand l0, t0, t0; \ | |
355 | vpand l1, t1, t1; \ | |
356 | vpand l2, t2, t2; \ | |
357 | vpand l3, t3, t3; \ | |
358 | \ | |
359 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | |
360 | \ | |
361 | vpxor l4, t0, l4; \ | |
acfffdb8 | 362 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ |
f3f935a7 JK |
363 | vmovdqu l4, 4 * 32(l); \ |
364 | vpxor l5, t1, l5; \ | |
365 | vmovdqu l5, 5 * 32(l); \ | |
366 | vpxor l6, t2, l6; \ | |
367 | vmovdqu l6, 6 * 32(l); \ | |
368 | vpxor l7, t3, l7; \ | |
369 | vmovdqu l7, 7 * 32(l); \ | |
370 | \ | |
371 | /* \ | |
372 | * t2 = krr; \ | |
373 | * t2 |= rr; \ | |
374 | * rl ^= t2; \ | |
375 | */ \ | |
376 | \ | |
acfffdb8 | 377 | vpshufb tt0, t0, t3; \ |
f3f935a7 | 378 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 379 | vpshufb tt0, t0, t2; \ |
f3f935a7 | 380 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 381 | vpshufb tt0, t0, t1; \ |
f3f935a7 | 382 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 383 | vpshufb tt0, t0, t0; \ |
f3f935a7 JK |
384 | \ |
385 | vpor 4 * 32(r), t0, t0; \ | |
386 | vpor 5 * 32(r), t1, t1; \ | |
387 | vpor 6 * 32(r), t2, t2; \ | |
388 | vpor 7 * 32(r), t3, t3; \ | |
389 | \ | |
390 | vpxor 0 * 32(r), t0, t0; \ | |
391 | vpxor 1 * 32(r), t1, t1; \ | |
392 | vpxor 2 * 32(r), t2, t2; \ | |
393 | vpxor 3 * 32(r), t3, t3; \ | |
394 | vmovdqu t0, 0 * 32(r); \ | |
acfffdb8 | 395 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ |
f3f935a7 JK |
396 | vmovdqu t1, 1 * 32(r); \ |
397 | vmovdqu t2, 2 * 32(r); \ | |
398 | vmovdqu t3, 3 * 32(r); \ | |
399 | \ | |
400 | /* \ | |
401 | * t2 = krl; \ | |
402 | * t2 &= rl; \ | |
403 | * rr ^= rol32(t2, 1); \ | |
404 | */ \ | |
acfffdb8 | 405 | vpshufb tt0, t0, t3; \ |
f3f935a7 | 406 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 407 | vpshufb tt0, t0, t2; \ |
f3f935a7 | 408 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 409 | vpshufb tt0, t0, t1; \ |
f3f935a7 | 410 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 411 | vpshufb tt0, t0, t0; \ |
f3f935a7 JK |
412 | \ |
413 | vpand 0 * 32(r), t0, t0; \ | |
414 | vpand 1 * 32(r), t1, t1; \ | |
415 | vpand 2 * 32(r), t2, t2; \ | |
416 | vpand 3 * 32(r), t3, t3; \ | |
417 | \ | |
418 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | |
419 | \ | |
420 | vpxor 4 * 32(r), t0, t0; \ | |
421 | vpxor 5 * 32(r), t1, t1; \ | |
422 | vpxor 6 * 32(r), t2, t2; \ | |
423 | vpxor 7 * 32(r), t3, t3; \ | |
424 | vmovdqu t0, 4 * 32(r); \ | |
acfffdb8 | 425 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ |
f3f935a7 JK |
426 | vmovdqu t1, 5 * 32(r); \ |
427 | vmovdqu t2, 6 * 32(r); \ | |
428 | vmovdqu t3, 7 * 32(r); \ | |
429 | \ | |
430 | /* \ | |
431 | * t0 = klr; \ | |
432 | * t0 |= lr; \ | |
433 | * ll ^= t0; \ | |
434 | */ \ | |
435 | \ | |
acfffdb8 | 436 | vpshufb tt0, t0, t3; \ |
f3f935a7 | 437 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 438 | vpshufb tt0, t0, t2; \ |
f3f935a7 | 439 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 440 | vpshufb tt0, t0, t1; \ |
f3f935a7 | 441 | vpsrldq $1, t0, t0; \ |
acfffdb8 | 442 | vpshufb tt0, t0, t0; \ |
f3f935a7 JK |
443 | \ |
444 | vpor l4, t0, t0; \ | |
445 | vpor l5, t1, t1; \ | |
446 | vpor l6, t2, t2; \ | |
447 | vpor l7, t3, t3; \ | |
448 | \ | |
449 | vpxor l0, t0, l0; \ | |
450 | vmovdqu l0, 0 * 32(l); \ | |
451 | vpxor l1, t1, l1; \ | |
452 | vmovdqu l1, 1 * 32(l); \ | |
453 | vpxor l2, t2, l2; \ | |
454 | vmovdqu l2, 2 * 32(l); \ | |
455 | vpxor l3, t3, l3; \ | |
456 | vmovdqu l3, 3 * 32(l); | |
457 | ||
458 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | |
459 | vpunpckhdq x1, x0, t2; \ | |
460 | vpunpckldq x1, x0, x0; \ | |
461 | \ | |
462 | vpunpckldq x3, x2, t1; \ | |
463 | vpunpckhdq x3, x2, x2; \ | |
464 | \ | |
465 | vpunpckhqdq t1, x0, x1; \ | |
466 | vpunpcklqdq t1, x0, x0; \ | |
467 | \ | |
468 | vpunpckhqdq x2, t2, x3; \ | |
469 | vpunpcklqdq x2, t2, x2; | |
470 | ||
471 | #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ | |
472 | a3, b3, c3, d3, st0, st1) \ | |
473 | vmovdqu d2, st0; \ | |
474 | vmovdqu d3, st1; \ | |
475 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | |
476 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | |
477 | vmovdqu st0, d2; \ | |
478 | vmovdqu st1, d3; \ | |
479 | \ | |
480 | vmovdqu a0, st0; \ | |
481 | vmovdqu a1, st1; \ | |
482 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | |
483 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | |
484 | \ | |
485 | vbroadcasti128 .Lshufb_16x16b, a0; \ | |
486 | vmovdqu st1, a1; \ | |
487 | vpshufb a0, a2, a2; \ | |
488 | vpshufb a0, a3, a3; \ | |
489 | vpshufb a0, b0, b0; \ | |
490 | vpshufb a0, b1, b1; \ | |
491 | vpshufb a0, b2, b2; \ | |
492 | vpshufb a0, b3, b3; \ | |
493 | vpshufb a0, a1, a1; \ | |
494 | vpshufb a0, c0, c0; \ | |
495 | vpshufb a0, c1, c1; \ | |
496 | vpshufb a0, c2, c2; \ | |
497 | vpshufb a0, c3, c3; \ | |
498 | vpshufb a0, d0, d0; \ | |
499 | vpshufb a0, d1, d1; \ | |
500 | vpshufb a0, d2, d2; \ | |
501 | vpshufb a0, d3, d3; \ | |
502 | vmovdqu d3, st1; \ | |
503 | vmovdqu st0, d3; \ | |
504 | vpshufb a0, d3, a0; \ | |
505 | vmovdqu d2, st0; \ | |
506 | \ | |
507 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | |
508 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | |
509 | vmovdqu st0, d2; \ | |
510 | vmovdqu st1, d3; \ | |
511 | \ | |
512 | vmovdqu b0, st0; \ | |
513 | vmovdqu b1, st1; \ | |
514 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | |
515 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | |
516 | vmovdqu st0, b0; \ | |
517 | vmovdqu st1, b1; \ | |
518 | /* does not adjust output bytes inside vectors */ | |
519 | ||
520 | /* load blocks to registers and apply pre-whitening */ | |
521 | #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
522 | y6, y7, rio, key) \ | |
523 | vpbroadcastq key, x0; \ | |
524 | vpshufb .Lpack_bswap, x0, x0; \ | |
525 | \ | |
526 | vpxor 0 * 32(rio), x0, y7; \ | |
527 | vpxor 1 * 32(rio), x0, y6; \ | |
528 | vpxor 2 * 32(rio), x0, y5; \ | |
529 | vpxor 3 * 32(rio), x0, y4; \ | |
530 | vpxor 4 * 32(rio), x0, y3; \ | |
531 | vpxor 5 * 32(rio), x0, y2; \ | |
532 | vpxor 6 * 32(rio), x0, y1; \ | |
533 | vpxor 7 * 32(rio), x0, y0; \ | |
534 | vpxor 8 * 32(rio), x0, x7; \ | |
535 | vpxor 9 * 32(rio), x0, x6; \ | |
536 | vpxor 10 * 32(rio), x0, x5; \ | |
537 | vpxor 11 * 32(rio), x0, x4; \ | |
538 | vpxor 12 * 32(rio), x0, x3; \ | |
539 | vpxor 13 * 32(rio), x0, x2; \ | |
540 | vpxor 14 * 32(rio), x0, x1; \ | |
541 | vpxor 15 * 32(rio), x0, x0; | |
542 | ||
543 | /* byteslice pre-whitened blocks and store to temporary memory */ | |
544 | #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
545 | y6, y7, mem_ab, mem_cd) \ | |
546 | byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ | |
547 | y4, y5, y6, y7, (mem_ab), (mem_cd)); \ | |
548 | \ | |
549 | vmovdqu x0, 0 * 32(mem_ab); \ | |
550 | vmovdqu x1, 1 * 32(mem_ab); \ | |
551 | vmovdqu x2, 2 * 32(mem_ab); \ | |
552 | vmovdqu x3, 3 * 32(mem_ab); \ | |
553 | vmovdqu x4, 4 * 32(mem_ab); \ | |
554 | vmovdqu x5, 5 * 32(mem_ab); \ | |
555 | vmovdqu x6, 6 * 32(mem_ab); \ | |
556 | vmovdqu x7, 7 * 32(mem_ab); \ | |
557 | vmovdqu y0, 0 * 32(mem_cd); \ | |
558 | vmovdqu y1, 1 * 32(mem_cd); \ | |
559 | vmovdqu y2, 2 * 32(mem_cd); \ | |
560 | vmovdqu y3, 3 * 32(mem_cd); \ | |
561 | vmovdqu y4, 4 * 32(mem_cd); \ | |
562 | vmovdqu y5, 5 * 32(mem_cd); \ | |
563 | vmovdqu y6, 6 * 32(mem_cd); \ | |
564 | vmovdqu y7, 7 * 32(mem_cd); | |
565 | ||
566 | /* de-byteslice, apply post-whitening and store blocks */ | |
567 | #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | |
568 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | |
569 | byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ | |
570 | y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ | |
571 | \ | |
572 | vmovdqu x0, stack_tmp0; \ | |
573 | \ | |
574 | vpbroadcastq key, x0; \ | |
575 | vpshufb .Lpack_bswap, x0, x0; \ | |
576 | \ | |
577 | vpxor x0, y7, y7; \ | |
578 | vpxor x0, y6, y6; \ | |
579 | vpxor x0, y5, y5; \ | |
580 | vpxor x0, y4, y4; \ | |
581 | vpxor x0, y3, y3; \ | |
582 | vpxor x0, y2, y2; \ | |
583 | vpxor x0, y1, y1; \ | |
584 | vpxor x0, y0, y0; \ | |
585 | vpxor x0, x7, x7; \ | |
586 | vpxor x0, x6, x6; \ | |
587 | vpxor x0, x5, x5; \ | |
588 | vpxor x0, x4, x4; \ | |
589 | vpxor x0, x3, x3; \ | |
590 | vpxor x0, x2, x2; \ | |
591 | vpxor x0, x1, x1; \ | |
592 | vpxor stack_tmp0, x0, x0; | |
593 | ||
594 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
595 | y6, y7, rio) \ | |
596 | vmovdqu x0, 0 * 32(rio); \ | |
597 | vmovdqu x1, 1 * 32(rio); \ | |
598 | vmovdqu x2, 2 * 32(rio); \ | |
599 | vmovdqu x3, 3 * 32(rio); \ | |
600 | vmovdqu x4, 4 * 32(rio); \ | |
601 | vmovdqu x5, 5 * 32(rio); \ | |
602 | vmovdqu x6, 6 * 32(rio); \ | |
603 | vmovdqu x7, 7 * 32(rio); \ | |
604 | vmovdqu y0, 8 * 32(rio); \ | |
605 | vmovdqu y1, 9 * 32(rio); \ | |
606 | vmovdqu y2, 10 * 32(rio); \ | |
607 | vmovdqu y3, 11 * 32(rio); \ | |
608 | vmovdqu y4, 12 * 32(rio); \ | |
609 | vmovdqu y5, 13 * 32(rio); \ | |
610 | vmovdqu y6, 14 * 32(rio); \ | |
611 | vmovdqu y7, 15 * 32(rio); | |
612 | ||
613 | .data | |
614 | .align 32 | |
615 | ||
616 | #define SHUFB_BYTES(idx) \ | |
617 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | |
618 | ||
619 | .Lshufb_16x16b: | |
620 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | |
621 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | |
622 | ||
623 | .Lpack_bswap: | |
624 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | |
625 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | |
626 | ||
627 | /* For CTR-mode IV byteswap */ | |
628 | .Lbswap128_mask: | |
629 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
630 | ||
631 | /* For XTS mode */ | |
632 | .Lxts_gf128mul_and_shl1_mask_0: | |
633 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
634 | .Lxts_gf128mul_and_shl1_mask_1: | |
635 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | |
636 | ||
637 | /* | |
638 | * pre-SubByte transform | |
639 | * | |
640 | * pre-lookup for sbox1, sbox2, sbox3: | |
641 | * swap_bitendianness( | |
642 | * isom_map_camellia_to_aes( | |
643 | * camellia_f( | |
644 | * swap_bitendianess(in) | |
645 | * ) | |
646 | * ) | |
647 | * ) | |
648 | * | |
649 | * (note: '⊕ 0xc5' inside camellia_f()) | |
650 | */ | |
651 | .Lpre_tf_lo_s1: | |
652 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | |
653 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | |
654 | .Lpre_tf_hi_s1: | |
655 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | |
656 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | |
657 | ||
658 | /* | |
659 | * pre-SubByte transform | |
660 | * | |
661 | * pre-lookup for sbox4: | |
662 | * swap_bitendianness( | |
663 | * isom_map_camellia_to_aes( | |
664 | * camellia_f( | |
665 | * swap_bitendianess(in <<< 1) | |
666 | * ) | |
667 | * ) | |
668 | * ) | |
669 | * | |
670 | * (note: '⊕ 0xc5' inside camellia_f()) | |
671 | */ | |
672 | .Lpre_tf_lo_s4: | |
673 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | |
674 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | |
675 | .Lpre_tf_hi_s4: | |
676 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | |
677 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | |
678 | ||
679 | /* | |
680 | * post-SubByte transform | |
681 | * | |
682 | * post-lookup for sbox1, sbox4: | |
683 | * swap_bitendianness( | |
684 | * camellia_h( | |
685 | * isom_map_aes_to_camellia( | |
686 | * swap_bitendianness( | |
687 | * aes_inverse_affine_transform(in) | |
688 | * ) | |
689 | * ) | |
690 | * ) | |
691 | * ) | |
692 | * | |
693 | * (note: '⊕ 0x6e' inside camellia_h()) | |
694 | */ | |
695 | .Lpost_tf_lo_s1: | |
696 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | |
697 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | |
698 | .Lpost_tf_hi_s1: | |
699 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | |
700 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | |
701 | ||
702 | /* | |
703 | * post-SubByte transform | |
704 | * | |
705 | * post-lookup for sbox2: | |
706 | * swap_bitendianness( | |
707 | * camellia_h( | |
708 | * isom_map_aes_to_camellia( | |
709 | * swap_bitendianness( | |
710 | * aes_inverse_affine_transform(in) | |
711 | * ) | |
712 | * ) | |
713 | * ) | |
714 | * ) <<< 1 | |
715 | * | |
716 | * (note: '⊕ 0x6e' inside camellia_h()) | |
717 | */ | |
718 | .Lpost_tf_lo_s2: | |
719 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | |
720 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | |
721 | .Lpost_tf_hi_s2: | |
722 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | |
723 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | |
724 | ||
725 | /* | |
726 | * post-SubByte transform | |
727 | * | |
728 | * post-lookup for sbox3: | |
729 | * swap_bitendianness( | |
730 | * camellia_h( | |
731 | * isom_map_aes_to_camellia( | |
732 | * swap_bitendianness( | |
733 | * aes_inverse_affine_transform(in) | |
734 | * ) | |
735 | * ) | |
736 | * ) | |
737 | * ) >>> 1 | |
738 | * | |
739 | * (note: '⊕ 0x6e' inside camellia_h()) | |
740 | */ | |
741 | .Lpost_tf_lo_s3: | |
742 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | |
743 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | |
744 | .Lpost_tf_hi_s3: | |
745 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | |
746 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | |
747 | ||
748 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | |
749 | .Linv_shift_row: | |
750 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | |
751 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | |
752 | ||
753 | .align 4 | |
754 | /* 4-bit mask */ | |
755 | .L0f0f0f0f: | |
756 | .long 0x0f0f0f0f | |
757 | ||
758 | .text | |
759 | ||
760 | .align 8 | |
761 | __camellia_enc_blk32: | |
762 | /* input: | |
763 | * %rdi: ctx, CTX | |
764 | * %rax: temporary storage, 512 bytes | |
765 | * %ymm0..%ymm15: 32 plaintext blocks | |
766 | * output: | |
767 | * %ymm0..%ymm15: 32 encrypted blocks, order swapped: | |
768 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | |
769 | */ | |
8691ccd7 | 770 | FRAME_BEGIN |
f3f935a7 JK |
771 | |
772 | leaq 8 * 32(%rax), %rcx; | |
773 | ||
774 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
775 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
776 | %ymm15, %rax, %rcx); | |
777 | ||
778 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
779 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
780 | %ymm15, %rax, %rcx, 0); | |
781 | ||
782 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
783 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
784 | %ymm15, | |
785 | ((key_table + (8) * 8) + 0)(CTX), | |
786 | ((key_table + (8) * 8) + 4)(CTX), | |
787 | ((key_table + (8) * 8) + 8)(CTX), | |
788 | ((key_table + (8) * 8) + 12)(CTX)); | |
789 | ||
790 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
791 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
792 | %ymm15, %rax, %rcx, 8); | |
793 | ||
794 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
795 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
796 | %ymm15, | |
797 | ((key_table + (16) * 8) + 0)(CTX), | |
798 | ((key_table + (16) * 8) + 4)(CTX), | |
799 | ((key_table + (16) * 8) + 8)(CTX), | |
800 | ((key_table + (16) * 8) + 12)(CTX)); | |
801 | ||
802 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
803 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
804 | %ymm15, %rax, %rcx, 16); | |
805 | ||
806 | movl $24, %r8d; | |
807 | cmpl $16, key_length(CTX); | |
808 | jne .Lenc_max32; | |
809 | ||
810 | .Lenc_done: | |
811 | /* load CD for output */ | |
812 | vmovdqu 0 * 32(%rcx), %ymm8; | |
813 | vmovdqu 1 * 32(%rcx), %ymm9; | |
814 | vmovdqu 2 * 32(%rcx), %ymm10; | |
815 | vmovdqu 3 * 32(%rcx), %ymm11; | |
816 | vmovdqu 4 * 32(%rcx), %ymm12; | |
817 | vmovdqu 5 * 32(%rcx), %ymm13; | |
818 | vmovdqu 6 * 32(%rcx), %ymm14; | |
819 | vmovdqu 7 * 32(%rcx), %ymm15; | |
820 | ||
821 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
822 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
823 | %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); | |
824 | ||
8691ccd7 | 825 | FRAME_END |
f3f935a7 JK |
826 | ret; |
827 | ||
828 | .align 8 | |
829 | .Lenc_max32: | |
830 | movl $32, %r8d; | |
831 | ||
832 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
833 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
834 | %ymm15, | |
835 | ((key_table + (24) * 8) + 0)(CTX), | |
836 | ((key_table + (24) * 8) + 4)(CTX), | |
837 | ((key_table + (24) * 8) + 8)(CTX), | |
838 | ((key_table + (24) * 8) + 12)(CTX)); | |
839 | ||
840 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
841 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
842 | %ymm15, %rax, %rcx, 24); | |
843 | ||
844 | jmp .Lenc_done; | |
845 | ENDPROC(__camellia_enc_blk32) | |
846 | ||
847 | .align 8 | |
848 | __camellia_dec_blk32: | |
849 | /* input: | |
850 | * %rdi: ctx, CTX | |
851 | * %rax: temporary storage, 512 bytes | |
852 | * %r8d: 24 for 16 byte key, 32 for larger | |
853 | * %ymm0..%ymm15: 16 encrypted blocks | |
854 | * output: | |
855 | * %ymm0..%ymm15: 16 plaintext blocks, order swapped: | |
856 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | |
857 | */ | |
8691ccd7 | 858 | FRAME_BEGIN |
f3f935a7 JK |
859 | |
860 | leaq 8 * 32(%rax), %rcx; | |
861 | ||
862 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
863 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
864 | %ymm15, %rax, %rcx); | |
865 | ||
866 | cmpl $32, %r8d; | |
867 | je .Ldec_max32; | |
868 | ||
869 | .Ldec_max24: | |
870 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
871 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
872 | %ymm15, %rax, %rcx, 16); | |
873 | ||
874 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
875 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
876 | %ymm15, | |
877 | ((key_table + (16) * 8) + 8)(CTX), | |
878 | ((key_table + (16) * 8) + 12)(CTX), | |
879 | ((key_table + (16) * 8) + 0)(CTX), | |
880 | ((key_table + (16) * 8) + 4)(CTX)); | |
881 | ||
882 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
883 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
884 | %ymm15, %rax, %rcx, 8); | |
885 | ||
886 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
887 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
888 | %ymm15, | |
889 | ((key_table + (8) * 8) + 8)(CTX), | |
890 | ((key_table + (8) * 8) + 12)(CTX), | |
891 | ((key_table + (8) * 8) + 0)(CTX), | |
892 | ((key_table + (8) * 8) + 4)(CTX)); | |
893 | ||
894 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
895 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
896 | %ymm15, %rax, %rcx, 0); | |
897 | ||
898 | /* load CD for output */ | |
899 | vmovdqu 0 * 32(%rcx), %ymm8; | |
900 | vmovdqu 1 * 32(%rcx), %ymm9; | |
901 | vmovdqu 2 * 32(%rcx), %ymm10; | |
902 | vmovdqu 3 * 32(%rcx), %ymm11; | |
903 | vmovdqu 4 * 32(%rcx), %ymm12; | |
904 | vmovdqu 5 * 32(%rcx), %ymm13; | |
905 | vmovdqu 6 * 32(%rcx), %ymm14; | |
906 | vmovdqu 7 * 32(%rcx), %ymm15; | |
907 | ||
908 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
909 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
910 | %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); | |
911 | ||
8691ccd7 | 912 | FRAME_END |
f3f935a7 JK |
913 | ret; |
914 | ||
915 | .align 8 | |
916 | .Ldec_max32: | |
917 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
918 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
919 | %ymm15, %rax, %rcx, 24); | |
920 | ||
921 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
922 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
923 | %ymm15, | |
924 | ((key_table + (24) * 8) + 8)(CTX), | |
925 | ((key_table + (24) * 8) + 12)(CTX), | |
926 | ((key_table + (24) * 8) + 0)(CTX), | |
927 | ((key_table + (24) * 8) + 4)(CTX)); | |
928 | ||
929 | jmp .Ldec_max24; | |
930 | ENDPROC(__camellia_dec_blk32) | |
931 | ||
932 | ENTRY(camellia_ecb_enc_32way) | |
933 | /* input: | |
934 | * %rdi: ctx, CTX | |
935 | * %rsi: dst (32 blocks) | |
936 | * %rdx: src (32 blocks) | |
937 | */ | |
8691ccd7 | 938 | FRAME_BEGIN |
f3f935a7 JK |
939 | |
940 | vzeroupper; | |
941 | ||
942 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
943 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
944 | %ymm15, %rdx, (key_table)(CTX)); | |
945 | ||
946 | /* now dst can be used as temporary buffer (even in src == dst case) */ | |
947 | movq %rsi, %rax; | |
948 | ||
949 | call __camellia_enc_blk32; | |
950 | ||
951 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | |
952 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | |
953 | %ymm8, %rsi); | |
954 | ||
955 | vzeroupper; | |
956 | ||
8691ccd7 | 957 | FRAME_END |
f3f935a7 JK |
958 | ret; |
959 | ENDPROC(camellia_ecb_enc_32way) | |
960 | ||
961 | ENTRY(camellia_ecb_dec_32way) | |
962 | /* input: | |
963 | * %rdi: ctx, CTX | |
964 | * %rsi: dst (32 blocks) | |
965 | * %rdx: src (32 blocks) | |
966 | */ | |
8691ccd7 | 967 | FRAME_BEGIN |
f3f935a7 JK |
968 | |
969 | vzeroupper; | |
970 | ||
971 | cmpl $16, key_length(CTX); | |
972 | movl $32, %r8d; | |
973 | movl $24, %eax; | |
974 | cmovel %eax, %r8d; /* max */ | |
975 | ||
976 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
977 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
978 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | |
979 | ||
980 | /* now dst can be used as temporary buffer (even in src == dst case) */ | |
981 | movq %rsi, %rax; | |
982 | ||
983 | call __camellia_dec_blk32; | |
984 | ||
985 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | |
986 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | |
987 | %ymm8, %rsi); | |
988 | ||
989 | vzeroupper; | |
990 | ||
8691ccd7 | 991 | FRAME_END |
f3f935a7 JK |
992 | ret; |
993 | ENDPROC(camellia_ecb_dec_32way) | |
994 | ||
995 | ENTRY(camellia_cbc_dec_32way) | |
996 | /* input: | |
997 | * %rdi: ctx, CTX | |
998 | * %rsi: dst (32 blocks) | |
999 | * %rdx: src (32 blocks) | |
1000 | */ | |
8691ccd7 | 1001 | FRAME_BEGIN |
f3f935a7 JK |
1002 | |
1003 | vzeroupper; | |
1004 | ||
1005 | cmpl $16, key_length(CTX); | |
1006 | movl $32, %r8d; | |
1007 | movl $24, %eax; | |
1008 | cmovel %eax, %r8d; /* max */ | |
1009 | ||
1010 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | |
1011 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | |
1012 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | |
1013 | ||
1014 | movq %rsp, %r10; | |
1015 | cmpq %rsi, %rdx; | |
1016 | je .Lcbc_dec_use_stack; | |
1017 | ||
1018 | /* dst can be used as temporary storage, src is not overwritten. */ | |
1019 | movq %rsi, %rax; | |
1020 | jmp .Lcbc_dec_continue; | |
1021 | ||
1022 | .Lcbc_dec_use_stack: | |
1023 | /* | |
1024 | * dst still in-use (because dst == src), so use stack for temporary | |
1025 | * storage. | |
1026 | */ | |
1027 | subq $(16 * 32), %rsp; | |
1028 | movq %rsp, %rax; | |
1029 | ||
1030 | .Lcbc_dec_continue: | |
1031 | call __camellia_dec_blk32; | |
1032 | ||
1033 | vmovdqu %ymm7, (%rax); | |
1034 | vpxor %ymm7, %ymm7, %ymm7; | |
1035 | vinserti128 $1, (%rdx), %ymm7, %ymm7; | |
1036 | vpxor (%rax), %ymm7, %ymm7; | |
1037 | movq %r10, %rsp; | |
1038 | vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; | |
1039 | vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; | |
1040 | vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; | |
1041 | vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; | |
1042 | vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; | |
1043 | vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; | |
1044 | vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; | |
1045 | vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; | |
1046 | vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; | |
1047 | vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; | |
1048 | vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; | |
1049 | vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; | |
1050 | vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; | |
1051 | vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; | |
1052 | vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; | |
1053 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | |
1054 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | |
1055 | %ymm8, %rsi); | |
1056 | ||
1057 | vzeroupper; | |
1058 | ||
8691ccd7 | 1059 | FRAME_END |
f3f935a7 JK |
1060 | ret; |
1061 | ENDPROC(camellia_cbc_dec_32way) | |
1062 | ||
1063 | #define inc_le128(x, minus_one, tmp) \ | |
1064 | vpcmpeqq minus_one, x, tmp; \ | |
1065 | vpsubq minus_one, x, x; \ | |
1066 | vpslldq $8, tmp, tmp; \ | |
1067 | vpsubq tmp, x, x; | |
1068 | ||
1069 | #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ | |
1070 | vpcmpeqq minus_one, x, tmp1; \ | |
1071 | vpcmpeqq minus_two, x, tmp2; \ | |
1072 | vpsubq minus_two, x, x; \ | |
1073 | vpor tmp2, tmp1, tmp1; \ | |
1074 | vpslldq $8, tmp1, tmp1; \ | |
1075 | vpsubq tmp1, x, x; | |
1076 | ||
1077 | ENTRY(camellia_ctr_32way) | |
1078 | /* input: | |
1079 | * %rdi: ctx, CTX | |
1080 | * %rsi: dst (32 blocks) | |
1081 | * %rdx: src (32 blocks) | |
1082 | * %rcx: iv (little endian, 128bit) | |
1083 | */ | |
8691ccd7 | 1084 | FRAME_BEGIN |
f3f935a7 JK |
1085 | |
1086 | vzeroupper; | |
1087 | ||
1088 | movq %rsp, %r10; | |
1089 | cmpq %rsi, %rdx; | |
1090 | je .Lctr_use_stack; | |
1091 | ||
1092 | /* dst can be used as temporary storage, src is not overwritten. */ | |
1093 | movq %rsi, %rax; | |
1094 | jmp .Lctr_continue; | |
1095 | ||
1096 | .Lctr_use_stack: | |
1097 | subq $(16 * 32), %rsp; | |
1098 | movq %rsp, %rax; | |
1099 | ||
1100 | .Lctr_continue: | |
1101 | vpcmpeqd %ymm15, %ymm15, %ymm15; | |
1102 | vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ | |
1103 | vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ | |
1104 | ||
1105 | /* load IV and byteswap */ | |
1106 | vmovdqu (%rcx), %xmm0; | |
1107 | vmovdqa %xmm0, %xmm1; | |
1108 | inc_le128(%xmm0, %xmm15, %xmm14); | |
1109 | vbroadcasti128 .Lbswap128_mask, %ymm14; | |
1110 | vinserti128 $1, %xmm0, %ymm1, %ymm0; | |
1111 | vpshufb %ymm14, %ymm0, %ymm13; | |
1112 | vmovdqu %ymm13, 15 * 32(%rax); | |
1113 | ||
1114 | /* construct IVs */ | |
1115 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ | |
1116 | vpshufb %ymm14, %ymm0, %ymm13; | |
1117 | vmovdqu %ymm13, 14 * 32(%rax); | |
1118 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1119 | vpshufb %ymm14, %ymm0, %ymm13; | |
1120 | vmovdqu %ymm13, 13 * 32(%rax); | |
1121 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1122 | vpshufb %ymm14, %ymm0, %ymm13; | |
1123 | vmovdqu %ymm13, 12 * 32(%rax); | |
1124 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1125 | vpshufb %ymm14, %ymm0, %ymm13; | |
1126 | vmovdqu %ymm13, 11 * 32(%rax); | |
1127 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1128 | vpshufb %ymm14, %ymm0, %ymm10; | |
1129 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1130 | vpshufb %ymm14, %ymm0, %ymm9; | |
1131 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1132 | vpshufb %ymm14, %ymm0, %ymm8; | |
1133 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1134 | vpshufb %ymm14, %ymm0, %ymm7; | |
1135 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1136 | vpshufb %ymm14, %ymm0, %ymm6; | |
1137 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1138 | vpshufb %ymm14, %ymm0, %ymm5; | |
1139 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1140 | vpshufb %ymm14, %ymm0, %ymm4; | |
1141 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1142 | vpshufb %ymm14, %ymm0, %ymm3; | |
1143 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1144 | vpshufb %ymm14, %ymm0, %ymm2; | |
1145 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1146 | vpshufb %ymm14, %ymm0, %ymm1; | |
1147 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | |
1148 | vextracti128 $1, %ymm0, %xmm13; | |
1149 | vpshufb %ymm14, %ymm0, %ymm0; | |
1150 | inc_le128(%xmm13, %xmm15, %xmm14); | |
1151 | vmovdqu %xmm13, (%rcx); | |
1152 | ||
1153 | /* inpack32_pre: */ | |
1154 | vpbroadcastq (key_table)(CTX), %ymm15; | |
1155 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | |
1156 | vpxor %ymm0, %ymm15, %ymm0; | |
1157 | vpxor %ymm1, %ymm15, %ymm1; | |
1158 | vpxor %ymm2, %ymm15, %ymm2; | |
1159 | vpxor %ymm3, %ymm15, %ymm3; | |
1160 | vpxor %ymm4, %ymm15, %ymm4; | |
1161 | vpxor %ymm5, %ymm15, %ymm5; | |
1162 | vpxor %ymm6, %ymm15, %ymm6; | |
1163 | vpxor %ymm7, %ymm15, %ymm7; | |
1164 | vpxor %ymm8, %ymm15, %ymm8; | |
1165 | vpxor %ymm9, %ymm15, %ymm9; | |
1166 | vpxor %ymm10, %ymm15, %ymm10; | |
1167 | vpxor 11 * 32(%rax), %ymm15, %ymm11; | |
1168 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | |
1169 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | |
1170 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | |
1171 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | |
1172 | ||
1173 | call __camellia_enc_blk32; | |
1174 | ||
1175 | movq %r10, %rsp; | |
1176 | ||
1177 | vpxor 0 * 32(%rdx), %ymm7, %ymm7; | |
1178 | vpxor 1 * 32(%rdx), %ymm6, %ymm6; | |
1179 | vpxor 2 * 32(%rdx), %ymm5, %ymm5; | |
1180 | vpxor 3 * 32(%rdx), %ymm4, %ymm4; | |
1181 | vpxor 4 * 32(%rdx), %ymm3, %ymm3; | |
1182 | vpxor 5 * 32(%rdx), %ymm2, %ymm2; | |
1183 | vpxor 6 * 32(%rdx), %ymm1, %ymm1; | |
1184 | vpxor 7 * 32(%rdx), %ymm0, %ymm0; | |
1185 | vpxor 8 * 32(%rdx), %ymm15, %ymm15; | |
1186 | vpxor 9 * 32(%rdx), %ymm14, %ymm14; | |
1187 | vpxor 10 * 32(%rdx), %ymm13, %ymm13; | |
1188 | vpxor 11 * 32(%rdx), %ymm12, %ymm12; | |
1189 | vpxor 12 * 32(%rdx), %ymm11, %ymm11; | |
1190 | vpxor 13 * 32(%rdx), %ymm10, %ymm10; | |
1191 | vpxor 14 * 32(%rdx), %ymm9, %ymm9; | |
1192 | vpxor 15 * 32(%rdx), %ymm8, %ymm8; | |
1193 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | |
1194 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | |
1195 | %ymm8, %rsi); | |
1196 | ||
1197 | vzeroupper; | |
1198 | ||
8691ccd7 | 1199 | FRAME_END |
f3f935a7 JK |
1200 | ret; |
1201 | ENDPROC(camellia_ctr_32way) | |
1202 | ||
1203 | #define gf128mul_x_ble(iv, mask, tmp) \ | |
1204 | vpsrad $31, iv, tmp; \ | |
1205 | vpaddq iv, iv, iv; \ | |
1206 | vpshufd $0x13, tmp, tmp; \ | |
1207 | vpand mask, tmp, tmp; \ | |
1208 | vpxor tmp, iv, iv; | |
1209 | ||
1210 | #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ | |
1211 | vpsrad $31, iv, tmp0; \ | |
1212 | vpaddq iv, iv, tmp1; \ | |
1213 | vpsllq $2, iv, iv; \ | |
1214 | vpshufd $0x13, tmp0, tmp0; \ | |
1215 | vpsrad $31, tmp1, tmp1; \ | |
1216 | vpand mask2, tmp0, tmp0; \ | |
1217 | vpshufd $0x13, tmp1, tmp1; \ | |
1218 | vpxor tmp0, iv, iv; \ | |
1219 | vpand mask1, tmp1, tmp1; \ | |
1220 | vpxor tmp1, iv, iv; | |
1221 | ||
1222 | .align 8 | |
1223 | camellia_xts_crypt_32way: | |
1224 | /* input: | |
1225 | * %rdi: ctx, CTX | |
1226 | * %rsi: dst (32 blocks) | |
1227 | * %rdx: src (32 blocks) | |
1228 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1229 | * %r8: index for input whitening key | |
1230 | * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 | |
1231 | */ | |
8691ccd7 | 1232 | FRAME_BEGIN |
f3f935a7 JK |
1233 | |
1234 | vzeroupper; | |
1235 | ||
1236 | subq $(16 * 32), %rsp; | |
1237 | movq %rsp, %rax; | |
1238 | ||
1239 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; | |
1240 | ||
1241 | /* load IV and construct second IV */ | |
1242 | vmovdqu (%rcx), %xmm0; | |
1243 | vmovdqa %xmm0, %xmm15; | |
1244 | gf128mul_x_ble(%xmm0, %xmm12, %xmm13); | |
1245 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; | |
1246 | vinserti128 $1, %xmm0, %ymm15, %ymm0; | |
1247 | vpxor 0 * 32(%rdx), %ymm0, %ymm15; | |
1248 | vmovdqu %ymm15, 15 * 32(%rax); | |
1249 | vmovdqu %ymm0, 0 * 32(%rsi); | |
1250 | ||
1251 | /* construct IVs */ | |
1252 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1253 | vpxor 1 * 32(%rdx), %ymm0, %ymm15; | |
1254 | vmovdqu %ymm15, 14 * 32(%rax); | |
1255 | vmovdqu %ymm0, 1 * 32(%rsi); | |
1256 | ||
1257 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1258 | vpxor 2 * 32(%rdx), %ymm0, %ymm15; | |
1259 | vmovdqu %ymm15, 13 * 32(%rax); | |
1260 | vmovdqu %ymm0, 2 * 32(%rsi); | |
1261 | ||
1262 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1263 | vpxor 3 * 32(%rdx), %ymm0, %ymm15; | |
1264 | vmovdqu %ymm15, 12 * 32(%rax); | |
1265 | vmovdqu %ymm0, 3 * 32(%rsi); | |
1266 | ||
1267 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1268 | vpxor 4 * 32(%rdx), %ymm0, %ymm11; | |
1269 | vmovdqu %ymm0, 4 * 32(%rsi); | |
1270 | ||
1271 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1272 | vpxor 5 * 32(%rdx), %ymm0, %ymm10; | |
1273 | vmovdqu %ymm0, 5 * 32(%rsi); | |
1274 | ||
1275 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1276 | vpxor 6 * 32(%rdx), %ymm0, %ymm9; | |
1277 | vmovdqu %ymm0, 6 * 32(%rsi); | |
1278 | ||
1279 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1280 | vpxor 7 * 32(%rdx), %ymm0, %ymm8; | |
1281 | vmovdqu %ymm0, 7 * 32(%rsi); | |
1282 | ||
1283 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1284 | vpxor 8 * 32(%rdx), %ymm0, %ymm7; | |
1285 | vmovdqu %ymm0, 8 * 32(%rsi); | |
1286 | ||
1287 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1288 | vpxor 9 * 32(%rdx), %ymm0, %ymm6; | |
1289 | vmovdqu %ymm0, 9 * 32(%rsi); | |
1290 | ||
1291 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1292 | vpxor 10 * 32(%rdx), %ymm0, %ymm5; | |
1293 | vmovdqu %ymm0, 10 * 32(%rsi); | |
1294 | ||
1295 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1296 | vpxor 11 * 32(%rdx), %ymm0, %ymm4; | |
1297 | vmovdqu %ymm0, 11 * 32(%rsi); | |
1298 | ||
1299 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1300 | vpxor 12 * 32(%rdx), %ymm0, %ymm3; | |
1301 | vmovdqu %ymm0, 12 * 32(%rsi); | |
1302 | ||
1303 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1304 | vpxor 13 * 32(%rdx), %ymm0, %ymm2; | |
1305 | vmovdqu %ymm0, 13 * 32(%rsi); | |
1306 | ||
1307 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1308 | vpxor 14 * 32(%rdx), %ymm0, %ymm1; | |
1309 | vmovdqu %ymm0, 14 * 32(%rsi); | |
1310 | ||
1311 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | |
1312 | vpxor 15 * 32(%rdx), %ymm0, %ymm15; | |
1313 | vmovdqu %ymm15, 0 * 32(%rax); | |
1314 | vmovdqu %ymm0, 15 * 32(%rsi); | |
1315 | ||
1316 | vextracti128 $1, %ymm0, %xmm0; | |
1317 | gf128mul_x_ble(%xmm0, %xmm12, %xmm15); | |
1318 | vmovdqu %xmm0, (%rcx); | |
1319 | ||
1320 | /* inpack32_pre: */ | |
1321 | vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; | |
1322 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | |
1323 | vpxor 0 * 32(%rax), %ymm15, %ymm0; | |
1324 | vpxor %ymm1, %ymm15, %ymm1; | |
1325 | vpxor %ymm2, %ymm15, %ymm2; | |
1326 | vpxor %ymm3, %ymm15, %ymm3; | |
1327 | vpxor %ymm4, %ymm15, %ymm4; | |
1328 | vpxor %ymm5, %ymm15, %ymm5; | |
1329 | vpxor %ymm6, %ymm15, %ymm6; | |
1330 | vpxor %ymm7, %ymm15, %ymm7; | |
1331 | vpxor %ymm8, %ymm15, %ymm8; | |
1332 | vpxor %ymm9, %ymm15, %ymm9; | |
1333 | vpxor %ymm10, %ymm15, %ymm10; | |
1334 | vpxor %ymm11, %ymm15, %ymm11; | |
1335 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | |
1336 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | |
1337 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | |
1338 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | |
1339 | ||
1340 | call *%r9; | |
1341 | ||
1342 | addq $(16 * 32), %rsp; | |
1343 | ||
1344 | vpxor 0 * 32(%rsi), %ymm7, %ymm7; | |
1345 | vpxor 1 * 32(%rsi), %ymm6, %ymm6; | |
1346 | vpxor 2 * 32(%rsi), %ymm5, %ymm5; | |
1347 | vpxor 3 * 32(%rsi), %ymm4, %ymm4; | |
1348 | vpxor 4 * 32(%rsi), %ymm3, %ymm3; | |
1349 | vpxor 5 * 32(%rsi), %ymm2, %ymm2; | |
1350 | vpxor 6 * 32(%rsi), %ymm1, %ymm1; | |
1351 | vpxor 7 * 32(%rsi), %ymm0, %ymm0; | |
1352 | vpxor 8 * 32(%rsi), %ymm15, %ymm15; | |
1353 | vpxor 9 * 32(%rsi), %ymm14, %ymm14; | |
1354 | vpxor 10 * 32(%rsi), %ymm13, %ymm13; | |
1355 | vpxor 11 * 32(%rsi), %ymm12, %ymm12; | |
1356 | vpxor 12 * 32(%rsi), %ymm11, %ymm11; | |
1357 | vpxor 13 * 32(%rsi), %ymm10, %ymm10; | |
1358 | vpxor 14 * 32(%rsi), %ymm9, %ymm9; | |
1359 | vpxor 15 * 32(%rsi), %ymm8, %ymm8; | |
1360 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | |
1361 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | |
1362 | %ymm8, %rsi); | |
1363 | ||
1364 | vzeroupper; | |
1365 | ||
8691ccd7 | 1366 | FRAME_END |
f3f935a7 JK |
1367 | ret; |
1368 | ENDPROC(camellia_xts_crypt_32way) | |
1369 | ||
1370 | ENTRY(camellia_xts_enc_32way) | |
1371 | /* input: | |
1372 | * %rdi: ctx, CTX | |
1373 | * %rsi: dst (32 blocks) | |
1374 | * %rdx: src (32 blocks) | |
1375 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1376 | */ | |
1377 | ||
1378 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | |
1379 | ||
1380 | leaq __camellia_enc_blk32, %r9; | |
1381 | ||
1382 | jmp camellia_xts_crypt_32way; | |
1383 | ENDPROC(camellia_xts_enc_32way) | |
1384 | ||
1385 | ENTRY(camellia_xts_dec_32way) | |
1386 | /* input: | |
1387 | * %rdi: ctx, CTX | |
1388 | * %rsi: dst (32 blocks) | |
1389 | * %rdx: src (32 blocks) | |
1390 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1391 | */ | |
1392 | ||
1393 | cmpl $16, key_length(CTX); | |
1394 | movl $32, %r8d; | |
1395 | movl $24, %eax; | |
1396 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | |
1397 | ||
1398 | leaq __camellia_dec_blk32, %r9; | |
1399 | ||
1400 | jmp camellia_xts_crypt_32way; | |
1401 | ENDPROC(camellia_xts_dec_32way) |