Commit | Line | Data |
---|---|---|
0db125c4 VN |
1 | #ifndef ASM_X86__XOR_32_H |
2 | #define ASM_X86__XOR_32_H | |
3 | ||
1da177e4 | 4 | /* |
1da177e4 LT |
5 | * Optimized RAID-5 checksumming functions for MMX and SSE. |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2, or (at your option) | |
10 | * any later version. | |
11 | * | |
12 | * You should have received a copy of the GNU General Public License | |
13 | * (for example /usr/src/linux/COPYING); if not, write to the Free | |
14 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
15 | */ | |
16 | ||
17 | /* | |
18 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | |
19 | * Copyright (C) 1998 Ingo Molnar. | |
20 | */ | |
21 | ||
8fdf7655 JP |
22 | #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" |
23 | #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | |
24 | #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | |
25 | #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | |
26 | #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | |
27 | #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | |
1da177e4 LT |
28 | |
29 | #include <asm/i387.h> | |
30 | ||
31 | static void | |
32 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
33 | { | |
34 | unsigned long lines = bytes >> 7; | |
35 | ||
36 | kernel_fpu_begin(); | |
37 | ||
8fdf7655 | 38 | asm volatile( |
1da177e4 | 39 | #undef BLOCK |
8fdf7655 JP |
40 | #define BLOCK(i) \ |
41 | LD(i, 0) \ | |
42 | LD(i + 1, 1) \ | |
43 | LD(i + 2, 2) \ | |
44 | LD(i + 3, 3) \ | |
45 | XO1(i, 0) \ | |
46 | ST(i, 0) \ | |
47 | XO1(i+1, 1) \ | |
48 | ST(i+1, 1) \ | |
49 | XO1(i + 2, 2) \ | |
50 | ST(i + 2, 2) \ | |
51 | XO1(i + 3, 3) \ | |
52 | ST(i + 3, 3) | |
1da177e4 LT |
53 | |
54 | " .align 32 ;\n" | |
8fdf7655 | 55 | " 1: ;\n" |
1da177e4 LT |
56 | |
57 | BLOCK(0) | |
58 | BLOCK(4) | |
59 | BLOCK(8) | |
60 | BLOCK(12) | |
61 | ||
62 | " addl $128, %1 ;\n" | |
63 | " addl $128, %2 ;\n" | |
64 | " decl %0 ;\n" | |
65 | " jnz 1b ;\n" | |
66 | : "+r" (lines), | |
67 | "+r" (p1), "+r" (p2) | |
68 | : | |
69 | : "memory"); | |
70 | ||
71 | kernel_fpu_end(); | |
72 | } | |
73 | ||
74 | static void | |
75 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
76 | unsigned long *p3) | |
77 | { | |
78 | unsigned long lines = bytes >> 7; | |
79 | ||
80 | kernel_fpu_begin(); | |
81 | ||
8fdf7655 | 82 | asm volatile( |
1da177e4 | 83 | #undef BLOCK |
8fdf7655 JP |
84 | #define BLOCK(i) \ |
85 | LD(i, 0) \ | |
86 | LD(i + 1, 1) \ | |
87 | LD(i + 2, 2) \ | |
88 | LD(i + 3, 3) \ | |
89 | XO1(i, 0) \ | |
90 | XO1(i + 1, 1) \ | |
91 | XO1(i + 2, 2) \ | |
92 | XO1(i + 3, 3) \ | |
93 | XO2(i, 0) \ | |
94 | ST(i, 0) \ | |
95 | XO2(i + 1, 1) \ | |
96 | ST(i + 1, 1) \ | |
97 | XO2(i + 2, 2) \ | |
98 | ST(i + 2, 2) \ | |
99 | XO2(i + 3, 3) \ | |
100 | ST(i + 3, 3) | |
1da177e4 LT |
101 | |
102 | " .align 32 ;\n" | |
103 | " 1: ;\n" | |
104 | ||
105 | BLOCK(0) | |
106 | BLOCK(4) | |
107 | BLOCK(8) | |
108 | BLOCK(12) | |
109 | ||
110 | " addl $128, %1 ;\n" | |
111 | " addl $128, %2 ;\n" | |
112 | " addl $128, %3 ;\n" | |
113 | " decl %0 ;\n" | |
114 | " jnz 1b ;\n" | |
115 | : "+r" (lines), | |
116 | "+r" (p1), "+r" (p2), "+r" (p3) | |
117 | : | |
118 | : "memory"); | |
119 | ||
120 | kernel_fpu_end(); | |
121 | } | |
122 | ||
123 | static void | |
124 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
125 | unsigned long *p3, unsigned long *p4) | |
126 | { | |
127 | unsigned long lines = bytes >> 7; | |
128 | ||
129 | kernel_fpu_begin(); | |
130 | ||
8fdf7655 | 131 | asm volatile( |
1da177e4 | 132 | #undef BLOCK |
8fdf7655 JP |
133 | #define BLOCK(i) \ |
134 | LD(i, 0) \ | |
135 | LD(i + 1, 1) \ | |
136 | LD(i + 2, 2) \ | |
137 | LD(i + 3, 3) \ | |
138 | XO1(i, 0) \ | |
139 | XO1(i + 1, 1) \ | |
140 | XO1(i + 2, 2) \ | |
141 | XO1(i + 3, 3) \ | |
142 | XO2(i, 0) \ | |
143 | XO2(i + 1, 1) \ | |
144 | XO2(i + 2, 2) \ | |
145 | XO2(i + 3, 3) \ | |
146 | XO3(i, 0) \ | |
147 | ST(i, 0) \ | |
148 | XO3(i + 1, 1) \ | |
149 | ST(i + 1, 1) \ | |
150 | XO3(i + 2, 2) \ | |
151 | ST(i + 2, 2) \ | |
152 | XO3(i + 3, 3) \ | |
153 | ST(i + 3, 3) | |
1da177e4 LT |
154 | |
155 | " .align 32 ;\n" | |
156 | " 1: ;\n" | |
157 | ||
158 | BLOCK(0) | |
159 | BLOCK(4) | |
160 | BLOCK(8) | |
161 | BLOCK(12) | |
162 | ||
163 | " addl $128, %1 ;\n" | |
164 | " addl $128, %2 ;\n" | |
165 | " addl $128, %3 ;\n" | |
166 | " addl $128, %4 ;\n" | |
167 | " decl %0 ;\n" | |
168 | " jnz 1b ;\n" | |
169 | : "+r" (lines), | |
170 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
171 | : | |
172 | : "memory"); | |
173 | ||
174 | kernel_fpu_end(); | |
175 | } | |
176 | ||
177 | ||
178 | static void | |
179 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
180 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
181 | { | |
182 | unsigned long lines = bytes >> 7; | |
183 | ||
184 | kernel_fpu_begin(); | |
185 | ||
186 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
187 | such that it won't pass to the asm volatile below a | |
188 | register that is shared with any other variable. That's | |
189 | because we modify p4 and p5 there, but we can't mark them | |
190 | as read/write, otherwise we'd overflow the 10-asm-operands | |
191 | limit of GCC < 3.1. */ | |
8fdf7655 | 192 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 193 | |
8fdf7655 | 194 | asm volatile( |
1da177e4 | 195 | #undef BLOCK |
8fdf7655 JP |
196 | #define BLOCK(i) \ |
197 | LD(i, 0) \ | |
198 | LD(i + 1, 1) \ | |
199 | LD(i + 2, 2) \ | |
200 | LD(i + 3, 3) \ | |
201 | XO1(i, 0) \ | |
202 | XO1(i + 1, 1) \ | |
203 | XO1(i + 2, 2) \ | |
204 | XO1(i + 3, 3) \ | |
205 | XO2(i, 0) \ | |
206 | XO2(i + 1, 1) \ | |
207 | XO2(i + 2, 2) \ | |
208 | XO2(i + 3, 3) \ | |
209 | XO3(i, 0) \ | |
210 | XO3(i + 1, 1) \ | |
211 | XO3(i + 2, 2) \ | |
212 | XO3(i + 3, 3) \ | |
213 | XO4(i, 0) \ | |
214 | ST(i, 0) \ | |
215 | XO4(i + 1, 1) \ | |
216 | ST(i + 1, 1) \ | |
217 | XO4(i + 2, 2) \ | |
218 | ST(i + 2, 2) \ | |
219 | XO4(i + 3, 3) \ | |
220 | ST(i + 3, 3) | |
1da177e4 LT |
221 | |
222 | " .align 32 ;\n" | |
223 | " 1: ;\n" | |
224 | ||
225 | BLOCK(0) | |
226 | BLOCK(4) | |
227 | BLOCK(8) | |
228 | BLOCK(12) | |
229 | ||
230 | " addl $128, %1 ;\n" | |
231 | " addl $128, %2 ;\n" | |
232 | " addl $128, %3 ;\n" | |
233 | " addl $128, %4 ;\n" | |
234 | " addl $128, %5 ;\n" | |
235 | " decl %0 ;\n" | |
236 | " jnz 1b ;\n" | |
237 | : "+r" (lines), | |
238 | "+r" (p1), "+r" (p2), "+r" (p3) | |
8fdf7655 | 239 | : "r" (p4), "r" (p5) |
1da177e4 LT |
240 | : "memory"); |
241 | ||
242 | /* p4 and p5 were modified, and now the variables are dead. | |
243 | Clobber them just to be sure nobody does something stupid | |
244 | like assuming they have some legal value. */ | |
8fdf7655 | 245 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
246 | |
247 | kernel_fpu_end(); | |
248 | } | |
249 | ||
250 | #undef LD | |
251 | #undef XO1 | |
252 | #undef XO2 | |
253 | #undef XO3 | |
254 | #undef XO4 | |
255 | #undef ST | |
256 | #undef BLOCK | |
257 | ||
258 | static void | |
259 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
260 | { | |
261 | unsigned long lines = bytes >> 6; | |
262 | ||
263 | kernel_fpu_begin(); | |
264 | ||
8fdf7655 | 265 | asm volatile( |
1da177e4 LT |
266 | " .align 32 ;\n" |
267 | " 1: ;\n" | |
268 | " movq (%1), %%mm0 ;\n" | |
269 | " movq 8(%1), %%mm1 ;\n" | |
270 | " pxor (%2), %%mm0 ;\n" | |
271 | " movq 16(%1), %%mm2 ;\n" | |
272 | " movq %%mm0, (%1) ;\n" | |
273 | " pxor 8(%2), %%mm1 ;\n" | |
274 | " movq 24(%1), %%mm3 ;\n" | |
275 | " movq %%mm1, 8(%1) ;\n" | |
276 | " pxor 16(%2), %%mm2 ;\n" | |
277 | " movq 32(%1), %%mm4 ;\n" | |
278 | " movq %%mm2, 16(%1) ;\n" | |
279 | " pxor 24(%2), %%mm3 ;\n" | |
280 | " movq 40(%1), %%mm5 ;\n" | |
281 | " movq %%mm3, 24(%1) ;\n" | |
282 | " pxor 32(%2), %%mm4 ;\n" | |
283 | " movq 48(%1), %%mm6 ;\n" | |
284 | " movq %%mm4, 32(%1) ;\n" | |
285 | " pxor 40(%2), %%mm5 ;\n" | |
286 | " movq 56(%1), %%mm7 ;\n" | |
287 | " movq %%mm5, 40(%1) ;\n" | |
288 | " pxor 48(%2), %%mm6 ;\n" | |
289 | " pxor 56(%2), %%mm7 ;\n" | |
290 | " movq %%mm6, 48(%1) ;\n" | |
291 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 292 | |
1da177e4 LT |
293 | " addl $64, %1 ;\n" |
294 | " addl $64, %2 ;\n" | |
295 | " decl %0 ;\n" | |
296 | " jnz 1b ;\n" | |
297 | : "+r" (lines), | |
298 | "+r" (p1), "+r" (p2) | |
299 | : | |
300 | : "memory"); | |
301 | ||
302 | kernel_fpu_end(); | |
303 | } | |
304 | ||
305 | static void | |
306 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
307 | unsigned long *p3) | |
308 | { | |
309 | unsigned long lines = bytes >> 6; | |
310 | ||
311 | kernel_fpu_begin(); | |
312 | ||
8fdf7655 | 313 | asm volatile( |
1da177e4 LT |
314 | " .align 32,0x90 ;\n" |
315 | " 1: ;\n" | |
316 | " movq (%1), %%mm0 ;\n" | |
317 | " movq 8(%1), %%mm1 ;\n" | |
318 | " pxor (%2), %%mm0 ;\n" | |
319 | " movq 16(%1), %%mm2 ;\n" | |
320 | " pxor 8(%2), %%mm1 ;\n" | |
321 | " pxor (%3), %%mm0 ;\n" | |
322 | " pxor 16(%2), %%mm2 ;\n" | |
323 | " movq %%mm0, (%1) ;\n" | |
324 | " pxor 8(%3), %%mm1 ;\n" | |
325 | " pxor 16(%3), %%mm2 ;\n" | |
326 | " movq 24(%1), %%mm3 ;\n" | |
327 | " movq %%mm1, 8(%1) ;\n" | |
328 | " movq 32(%1), %%mm4 ;\n" | |
329 | " movq 40(%1), %%mm5 ;\n" | |
330 | " pxor 24(%2), %%mm3 ;\n" | |
331 | " movq %%mm2, 16(%1) ;\n" | |
332 | " pxor 32(%2), %%mm4 ;\n" | |
333 | " pxor 24(%3), %%mm3 ;\n" | |
334 | " pxor 40(%2), %%mm5 ;\n" | |
335 | " movq %%mm3, 24(%1) ;\n" | |
336 | " pxor 32(%3), %%mm4 ;\n" | |
337 | " pxor 40(%3), %%mm5 ;\n" | |
338 | " movq 48(%1), %%mm6 ;\n" | |
339 | " movq %%mm4, 32(%1) ;\n" | |
340 | " movq 56(%1), %%mm7 ;\n" | |
341 | " pxor 48(%2), %%mm6 ;\n" | |
342 | " movq %%mm5, 40(%1) ;\n" | |
343 | " pxor 56(%2), %%mm7 ;\n" | |
344 | " pxor 48(%3), %%mm6 ;\n" | |
345 | " pxor 56(%3), %%mm7 ;\n" | |
346 | " movq %%mm6, 48(%1) ;\n" | |
347 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 348 | |
1da177e4 LT |
349 | " addl $64, %1 ;\n" |
350 | " addl $64, %2 ;\n" | |
351 | " addl $64, %3 ;\n" | |
352 | " decl %0 ;\n" | |
353 | " jnz 1b ;\n" | |
354 | : "+r" (lines), | |
355 | "+r" (p1), "+r" (p2), "+r" (p3) | |
356 | : | |
357 | : "memory" ); | |
358 | ||
359 | kernel_fpu_end(); | |
360 | } | |
361 | ||
362 | static void | |
363 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
364 | unsigned long *p3, unsigned long *p4) | |
365 | { | |
366 | unsigned long lines = bytes >> 6; | |
367 | ||
368 | kernel_fpu_begin(); | |
369 | ||
8fdf7655 | 370 | asm volatile( |
1da177e4 LT |
371 | " .align 32,0x90 ;\n" |
372 | " 1: ;\n" | |
373 | " movq (%1), %%mm0 ;\n" | |
374 | " movq 8(%1), %%mm1 ;\n" | |
375 | " pxor (%2), %%mm0 ;\n" | |
376 | " movq 16(%1), %%mm2 ;\n" | |
377 | " pxor 8(%2), %%mm1 ;\n" | |
378 | " pxor (%3), %%mm0 ;\n" | |
379 | " pxor 16(%2), %%mm2 ;\n" | |
380 | " pxor 8(%3), %%mm1 ;\n" | |
381 | " pxor (%4), %%mm0 ;\n" | |
382 | " movq 24(%1), %%mm3 ;\n" | |
383 | " pxor 16(%3), %%mm2 ;\n" | |
384 | " pxor 8(%4), %%mm1 ;\n" | |
385 | " movq %%mm0, (%1) ;\n" | |
386 | " movq 32(%1), %%mm4 ;\n" | |
387 | " pxor 24(%2), %%mm3 ;\n" | |
388 | " pxor 16(%4), %%mm2 ;\n" | |
389 | " movq %%mm1, 8(%1) ;\n" | |
390 | " movq 40(%1), %%mm5 ;\n" | |
391 | " pxor 32(%2), %%mm4 ;\n" | |
392 | " pxor 24(%3), %%mm3 ;\n" | |
393 | " movq %%mm2, 16(%1) ;\n" | |
394 | " pxor 40(%2), %%mm5 ;\n" | |
395 | " pxor 32(%3), %%mm4 ;\n" | |
396 | " pxor 24(%4), %%mm3 ;\n" | |
397 | " movq %%mm3, 24(%1) ;\n" | |
398 | " movq 56(%1), %%mm7 ;\n" | |
399 | " movq 48(%1), %%mm6 ;\n" | |
400 | " pxor 40(%3), %%mm5 ;\n" | |
401 | " pxor 32(%4), %%mm4 ;\n" | |
402 | " pxor 48(%2), %%mm6 ;\n" | |
403 | " movq %%mm4, 32(%1) ;\n" | |
404 | " pxor 56(%2), %%mm7 ;\n" | |
405 | " pxor 40(%4), %%mm5 ;\n" | |
406 | " pxor 48(%3), %%mm6 ;\n" | |
407 | " pxor 56(%3), %%mm7 ;\n" | |
408 | " movq %%mm5, 40(%1) ;\n" | |
409 | " pxor 48(%4), %%mm6 ;\n" | |
410 | " pxor 56(%4), %%mm7 ;\n" | |
411 | " movq %%mm6, 48(%1) ;\n" | |
412 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 413 | |
1da177e4 LT |
414 | " addl $64, %1 ;\n" |
415 | " addl $64, %2 ;\n" | |
416 | " addl $64, %3 ;\n" | |
417 | " addl $64, %4 ;\n" | |
418 | " decl %0 ;\n" | |
419 | " jnz 1b ;\n" | |
420 | : "+r" (lines), | |
421 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
422 | : | |
423 | : "memory"); | |
424 | ||
425 | kernel_fpu_end(); | |
426 | } | |
427 | ||
428 | static void | |
429 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
430 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
431 | { | |
432 | unsigned long lines = bytes >> 6; | |
433 | ||
434 | kernel_fpu_begin(); | |
435 | ||
436 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
437 | such that it won't pass to the asm volatile below a | |
438 | register that is shared with any other variable. That's | |
439 | because we modify p4 and p5 there, but we can't mark them | |
440 | as read/write, otherwise we'd overflow the 10-asm-operands | |
441 | limit of GCC < 3.1. */ | |
8fdf7655 | 442 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 443 | |
8fdf7655 | 444 | asm volatile( |
1da177e4 LT |
445 | " .align 32,0x90 ;\n" |
446 | " 1: ;\n" | |
447 | " movq (%1), %%mm0 ;\n" | |
448 | " movq 8(%1), %%mm1 ;\n" | |
449 | " pxor (%2), %%mm0 ;\n" | |
450 | " pxor 8(%2), %%mm1 ;\n" | |
451 | " movq 16(%1), %%mm2 ;\n" | |
452 | " pxor (%3), %%mm0 ;\n" | |
453 | " pxor 8(%3), %%mm1 ;\n" | |
454 | " pxor 16(%2), %%mm2 ;\n" | |
455 | " pxor (%4), %%mm0 ;\n" | |
456 | " pxor 8(%4), %%mm1 ;\n" | |
457 | " pxor 16(%3), %%mm2 ;\n" | |
458 | " movq 24(%1), %%mm3 ;\n" | |
459 | " pxor (%5), %%mm0 ;\n" | |
460 | " pxor 8(%5), %%mm1 ;\n" | |
461 | " movq %%mm0, (%1) ;\n" | |
462 | " pxor 16(%4), %%mm2 ;\n" | |
463 | " pxor 24(%2), %%mm3 ;\n" | |
464 | " movq %%mm1, 8(%1) ;\n" | |
465 | " pxor 16(%5), %%mm2 ;\n" | |
466 | " pxor 24(%3), %%mm3 ;\n" | |
467 | " movq 32(%1), %%mm4 ;\n" | |
468 | " movq %%mm2, 16(%1) ;\n" | |
469 | " pxor 24(%4), %%mm3 ;\n" | |
470 | " pxor 32(%2), %%mm4 ;\n" | |
471 | " movq 40(%1), %%mm5 ;\n" | |
472 | " pxor 24(%5), %%mm3 ;\n" | |
473 | " pxor 32(%3), %%mm4 ;\n" | |
474 | " pxor 40(%2), %%mm5 ;\n" | |
475 | " movq %%mm3, 24(%1) ;\n" | |
476 | " pxor 32(%4), %%mm4 ;\n" | |
477 | " pxor 40(%3), %%mm5 ;\n" | |
478 | " movq 48(%1), %%mm6 ;\n" | |
479 | " movq 56(%1), %%mm7 ;\n" | |
480 | " pxor 32(%5), %%mm4 ;\n" | |
481 | " pxor 40(%4), %%mm5 ;\n" | |
482 | " pxor 48(%2), %%mm6 ;\n" | |
483 | " pxor 56(%2), %%mm7 ;\n" | |
484 | " movq %%mm4, 32(%1) ;\n" | |
485 | " pxor 48(%3), %%mm6 ;\n" | |
486 | " pxor 56(%3), %%mm7 ;\n" | |
487 | " pxor 40(%5), %%mm5 ;\n" | |
488 | " pxor 48(%4), %%mm6 ;\n" | |
489 | " pxor 56(%4), %%mm7 ;\n" | |
490 | " movq %%mm5, 40(%1) ;\n" | |
491 | " pxor 48(%5), %%mm6 ;\n" | |
492 | " pxor 56(%5), %%mm7 ;\n" | |
493 | " movq %%mm6, 48(%1) ;\n" | |
494 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 495 | |
1da177e4 LT |
496 | " addl $64, %1 ;\n" |
497 | " addl $64, %2 ;\n" | |
498 | " addl $64, %3 ;\n" | |
499 | " addl $64, %4 ;\n" | |
500 | " addl $64, %5 ;\n" | |
501 | " decl %0 ;\n" | |
502 | " jnz 1b ;\n" | |
503 | : "+r" (lines), | |
504 | "+r" (p1), "+r" (p2), "+r" (p3) | |
505 | : "r" (p4), "r" (p5) | |
506 | : "memory"); | |
507 | ||
508 | /* p4 and p5 were modified, and now the variables are dead. | |
509 | Clobber them just to be sure nobody does something stupid | |
510 | like assuming they have some legal value. */ | |
8fdf7655 | 511 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
512 | |
513 | kernel_fpu_end(); | |
514 | } | |
515 | ||
516 | static struct xor_block_template xor_block_pII_mmx = { | |
517 | .name = "pII_mmx", | |
518 | .do_2 = xor_pII_mmx_2, | |
519 | .do_3 = xor_pII_mmx_3, | |
520 | .do_4 = xor_pII_mmx_4, | |
521 | .do_5 = xor_pII_mmx_5, | |
522 | }; | |
523 | ||
524 | static struct xor_block_template xor_block_p5_mmx = { | |
525 | .name = "p5_mmx", | |
526 | .do_2 = xor_p5_mmx_2, | |
527 | .do_3 = xor_p5_mmx_3, | |
528 | .do_4 = xor_p5_mmx_4, | |
529 | .do_5 = xor_p5_mmx_5, | |
530 | }; | |
531 | ||
532 | /* | |
533 | * Cache avoiding checksumming functions utilizing KNI instructions | |
534 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
535 | */ | |
536 | ||
8fdf7655 JP |
537 | #define XMMS_SAVE \ |
538 | do { \ | |
1da177e4 | 539 | preempt_disable(); \ |
4bb0d3ec ZA |
540 | cr0 = read_cr0(); \ |
541 | clts(); \ | |
8fdf7655 | 542 | asm volatile( \ |
4bb0d3ec ZA |
543 | "movups %%xmm0,(%0) ;\n\t" \ |
544 | "movups %%xmm1,0x10(%0) ;\n\t" \ | |
545 | "movups %%xmm2,0x20(%0) ;\n\t" \ | |
546 | "movups %%xmm3,0x30(%0) ;\n\t" \ | |
547 | : \ | |
1da177e4 LT |
548 | : "r" (xmm_save) \ |
549 | : "memory"); \ | |
8fdf7655 | 550 | } while (0) |
1da177e4 | 551 | |
8fdf7655 JP |
552 | #define XMMS_RESTORE \ |
553 | do { \ | |
554 | asm volatile( \ | |
1da177e4 | 555 | "sfence ;\n\t" \ |
4bb0d3ec ZA |
556 | "movups (%0),%%xmm0 ;\n\t" \ |
557 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | |
558 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | |
559 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | |
1da177e4 | 560 | : \ |
4bb0d3ec | 561 | : "r" (xmm_save) \ |
1da177e4 | 562 | : "memory"); \ |
4bb0d3ec | 563 | write_cr0(cr0); \ |
1da177e4 | 564 | preempt_enable(); \ |
8fdf7655 | 565 | } while (0) |
1da177e4 LT |
566 | |
567 | #define ALIGN16 __attribute__((aligned(16))) | |
568 | ||
569 | #define OFFS(x) "16*("#x")" | |
570 | #define PF_OFFS(x) "256+16*("#x")" | |
571 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | |
8fdf7655 JP |
572 | #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" |
573 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | |
1da177e4 LT |
574 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" |
575 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | |
576 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | |
577 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | |
578 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | |
8fdf7655 JP |
579 | #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" |
580 | #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | |
581 | #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | |
582 | #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | |
583 | #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | |
1da177e4 LT |
584 | |
585 | ||
586 | static void | |
587 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
588 | { | |
8fdf7655 | 589 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
590 | char xmm_save[16*4] ALIGN16; |
591 | int cr0; | |
592 | ||
593 | XMMS_SAVE; | |
594 | ||
8fdf7655 | 595 | asm volatile( |
1da177e4 | 596 | #undef BLOCK |
8fdf7655 JP |
597 | #define BLOCK(i) \ |
598 | LD(i, 0) \ | |
599 | LD(i + 1, 1) \ | |
1da177e4 | 600 | PF1(i) \ |
8fdf7655 JP |
601 | PF1(i + 2) \ |
602 | LD(i + 2, 2) \ | |
603 | LD(i + 3, 3) \ | |
604 | PF0(i + 4) \ | |
605 | PF0(i + 6) \ | |
606 | XO1(i, 0) \ | |
607 | XO1(i + 1, 1) \ | |
608 | XO1(i + 2, 2) \ | |
609 | XO1(i + 3, 3) \ | |
610 | ST(i, 0) \ | |
611 | ST(i + 1, 1) \ | |
612 | ST(i + 2, 2) \ | |
613 | ST(i + 3, 3) \ | |
1da177e4 LT |
614 | |
615 | ||
616 | PF0(0) | |
617 | PF0(2) | |
618 | ||
619 | " .align 32 ;\n" | |
8fdf7655 | 620 | " 1: ;\n" |
1da177e4 LT |
621 | |
622 | BLOCK(0) | |
623 | BLOCK(4) | |
624 | BLOCK(8) | |
625 | BLOCK(12) | |
626 | ||
8fdf7655 JP |
627 | " addl $256, %1 ;\n" |
628 | " addl $256, %2 ;\n" | |
629 | " decl %0 ;\n" | |
630 | " jnz 1b ;\n" | |
1da177e4 LT |
631 | : "+r" (lines), |
632 | "+r" (p1), "+r" (p2) | |
633 | : | |
8fdf7655 | 634 | : "memory"); |
1da177e4 LT |
635 | |
636 | XMMS_RESTORE; | |
637 | } | |
638 | ||
639 | static void | |
640 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
641 | unsigned long *p3) | |
642 | { | |
8fdf7655 | 643 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
644 | char xmm_save[16*4] ALIGN16; |
645 | int cr0; | |
646 | ||
647 | XMMS_SAVE; | |
648 | ||
8fdf7655 | 649 | asm volatile( |
1da177e4 LT |
650 | #undef BLOCK |
651 | #define BLOCK(i) \ | |
652 | PF1(i) \ | |
8fdf7655 | 653 | PF1(i + 2) \ |
1da177e4 | 654 | LD(i,0) \ |
8fdf7655 JP |
655 | LD(i + 1, 1) \ |
656 | LD(i + 2, 2) \ | |
657 | LD(i + 3, 3) \ | |
1da177e4 | 658 | PF2(i) \ |
8fdf7655 JP |
659 | PF2(i + 2) \ |
660 | PF0(i + 4) \ | |
661 | PF0(i + 6) \ | |
1da177e4 | 662 | XO1(i,0) \ |
8fdf7655 JP |
663 | XO1(i + 1, 1) \ |
664 | XO1(i + 2, 2) \ | |
665 | XO1(i + 3, 3) \ | |
1da177e4 | 666 | XO2(i,0) \ |
8fdf7655 JP |
667 | XO2(i + 1, 1) \ |
668 | XO2(i + 2, 2) \ | |
669 | XO2(i + 3, 3) \ | |
1da177e4 | 670 | ST(i,0) \ |
8fdf7655 JP |
671 | ST(i + 1, 1) \ |
672 | ST(i + 2, 2) \ | |
673 | ST(i + 3, 3) \ | |
1da177e4 LT |
674 | |
675 | ||
676 | PF0(0) | |
677 | PF0(2) | |
678 | ||
679 | " .align 32 ;\n" | |
8fdf7655 | 680 | " 1: ;\n" |
1da177e4 LT |
681 | |
682 | BLOCK(0) | |
683 | BLOCK(4) | |
684 | BLOCK(8) | |
685 | BLOCK(12) | |
686 | ||
8fdf7655 JP |
687 | " addl $256, %1 ;\n" |
688 | " addl $256, %2 ;\n" | |
689 | " addl $256, %3 ;\n" | |
690 | " decl %0 ;\n" | |
691 | " jnz 1b ;\n" | |
1da177e4 LT |
692 | : "+r" (lines), |
693 | "+r" (p1), "+r"(p2), "+r"(p3) | |
694 | : | |
8fdf7655 | 695 | : "memory" ); |
1da177e4 LT |
696 | |
697 | XMMS_RESTORE; | |
698 | } | |
699 | ||
700 | static void | |
701 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
702 | unsigned long *p3, unsigned long *p4) | |
703 | { | |
8fdf7655 | 704 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
705 | char xmm_save[16*4] ALIGN16; |
706 | int cr0; | |
707 | ||
708 | XMMS_SAVE; | |
709 | ||
8fdf7655 | 710 | asm volatile( |
1da177e4 LT |
711 | #undef BLOCK |
712 | #define BLOCK(i) \ | |
713 | PF1(i) \ | |
8fdf7655 | 714 | PF1(i + 2) \ |
1da177e4 | 715 | LD(i,0) \ |
8fdf7655 JP |
716 | LD(i + 1, 1) \ |
717 | LD(i + 2, 2) \ | |
718 | LD(i + 3, 3) \ | |
1da177e4 | 719 | PF2(i) \ |
8fdf7655 | 720 | PF2(i + 2) \ |
1da177e4 | 721 | XO1(i,0) \ |
8fdf7655 JP |
722 | XO1(i + 1, 1) \ |
723 | XO1(i + 2, 2) \ | |
724 | XO1(i + 3, 3) \ | |
1da177e4 | 725 | PF3(i) \ |
8fdf7655 JP |
726 | PF3(i + 2) \ |
727 | PF0(i + 4) \ | |
728 | PF0(i + 6) \ | |
1da177e4 | 729 | XO2(i,0) \ |
8fdf7655 JP |
730 | XO2(i + 1, 1) \ |
731 | XO2(i + 2, 2) \ | |
732 | XO2(i + 3, 3) \ | |
1da177e4 | 733 | XO3(i,0) \ |
8fdf7655 JP |
734 | XO3(i + 1, 1) \ |
735 | XO3(i + 2, 2) \ | |
736 | XO3(i + 3, 3) \ | |
1da177e4 | 737 | ST(i,0) \ |
8fdf7655 JP |
738 | ST(i + 1, 1) \ |
739 | ST(i + 2, 2) \ | |
740 | ST(i + 3, 3) \ | |
1da177e4 LT |
741 | |
742 | ||
743 | PF0(0) | |
744 | PF0(2) | |
745 | ||
746 | " .align 32 ;\n" | |
8fdf7655 | 747 | " 1: ;\n" |
1da177e4 LT |
748 | |
749 | BLOCK(0) | |
750 | BLOCK(4) | |
751 | BLOCK(8) | |
752 | BLOCK(12) | |
753 | ||
8fdf7655 JP |
754 | " addl $256, %1 ;\n" |
755 | " addl $256, %2 ;\n" | |
756 | " addl $256, %3 ;\n" | |
757 | " addl $256, %4 ;\n" | |
758 | " decl %0 ;\n" | |
759 | " jnz 1b ;\n" | |
1da177e4 LT |
760 | : "+r" (lines), |
761 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
762 | : | |
8fdf7655 | 763 | : "memory" ); |
1da177e4 LT |
764 | |
765 | XMMS_RESTORE; | |
766 | } | |
767 | ||
768 | static void | |
769 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
770 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
771 | { | |
8fdf7655 | 772 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
773 | char xmm_save[16*4] ALIGN16; |
774 | int cr0; | |
775 | ||
776 | XMMS_SAVE; | |
777 | ||
778 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
779 | such that it won't pass to the asm volatile below a | |
780 | register that is shared with any other variable. That's | |
781 | because we modify p4 and p5 there, but we can't mark them | |
782 | as read/write, otherwise we'd overflow the 10-asm-operands | |
783 | limit of GCC < 3.1. */ | |
8fdf7655 | 784 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 785 | |
8fdf7655 | 786 | asm volatile( |
1da177e4 LT |
787 | #undef BLOCK |
788 | #define BLOCK(i) \ | |
789 | PF1(i) \ | |
8fdf7655 | 790 | PF1(i + 2) \ |
1da177e4 | 791 | LD(i,0) \ |
8fdf7655 JP |
792 | LD(i + 1, 1) \ |
793 | LD(i + 2, 2) \ | |
794 | LD(i + 3, 3) \ | |
1da177e4 | 795 | PF2(i) \ |
8fdf7655 | 796 | PF2(i + 2) \ |
1da177e4 | 797 | XO1(i,0) \ |
8fdf7655 JP |
798 | XO1(i + 1, 1) \ |
799 | XO1(i + 2, 2) \ | |
800 | XO1(i + 3, 3) \ | |
1da177e4 | 801 | PF3(i) \ |
8fdf7655 | 802 | PF3(i + 2) \ |
1da177e4 | 803 | XO2(i,0) \ |
8fdf7655 JP |
804 | XO2(i + 1, 1) \ |
805 | XO2(i + 2, 2) \ | |
806 | XO2(i + 3, 3) \ | |
1da177e4 | 807 | PF4(i) \ |
8fdf7655 JP |
808 | PF4(i + 2) \ |
809 | PF0(i + 4) \ | |
810 | PF0(i + 6) \ | |
1da177e4 | 811 | XO3(i,0) \ |
8fdf7655 JP |
812 | XO3(i + 1, 1) \ |
813 | XO3(i + 2, 2) \ | |
814 | XO3(i + 3, 3) \ | |
1da177e4 | 815 | XO4(i,0) \ |
8fdf7655 JP |
816 | XO4(i + 1, 1) \ |
817 | XO4(i + 2, 2) \ | |
818 | XO4(i + 3, 3) \ | |
1da177e4 | 819 | ST(i,0) \ |
8fdf7655 JP |
820 | ST(i + 1, 1) \ |
821 | ST(i + 2, 2) \ | |
822 | ST(i + 3, 3) \ | |
1da177e4 LT |
823 | |
824 | ||
825 | PF0(0) | |
826 | PF0(2) | |
827 | ||
828 | " .align 32 ;\n" | |
8fdf7655 | 829 | " 1: ;\n" |
1da177e4 LT |
830 | |
831 | BLOCK(0) | |
832 | BLOCK(4) | |
833 | BLOCK(8) | |
834 | BLOCK(12) | |
835 | ||
8fdf7655 JP |
836 | " addl $256, %1 ;\n" |
837 | " addl $256, %2 ;\n" | |
838 | " addl $256, %3 ;\n" | |
839 | " addl $256, %4 ;\n" | |
840 | " addl $256, %5 ;\n" | |
841 | " decl %0 ;\n" | |
842 | " jnz 1b ;\n" | |
1da177e4 LT |
843 | : "+r" (lines), |
844 | "+r" (p1), "+r" (p2), "+r" (p3) | |
845 | : "r" (p4), "r" (p5) | |
846 | : "memory"); | |
847 | ||
848 | /* p4 and p5 were modified, and now the variables are dead. | |
849 | Clobber them just to be sure nobody does something stupid | |
850 | like assuming they have some legal value. */ | |
8fdf7655 | 851 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
852 | |
853 | XMMS_RESTORE; | |
854 | } | |
855 | ||
856 | static struct xor_block_template xor_block_pIII_sse = { | |
8fdf7655 JP |
857 | .name = "pIII_sse", |
858 | .do_2 = xor_sse_2, | |
859 | .do_3 = xor_sse_3, | |
860 | .do_4 = xor_sse_4, | |
861 | .do_5 = xor_sse_5, | |
1da177e4 LT |
862 | }; |
863 | ||
864 | /* Also try the generic routines. */ | |
865 | #include <asm-generic/xor.h> | |
866 | ||
867 | #undef XOR_TRY_TEMPLATES | |
868 | #define XOR_TRY_TEMPLATES \ | |
8fdf7655 JP |
869 | do { \ |
870 | xor_speed(&xor_block_8regs); \ | |
871 | xor_speed(&xor_block_8regs_p); \ | |
872 | xor_speed(&xor_block_32regs); \ | |
873 | xor_speed(&xor_block_32regs_p); \ | |
874 | if (cpu_has_xmm) \ | |
875 | xor_speed(&xor_block_pIII_sse); \ | |
876 | if (cpu_has_mmx) { \ | |
877 | xor_speed(&xor_block_pII_mmx); \ | |
878 | xor_speed(&xor_block_p5_mmx); \ | |
879 | } \ | |
880 | } while (0) | |
1da177e4 LT |
881 | |
882 | /* We force the use of the SSE xor block because it can write around L2. | |
883 | We may also be able to load into the L1 only depending on how the cpu | |
884 | deals with a load to a line that is being prefetched. */ | |
8fdf7655 | 885 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
1da177e4 | 886 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |
0db125c4 VN |
887 | |
888 | #endif /* ASM_X86__XOR_32_H */ |