Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * include/asm-i386/xor.h | |
3 | * | |
4 | * Optimized RAID-5 checksumming functions for MMX and SSE. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2, or (at your option) | |
9 | * any later version. | |
10 | * | |
11 | * You should have received a copy of the GNU General Public License | |
12 | * (for example /usr/src/linux/COPYING); if not, write to the Free | |
13 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
14 | */ | |
15 | ||
16 | /* | |
17 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | |
18 | * Copyright (C) 1998 Ingo Molnar. | |
19 | */ | |
20 | ||
21 | #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" | |
22 | #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | |
23 | #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | |
24 | #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | |
25 | #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | |
26 | #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | |
27 | ||
28 | #include <asm/i387.h> | |
29 | ||
30 | static void | |
31 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
32 | { | |
33 | unsigned long lines = bytes >> 7; | |
34 | ||
35 | kernel_fpu_begin(); | |
36 | ||
37 | __asm__ __volatile__ ( | |
38 | #undef BLOCK | |
39 | #define BLOCK(i) \ | |
40 | LD(i,0) \ | |
41 | LD(i+1,1) \ | |
42 | LD(i+2,2) \ | |
43 | LD(i+3,3) \ | |
44 | XO1(i,0) \ | |
45 | ST(i,0) \ | |
46 | XO1(i+1,1) \ | |
47 | ST(i+1,1) \ | |
48 | XO1(i+2,2) \ | |
49 | ST(i+2,2) \ | |
50 | XO1(i+3,3) \ | |
51 | ST(i+3,3) | |
52 | ||
53 | " .align 32 ;\n" | |
54 | " 1: ;\n" | |
55 | ||
56 | BLOCK(0) | |
57 | BLOCK(4) | |
58 | BLOCK(8) | |
59 | BLOCK(12) | |
60 | ||
61 | " addl $128, %1 ;\n" | |
62 | " addl $128, %2 ;\n" | |
63 | " decl %0 ;\n" | |
64 | " jnz 1b ;\n" | |
65 | : "+r" (lines), | |
66 | "+r" (p1), "+r" (p2) | |
67 | : | |
68 | : "memory"); | |
69 | ||
70 | kernel_fpu_end(); | |
71 | } | |
72 | ||
73 | static void | |
74 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
75 | unsigned long *p3) | |
76 | { | |
77 | unsigned long lines = bytes >> 7; | |
78 | ||
79 | kernel_fpu_begin(); | |
80 | ||
81 | __asm__ __volatile__ ( | |
82 | #undef BLOCK | |
83 | #define BLOCK(i) \ | |
84 | LD(i,0) \ | |
85 | LD(i+1,1) \ | |
86 | LD(i+2,2) \ | |
87 | LD(i+3,3) \ | |
88 | XO1(i,0) \ | |
89 | XO1(i+1,1) \ | |
90 | XO1(i+2,2) \ | |
91 | XO1(i+3,3) \ | |
92 | XO2(i,0) \ | |
93 | ST(i,0) \ | |
94 | XO2(i+1,1) \ | |
95 | ST(i+1,1) \ | |
96 | XO2(i+2,2) \ | |
97 | ST(i+2,2) \ | |
98 | XO2(i+3,3) \ | |
99 | ST(i+3,3) | |
100 | ||
101 | " .align 32 ;\n" | |
102 | " 1: ;\n" | |
103 | ||
104 | BLOCK(0) | |
105 | BLOCK(4) | |
106 | BLOCK(8) | |
107 | BLOCK(12) | |
108 | ||
109 | " addl $128, %1 ;\n" | |
110 | " addl $128, %2 ;\n" | |
111 | " addl $128, %3 ;\n" | |
112 | " decl %0 ;\n" | |
113 | " jnz 1b ;\n" | |
114 | : "+r" (lines), | |
115 | "+r" (p1), "+r" (p2), "+r" (p3) | |
116 | : | |
117 | : "memory"); | |
118 | ||
119 | kernel_fpu_end(); | |
120 | } | |
121 | ||
122 | static void | |
123 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
124 | unsigned long *p3, unsigned long *p4) | |
125 | { | |
126 | unsigned long lines = bytes >> 7; | |
127 | ||
128 | kernel_fpu_begin(); | |
129 | ||
130 | __asm__ __volatile__ ( | |
131 | #undef BLOCK | |
132 | #define BLOCK(i) \ | |
133 | LD(i,0) \ | |
134 | LD(i+1,1) \ | |
135 | LD(i+2,2) \ | |
136 | LD(i+3,3) \ | |
137 | XO1(i,0) \ | |
138 | XO1(i+1,1) \ | |
139 | XO1(i+2,2) \ | |
140 | XO1(i+3,3) \ | |
141 | XO2(i,0) \ | |
142 | XO2(i+1,1) \ | |
143 | XO2(i+2,2) \ | |
144 | XO2(i+3,3) \ | |
145 | XO3(i,0) \ | |
146 | ST(i,0) \ | |
147 | XO3(i+1,1) \ | |
148 | ST(i+1,1) \ | |
149 | XO3(i+2,2) \ | |
150 | ST(i+2,2) \ | |
151 | XO3(i+3,3) \ | |
152 | ST(i+3,3) | |
153 | ||
154 | " .align 32 ;\n" | |
155 | " 1: ;\n" | |
156 | ||
157 | BLOCK(0) | |
158 | BLOCK(4) | |
159 | BLOCK(8) | |
160 | BLOCK(12) | |
161 | ||
162 | " addl $128, %1 ;\n" | |
163 | " addl $128, %2 ;\n" | |
164 | " addl $128, %3 ;\n" | |
165 | " addl $128, %4 ;\n" | |
166 | " decl %0 ;\n" | |
167 | " jnz 1b ;\n" | |
168 | : "+r" (lines), | |
169 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
170 | : | |
171 | : "memory"); | |
172 | ||
173 | kernel_fpu_end(); | |
174 | } | |
175 | ||
176 | ||
177 | static void | |
178 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
179 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
180 | { | |
181 | unsigned long lines = bytes >> 7; | |
182 | ||
183 | kernel_fpu_begin(); | |
184 | ||
185 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
186 | such that it won't pass to the asm volatile below a | |
187 | register that is shared with any other variable. That's | |
188 | because we modify p4 and p5 there, but we can't mark them | |
189 | as read/write, otherwise we'd overflow the 10-asm-operands | |
190 | limit of GCC < 3.1. */ | |
191 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
192 | ||
193 | __asm__ __volatile__ ( | |
194 | #undef BLOCK | |
195 | #define BLOCK(i) \ | |
196 | LD(i,0) \ | |
197 | LD(i+1,1) \ | |
198 | LD(i+2,2) \ | |
199 | LD(i+3,3) \ | |
200 | XO1(i,0) \ | |
201 | XO1(i+1,1) \ | |
202 | XO1(i+2,2) \ | |
203 | XO1(i+3,3) \ | |
204 | XO2(i,0) \ | |
205 | XO2(i+1,1) \ | |
206 | XO2(i+2,2) \ | |
207 | XO2(i+3,3) \ | |
208 | XO3(i,0) \ | |
209 | XO3(i+1,1) \ | |
210 | XO3(i+2,2) \ | |
211 | XO3(i+3,3) \ | |
212 | XO4(i,0) \ | |
213 | ST(i,0) \ | |
214 | XO4(i+1,1) \ | |
215 | ST(i+1,1) \ | |
216 | XO4(i+2,2) \ | |
217 | ST(i+2,2) \ | |
218 | XO4(i+3,3) \ | |
219 | ST(i+3,3) | |
220 | ||
221 | " .align 32 ;\n" | |
222 | " 1: ;\n" | |
223 | ||
224 | BLOCK(0) | |
225 | BLOCK(4) | |
226 | BLOCK(8) | |
227 | BLOCK(12) | |
228 | ||
229 | " addl $128, %1 ;\n" | |
230 | " addl $128, %2 ;\n" | |
231 | " addl $128, %3 ;\n" | |
232 | " addl $128, %4 ;\n" | |
233 | " addl $128, %5 ;\n" | |
234 | " decl %0 ;\n" | |
235 | " jnz 1b ;\n" | |
236 | : "+r" (lines), | |
237 | "+r" (p1), "+r" (p2), "+r" (p3) | |
238 | : "r" (p4), "r" (p5) | |
239 | : "memory"); | |
240 | ||
241 | /* p4 and p5 were modified, and now the variables are dead. | |
242 | Clobber them just to be sure nobody does something stupid | |
243 | like assuming they have some legal value. */ | |
244 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
245 | ||
246 | kernel_fpu_end(); | |
247 | } | |
248 | ||
249 | #undef LD | |
250 | #undef XO1 | |
251 | #undef XO2 | |
252 | #undef XO3 | |
253 | #undef XO4 | |
254 | #undef ST | |
255 | #undef BLOCK | |
256 | ||
257 | static void | |
258 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
259 | { | |
260 | unsigned long lines = bytes >> 6; | |
261 | ||
262 | kernel_fpu_begin(); | |
263 | ||
264 | __asm__ __volatile__ ( | |
265 | " .align 32 ;\n" | |
266 | " 1: ;\n" | |
267 | " movq (%1), %%mm0 ;\n" | |
268 | " movq 8(%1), %%mm1 ;\n" | |
269 | " pxor (%2), %%mm0 ;\n" | |
270 | " movq 16(%1), %%mm2 ;\n" | |
271 | " movq %%mm0, (%1) ;\n" | |
272 | " pxor 8(%2), %%mm1 ;\n" | |
273 | " movq 24(%1), %%mm3 ;\n" | |
274 | " movq %%mm1, 8(%1) ;\n" | |
275 | " pxor 16(%2), %%mm2 ;\n" | |
276 | " movq 32(%1), %%mm4 ;\n" | |
277 | " movq %%mm2, 16(%1) ;\n" | |
278 | " pxor 24(%2), %%mm3 ;\n" | |
279 | " movq 40(%1), %%mm5 ;\n" | |
280 | " movq %%mm3, 24(%1) ;\n" | |
281 | " pxor 32(%2), %%mm4 ;\n" | |
282 | " movq 48(%1), %%mm6 ;\n" | |
283 | " movq %%mm4, 32(%1) ;\n" | |
284 | " pxor 40(%2), %%mm5 ;\n" | |
285 | " movq 56(%1), %%mm7 ;\n" | |
286 | " movq %%mm5, 40(%1) ;\n" | |
287 | " pxor 48(%2), %%mm6 ;\n" | |
288 | " pxor 56(%2), %%mm7 ;\n" | |
289 | " movq %%mm6, 48(%1) ;\n" | |
290 | " movq %%mm7, 56(%1) ;\n" | |
291 | ||
292 | " addl $64, %1 ;\n" | |
293 | " addl $64, %2 ;\n" | |
294 | " decl %0 ;\n" | |
295 | " jnz 1b ;\n" | |
296 | : "+r" (lines), | |
297 | "+r" (p1), "+r" (p2) | |
298 | : | |
299 | : "memory"); | |
300 | ||
301 | kernel_fpu_end(); | |
302 | } | |
303 | ||
304 | static void | |
305 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
306 | unsigned long *p3) | |
307 | { | |
308 | unsigned long lines = bytes >> 6; | |
309 | ||
310 | kernel_fpu_begin(); | |
311 | ||
312 | __asm__ __volatile__ ( | |
313 | " .align 32,0x90 ;\n" | |
314 | " 1: ;\n" | |
315 | " movq (%1), %%mm0 ;\n" | |
316 | " movq 8(%1), %%mm1 ;\n" | |
317 | " pxor (%2), %%mm0 ;\n" | |
318 | " movq 16(%1), %%mm2 ;\n" | |
319 | " pxor 8(%2), %%mm1 ;\n" | |
320 | " pxor (%3), %%mm0 ;\n" | |
321 | " pxor 16(%2), %%mm2 ;\n" | |
322 | " movq %%mm0, (%1) ;\n" | |
323 | " pxor 8(%3), %%mm1 ;\n" | |
324 | " pxor 16(%3), %%mm2 ;\n" | |
325 | " movq 24(%1), %%mm3 ;\n" | |
326 | " movq %%mm1, 8(%1) ;\n" | |
327 | " movq 32(%1), %%mm4 ;\n" | |
328 | " movq 40(%1), %%mm5 ;\n" | |
329 | " pxor 24(%2), %%mm3 ;\n" | |
330 | " movq %%mm2, 16(%1) ;\n" | |
331 | " pxor 32(%2), %%mm4 ;\n" | |
332 | " pxor 24(%3), %%mm3 ;\n" | |
333 | " pxor 40(%2), %%mm5 ;\n" | |
334 | " movq %%mm3, 24(%1) ;\n" | |
335 | " pxor 32(%3), %%mm4 ;\n" | |
336 | " pxor 40(%3), %%mm5 ;\n" | |
337 | " movq 48(%1), %%mm6 ;\n" | |
338 | " movq %%mm4, 32(%1) ;\n" | |
339 | " movq 56(%1), %%mm7 ;\n" | |
340 | " pxor 48(%2), %%mm6 ;\n" | |
341 | " movq %%mm5, 40(%1) ;\n" | |
342 | " pxor 56(%2), %%mm7 ;\n" | |
343 | " pxor 48(%3), %%mm6 ;\n" | |
344 | " pxor 56(%3), %%mm7 ;\n" | |
345 | " movq %%mm6, 48(%1) ;\n" | |
346 | " movq %%mm7, 56(%1) ;\n" | |
347 | ||
348 | " addl $64, %1 ;\n" | |
349 | " addl $64, %2 ;\n" | |
350 | " addl $64, %3 ;\n" | |
351 | " decl %0 ;\n" | |
352 | " jnz 1b ;\n" | |
353 | : "+r" (lines), | |
354 | "+r" (p1), "+r" (p2), "+r" (p3) | |
355 | : | |
356 | : "memory" ); | |
357 | ||
358 | kernel_fpu_end(); | |
359 | } | |
360 | ||
361 | static void | |
362 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
363 | unsigned long *p3, unsigned long *p4) | |
364 | { | |
365 | unsigned long lines = bytes >> 6; | |
366 | ||
367 | kernel_fpu_begin(); | |
368 | ||
369 | __asm__ __volatile__ ( | |
370 | " .align 32,0x90 ;\n" | |
371 | " 1: ;\n" | |
372 | " movq (%1), %%mm0 ;\n" | |
373 | " movq 8(%1), %%mm1 ;\n" | |
374 | " pxor (%2), %%mm0 ;\n" | |
375 | " movq 16(%1), %%mm2 ;\n" | |
376 | " pxor 8(%2), %%mm1 ;\n" | |
377 | " pxor (%3), %%mm0 ;\n" | |
378 | " pxor 16(%2), %%mm2 ;\n" | |
379 | " pxor 8(%3), %%mm1 ;\n" | |
380 | " pxor (%4), %%mm0 ;\n" | |
381 | " movq 24(%1), %%mm3 ;\n" | |
382 | " pxor 16(%3), %%mm2 ;\n" | |
383 | " pxor 8(%4), %%mm1 ;\n" | |
384 | " movq %%mm0, (%1) ;\n" | |
385 | " movq 32(%1), %%mm4 ;\n" | |
386 | " pxor 24(%2), %%mm3 ;\n" | |
387 | " pxor 16(%4), %%mm2 ;\n" | |
388 | " movq %%mm1, 8(%1) ;\n" | |
389 | " movq 40(%1), %%mm5 ;\n" | |
390 | " pxor 32(%2), %%mm4 ;\n" | |
391 | " pxor 24(%3), %%mm3 ;\n" | |
392 | " movq %%mm2, 16(%1) ;\n" | |
393 | " pxor 40(%2), %%mm5 ;\n" | |
394 | " pxor 32(%3), %%mm4 ;\n" | |
395 | " pxor 24(%4), %%mm3 ;\n" | |
396 | " movq %%mm3, 24(%1) ;\n" | |
397 | " movq 56(%1), %%mm7 ;\n" | |
398 | " movq 48(%1), %%mm6 ;\n" | |
399 | " pxor 40(%3), %%mm5 ;\n" | |
400 | " pxor 32(%4), %%mm4 ;\n" | |
401 | " pxor 48(%2), %%mm6 ;\n" | |
402 | " movq %%mm4, 32(%1) ;\n" | |
403 | " pxor 56(%2), %%mm7 ;\n" | |
404 | " pxor 40(%4), %%mm5 ;\n" | |
405 | " pxor 48(%3), %%mm6 ;\n" | |
406 | " pxor 56(%3), %%mm7 ;\n" | |
407 | " movq %%mm5, 40(%1) ;\n" | |
408 | " pxor 48(%4), %%mm6 ;\n" | |
409 | " pxor 56(%4), %%mm7 ;\n" | |
410 | " movq %%mm6, 48(%1) ;\n" | |
411 | " movq %%mm7, 56(%1) ;\n" | |
412 | ||
413 | " addl $64, %1 ;\n" | |
414 | " addl $64, %2 ;\n" | |
415 | " addl $64, %3 ;\n" | |
416 | " addl $64, %4 ;\n" | |
417 | " decl %0 ;\n" | |
418 | " jnz 1b ;\n" | |
419 | : "+r" (lines), | |
420 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
421 | : | |
422 | : "memory"); | |
423 | ||
424 | kernel_fpu_end(); | |
425 | } | |
426 | ||
427 | static void | |
428 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
429 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
430 | { | |
431 | unsigned long lines = bytes >> 6; | |
432 | ||
433 | kernel_fpu_begin(); | |
434 | ||
435 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
436 | such that it won't pass to the asm volatile below a | |
437 | register that is shared with any other variable. That's | |
438 | because we modify p4 and p5 there, but we can't mark them | |
439 | as read/write, otherwise we'd overflow the 10-asm-operands | |
440 | limit of GCC < 3.1. */ | |
441 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
442 | ||
443 | __asm__ __volatile__ ( | |
444 | " .align 32,0x90 ;\n" | |
445 | " 1: ;\n" | |
446 | " movq (%1), %%mm0 ;\n" | |
447 | " movq 8(%1), %%mm1 ;\n" | |
448 | " pxor (%2), %%mm0 ;\n" | |
449 | " pxor 8(%2), %%mm1 ;\n" | |
450 | " movq 16(%1), %%mm2 ;\n" | |
451 | " pxor (%3), %%mm0 ;\n" | |
452 | " pxor 8(%3), %%mm1 ;\n" | |
453 | " pxor 16(%2), %%mm2 ;\n" | |
454 | " pxor (%4), %%mm0 ;\n" | |
455 | " pxor 8(%4), %%mm1 ;\n" | |
456 | " pxor 16(%3), %%mm2 ;\n" | |
457 | " movq 24(%1), %%mm3 ;\n" | |
458 | " pxor (%5), %%mm0 ;\n" | |
459 | " pxor 8(%5), %%mm1 ;\n" | |
460 | " movq %%mm0, (%1) ;\n" | |
461 | " pxor 16(%4), %%mm2 ;\n" | |
462 | " pxor 24(%2), %%mm3 ;\n" | |
463 | " movq %%mm1, 8(%1) ;\n" | |
464 | " pxor 16(%5), %%mm2 ;\n" | |
465 | " pxor 24(%3), %%mm3 ;\n" | |
466 | " movq 32(%1), %%mm4 ;\n" | |
467 | " movq %%mm2, 16(%1) ;\n" | |
468 | " pxor 24(%4), %%mm3 ;\n" | |
469 | " pxor 32(%2), %%mm4 ;\n" | |
470 | " movq 40(%1), %%mm5 ;\n" | |
471 | " pxor 24(%5), %%mm3 ;\n" | |
472 | " pxor 32(%3), %%mm4 ;\n" | |
473 | " pxor 40(%2), %%mm5 ;\n" | |
474 | " movq %%mm3, 24(%1) ;\n" | |
475 | " pxor 32(%4), %%mm4 ;\n" | |
476 | " pxor 40(%3), %%mm5 ;\n" | |
477 | " movq 48(%1), %%mm6 ;\n" | |
478 | " movq 56(%1), %%mm7 ;\n" | |
479 | " pxor 32(%5), %%mm4 ;\n" | |
480 | " pxor 40(%4), %%mm5 ;\n" | |
481 | " pxor 48(%2), %%mm6 ;\n" | |
482 | " pxor 56(%2), %%mm7 ;\n" | |
483 | " movq %%mm4, 32(%1) ;\n" | |
484 | " pxor 48(%3), %%mm6 ;\n" | |
485 | " pxor 56(%3), %%mm7 ;\n" | |
486 | " pxor 40(%5), %%mm5 ;\n" | |
487 | " pxor 48(%4), %%mm6 ;\n" | |
488 | " pxor 56(%4), %%mm7 ;\n" | |
489 | " movq %%mm5, 40(%1) ;\n" | |
490 | " pxor 48(%5), %%mm6 ;\n" | |
491 | " pxor 56(%5), %%mm7 ;\n" | |
492 | " movq %%mm6, 48(%1) ;\n" | |
493 | " movq %%mm7, 56(%1) ;\n" | |
494 | ||
495 | " addl $64, %1 ;\n" | |
496 | " addl $64, %2 ;\n" | |
497 | " addl $64, %3 ;\n" | |
498 | " addl $64, %4 ;\n" | |
499 | " addl $64, %5 ;\n" | |
500 | " decl %0 ;\n" | |
501 | " jnz 1b ;\n" | |
502 | : "+r" (lines), | |
503 | "+r" (p1), "+r" (p2), "+r" (p3) | |
504 | : "r" (p4), "r" (p5) | |
505 | : "memory"); | |
506 | ||
507 | /* p4 and p5 were modified, and now the variables are dead. | |
508 | Clobber them just to be sure nobody does something stupid | |
509 | like assuming they have some legal value. */ | |
510 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
511 | ||
512 | kernel_fpu_end(); | |
513 | } | |
514 | ||
515 | static struct xor_block_template xor_block_pII_mmx = { | |
516 | .name = "pII_mmx", | |
517 | .do_2 = xor_pII_mmx_2, | |
518 | .do_3 = xor_pII_mmx_3, | |
519 | .do_4 = xor_pII_mmx_4, | |
520 | .do_5 = xor_pII_mmx_5, | |
521 | }; | |
522 | ||
523 | static struct xor_block_template xor_block_p5_mmx = { | |
524 | .name = "p5_mmx", | |
525 | .do_2 = xor_p5_mmx_2, | |
526 | .do_3 = xor_p5_mmx_3, | |
527 | .do_4 = xor_p5_mmx_4, | |
528 | .do_5 = xor_p5_mmx_5, | |
529 | }; | |
530 | ||
531 | /* | |
532 | * Cache avoiding checksumming functions utilizing KNI instructions | |
533 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
534 | */ | |
535 | ||
536 | #define XMMS_SAVE do { \ | |
537 | preempt_disable(); \ | |
4bb0d3ec ZA |
538 | cr0 = read_cr0(); \ |
539 | clts(); \ | |
1da177e4 | 540 | __asm__ __volatile__ ( \ |
4bb0d3ec ZA |
541 | "movups %%xmm0,(%0) ;\n\t" \ |
542 | "movups %%xmm1,0x10(%0) ;\n\t" \ | |
543 | "movups %%xmm2,0x20(%0) ;\n\t" \ | |
544 | "movups %%xmm3,0x30(%0) ;\n\t" \ | |
545 | : \ | |
1da177e4 LT |
546 | : "r" (xmm_save) \ |
547 | : "memory"); \ | |
548 | } while(0) | |
549 | ||
550 | #define XMMS_RESTORE do { \ | |
551 | __asm__ __volatile__ ( \ | |
552 | "sfence ;\n\t" \ | |
4bb0d3ec ZA |
553 | "movups (%0),%%xmm0 ;\n\t" \ |
554 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | |
555 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | |
556 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | |
1da177e4 | 557 | : \ |
4bb0d3ec | 558 | : "r" (xmm_save) \ |
1da177e4 | 559 | : "memory"); \ |
4bb0d3ec | 560 | write_cr0(cr0); \ |
1da177e4 LT |
561 | preempt_enable(); \ |
562 | } while(0) | |
563 | ||
564 | #define ALIGN16 __attribute__((aligned(16))) | |
565 | ||
566 | #define OFFS(x) "16*("#x")" | |
567 | #define PF_OFFS(x) "256+16*("#x")" | |
568 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | |
569 | #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | |
570 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | |
571 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | |
572 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | |
573 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | |
574 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | |
575 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | |
576 | #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | |
577 | #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | |
578 | #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | |
579 | #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | |
580 | #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | |
581 | ||
582 | ||
583 | static void | |
584 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
585 | { | |
586 | unsigned long lines = bytes >> 8; | |
587 | char xmm_save[16*4] ALIGN16; | |
588 | int cr0; | |
589 | ||
590 | XMMS_SAVE; | |
591 | ||
592 | __asm__ __volatile__ ( | |
593 | #undef BLOCK | |
594 | #define BLOCK(i) \ | |
595 | LD(i,0) \ | |
596 | LD(i+1,1) \ | |
597 | PF1(i) \ | |
598 | PF1(i+2) \ | |
599 | LD(i+2,2) \ | |
600 | LD(i+3,3) \ | |
601 | PF0(i+4) \ | |
602 | PF0(i+6) \ | |
603 | XO1(i,0) \ | |
604 | XO1(i+1,1) \ | |
605 | XO1(i+2,2) \ | |
606 | XO1(i+3,3) \ | |
607 | ST(i,0) \ | |
608 | ST(i+1,1) \ | |
609 | ST(i+2,2) \ | |
610 | ST(i+3,3) \ | |
611 | ||
612 | ||
613 | PF0(0) | |
614 | PF0(2) | |
615 | ||
616 | " .align 32 ;\n" | |
617 | " 1: ;\n" | |
618 | ||
619 | BLOCK(0) | |
620 | BLOCK(4) | |
621 | BLOCK(8) | |
622 | BLOCK(12) | |
623 | ||
624 | " addl $256, %1 ;\n" | |
625 | " addl $256, %2 ;\n" | |
626 | " decl %0 ;\n" | |
627 | " jnz 1b ;\n" | |
628 | : "+r" (lines), | |
629 | "+r" (p1), "+r" (p2) | |
630 | : | |
631 | : "memory"); | |
632 | ||
633 | XMMS_RESTORE; | |
634 | } | |
635 | ||
636 | static void | |
637 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
638 | unsigned long *p3) | |
639 | { | |
640 | unsigned long lines = bytes >> 8; | |
641 | char xmm_save[16*4] ALIGN16; | |
642 | int cr0; | |
643 | ||
644 | XMMS_SAVE; | |
645 | ||
646 | __asm__ __volatile__ ( | |
647 | #undef BLOCK | |
648 | #define BLOCK(i) \ | |
649 | PF1(i) \ | |
650 | PF1(i+2) \ | |
651 | LD(i,0) \ | |
652 | LD(i+1,1) \ | |
653 | LD(i+2,2) \ | |
654 | LD(i+3,3) \ | |
655 | PF2(i) \ | |
656 | PF2(i+2) \ | |
657 | PF0(i+4) \ | |
658 | PF0(i+6) \ | |
659 | XO1(i,0) \ | |
660 | XO1(i+1,1) \ | |
661 | XO1(i+2,2) \ | |
662 | XO1(i+3,3) \ | |
663 | XO2(i,0) \ | |
664 | XO2(i+1,1) \ | |
665 | XO2(i+2,2) \ | |
666 | XO2(i+3,3) \ | |
667 | ST(i,0) \ | |
668 | ST(i+1,1) \ | |
669 | ST(i+2,2) \ | |
670 | ST(i+3,3) \ | |
671 | ||
672 | ||
673 | PF0(0) | |
674 | PF0(2) | |
675 | ||
676 | " .align 32 ;\n" | |
677 | " 1: ;\n" | |
678 | ||
679 | BLOCK(0) | |
680 | BLOCK(4) | |
681 | BLOCK(8) | |
682 | BLOCK(12) | |
683 | ||
684 | " addl $256, %1 ;\n" | |
685 | " addl $256, %2 ;\n" | |
686 | " addl $256, %3 ;\n" | |
687 | " decl %0 ;\n" | |
688 | " jnz 1b ;\n" | |
689 | : "+r" (lines), | |
690 | "+r" (p1), "+r"(p2), "+r"(p3) | |
691 | : | |
692 | : "memory" ); | |
693 | ||
694 | XMMS_RESTORE; | |
695 | } | |
696 | ||
697 | static void | |
698 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
699 | unsigned long *p3, unsigned long *p4) | |
700 | { | |
701 | unsigned long lines = bytes >> 8; | |
702 | char xmm_save[16*4] ALIGN16; | |
703 | int cr0; | |
704 | ||
705 | XMMS_SAVE; | |
706 | ||
707 | __asm__ __volatile__ ( | |
708 | #undef BLOCK | |
709 | #define BLOCK(i) \ | |
710 | PF1(i) \ | |
711 | PF1(i+2) \ | |
712 | LD(i,0) \ | |
713 | LD(i+1,1) \ | |
714 | LD(i+2,2) \ | |
715 | LD(i+3,3) \ | |
716 | PF2(i) \ | |
717 | PF2(i+2) \ | |
718 | XO1(i,0) \ | |
719 | XO1(i+1,1) \ | |
720 | XO1(i+2,2) \ | |
721 | XO1(i+3,3) \ | |
722 | PF3(i) \ | |
723 | PF3(i+2) \ | |
724 | PF0(i+4) \ | |
725 | PF0(i+6) \ | |
726 | XO2(i,0) \ | |
727 | XO2(i+1,1) \ | |
728 | XO2(i+2,2) \ | |
729 | XO2(i+3,3) \ | |
730 | XO3(i,0) \ | |
731 | XO3(i+1,1) \ | |
732 | XO3(i+2,2) \ | |
733 | XO3(i+3,3) \ | |
734 | ST(i,0) \ | |
735 | ST(i+1,1) \ | |
736 | ST(i+2,2) \ | |
737 | ST(i+3,3) \ | |
738 | ||
739 | ||
740 | PF0(0) | |
741 | PF0(2) | |
742 | ||
743 | " .align 32 ;\n" | |
744 | " 1: ;\n" | |
745 | ||
746 | BLOCK(0) | |
747 | BLOCK(4) | |
748 | BLOCK(8) | |
749 | BLOCK(12) | |
750 | ||
751 | " addl $256, %1 ;\n" | |
752 | " addl $256, %2 ;\n" | |
753 | " addl $256, %3 ;\n" | |
754 | " addl $256, %4 ;\n" | |
755 | " decl %0 ;\n" | |
756 | " jnz 1b ;\n" | |
757 | : "+r" (lines), | |
758 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
759 | : | |
760 | : "memory" ); | |
761 | ||
762 | XMMS_RESTORE; | |
763 | } | |
764 | ||
765 | static void | |
766 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
767 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
768 | { | |
769 | unsigned long lines = bytes >> 8; | |
770 | char xmm_save[16*4] ALIGN16; | |
771 | int cr0; | |
772 | ||
773 | XMMS_SAVE; | |
774 | ||
775 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
776 | such that it won't pass to the asm volatile below a | |
777 | register that is shared with any other variable. That's | |
778 | because we modify p4 and p5 there, but we can't mark them | |
779 | as read/write, otherwise we'd overflow the 10-asm-operands | |
780 | limit of GCC < 3.1. */ | |
781 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
782 | ||
783 | __asm__ __volatile__ ( | |
784 | #undef BLOCK | |
785 | #define BLOCK(i) \ | |
786 | PF1(i) \ | |
787 | PF1(i+2) \ | |
788 | LD(i,0) \ | |
789 | LD(i+1,1) \ | |
790 | LD(i+2,2) \ | |
791 | LD(i+3,3) \ | |
792 | PF2(i) \ | |
793 | PF2(i+2) \ | |
794 | XO1(i,0) \ | |
795 | XO1(i+1,1) \ | |
796 | XO1(i+2,2) \ | |
797 | XO1(i+3,3) \ | |
798 | PF3(i) \ | |
799 | PF3(i+2) \ | |
800 | XO2(i,0) \ | |
801 | XO2(i+1,1) \ | |
802 | XO2(i+2,2) \ | |
803 | XO2(i+3,3) \ | |
804 | PF4(i) \ | |
805 | PF4(i+2) \ | |
806 | PF0(i+4) \ | |
807 | PF0(i+6) \ | |
808 | XO3(i,0) \ | |
809 | XO3(i+1,1) \ | |
810 | XO3(i+2,2) \ | |
811 | XO3(i+3,3) \ | |
812 | XO4(i,0) \ | |
813 | XO4(i+1,1) \ | |
814 | XO4(i+2,2) \ | |
815 | XO4(i+3,3) \ | |
816 | ST(i,0) \ | |
817 | ST(i+1,1) \ | |
818 | ST(i+2,2) \ | |
819 | ST(i+3,3) \ | |
820 | ||
821 | ||
822 | PF0(0) | |
823 | PF0(2) | |
824 | ||
825 | " .align 32 ;\n" | |
826 | " 1: ;\n" | |
827 | ||
828 | BLOCK(0) | |
829 | BLOCK(4) | |
830 | BLOCK(8) | |
831 | BLOCK(12) | |
832 | ||
833 | " addl $256, %1 ;\n" | |
834 | " addl $256, %2 ;\n" | |
835 | " addl $256, %3 ;\n" | |
836 | " addl $256, %4 ;\n" | |
837 | " addl $256, %5 ;\n" | |
838 | " decl %0 ;\n" | |
839 | " jnz 1b ;\n" | |
840 | : "+r" (lines), | |
841 | "+r" (p1), "+r" (p2), "+r" (p3) | |
842 | : "r" (p4), "r" (p5) | |
843 | : "memory"); | |
844 | ||
845 | /* p4 and p5 were modified, and now the variables are dead. | |
846 | Clobber them just to be sure nobody does something stupid | |
847 | like assuming they have some legal value. */ | |
848 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
849 | ||
850 | XMMS_RESTORE; | |
851 | } | |
852 | ||
853 | static struct xor_block_template xor_block_pIII_sse = { | |
854 | .name = "pIII_sse", | |
855 | .do_2 = xor_sse_2, | |
856 | .do_3 = xor_sse_3, | |
857 | .do_4 = xor_sse_4, | |
858 | .do_5 = xor_sse_5, | |
859 | }; | |
860 | ||
861 | /* Also try the generic routines. */ | |
862 | #include <asm-generic/xor.h> | |
863 | ||
864 | #undef XOR_TRY_TEMPLATES | |
865 | #define XOR_TRY_TEMPLATES \ | |
866 | do { \ | |
867 | xor_speed(&xor_block_8regs); \ | |
868 | xor_speed(&xor_block_8regs_p); \ | |
869 | xor_speed(&xor_block_32regs); \ | |
870 | xor_speed(&xor_block_32regs_p); \ | |
871 | if (cpu_has_xmm) \ | |
872 | xor_speed(&xor_block_pIII_sse); \ | |
873 | if (cpu_has_mmx) { \ | |
874 | xor_speed(&xor_block_pII_mmx); \ | |
875 | xor_speed(&xor_block_p5_mmx); \ | |
876 | } \ | |
877 | } while (0) | |
878 | ||
879 | /* We force the use of the SSE xor block because it can write around L2. | |
880 | We may also be able to load into the L1 only depending on how the cpu | |
881 | deals with a load to a line that is being prefetched. */ | |
882 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | |
883 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |