Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | #include <linux/config.h> |
2 | #include <linux/types.h> | |
3 | #include <linux/string.h> | |
4 | #include <linux/sched.h> | |
5 | #include <linux/hardirq.h> | |
129f6946 | 6 | #include <linux/module.h> |
1da177e4 LT |
7 | |
8 | #include <asm/i387.h> | |
9 | ||
10 | ||
11 | /* | |
12 | * MMX 3DNow! library helper functions | |
13 | * | |
14 | * To do: | |
15 | * We can use MMX just for prefetch in IRQ's. This may be a win. | |
16 | * (reported so on K6-III) | |
17 | * We should use a better code neutral filler for the short jump | |
18 | * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? | |
19 | * We also want to clobber the filler register so we don't get any | |
20 | * register forwarding stalls on the filler. | |
21 | * | |
22 | * Add *user handling. Checksums are not a win with MMX on any CPU | |
23 | * tested so far for any MMX solution figured. | |
24 | * | |
25 | * 22/09/2000 - Arjan van de Ven | |
26 | * Improved for non-egineering-sample Athlons | |
27 | * | |
28 | */ | |
29 | ||
30 | void *_mmx_memcpy(void *to, const void *from, size_t len) | |
31 | { | |
32 | void *p; | |
33 | int i; | |
34 | ||
35 | if (unlikely(in_interrupt())) | |
36 | return __memcpy(to, from, len); | |
37 | ||
38 | p = to; | |
39 | i = len >> 6; /* len/64 */ | |
40 | ||
41 | kernel_fpu_begin(); | |
42 | ||
43 | __asm__ __volatile__ ( | |
44 | "1: prefetch (%0)\n" /* This set is 28 bytes */ | |
45 | " prefetch 64(%0)\n" | |
46 | " prefetch 128(%0)\n" | |
47 | " prefetch 192(%0)\n" | |
48 | " prefetch 256(%0)\n" | |
49 | "2: \n" | |
50 | ".section .fixup, \"ax\"\n" | |
51 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
52 | " jmp 2b\n" | |
53 | ".previous\n" | |
54 | ".section __ex_table,\"a\"\n" | |
55 | " .align 4\n" | |
56 | " .long 1b, 3b\n" | |
57 | ".previous" | |
58 | : : "r" (from) ); | |
59 | ||
60 | ||
61 | for(; i>5; i--) | |
62 | { | |
63 | __asm__ __volatile__ ( | |
64 | "1: prefetch 320(%0)\n" | |
65 | "2: movq (%0), %%mm0\n" | |
66 | " movq 8(%0), %%mm1\n" | |
67 | " movq 16(%0), %%mm2\n" | |
68 | " movq 24(%0), %%mm3\n" | |
69 | " movq %%mm0, (%1)\n" | |
70 | " movq %%mm1, 8(%1)\n" | |
71 | " movq %%mm2, 16(%1)\n" | |
72 | " movq %%mm3, 24(%1)\n" | |
73 | " movq 32(%0), %%mm0\n" | |
74 | " movq 40(%0), %%mm1\n" | |
75 | " movq 48(%0), %%mm2\n" | |
76 | " movq 56(%0), %%mm3\n" | |
77 | " movq %%mm0, 32(%1)\n" | |
78 | " movq %%mm1, 40(%1)\n" | |
79 | " movq %%mm2, 48(%1)\n" | |
80 | " movq %%mm3, 56(%1)\n" | |
81 | ".section .fixup, \"ax\"\n" | |
82 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
83 | " jmp 2b\n" | |
84 | ".previous\n" | |
85 | ".section __ex_table,\"a\"\n" | |
86 | " .align 4\n" | |
87 | " .long 1b, 3b\n" | |
88 | ".previous" | |
89 | : : "r" (from), "r" (to) : "memory"); | |
90 | from+=64; | |
91 | to+=64; | |
92 | } | |
93 | ||
94 | for(; i>0; i--) | |
95 | { | |
96 | __asm__ __volatile__ ( | |
97 | " movq (%0), %%mm0\n" | |
98 | " movq 8(%0), %%mm1\n" | |
99 | " movq 16(%0), %%mm2\n" | |
100 | " movq 24(%0), %%mm3\n" | |
101 | " movq %%mm0, (%1)\n" | |
102 | " movq %%mm1, 8(%1)\n" | |
103 | " movq %%mm2, 16(%1)\n" | |
104 | " movq %%mm3, 24(%1)\n" | |
105 | " movq 32(%0), %%mm0\n" | |
106 | " movq 40(%0), %%mm1\n" | |
107 | " movq 48(%0), %%mm2\n" | |
108 | " movq 56(%0), %%mm3\n" | |
109 | " movq %%mm0, 32(%1)\n" | |
110 | " movq %%mm1, 40(%1)\n" | |
111 | " movq %%mm2, 48(%1)\n" | |
112 | " movq %%mm3, 56(%1)\n" | |
113 | : : "r" (from), "r" (to) : "memory"); | |
114 | from+=64; | |
115 | to+=64; | |
116 | } | |
117 | /* | |
118 | * Now do the tail of the block | |
119 | */ | |
120 | __memcpy(to, from, len&63); | |
121 | kernel_fpu_end(); | |
122 | return p; | |
123 | } | |
124 | ||
125 | #ifdef CONFIG_MK7 | |
126 | ||
127 | /* | |
128 | * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and | |
129 | * other MMX using processors do not. | |
130 | */ | |
131 | ||
132 | static void fast_clear_page(void *page) | |
133 | { | |
134 | int i; | |
135 | ||
136 | kernel_fpu_begin(); | |
137 | ||
138 | __asm__ __volatile__ ( | |
139 | " pxor %%mm0, %%mm0\n" : : | |
140 | ); | |
141 | ||
142 | for(i=0;i<4096/64;i++) | |
143 | { | |
144 | __asm__ __volatile__ ( | |
145 | " movntq %%mm0, (%0)\n" | |
146 | " movntq %%mm0, 8(%0)\n" | |
147 | " movntq %%mm0, 16(%0)\n" | |
148 | " movntq %%mm0, 24(%0)\n" | |
149 | " movntq %%mm0, 32(%0)\n" | |
150 | " movntq %%mm0, 40(%0)\n" | |
151 | " movntq %%mm0, 48(%0)\n" | |
152 | " movntq %%mm0, 56(%0)\n" | |
153 | : : "r" (page) : "memory"); | |
154 | page+=64; | |
155 | } | |
156 | /* since movntq is weakly-ordered, a "sfence" is needed to become | |
157 | * ordered again. | |
158 | */ | |
159 | __asm__ __volatile__ ( | |
160 | " sfence \n" : : | |
161 | ); | |
162 | kernel_fpu_end(); | |
163 | } | |
164 | ||
165 | static void fast_copy_page(void *to, void *from) | |
166 | { | |
167 | int i; | |
168 | ||
169 | kernel_fpu_begin(); | |
170 | ||
171 | /* maybe the prefetch stuff can go before the expensive fnsave... | |
172 | * but that is for later. -AV | |
173 | */ | |
174 | __asm__ __volatile__ ( | |
175 | "1: prefetch (%0)\n" | |
176 | " prefetch 64(%0)\n" | |
177 | " prefetch 128(%0)\n" | |
178 | " prefetch 192(%0)\n" | |
179 | " prefetch 256(%0)\n" | |
180 | "2: \n" | |
181 | ".section .fixup, \"ax\"\n" | |
182 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
183 | " jmp 2b\n" | |
184 | ".previous\n" | |
185 | ".section __ex_table,\"a\"\n" | |
186 | " .align 4\n" | |
187 | " .long 1b, 3b\n" | |
188 | ".previous" | |
189 | : : "r" (from) ); | |
190 | ||
191 | for(i=0; i<(4096-320)/64; i++) | |
192 | { | |
193 | __asm__ __volatile__ ( | |
194 | "1: prefetch 320(%0)\n" | |
195 | "2: movq (%0), %%mm0\n" | |
196 | " movntq %%mm0, (%1)\n" | |
197 | " movq 8(%0), %%mm1\n" | |
198 | " movntq %%mm1, 8(%1)\n" | |
199 | " movq 16(%0), %%mm2\n" | |
200 | " movntq %%mm2, 16(%1)\n" | |
201 | " movq 24(%0), %%mm3\n" | |
202 | " movntq %%mm3, 24(%1)\n" | |
203 | " movq 32(%0), %%mm4\n" | |
204 | " movntq %%mm4, 32(%1)\n" | |
205 | " movq 40(%0), %%mm5\n" | |
206 | " movntq %%mm5, 40(%1)\n" | |
207 | " movq 48(%0), %%mm6\n" | |
208 | " movntq %%mm6, 48(%1)\n" | |
209 | " movq 56(%0), %%mm7\n" | |
210 | " movntq %%mm7, 56(%1)\n" | |
211 | ".section .fixup, \"ax\"\n" | |
212 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
213 | " jmp 2b\n" | |
214 | ".previous\n" | |
215 | ".section __ex_table,\"a\"\n" | |
216 | " .align 4\n" | |
217 | " .long 1b, 3b\n" | |
218 | ".previous" | |
219 | : : "r" (from), "r" (to) : "memory"); | |
220 | from+=64; | |
221 | to+=64; | |
222 | } | |
223 | for(i=(4096-320)/64; i<4096/64; i++) | |
224 | { | |
225 | __asm__ __volatile__ ( | |
226 | "2: movq (%0), %%mm0\n" | |
227 | " movntq %%mm0, (%1)\n" | |
228 | " movq 8(%0), %%mm1\n" | |
229 | " movntq %%mm1, 8(%1)\n" | |
230 | " movq 16(%0), %%mm2\n" | |
231 | " movntq %%mm2, 16(%1)\n" | |
232 | " movq 24(%0), %%mm3\n" | |
233 | " movntq %%mm3, 24(%1)\n" | |
234 | " movq 32(%0), %%mm4\n" | |
235 | " movntq %%mm4, 32(%1)\n" | |
236 | " movq 40(%0), %%mm5\n" | |
237 | " movntq %%mm5, 40(%1)\n" | |
238 | " movq 48(%0), %%mm6\n" | |
239 | " movntq %%mm6, 48(%1)\n" | |
240 | " movq 56(%0), %%mm7\n" | |
241 | " movntq %%mm7, 56(%1)\n" | |
242 | : : "r" (from), "r" (to) : "memory"); | |
243 | from+=64; | |
244 | to+=64; | |
245 | } | |
246 | /* since movntq is weakly-ordered, a "sfence" is needed to become | |
247 | * ordered again. | |
248 | */ | |
249 | __asm__ __volatile__ ( | |
250 | " sfence \n" : : | |
251 | ); | |
252 | kernel_fpu_end(); | |
253 | } | |
254 | ||
255 | #else | |
256 | ||
257 | /* | |
258 | * Generic MMX implementation without K7 specific streaming | |
259 | */ | |
260 | ||
261 | static void fast_clear_page(void *page) | |
262 | { | |
263 | int i; | |
264 | ||
265 | kernel_fpu_begin(); | |
266 | ||
267 | __asm__ __volatile__ ( | |
268 | " pxor %%mm0, %%mm0\n" : : | |
269 | ); | |
270 | ||
271 | for(i=0;i<4096/128;i++) | |
272 | { | |
273 | __asm__ __volatile__ ( | |
274 | " movq %%mm0, (%0)\n" | |
275 | " movq %%mm0, 8(%0)\n" | |
276 | " movq %%mm0, 16(%0)\n" | |
277 | " movq %%mm0, 24(%0)\n" | |
278 | " movq %%mm0, 32(%0)\n" | |
279 | " movq %%mm0, 40(%0)\n" | |
280 | " movq %%mm0, 48(%0)\n" | |
281 | " movq %%mm0, 56(%0)\n" | |
282 | " movq %%mm0, 64(%0)\n" | |
283 | " movq %%mm0, 72(%0)\n" | |
284 | " movq %%mm0, 80(%0)\n" | |
285 | " movq %%mm0, 88(%0)\n" | |
286 | " movq %%mm0, 96(%0)\n" | |
287 | " movq %%mm0, 104(%0)\n" | |
288 | " movq %%mm0, 112(%0)\n" | |
289 | " movq %%mm0, 120(%0)\n" | |
290 | : : "r" (page) : "memory"); | |
291 | page+=128; | |
292 | } | |
293 | ||
294 | kernel_fpu_end(); | |
295 | } | |
296 | ||
297 | static void fast_copy_page(void *to, void *from) | |
298 | { | |
299 | int i; | |
300 | ||
301 | ||
302 | kernel_fpu_begin(); | |
303 | ||
304 | __asm__ __volatile__ ( | |
305 | "1: prefetch (%0)\n" | |
306 | " prefetch 64(%0)\n" | |
307 | " prefetch 128(%0)\n" | |
308 | " prefetch 192(%0)\n" | |
309 | " prefetch 256(%0)\n" | |
310 | "2: \n" | |
311 | ".section .fixup, \"ax\"\n" | |
312 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
313 | " jmp 2b\n" | |
314 | ".previous\n" | |
315 | ".section __ex_table,\"a\"\n" | |
316 | " .align 4\n" | |
317 | " .long 1b, 3b\n" | |
318 | ".previous" | |
319 | : : "r" (from) ); | |
320 | ||
321 | for(i=0; i<4096/64; i++) | |
322 | { | |
323 | __asm__ __volatile__ ( | |
324 | "1: prefetch 320(%0)\n" | |
325 | "2: movq (%0), %%mm0\n" | |
326 | " movq 8(%0), %%mm1\n" | |
327 | " movq 16(%0), %%mm2\n" | |
328 | " movq 24(%0), %%mm3\n" | |
329 | " movq %%mm0, (%1)\n" | |
330 | " movq %%mm1, 8(%1)\n" | |
331 | " movq %%mm2, 16(%1)\n" | |
332 | " movq %%mm3, 24(%1)\n" | |
333 | " movq 32(%0), %%mm0\n" | |
334 | " movq 40(%0), %%mm1\n" | |
335 | " movq 48(%0), %%mm2\n" | |
336 | " movq 56(%0), %%mm3\n" | |
337 | " movq %%mm0, 32(%1)\n" | |
338 | " movq %%mm1, 40(%1)\n" | |
339 | " movq %%mm2, 48(%1)\n" | |
340 | " movq %%mm3, 56(%1)\n" | |
341 | ".section .fixup, \"ax\"\n" | |
342 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
343 | " jmp 2b\n" | |
344 | ".previous\n" | |
345 | ".section __ex_table,\"a\"\n" | |
346 | " .align 4\n" | |
347 | " .long 1b, 3b\n" | |
348 | ".previous" | |
349 | : : "r" (from), "r" (to) : "memory"); | |
350 | from+=64; | |
351 | to+=64; | |
352 | } | |
353 | kernel_fpu_end(); | |
354 | } | |
355 | ||
356 | ||
357 | #endif | |
358 | ||
359 | /* | |
360 | * Favour MMX for page clear and copy. | |
361 | */ | |
362 | ||
363 | static void slow_zero_page(void * page) | |
364 | { | |
365 | int d0, d1; | |
366 | __asm__ __volatile__( \ | |
367 | "cld\n\t" \ | |
368 | "rep ; stosl" \ | |
369 | : "=&c" (d0), "=&D" (d1) | |
370 | :"a" (0),"1" (page),"0" (1024) | |
371 | :"memory"); | |
372 | } | |
373 | ||
374 | void mmx_clear_page(void * page) | |
375 | { | |
376 | if(unlikely(in_interrupt())) | |
377 | slow_zero_page(page); | |
378 | else | |
379 | fast_clear_page(page); | |
380 | } | |
381 | ||
382 | static void slow_copy_page(void *to, void *from) | |
383 | { | |
384 | int d0, d1, d2; | |
385 | __asm__ __volatile__( \ | |
386 | "cld\n\t" \ | |
387 | "rep ; movsl" \ | |
388 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ | |
389 | : "0" (1024),"1" ((long) to),"2" ((long) from) \ | |
390 | : "memory"); | |
391 | } | |
392 | ||
393 | ||
394 | void mmx_copy_page(void *to, void *from) | |
395 | { | |
396 | if(unlikely(in_interrupt())) | |
397 | slow_copy_page(to, from); | |
398 | else | |
399 | fast_copy_page(to, from); | |
400 | } | |
129f6946 AD |
401 | |
402 | EXPORT_SYMBOL(_mmx_memcpy); | |
403 | EXPORT_SYMBOL(mmx_clear_page); | |
404 | EXPORT_SYMBOL(mmx_copy_page); |