[PARISC] Use F_EXTEND() for COMMAND_GLOBAL
[deliverable/linux.git] / arch / i386 / lib / mmx.c
CommitLineData
1da177e4
LT
1#include <linux/config.h>
2#include <linux/types.h>
3#include <linux/string.h>
4#include <linux/sched.h>
5#include <linux/hardirq.h>
129f6946 6#include <linux/module.h>
1da177e4
LT
7
8#include <asm/i387.h>
9
10
11/*
12 * MMX 3DNow! library helper functions
13 *
14 * To do:
15 * We can use MMX just for prefetch in IRQ's. This may be a win.
16 * (reported so on K6-III)
17 * We should use a better code neutral filler for the short jump
18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 * We also want to clobber the filler register so we don't get any
20 * register forwarding stalls on the filler.
21 *
22 * Add *user handling. Checksums are not a win with MMX on any CPU
23 * tested so far for any MMX solution figured.
24 *
25 * 22/09/2000 - Arjan van de Ven
26 * Improved for non-egineering-sample Athlons
27 *
28 */
29
30void *_mmx_memcpy(void *to, const void *from, size_t len)
31{
32 void *p;
33 int i;
34
35 if (unlikely(in_interrupt()))
36 return __memcpy(to, from, len);
37
38 p = to;
39 i = len >> 6; /* len/64 */
40
41 kernel_fpu_begin();
42
43 __asm__ __volatile__ (
44 "1: prefetch (%0)\n" /* This set is 28 bytes */
45 " prefetch 64(%0)\n"
46 " prefetch 128(%0)\n"
47 " prefetch 192(%0)\n"
48 " prefetch 256(%0)\n"
49 "2: \n"
50 ".section .fixup, \"ax\"\n"
51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
52 " jmp 2b\n"
53 ".previous\n"
54 ".section __ex_table,\"a\"\n"
55 " .align 4\n"
56 " .long 1b, 3b\n"
57 ".previous"
58 : : "r" (from) );
59
60
61 for(; i>5; i--)
62 {
63 __asm__ __volatile__ (
64 "1: prefetch 320(%0)\n"
65 "2: movq (%0), %%mm0\n"
66 " movq 8(%0), %%mm1\n"
67 " movq 16(%0), %%mm2\n"
68 " movq 24(%0), %%mm3\n"
69 " movq %%mm0, (%1)\n"
70 " movq %%mm1, 8(%1)\n"
71 " movq %%mm2, 16(%1)\n"
72 " movq %%mm3, 24(%1)\n"
73 " movq 32(%0), %%mm0\n"
74 " movq 40(%0), %%mm1\n"
75 " movq 48(%0), %%mm2\n"
76 " movq 56(%0), %%mm3\n"
77 " movq %%mm0, 32(%1)\n"
78 " movq %%mm1, 40(%1)\n"
79 " movq %%mm2, 48(%1)\n"
80 " movq %%mm3, 56(%1)\n"
81 ".section .fixup, \"ax\"\n"
82 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
83 " jmp 2b\n"
84 ".previous\n"
85 ".section __ex_table,\"a\"\n"
86 " .align 4\n"
87 " .long 1b, 3b\n"
88 ".previous"
89 : : "r" (from), "r" (to) : "memory");
90 from+=64;
91 to+=64;
92 }
93
94 for(; i>0; i--)
95 {
96 __asm__ __volatile__ (
97 " movq (%0), %%mm0\n"
98 " movq 8(%0), %%mm1\n"
99 " movq 16(%0), %%mm2\n"
100 " movq 24(%0), %%mm3\n"
101 " movq %%mm0, (%1)\n"
102 " movq %%mm1, 8(%1)\n"
103 " movq %%mm2, 16(%1)\n"
104 " movq %%mm3, 24(%1)\n"
105 " movq 32(%0), %%mm0\n"
106 " movq 40(%0), %%mm1\n"
107 " movq 48(%0), %%mm2\n"
108 " movq 56(%0), %%mm3\n"
109 " movq %%mm0, 32(%1)\n"
110 " movq %%mm1, 40(%1)\n"
111 " movq %%mm2, 48(%1)\n"
112 " movq %%mm3, 56(%1)\n"
113 : : "r" (from), "r" (to) : "memory");
114 from+=64;
115 to+=64;
116 }
117 /*
118 * Now do the tail of the block
119 */
120 __memcpy(to, from, len&63);
121 kernel_fpu_end();
122 return p;
123}
124
125#ifdef CONFIG_MK7
126
127/*
128 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
129 * other MMX using processors do not.
130 */
131
132static void fast_clear_page(void *page)
133{
134 int i;
135
136 kernel_fpu_begin();
137
138 __asm__ __volatile__ (
139 " pxor %%mm0, %%mm0\n" : :
140 );
141
142 for(i=0;i<4096/64;i++)
143 {
144 __asm__ __volatile__ (
145 " movntq %%mm0, (%0)\n"
146 " movntq %%mm0, 8(%0)\n"
147 " movntq %%mm0, 16(%0)\n"
148 " movntq %%mm0, 24(%0)\n"
149 " movntq %%mm0, 32(%0)\n"
150 " movntq %%mm0, 40(%0)\n"
151 " movntq %%mm0, 48(%0)\n"
152 " movntq %%mm0, 56(%0)\n"
153 : : "r" (page) : "memory");
154 page+=64;
155 }
156 /* since movntq is weakly-ordered, a "sfence" is needed to become
157 * ordered again.
158 */
159 __asm__ __volatile__ (
160 " sfence \n" : :
161 );
162 kernel_fpu_end();
163}
164
165static void fast_copy_page(void *to, void *from)
166{
167 int i;
168
169 kernel_fpu_begin();
170
171 /* maybe the prefetch stuff can go before the expensive fnsave...
172 * but that is for later. -AV
173 */
174 __asm__ __volatile__ (
175 "1: prefetch (%0)\n"
176 " prefetch 64(%0)\n"
177 " prefetch 128(%0)\n"
178 " prefetch 192(%0)\n"
179 " prefetch 256(%0)\n"
180 "2: \n"
181 ".section .fixup, \"ax\"\n"
182 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
183 " jmp 2b\n"
184 ".previous\n"
185 ".section __ex_table,\"a\"\n"
186 " .align 4\n"
187 " .long 1b, 3b\n"
188 ".previous"
189 : : "r" (from) );
190
191 for(i=0; i<(4096-320)/64; i++)
192 {
193 __asm__ __volatile__ (
194 "1: prefetch 320(%0)\n"
195 "2: movq (%0), %%mm0\n"
196 " movntq %%mm0, (%1)\n"
197 " movq 8(%0), %%mm1\n"
198 " movntq %%mm1, 8(%1)\n"
199 " movq 16(%0), %%mm2\n"
200 " movntq %%mm2, 16(%1)\n"
201 " movq 24(%0), %%mm3\n"
202 " movntq %%mm3, 24(%1)\n"
203 " movq 32(%0), %%mm4\n"
204 " movntq %%mm4, 32(%1)\n"
205 " movq 40(%0), %%mm5\n"
206 " movntq %%mm5, 40(%1)\n"
207 " movq 48(%0), %%mm6\n"
208 " movntq %%mm6, 48(%1)\n"
209 " movq 56(%0), %%mm7\n"
210 " movntq %%mm7, 56(%1)\n"
211 ".section .fixup, \"ax\"\n"
212 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
213 " jmp 2b\n"
214 ".previous\n"
215 ".section __ex_table,\"a\"\n"
216 " .align 4\n"
217 " .long 1b, 3b\n"
218 ".previous"
219 : : "r" (from), "r" (to) : "memory");
220 from+=64;
221 to+=64;
222 }
223 for(i=(4096-320)/64; i<4096/64; i++)
224 {
225 __asm__ __volatile__ (
226 "2: movq (%0), %%mm0\n"
227 " movntq %%mm0, (%1)\n"
228 " movq 8(%0), %%mm1\n"
229 " movntq %%mm1, 8(%1)\n"
230 " movq 16(%0), %%mm2\n"
231 " movntq %%mm2, 16(%1)\n"
232 " movq 24(%0), %%mm3\n"
233 " movntq %%mm3, 24(%1)\n"
234 " movq 32(%0), %%mm4\n"
235 " movntq %%mm4, 32(%1)\n"
236 " movq 40(%0), %%mm5\n"
237 " movntq %%mm5, 40(%1)\n"
238 " movq 48(%0), %%mm6\n"
239 " movntq %%mm6, 48(%1)\n"
240 " movq 56(%0), %%mm7\n"
241 " movntq %%mm7, 56(%1)\n"
242 : : "r" (from), "r" (to) : "memory");
243 from+=64;
244 to+=64;
245 }
246 /* since movntq is weakly-ordered, a "sfence" is needed to become
247 * ordered again.
248 */
249 __asm__ __volatile__ (
250 " sfence \n" : :
251 );
252 kernel_fpu_end();
253}
254
255#else
256
257/*
258 * Generic MMX implementation without K7 specific streaming
259 */
260
261static void fast_clear_page(void *page)
262{
263 int i;
264
265 kernel_fpu_begin();
266
267 __asm__ __volatile__ (
268 " pxor %%mm0, %%mm0\n" : :
269 );
270
271 for(i=0;i<4096/128;i++)
272 {
273 __asm__ __volatile__ (
274 " movq %%mm0, (%0)\n"
275 " movq %%mm0, 8(%0)\n"
276 " movq %%mm0, 16(%0)\n"
277 " movq %%mm0, 24(%0)\n"
278 " movq %%mm0, 32(%0)\n"
279 " movq %%mm0, 40(%0)\n"
280 " movq %%mm0, 48(%0)\n"
281 " movq %%mm0, 56(%0)\n"
282 " movq %%mm0, 64(%0)\n"
283 " movq %%mm0, 72(%0)\n"
284 " movq %%mm0, 80(%0)\n"
285 " movq %%mm0, 88(%0)\n"
286 " movq %%mm0, 96(%0)\n"
287 " movq %%mm0, 104(%0)\n"
288 " movq %%mm0, 112(%0)\n"
289 " movq %%mm0, 120(%0)\n"
290 : : "r" (page) : "memory");
291 page+=128;
292 }
293
294 kernel_fpu_end();
295}
296
297static void fast_copy_page(void *to, void *from)
298{
299 int i;
300
301
302 kernel_fpu_begin();
303
304 __asm__ __volatile__ (
305 "1: prefetch (%0)\n"
306 " prefetch 64(%0)\n"
307 " prefetch 128(%0)\n"
308 " prefetch 192(%0)\n"
309 " prefetch 256(%0)\n"
310 "2: \n"
311 ".section .fixup, \"ax\"\n"
312 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
313 " jmp 2b\n"
314 ".previous\n"
315 ".section __ex_table,\"a\"\n"
316 " .align 4\n"
317 " .long 1b, 3b\n"
318 ".previous"
319 : : "r" (from) );
320
321 for(i=0; i<4096/64; i++)
322 {
323 __asm__ __volatile__ (
324 "1: prefetch 320(%0)\n"
325 "2: movq (%0), %%mm0\n"
326 " movq 8(%0), %%mm1\n"
327 " movq 16(%0), %%mm2\n"
328 " movq 24(%0), %%mm3\n"
329 " movq %%mm0, (%1)\n"
330 " movq %%mm1, 8(%1)\n"
331 " movq %%mm2, 16(%1)\n"
332 " movq %%mm3, 24(%1)\n"
333 " movq 32(%0), %%mm0\n"
334 " movq 40(%0), %%mm1\n"
335 " movq 48(%0), %%mm2\n"
336 " movq 56(%0), %%mm3\n"
337 " movq %%mm0, 32(%1)\n"
338 " movq %%mm1, 40(%1)\n"
339 " movq %%mm2, 48(%1)\n"
340 " movq %%mm3, 56(%1)\n"
341 ".section .fixup, \"ax\"\n"
342 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
343 " jmp 2b\n"
344 ".previous\n"
345 ".section __ex_table,\"a\"\n"
346 " .align 4\n"
347 " .long 1b, 3b\n"
348 ".previous"
349 : : "r" (from), "r" (to) : "memory");
350 from+=64;
351 to+=64;
352 }
353 kernel_fpu_end();
354}
355
356
357#endif
358
359/*
360 * Favour MMX for page clear and copy.
361 */
362
363static void slow_zero_page(void * page)
364{
365 int d0, d1;
366 __asm__ __volatile__( \
367 "cld\n\t" \
368 "rep ; stosl" \
369 : "=&c" (d0), "=&D" (d1)
370 :"a" (0),"1" (page),"0" (1024)
371 :"memory");
372}
373
374void mmx_clear_page(void * page)
375{
376 if(unlikely(in_interrupt()))
377 slow_zero_page(page);
378 else
379 fast_clear_page(page);
380}
381
382static void slow_copy_page(void *to, void *from)
383{
384 int d0, d1, d2;
385 __asm__ __volatile__( \
386 "cld\n\t" \
387 "rep ; movsl" \
388 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
389 : "0" (1024),"1" ((long) to),"2" ((long) from) \
390 : "memory");
391}
392
393
394void mmx_copy_page(void *to, void *from)
395{
396 if(unlikely(in_interrupt()))
397 slow_copy_page(to, from);
398 else
399 fast_copy_page(to, from);
400}
129f6946
AD
401
402EXPORT_SYMBOL(_mmx_memcpy);
403EXPORT_SYMBOL(mmx_clear_page);
404EXPORT_SYMBOL(mmx_copy_page);
This page took 0.109262 seconds and 5 git commands to generate.