Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * MMX 3DNow! library helper functions | |
3 | * | |
4 | * To do: | |
ca5d3f14 | 5 | * We can use MMX just for prefetch in IRQ's. This may be a win. |
1da177e4 LT |
6 | * (reported so on K6-III) |
7 | * We should use a better code neutral filler for the short jump | |
8 | * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? | |
9 | * We also want to clobber the filler register so we don't get any | |
ca5d3f14 | 10 | * register forwarding stalls on the filler. |
1da177e4 LT |
11 | * |
12 | * Add *user handling. Checksums are not a win with MMX on any CPU | |
13 | * tested so far for any MMX solution figured. | |
14 | * | |
ca5d3f14 IM |
15 | * 22/09/2000 - Arjan van de Ven |
16 | * Improved for non-egineering-sample Athlons | |
1da177e4 LT |
17 | * |
18 | */ | |
ca5d3f14 IM |
19 | #include <linux/hardirq.h> |
20 | #include <linux/string.h> | |
21 | #include <linux/module.h> | |
22 | #include <linux/sched.h> | |
23 | #include <linux/types.h> | |
24 | ||
25 | #include <asm/i387.h> | |
26 | #include <asm/asm.h> | |
27 | ||
1da177e4 LT |
28 | void *_mmx_memcpy(void *to, const void *from, size_t len) |
29 | { | |
30 | void *p; | |
31 | int i; | |
32 | ||
33 | if (unlikely(in_interrupt())) | |
34 | return __memcpy(to, from, len); | |
35 | ||
36 | p = to; | |
37 | i = len >> 6; /* len/64 */ | |
38 | ||
39 | kernel_fpu_begin(); | |
40 | ||
41 | __asm__ __volatile__ ( | |
42 | "1: prefetch (%0)\n" /* This set is 28 bytes */ | |
43 | " prefetch 64(%0)\n" | |
44 | " prefetch 128(%0)\n" | |
45 | " prefetch 192(%0)\n" | |
46 | " prefetch 256(%0)\n" | |
47 | "2: \n" | |
48 | ".section .fixup, \"ax\"\n" | |
49 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
50 | " jmp 2b\n" | |
51 | ".previous\n" | |
ca5d3f14 IM |
52 | _ASM_EXTABLE(1b, 3b) |
53 | : : "r" (from)); | |
54 | ||
55 | for ( ; i > 5; i--) { | |
1da177e4 LT |
56 | __asm__ __volatile__ ( |
57 | "1: prefetch 320(%0)\n" | |
58 | "2: movq (%0), %%mm0\n" | |
59 | " movq 8(%0), %%mm1\n" | |
60 | " movq 16(%0), %%mm2\n" | |
61 | " movq 24(%0), %%mm3\n" | |
62 | " movq %%mm0, (%1)\n" | |
63 | " movq %%mm1, 8(%1)\n" | |
64 | " movq %%mm2, 16(%1)\n" | |
65 | " movq %%mm3, 24(%1)\n" | |
66 | " movq 32(%0), %%mm0\n" | |
67 | " movq 40(%0), %%mm1\n" | |
68 | " movq 48(%0), %%mm2\n" | |
69 | " movq 56(%0), %%mm3\n" | |
70 | " movq %%mm0, 32(%1)\n" | |
71 | " movq %%mm1, 40(%1)\n" | |
72 | " movq %%mm2, 48(%1)\n" | |
73 | " movq %%mm3, 56(%1)\n" | |
74 | ".section .fixup, \"ax\"\n" | |
75 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
76 | " jmp 2b\n" | |
77 | ".previous\n" | |
ca5d3f14 IM |
78 | _ASM_EXTABLE(1b, 3b) |
79 | : : "r" (from), "r" (to) : "memory"); | |
80 | ||
81 | from += 64; | |
82 | to += 64; | |
1da177e4 LT |
83 | } |
84 | ||
ca5d3f14 | 85 | for ( ; i > 0; i--) { |
1da177e4 LT |
86 | __asm__ __volatile__ ( |
87 | " movq (%0), %%mm0\n" | |
88 | " movq 8(%0), %%mm1\n" | |
89 | " movq 16(%0), %%mm2\n" | |
90 | " movq 24(%0), %%mm3\n" | |
91 | " movq %%mm0, (%1)\n" | |
92 | " movq %%mm1, 8(%1)\n" | |
93 | " movq %%mm2, 16(%1)\n" | |
94 | " movq %%mm3, 24(%1)\n" | |
95 | " movq 32(%0), %%mm0\n" | |
96 | " movq 40(%0), %%mm1\n" | |
97 | " movq 48(%0), %%mm2\n" | |
98 | " movq 56(%0), %%mm3\n" | |
99 | " movq %%mm0, 32(%1)\n" | |
100 | " movq %%mm1, 40(%1)\n" | |
101 | " movq %%mm2, 48(%1)\n" | |
102 | " movq %%mm3, 56(%1)\n" | |
ca5d3f14 IM |
103 | : : "r" (from), "r" (to) : "memory"); |
104 | ||
105 | from += 64; | |
106 | to += 64; | |
1da177e4 LT |
107 | } |
108 | /* | |
ca5d3f14 | 109 | * Now do the tail of the block: |
1da177e4 | 110 | */ |
ca5d3f14 | 111 | __memcpy(to, from, len & 63); |
1da177e4 | 112 | kernel_fpu_end(); |
ca5d3f14 | 113 | |
1da177e4 LT |
114 | return p; |
115 | } | |
ca5d3f14 | 116 | EXPORT_SYMBOL(_mmx_memcpy); |
1da177e4 LT |
117 | |
118 | #ifdef CONFIG_MK7 | |
119 | ||
120 | /* | |
121 | * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and | |
122 | * other MMX using processors do not. | |
123 | */ | |
124 | ||
125 | static void fast_clear_page(void *page) | |
126 | { | |
127 | int i; | |
128 | ||
129 | kernel_fpu_begin(); | |
ca5d3f14 | 130 | |
1da177e4 LT |
131 | __asm__ __volatile__ ( |
132 | " pxor %%mm0, %%mm0\n" : : | |
133 | ); | |
134 | ||
ca5d3f14 | 135 | for (i = 0; i < 4096/64; i++) { |
1da177e4 LT |
136 | __asm__ __volatile__ ( |
137 | " movntq %%mm0, (%0)\n" | |
138 | " movntq %%mm0, 8(%0)\n" | |
139 | " movntq %%mm0, 16(%0)\n" | |
140 | " movntq %%mm0, 24(%0)\n" | |
141 | " movntq %%mm0, 32(%0)\n" | |
142 | " movntq %%mm0, 40(%0)\n" | |
143 | " movntq %%mm0, 48(%0)\n" | |
144 | " movntq %%mm0, 56(%0)\n" | |
145 | : : "r" (page) : "memory"); | |
ca5d3f14 | 146 | page += 64; |
1da177e4 | 147 | } |
ca5d3f14 IM |
148 | |
149 | /* | |
150 | * Since movntq is weakly-ordered, a "sfence" is needed to become | |
151 | * ordered again: | |
1da177e4 | 152 | */ |
ca5d3f14 IM |
153 | __asm__ __volatile__("sfence\n"::); |
154 | ||
1da177e4 LT |
155 | kernel_fpu_end(); |
156 | } | |
157 | ||
158 | static void fast_copy_page(void *to, void *from) | |
159 | { | |
160 | int i; | |
161 | ||
162 | kernel_fpu_begin(); | |
163 | ||
ca5d3f14 IM |
164 | /* |
165 | * maybe the prefetch stuff can go before the expensive fnsave... | |
1da177e4 LT |
166 | * but that is for later. -AV |
167 | */ | |
ca5d3f14 | 168 | __asm__ __volatile__( |
1da177e4 LT |
169 | "1: prefetch (%0)\n" |
170 | " prefetch 64(%0)\n" | |
171 | " prefetch 128(%0)\n" | |
172 | " prefetch 192(%0)\n" | |
173 | " prefetch 256(%0)\n" | |
174 | "2: \n" | |
175 | ".section .fixup, \"ax\"\n" | |
176 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
177 | " jmp 2b\n" | |
178 | ".previous\n" | |
ca5d3f14 | 179 | _ASM_EXTABLE(1b, 3b) : : "r" (from)); |
1da177e4 | 180 | |
ca5d3f14 | 181 | for (i = 0; i < (4096-320)/64; i++) { |
1da177e4 LT |
182 | __asm__ __volatile__ ( |
183 | "1: prefetch 320(%0)\n" | |
184 | "2: movq (%0), %%mm0\n" | |
185 | " movntq %%mm0, (%1)\n" | |
186 | " movq 8(%0), %%mm1\n" | |
187 | " movntq %%mm1, 8(%1)\n" | |
188 | " movq 16(%0), %%mm2\n" | |
189 | " movntq %%mm2, 16(%1)\n" | |
190 | " movq 24(%0), %%mm3\n" | |
191 | " movntq %%mm3, 24(%1)\n" | |
192 | " movq 32(%0), %%mm4\n" | |
193 | " movntq %%mm4, 32(%1)\n" | |
194 | " movq 40(%0), %%mm5\n" | |
195 | " movntq %%mm5, 40(%1)\n" | |
196 | " movq 48(%0), %%mm6\n" | |
197 | " movntq %%mm6, 48(%1)\n" | |
198 | " movq 56(%0), %%mm7\n" | |
199 | " movntq %%mm7, 56(%1)\n" | |
200 | ".section .fixup, \"ax\"\n" | |
201 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
202 | " jmp 2b\n" | |
203 | ".previous\n" | |
ca5d3f14 IM |
204 | _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); |
205 | ||
206 | from += 64; | |
207 | to += 64; | |
1da177e4 | 208 | } |
ca5d3f14 IM |
209 | |
210 | for (i = (4096-320)/64; i < 4096/64; i++) { | |
1da177e4 LT |
211 | __asm__ __volatile__ ( |
212 | "2: movq (%0), %%mm0\n" | |
213 | " movntq %%mm0, (%1)\n" | |
214 | " movq 8(%0), %%mm1\n" | |
215 | " movntq %%mm1, 8(%1)\n" | |
216 | " movq 16(%0), %%mm2\n" | |
217 | " movntq %%mm2, 16(%1)\n" | |
218 | " movq 24(%0), %%mm3\n" | |
219 | " movntq %%mm3, 24(%1)\n" | |
220 | " movq 32(%0), %%mm4\n" | |
221 | " movntq %%mm4, 32(%1)\n" | |
222 | " movq 40(%0), %%mm5\n" | |
223 | " movntq %%mm5, 40(%1)\n" | |
224 | " movq 48(%0), %%mm6\n" | |
225 | " movntq %%mm6, 48(%1)\n" | |
226 | " movq 56(%0), %%mm7\n" | |
227 | " movntq %%mm7, 56(%1)\n" | |
ca5d3f14 IM |
228 | : : "r" (from), "r" (to) : "memory"); |
229 | from += 64; | |
230 | to += 64; | |
1da177e4 | 231 | } |
ca5d3f14 IM |
232 | /* |
233 | * Since movntq is weakly-ordered, a "sfence" is needed to become | |
234 | * ordered again: | |
1da177e4 | 235 | */ |
ca5d3f14 | 236 | __asm__ __volatile__("sfence \n"::); |
1da177e4 LT |
237 | kernel_fpu_end(); |
238 | } | |
239 | ||
ca5d3f14 | 240 | #else /* CONFIG_MK7 */ |
1da177e4 LT |
241 | |
242 | /* | |
243 | * Generic MMX implementation without K7 specific streaming | |
244 | */ | |
1da177e4 LT |
245 | static void fast_clear_page(void *page) |
246 | { | |
247 | int i; | |
ca5d3f14 | 248 | |
1da177e4 | 249 | kernel_fpu_begin(); |
ca5d3f14 | 250 | |
1da177e4 LT |
251 | __asm__ __volatile__ ( |
252 | " pxor %%mm0, %%mm0\n" : : | |
253 | ); | |
254 | ||
ca5d3f14 | 255 | for (i = 0; i < 4096/128; i++) { |
1da177e4 LT |
256 | __asm__ __volatile__ ( |
257 | " movq %%mm0, (%0)\n" | |
258 | " movq %%mm0, 8(%0)\n" | |
259 | " movq %%mm0, 16(%0)\n" | |
260 | " movq %%mm0, 24(%0)\n" | |
261 | " movq %%mm0, 32(%0)\n" | |
262 | " movq %%mm0, 40(%0)\n" | |
263 | " movq %%mm0, 48(%0)\n" | |
264 | " movq %%mm0, 56(%0)\n" | |
265 | " movq %%mm0, 64(%0)\n" | |
266 | " movq %%mm0, 72(%0)\n" | |
267 | " movq %%mm0, 80(%0)\n" | |
268 | " movq %%mm0, 88(%0)\n" | |
269 | " movq %%mm0, 96(%0)\n" | |
270 | " movq %%mm0, 104(%0)\n" | |
271 | " movq %%mm0, 112(%0)\n" | |
272 | " movq %%mm0, 120(%0)\n" | |
ca5d3f14 IM |
273 | : : "r" (page) : "memory"); |
274 | page += 128; | |
1da177e4 LT |
275 | } |
276 | ||
277 | kernel_fpu_end(); | |
278 | } | |
279 | ||
280 | static void fast_copy_page(void *to, void *from) | |
281 | { | |
282 | int i; | |
ca5d3f14 | 283 | |
1da177e4 LT |
284 | kernel_fpu_begin(); |
285 | ||
286 | __asm__ __volatile__ ( | |
287 | "1: prefetch (%0)\n" | |
288 | " prefetch 64(%0)\n" | |
289 | " prefetch 128(%0)\n" | |
290 | " prefetch 192(%0)\n" | |
291 | " prefetch 256(%0)\n" | |
292 | "2: \n" | |
293 | ".section .fixup, \"ax\"\n" | |
294 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
295 | " jmp 2b\n" | |
296 | ".previous\n" | |
ca5d3f14 | 297 | _ASM_EXTABLE(1b, 3b) : : "r" (from)); |
1da177e4 | 298 | |
ca5d3f14 | 299 | for (i = 0; i < 4096/64; i++) { |
1da177e4 LT |
300 | __asm__ __volatile__ ( |
301 | "1: prefetch 320(%0)\n" | |
302 | "2: movq (%0), %%mm0\n" | |
303 | " movq 8(%0), %%mm1\n" | |
304 | " movq 16(%0), %%mm2\n" | |
305 | " movq 24(%0), %%mm3\n" | |
306 | " movq %%mm0, (%1)\n" | |
307 | " movq %%mm1, 8(%1)\n" | |
308 | " movq %%mm2, 16(%1)\n" | |
309 | " movq %%mm3, 24(%1)\n" | |
310 | " movq 32(%0), %%mm0\n" | |
311 | " movq 40(%0), %%mm1\n" | |
312 | " movq 48(%0), %%mm2\n" | |
313 | " movq 56(%0), %%mm3\n" | |
314 | " movq %%mm0, 32(%1)\n" | |
315 | " movq %%mm1, 40(%1)\n" | |
316 | " movq %%mm2, 48(%1)\n" | |
317 | " movq %%mm3, 56(%1)\n" | |
318 | ".section .fixup, \"ax\"\n" | |
319 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
320 | " jmp 2b\n" | |
321 | ".previous\n" | |
ca5d3f14 IM |
322 | _ASM_EXTABLE(1b, 3b) |
323 | : : "r" (from), "r" (to) : "memory"); | |
324 | ||
325 | from += 64; | |
326 | to += 64; | |
1da177e4 LT |
327 | } |
328 | kernel_fpu_end(); | |
329 | } | |
330 | ||
ca5d3f14 | 331 | #endif /* !CONFIG_MK7 */ |
1da177e4 LT |
332 | |
333 | /* | |
ca5d3f14 | 334 | * Favour MMX for page clear and copy: |
1da177e4 | 335 | */ |
ca5d3f14 | 336 | static void slow_zero_page(void *page) |
1da177e4 LT |
337 | { |
338 | int d0, d1; | |
ca5d3f14 IM |
339 | |
340 | __asm__ __volatile__( | |
341 | "cld\n\t" | |
342 | "rep ; stosl" | |
343 | ||
344 | : "=&c" (d0), "=&D" (d1) | |
345 | :"a" (0), "1" (page), "0" (1024) | |
346 | :"memory"); | |
1da177e4 | 347 | } |
ca5d3f14 IM |
348 | |
349 | void mmx_clear_page(void *page) | |
1da177e4 | 350 | { |
ca5d3f14 | 351 | if (unlikely(in_interrupt())) |
1da177e4 LT |
352 | slow_zero_page(page); |
353 | else | |
354 | fast_clear_page(page); | |
355 | } | |
ca5d3f14 | 356 | EXPORT_SYMBOL(mmx_clear_page); |
1da177e4 LT |
357 | |
358 | static void slow_copy_page(void *to, void *from) | |
359 | { | |
360 | int d0, d1, d2; | |
ca5d3f14 IM |
361 | |
362 | __asm__ __volatile__( | |
363 | "cld\n\t" | |
364 | "rep ; movsl" | |
365 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) | |
366 | : "0" (1024), "1" ((long) to), "2" ((long) from) | |
1da177e4 LT |
367 | : "memory"); |
368 | } | |
1da177e4 LT |
369 | |
370 | void mmx_copy_page(void *to, void *from) | |
371 | { | |
ca5d3f14 | 372 | if (unlikely(in_interrupt())) |
1da177e4 LT |
373 | slow_copy_page(to, from); |
374 | else | |
375 | fast_copy_page(to, from); | |
376 | } | |
129f6946 | 377 | EXPORT_SYMBOL(mmx_copy_page); |