Merge branch 'kconfig' of git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild-2.6
[deliverable/linux.git] / arch / powerpc / lib / copy_32.S
1 /*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11 #include <asm/processor.h>
12 #include <asm/cache.h>
13 #include <asm/errno.h>
14 #include <asm/ppc_asm.h>
15
16 #define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26 #define COPY_16_BYTES_WITHEX(n) \
27 8 ## n ## 0: \
28 lwz r7,4(r4); \
29 8 ## n ## 1: \
30 lwz r8,8(r4); \
31 8 ## n ## 2: \
32 lwz r9,12(r4); \
33 8 ## n ## 3: \
34 lwzu r10,16(r4); \
35 8 ## n ## 4: \
36 stw r7,4(r6); \
37 8 ## n ## 5: \
38 stw r8,8(r6); \
39 8 ## n ## 6: \
40 stw r9,12(r6); \
41 8 ## n ## 7: \
42 stwu r10,16(r6)
43
44 #define COPY_16_BYTES_EXCODE(n) \
45 9 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
48 9 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51 .section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
65 .stabs "copy_32.S",N_SO,0,0,0f
66 0:
67
68 CACHELINE_BYTES = L1_CACHE_BYTES
69 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70 CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72 /*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
76 */
77 _GLOBAL(cacheable_memzero)
78 mr r5,r4
79 li r4,0
80 addi r6,r3,-4
81 cmplwi 0,r5,4
82 blt 7f
83 stwu r4,4(r6)
84 beqlr
85 andi. r0,r6,3
86 add r5,r0,r5
87 subf r6,r0,r6
88 clrlwi r7,r6,32-LG_CACHELINE_BYTES
89 add r8,r7,r5
90 srwi r9,r8,LG_CACHELINE_BYTES
91 addic. r9,r9,-1 /* total number of complete cachelines */
92 ble 2f
93 xori r0,r7,CACHELINE_MASK & ~3
94 srwi. r0,r0,2
95 beq 3f
96 mtctr r0
97 4: stwu r4,4(r6)
98 bdnz 4b
99 3: mtctr r9
100 li r7,4
101 10: dcbz r7,r6
102 addi r6,r6,CACHELINE_BYTES
103 bdnz 10b
104 clrlwi r5,r8,32-LG_CACHELINE_BYTES
105 addi r5,r5,4
106 2: srwi r0,r5,2
107 mtctr r0
108 bdz 6f
109 1: stwu r4,4(r6)
110 bdnz 1b
111 6: andi. r5,r5,3
112 7: cmpwi 0,r5,0
113 beqlr
114 mtctr r5
115 addi r6,r6,3
116 8: stbu r4,1(r6)
117 bdnz 8b
118 blr
119
120 _GLOBAL(memset)
121 rlwimi r4,r4,8,16,23
122 rlwimi r4,r4,16,0,15
123 addi r6,r3,-4
124 cmplwi 0,r5,4
125 blt 7f
126 stwu r4,4(r6)
127 beqlr
128 andi. r0,r6,3
129 add r5,r0,r5
130 subf r6,r0,r6
131 srwi r0,r5,2
132 mtctr r0
133 bdz 6f
134 1: stwu r4,4(r6)
135 bdnz 1b
136 6: andi. r5,r5,3
137 7: cmpwi 0,r5,0
138 beqlr
139 mtctr r5
140 addi r6,r6,3
141 8: stbu r4,1(r6)
142 bdnz 8b
143 blr
144
145 /*
146 * This version uses dcbz on the complete cache lines in the
147 * destination area to reduce memory traffic. This requires that
148 * the destination area is cacheable.
149 * We only use this version if the source and dest don't overlap.
150 * -- paulus.
151 */
152 _GLOBAL(cacheable_memcpy)
153 add r7,r3,r5 /* test if the src & dst overlap */
154 add r8,r4,r5
155 cmplw 0,r4,r7
156 cmplw 1,r3,r8
157 crand 0,0,4 /* cr0.lt &= cr1.lt */
158 blt memcpy /* if regions overlap */
159
160 addi r4,r4,-4
161 addi r6,r3,-4
162 neg r0,r3
163 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
164 beq 58f
165
166 cmplw 0,r5,r0 /* is this more than total to do? */
167 blt 63f /* if not much to do */
168 andi. r8,r0,3 /* get it word-aligned first */
169 subf r5,r0,r5
170 mtctr r8
171 beq+ 61f
172 70: lbz r9,4(r4) /* do some bytes */
173 stb r9,4(r6)
174 addi r4,r4,1
175 addi r6,r6,1
176 bdnz 70b
177 61: srwi. r0,r0,2
178 mtctr r0
179 beq 58f
180 72: lwzu r9,4(r4) /* do some words */
181 stwu r9,4(r6)
182 bdnz 72b
183
184 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
185 clrlwi r5,r5,32-LG_CACHELINE_BYTES
186 li r11,4
187 mtctr r0
188 beq 63f
189 53:
190 dcbz r11,r6
191 COPY_16_BYTES
192 #if L1_CACHE_BYTES >= 32
193 COPY_16_BYTES
194 #if L1_CACHE_BYTES >= 64
195 COPY_16_BYTES
196 COPY_16_BYTES
197 #if L1_CACHE_BYTES >= 128
198 COPY_16_BYTES
199 COPY_16_BYTES
200 COPY_16_BYTES
201 COPY_16_BYTES
202 #endif
203 #endif
204 #endif
205 bdnz 53b
206
207 63: srwi. r0,r5,2
208 mtctr r0
209 beq 64f
210 30: lwzu r0,4(r4)
211 stwu r0,4(r6)
212 bdnz 30b
213
214 64: andi. r0,r5,3
215 mtctr r0
216 beq+ 65f
217 40: lbz r0,4(r4)
218 stb r0,4(r6)
219 addi r4,r4,1
220 addi r6,r6,1
221 bdnz 40b
222 65: blr
223
224 _GLOBAL(memmove)
225 cmplw 0,r3,r4
226 bgt backwards_memcpy
227 /* fall through */
228
229 _GLOBAL(memcpy)
230 srwi. r7,r5,3
231 addi r6,r3,-4
232 addi r4,r4,-4
233 beq 2f /* if less than 8 bytes to do */
234 andi. r0,r6,3 /* get dest word aligned */
235 mtctr r7
236 bne 5f
237 1: lwz r7,4(r4)
238 lwzu r8,8(r4)
239 stw r7,4(r6)
240 stwu r8,8(r6)
241 bdnz 1b
242 andi. r5,r5,7
243 2: cmplwi 0,r5,4
244 blt 3f
245 lwzu r0,4(r4)
246 addi r5,r5,-4
247 stwu r0,4(r6)
248 3: cmpwi 0,r5,0
249 beqlr
250 mtctr r5
251 addi r4,r4,3
252 addi r6,r6,3
253 4: lbzu r0,1(r4)
254 stbu r0,1(r6)
255 bdnz 4b
256 blr
257 5: subfic r0,r0,4
258 mtctr r0
259 6: lbz r7,4(r4)
260 addi r4,r4,1
261 stb r7,4(r6)
262 addi r6,r6,1
263 bdnz 6b
264 subf r5,r0,r5
265 rlwinm. r7,r5,32-3,3,31
266 beq 2b
267 mtctr r7
268 b 1b
269
270 _GLOBAL(backwards_memcpy)
271 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
272 add r6,r3,r5
273 add r4,r4,r5
274 beq 2f
275 andi. r0,r6,3
276 mtctr r7
277 bne 5f
278 1: lwz r7,-4(r4)
279 lwzu r8,-8(r4)
280 stw r7,-4(r6)
281 stwu r8,-8(r6)
282 bdnz 1b
283 andi. r5,r5,7
284 2: cmplwi 0,r5,4
285 blt 3f
286 lwzu r0,-4(r4)
287 subi r5,r5,4
288 stwu r0,-4(r6)
289 3: cmpwi 0,r5,0
290 beqlr
291 mtctr r5
292 4: lbzu r0,-1(r4)
293 stbu r0,-1(r6)
294 bdnz 4b
295 blr
296 5: mtctr r0
297 6: lbzu r7,-1(r4)
298 stbu r7,-1(r6)
299 bdnz 6b
300 subf r5,r0,r5
301 rlwinm. r7,r5,32-3,3,31
302 beq 2b
303 mtctr r7
304 b 1b
305
306 _GLOBAL(__copy_tofrom_user)
307 addi r4,r4,-4
308 addi r6,r3,-4
309 neg r0,r3
310 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
311 beq 58f
312
313 cmplw 0,r5,r0 /* is this more than total to do? */
314 blt 63f /* if not much to do */
315 andi. r8,r0,3 /* get it word-aligned first */
316 mtctr r8
317 beq+ 61f
318 70: lbz r9,4(r4) /* do some bytes */
319 71: stb r9,4(r6)
320 addi r4,r4,1
321 addi r6,r6,1
322 bdnz 70b
323 61: subf r5,r0,r5
324 srwi. r0,r0,2
325 mtctr r0
326 beq 58f
327 72: lwzu r9,4(r4) /* do some words */
328 73: stwu r9,4(r6)
329 bdnz 72b
330
331 .section __ex_table,"a"
332 .align 2
333 .long 70b,100f
334 .long 71b,101f
335 .long 72b,102f
336 .long 73b,103f
337 .text
338
339 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
340 clrlwi r5,r5,32-LG_CACHELINE_BYTES
341 li r11,4
342 beq 63f
343
344 /* Here we decide how far ahead to prefetch the source */
345 li r3,4
346 cmpwi r0,1
347 li r7,0
348 ble 114f
349 li r7,1
350 #if MAX_COPY_PREFETCH > 1
351 /* Heuristically, for large transfers we prefetch
352 MAX_COPY_PREFETCH cachelines ahead. For small transfers
353 we prefetch 1 cacheline ahead. */
354 cmpwi r0,MAX_COPY_PREFETCH
355 ble 112f
356 li r7,MAX_COPY_PREFETCH
357 112: mtctr r7
358 111: dcbt r3,r4
359 addi r3,r3,CACHELINE_BYTES
360 bdnz 111b
361 #else
362 dcbt r3,r4
363 addi r3,r3,CACHELINE_BYTES
364 #endif /* MAX_COPY_PREFETCH > 1 */
365
366 114: subf r8,r7,r0
367 mr r0,r7
368 mtctr r8
369
370 53: dcbt r3,r4
371 54: dcbz r11,r6
372 .section __ex_table,"a"
373 .align 2
374 .long 54b,105f
375 .text
376 /* the main body of the cacheline loop */
377 COPY_16_BYTES_WITHEX(0)
378 #if L1_CACHE_BYTES >= 32
379 COPY_16_BYTES_WITHEX(1)
380 #if L1_CACHE_BYTES >= 64
381 COPY_16_BYTES_WITHEX(2)
382 COPY_16_BYTES_WITHEX(3)
383 #if L1_CACHE_BYTES >= 128
384 COPY_16_BYTES_WITHEX(4)
385 COPY_16_BYTES_WITHEX(5)
386 COPY_16_BYTES_WITHEX(6)
387 COPY_16_BYTES_WITHEX(7)
388 #endif
389 #endif
390 #endif
391 bdnz 53b
392 cmpwi r0,0
393 li r3,4
394 li r7,0
395 bne 114b
396
397 63: srwi. r0,r5,2
398 mtctr r0
399 beq 64f
400 30: lwzu r0,4(r4)
401 31: stwu r0,4(r6)
402 bdnz 30b
403
404 64: andi. r0,r5,3
405 mtctr r0
406 beq+ 65f
407 40: lbz r0,4(r4)
408 41: stb r0,4(r6)
409 addi r4,r4,1
410 addi r6,r6,1
411 bdnz 40b
412 65: li r3,0
413 blr
414
415 /* read fault, initial single-byte copy */
416 100: li r9,0
417 b 90f
418 /* write fault, initial single-byte copy */
419 101: li r9,1
420 90: subf r5,r8,r5
421 li r3,0
422 b 99f
423 /* read fault, initial word copy */
424 102: li r9,0
425 b 91f
426 /* write fault, initial word copy */
427 103: li r9,1
428 91: li r3,2
429 b 99f
430
431 /*
432 * this stuff handles faults in the cacheline loop and branches to either
433 * 104f (if in read part) or 105f (if in write part), after updating r5
434 */
435 COPY_16_BYTES_EXCODE(0)
436 #if L1_CACHE_BYTES >= 32
437 COPY_16_BYTES_EXCODE(1)
438 #if L1_CACHE_BYTES >= 64
439 COPY_16_BYTES_EXCODE(2)
440 COPY_16_BYTES_EXCODE(3)
441 #if L1_CACHE_BYTES >= 128
442 COPY_16_BYTES_EXCODE(4)
443 COPY_16_BYTES_EXCODE(5)
444 COPY_16_BYTES_EXCODE(6)
445 COPY_16_BYTES_EXCODE(7)
446 #endif
447 #endif
448 #endif
449
450 /* read fault in cacheline loop */
451 104: li r9,0
452 b 92f
453 /* fault on dcbz (effectively a write fault) */
454 /* or write fault in cacheline loop */
455 105: li r9,1
456 92: li r3,LG_CACHELINE_BYTES
457 mfctr r8
458 add r0,r0,r8
459 b 106f
460 /* read fault in final word loop */
461 108: li r9,0
462 b 93f
463 /* write fault in final word loop */
464 109: li r9,1
465 93: andi. r5,r5,3
466 li r3,2
467 b 99f
468 /* read fault in final byte loop */
469 110: li r9,0
470 b 94f
471 /* write fault in final byte loop */
472 111: li r9,1
473 94: li r5,0
474 li r3,0
475 /*
476 * At this stage the number of bytes not copied is
477 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
478 */
479 99: mfctr r0
480 106: slw r3,r0,r3
481 add. r3,r3,r5
482 beq 120f /* shouldn't happen */
483 cmpwi 0,r9,0
484 bne 120f
485 /* for a read fault, first try to continue the copy one byte at a time */
486 mtctr r3
487 130: lbz r0,4(r4)
488 131: stb r0,4(r6)
489 addi r4,r4,1
490 addi r6,r6,1
491 bdnz 130b
492 /* then clear out the destination: r3 bytes starting at 4(r6) */
493 132: mfctr r3
494 srwi. r0,r3,2
495 li r9,0
496 mtctr r0
497 beq 113f
498 112: stwu r9,4(r6)
499 bdnz 112b
500 113: andi. r0,r3,3
501 mtctr r0
502 beq 120f
503 114: stb r9,4(r6)
504 addi r6,r6,1
505 bdnz 114b
506 120: blr
507
508 .section __ex_table,"a"
509 .align 2
510 .long 30b,108b
511 .long 31b,109b
512 .long 40b,110b
513 .long 41b,111b
514 .long 130b,132b
515 .long 131b,120b
516 .long 112b,120b
517 .long 114b,120b
518 .text
This page took 0.040933 seconds and 6 git commands to generate.