Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * "memcpy" implementation of SuperH | |
3 | * | |
4 | * Copyright (C) 1999 Niibe Yutaka | |
5 | * Copyright (c) 2002 STMicroelectronics Ltd | |
6 | * Modified from memcpy.S and micro-optimised for SH4 | |
7 | * Stuart Menefy (stuart.menefy@st.com) | |
8 | * | |
9 | */ | |
10 | #include <linux/linkage.h> | |
1da177e4 LT |
11 | |
12 | /* | |
13 | * void *memcpy(void *dst, const void *src, size_t n); | |
14 | * | |
15 | * It is assumed that there is no overlap between src and dst. | |
16 | * If there is an overlap, then the results are undefined. | |
17 | */ | |
18 | ||
19 | ! | |
20 | ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. | |
21 | ! | |
22 | ||
23 | ! Size is 16 or greater, and may have trailing bytes | |
24 | ||
25 | .balign 32 | |
26 | .Lcase1: | |
27 | ! Read a long word and write a long word at once | |
28 | ! At the start of each iteration, r7 contains last long load | |
29 | add #-1,r5 ! 79 EX | |
30 | mov r4,r2 ! 5 MT (0 cycles latency) | |
31 | ||
32 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | |
33 | add #-4,r5 ! 50 EX | |
34 | ||
35 | add #7,r2 ! 79 EX | |
36 | ! | |
37 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
38 | ! 6 cycles, 4 bytes per iteration | |
39 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | |
40 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | |
41 | ||
42 | cmp/hi r2,r0 ! 57 MT | |
43 | shll16 r3 ! 103 EX | |
44 | ||
45 | mov r1,r6 ! 5 MT (latency=0) | |
46 | shll8 r3 ! 102 EX ! Oxxx | |
47 | ||
48 | shlr8 r6 ! 106 EX ! xNML | |
49 | mov r1, r7 ! 5 MT (latency=0) | |
50 | ||
51 | or r6,r3 ! 82 EX ! ONML | |
52 | bt/s 3b ! 109 BR | |
53 | ||
54 | mov.l r3,@-r0 ! 30 LS | |
55 | #else | |
56 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN | |
57 | mov r7,r3 ! 5 MT (latency=0) ! OPQR | |
58 | ||
59 | cmp/hi r2,r0 ! 57 MT | |
60 | shlr16 r3 ! 107 EX | |
61 | ||
62 | shlr8 r3 ! 106 EX ! xxxO | |
63 | mov r1,r6 ! 5 MT (latency=0) | |
64 | ||
65 | shll8 r6 ! 102 EX ! LMNx | |
66 | mov r1,r7 ! 5 MT (latency=0) | |
67 | ||
68 | or r6,r3 ! 82 EX ! LMNO | |
69 | bt/s 3b ! 109 BR | |
70 | ||
71 | mov.l r3,@-r0 ! 30 LS | |
72 | #endif | |
73 | ! Finally, copy a byte at once, if necessary | |
74 | ||
75 | add #4,r5 ! 50 EX | |
76 | cmp/eq r4,r0 ! 54 MT | |
77 | ||
78 | add #-6,r2 ! 50 EX | |
79 | bt 9f ! 109 BR | |
80 | ||
81 | 8: cmp/hi r2,r0 ! 57 MT | |
82 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
83 | ||
84 | bt/s 8b ! 109 BR | |
85 | ||
86 | mov.b r1,@-r0 ! 29 LS | |
87 | ||
88 | 9: rts | |
89 | nop | |
90 | ||
91 | ||
92 | ! | |
93 | ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... | |
94 | ! | |
95 | ||
96 | ! Size is 16 or greater, and may have trailing bytes | |
97 | ||
98 | .balign 32 | |
99 | .Lcase3: | |
100 | ! Read a long word and write a long word at once | |
101 | ! At the start of each iteration, r7 contains last long load | |
102 | add #-3,r5 ! 79 EX | |
103 | mov r4,r2 ! 5 MT (0 cycles latency) | |
104 | ||
105 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | |
106 | add #-4,r5 ! 50 EX | |
107 | ||
108 | add #7,r2 ! 79 EX | |
109 | ! | |
110 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
111 | ! 6 cycles, 4 bytes per iteration | |
112 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | |
113 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | |
114 | ||
115 | cmp/hi r2,r0 ! 57 MT | |
116 | shll8 r3 ! 102 EX ! QPOx | |
117 | ||
118 | mov r1,r6 ! 5 MT (latency=0) | |
119 | shlr16 r6 ! 107 EX | |
120 | ||
121 | shlr8 r6 ! 106 EX ! xxxN | |
122 | mov r1, r7 ! 5 MT (latency=0) | |
123 | ||
124 | or r6,r3 ! 82 EX ! QPON | |
125 | bt/s 3b ! 109 BR | |
126 | ||
127 | mov.l r3,@-r0 ! 30 LS | |
128 | #else | |
e08b954c | 129 | 3: mov r7,r3 ! OPQR |
1da177e4 | 130 | shlr8 r3 ! xOPQ |
e08b954c HS |
131 | mov.l @(r0,r5),r7 ! KLMN |
132 | mov r7,r6 | |
1da177e4 LT |
133 | shll16 r6 |
134 | shll8 r6 ! Nxxx | |
135 | or r6,r3 ! NOPQ | |
136 | cmp/hi r2,r0 | |
137 | bt/s 3b | |
138 | mov.l r3,@-r0 | |
139 | #endif | |
140 | ||
141 | ! Finally, copy a byte at once, if necessary | |
142 | ||
143 | add #6,r5 ! 50 EX | |
144 | cmp/eq r4,r0 ! 54 MT | |
145 | ||
146 | add #-6,r2 ! 50 EX | |
147 | bt 9f ! 109 BR | |
148 | ||
149 | 8: cmp/hi r2,r0 ! 57 MT | |
150 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
151 | ||
152 | bt/s 8b ! 109 BR | |
153 | ||
154 | mov.b r1,@-r0 ! 29 LS | |
155 | ||
156 | 9: rts | |
157 | nop | |
158 | ||
159 | ENTRY(memcpy) | |
160 | ||
161 | ! Calculate the invariants which will be used in the remainder | |
162 | ! of the code: | |
163 | ! | |
164 | ! r4 --> [ ... ] DST [ ... ] SRC | |
165 | ! [ ... ] [ ... ] | |
166 | ! : : | |
167 | ! r0 --> [ ... ] r0+r5 --> [ ... ] | |
168 | ! | |
169 | ! | |
170 | ||
171 | ! Short circuit the common case of src, dst and len being 32 bit aligned | |
172 | ! and test for zero length move | |
173 | ||
174 | mov r6, r0 ! 5 MT (0 cycle latency) | |
175 | or r4, r0 ! 82 EX | |
176 | ||
177 | or r5, r0 ! 82 EX | |
178 | tst r6, r6 ! 86 MT | |
179 | ||
180 | bt/s 99f ! 111 BR (zero len) | |
181 | tst #3, r0 ! 87 MT | |
182 | ||
183 | mov r4, r0 ! 5 MT (0 cycle latency) | |
184 | add r6, r0 ! 49 EX | |
185 | ||
186 | mov #16, r1 ! 6 EX | |
187 | bt/s .Lcase00 ! 111 BR (aligned) | |
188 | ||
189 | sub r4, r5 ! 75 EX | |
190 | ||
191 | ! Arguments are not nicely long word aligned or zero len. | |
192 | ! Check for small copies, and if so do a simple byte at a time copy. | |
193 | ! | |
194 | ! Deciding on an exact value of 'small' is not easy, as the point at which | |
195 | ! using the optimised routines become worthwhile varies (these are the | |
196 | ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): | |
197 | ! size byte-at-time long word byte | |
198 | ! 16 42 39-40 46-50 50-55 | |
199 | ! 24 58 43-44 54-58 62-67 | |
200 | ! 36 82 49-50 66-70 80-85 | |
201 | ! However the penalty for getting it 'wrong' is much higher for long word | |
202 | ! aligned data (and this is more common), so use a value of 16. | |
203 | ||
204 | cmp/gt r6,r1 ! 56 MT | |
205 | ||
206 | add #-1,r5 ! 50 EX | |
207 | bf/s 6f ! 108 BR (not small) | |
208 | ||
209 | mov r5, r3 ! 5 MT (latency=0) | |
210 | shlr r6 ! 104 EX | |
211 | ||
212 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
213 | bf/s 4f ! 111 BR | |
214 | ||
215 | add #-1,r3 ! 50 EX | |
216 | tst r6, r6 ! 86 MT | |
217 | ||
218 | bt/s 98f ! 110 BR | |
219 | mov.b r1,@-r0 ! 29 LS | |
220 | ||
221 | ! 4 cycles, 2 bytes per iteration | |
222 | 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
223 | ||
224 | 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) | |
225 | dt r6 ! 67 EX | |
226 | ||
227 | mov.b r1,@-r0 ! 29 LS | |
228 | bf/s 3b ! 111 BR | |
229 | ||
230 | mov.b r2,@-r0 ! 29 LS | |
231 | 98: | |
232 | rts | |
233 | nop | |
234 | ||
235 | 99: rts | |
236 | mov r4, r0 | |
237 | ||
238 | ! Size is not small, so its worthwhile looking for optimisations. | |
239 | ! First align destination to a long word boundary. | |
240 | ! | |
241 | ! r5 = normal value -1 | |
242 | ||
243 | 6: tst #3, r0 ! 87 MT | |
244 | mov #3, r3 ! 6 EX | |
245 | ||
246 | bt/s 2f ! 111 BR | |
247 | and r0,r3 ! 78 EX | |
248 | ||
249 | ! 3 cycles, 1 byte per iteration | |
250 | 1: dt r3 ! 67 EX | |
251 | mov.b @(r0,r5),r1 ! 19 LS (latency=2) | |
252 | ||
253 | add #-1, r6 ! 79 EX | |
254 | bf/s 1b ! 109 BR | |
255 | ||
256 | mov.b r1,@-r0 ! 28 LS | |
257 | ||
258 | 2: add #1, r5 ! 79 EX | |
259 | ||
260 | ! Now select the appropriate bulk transfer code based on relative | |
261 | ! alignment of src and dst. | |
262 | ||
263 | mov r0, r3 ! 5 MT (latency=0) | |
264 | ||
265 | mov r5, r0 ! 5 MT (latency=0) | |
266 | tst #1, r0 ! 87 MT | |
267 | ||
268 | bf/s 1f ! 111 BR | |
269 | mov #64, r7 ! 6 EX | |
270 | ||
271 | ! bit 0 clear | |
272 | ||
273 | cmp/ge r7, r6 ! 55 MT | |
274 | ||
275 | bt/s 2f ! 111 BR | |
276 | tst #2, r0 ! 87 MT | |
277 | ||
278 | ! small | |
279 | bt/s .Lcase0 | |
280 | mov r3, r0 | |
281 | ||
282 | bra .Lcase2 | |
283 | nop | |
284 | ||
285 | ! big | |
286 | 2: bt/s .Lcase0b | |
287 | mov r3, r0 | |
288 | ||
289 | bra .Lcase2b | |
290 | nop | |
291 | ||
292 | ! bit 0 set | |
293 | 1: tst #2, r0 ! 87 MT | |
294 | ||
295 | bt/s .Lcase1 | |
296 | mov r3, r0 | |
297 | ||
298 | bra .Lcase3 | |
299 | nop | |
300 | ||
301 | ||
302 | ! | |
303 | ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR | |
304 | ! | |
305 | ||
306 | ! src, dst and size are all long word aligned | |
307 | ! size is non-zero | |
308 | ||
309 | .balign 32 | |
310 | .Lcase00: | |
311 | mov #64, r1 ! 6 EX | |
312 | mov r5, r3 ! 5 MT (latency=0) | |
313 | ||
314 | cmp/gt r6, r1 ! 56 MT | |
315 | add #-4, r5 ! 50 EX | |
316 | ||
317 | bf .Lcase00b ! 108 BR (big loop) | |
318 | shlr2 r6 ! 105 EX | |
319 | ||
320 | shlr r6 ! 104 EX | |
321 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
322 | ||
323 | bf/s 4f ! 111 BR | |
324 | add #-8, r3 ! 50 EX | |
325 | ||
326 | tst r6, r6 ! 86 MT | |
327 | bt/s 5f ! 110 BR | |
328 | ||
329 | mov.l r1,@-r0 ! 30 LS | |
330 | ||
331 | ! 4 cycles, 2 long words per iteration | |
332 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
333 | ||
334 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
335 | dt r6 ! 67 EX | |
336 | ||
337 | mov.l r1, @-r0 ! 30 LS | |
338 | bf/s 3b ! 109 BR | |
339 | ||
340 | mov.l r2, @-r0 ! 30 LS | |
341 | ||
342 | 5: rts | |
343 | nop | |
344 | ||
345 | ||
346 | ! Size is 16 or greater and less than 64, but may have trailing bytes | |
347 | ||
348 | .balign 32 | |
349 | .Lcase0: | |
350 | add #-4, r5 ! 50 EX | |
351 | mov r4, r7 ! 5 MT (latency=0) | |
352 | ||
353 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
354 | mov #4, r2 ! 6 EX | |
355 | ||
356 | add #11, r7 ! 50 EX | |
357 | tst r2, r6 ! 86 MT | |
358 | ||
359 | mov r5, r3 ! 5 MT (latency=0) | |
360 | bt/s 4f ! 111 BR | |
361 | ||
362 | add #-4, r3 ! 50 EX | |
363 | mov.l r1,@-r0 ! 30 LS | |
364 | ||
365 | ! 4 cycles, 2 long words per iteration | |
366 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
367 | ||
368 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
369 | cmp/hi r7, r0 | |
370 | ||
371 | mov.l r1, @-r0 ! 30 LS | |
372 | bt/s 3b ! 109 BR | |
373 | ||
374 | mov.l r2, @-r0 ! 30 LS | |
375 | ||
376 | ! Copy the final 0-3 bytes | |
377 | ||
378 | add #3,r5 ! 50 EX | |
379 | ||
380 | cmp/eq r0, r4 ! 54 MT | |
381 | add #-10, r7 ! 50 EX | |
382 | ||
383 | bt 9f ! 110 BR | |
384 | ||
385 | ! 3 cycles, 1 byte per iteration | |
386 | 1: mov.b @(r0,r5),r1 ! 19 LS | |
387 | cmp/hi r7,r0 ! 57 MT | |
388 | ||
389 | bt/s 1b ! 111 BR | |
390 | mov.b r1,@-r0 ! 28 LS | |
391 | ||
392 | 9: rts | |
393 | nop | |
394 | ||
395 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | |
396 | ! | |
397 | ! r2 = rounded up r4 | |
398 | ! r3 = rounded down r0 | |
399 | ||
400 | .balign 32 | |
401 | .Lcase0b: | |
402 | add #-4, r5 ! 50 EX | |
403 | ||
404 | .Lcase00b: | |
405 | mov r0, r3 ! 5 MT (latency=0) | |
406 | mov #(~0x1f), r1 ! 6 EX | |
407 | ||
408 | and r1, r3 ! 78 EX | |
409 | mov r4, r2 ! 5 MT (latency=0) | |
410 | ||
411 | cmp/eq r3, r0 ! 54 MT | |
412 | add #0x1f, r2 ! 50 EX | |
413 | ||
414 | bt/s 1f ! 110 BR | |
415 | and r1, r2 ! 78 EX | |
416 | ||
417 | ! copy initial words until cache line aligned | |
418 | ||
419 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
420 | tst #4, r0 ! 87 MT | |
421 | ||
422 | mov r5, r6 ! 5 MT (latency=0) | |
423 | add #-4, r6 ! 50 EX | |
424 | ||
425 | bt/s 4f ! 111 BR | |
426 | add #8, r3 ! 50 EX | |
427 | ||
428 | tst #0x18, r0 ! 87 MT | |
429 | ||
430 | bt/s 1f ! 109 BR | |
431 | mov.l r1,@-r0 ! 30 LS | |
432 | ||
433 | ! 4 cycles, 2 long words per iteration | |
434 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
435 | ||
436 | 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) | |
437 | cmp/eq r3, r0 ! 54 MT | |
438 | ||
439 | mov.l r1, @-r0 ! 30 LS | |
440 | bf/s 3b ! 109 BR | |
441 | ||
442 | mov.l r7, @-r0 ! 30 LS | |
443 | ||
444 | ! Copy the cache line aligned blocks | |
445 | ! | |
446 | ! In use: r0, r2, r4, r5 | |
447 | ! Scratch: r1, r3, r6, r7 | |
448 | ! | |
449 | ! We could do this with the four scratch registers, but if src | |
450 | ! and dest hit the same cache line, this will thrash, so make | |
451 | ! use of additional registers. | |
452 | ! | |
453 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | |
454 | ! r5: src (was r0+r5) | |
455 | ! r1: dest (was r0) | |
456 | ! this can be reversed at the end, so we don't need to save any extra | |
457 | ! state. | |
458 | ! | |
459 | 1: mov.l r8, @-r15 ! 30 LS | |
460 | add r0, r5 ! 49 EX | |
461 | ||
462 | mov.l r9, @-r15 ! 30 LS | |
463 | mov r0, r1 ! 5 MT (latency=0) | |
464 | ||
465 | mov.l r10, @-r15 ! 30 LS | |
466 | add #-0x1c, r5 ! 50 EX | |
467 | ||
468 | mov.l r11, @-r15 ! 30 LS | |
469 | ||
470 | ! 16 cycles, 32 bytes per iteration | |
471 | 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) | |
472 | add #-0x20, r1 ! 50 EX | |
473 | mov.l @(0x04,r5),r3 ! 18 LS (latency=2) | |
474 | mov.l @(0x08,r5),r6 ! 18 LS (latency=2) | |
475 | mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) | |
476 | mov.l @(0x10,r5),r8 ! 18 LS (latency=2) | |
477 | mov.l @(0x14,r5),r9 ! 18 LS (latency=2) | |
478 | mov.l @(0x18,r5),r10 ! 18 LS (latency=2) | |
479 | mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) | |
480 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
481 | mov.l r3,@(0x04,r1) ! 33 LS | |
482 | mov.l r6,@(0x08,r1) ! 33 LS | |
483 | mov.l r7,@(0x0c,r1) ! 33 LS | |
484 | ||
485 | mov.l r8,@(0x10,r1) ! 33 LS | |
486 | add #-0x20, r5 ! 50 EX | |
487 | ||
488 | mov.l r9,@(0x14,r1) ! 33 LS | |
489 | cmp/eq r2,r1 ! 54 MT | |
490 | ||
491 | mov.l r10,@(0x18,r1) ! 33 LS | |
492 | bf/s 2b ! 109 BR | |
493 | ||
494 | mov.l r11,@(0x1c,r1) ! 33 LS | |
495 | ||
496 | mov r1, r0 ! 5 MT (latency=0) | |
497 | ||
498 | mov.l @r15+, r11 ! 15 LS | |
499 | sub r1, r5 ! 75 EX | |
500 | ||
501 | mov.l @r15+, r10 ! 15 LS | |
502 | cmp/eq r4, r0 ! 54 MT | |
503 | ||
504 | bf/s 1f ! 109 BR | |
505 | mov.l @r15+, r9 ! 15 LS | |
506 | ||
507 | rts | |
508 | 1: mov.l @r15+, r8 ! 15 LS | |
509 | sub r4, r1 ! 75 EX (len remaining) | |
510 | ||
511 | ! number of trailing bytes is non-zero | |
512 | ! | |
513 | ! invariants restored (r5 already decremented by 4) | |
514 | ! also r1=num bytes remaining | |
515 | ||
516 | mov #4, r2 ! 6 EX | |
517 | mov r4, r7 ! 5 MT (latency=0) | |
518 | ||
519 | add #0x1c, r5 ! 50 EX (back to -4) | |
520 | cmp/hs r2, r1 ! 58 MT | |
521 | ||
522 | bf/s 5f ! 108 BR | |
523 | add #11, r7 ! 50 EX | |
524 | ||
525 | mov.l @(r0, r5), r6 ! 21 LS (latency=2) | |
526 | tst r2, r1 ! 86 MT | |
527 | ||
528 | mov r5, r3 ! 5 MT (latency=0) | |
529 | bt/s 4f ! 111 BR | |
530 | ||
531 | add #-4, r3 ! 50 EX | |
532 | cmp/hs r2, r1 ! 58 MT | |
533 | ||
534 | bt/s 5f ! 111 BR | |
535 | mov.l r6,@-r0 ! 30 LS | |
536 | ||
537 | ! 4 cycles, 2 long words per iteration | |
538 | 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) | |
539 | ||
540 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
541 | cmp/hi r7, r0 | |
542 | ||
543 | mov.l r6, @-r0 ! 30 LS | |
544 | bt/s 3b ! 109 BR | |
545 | ||
546 | mov.l r2, @-r0 ! 30 LS | |
547 | ||
548 | ! Copy the final 0-3 bytes | |
549 | ||
550 | 5: cmp/eq r0, r4 ! 54 MT | |
551 | add #-10, r7 ! 50 EX | |
552 | ||
553 | bt 9f ! 110 BR | |
554 | add #3,r5 ! 50 EX | |
555 | ||
556 | ! 3 cycles, 1 byte per iteration | |
557 | 1: mov.b @(r0,r5),r1 ! 19 LS | |
558 | cmp/hi r7,r0 ! 57 MT | |
559 | ||
560 | bt/s 1b ! 111 BR | |
561 | mov.b r1,@-r0 ! 28 LS | |
562 | ||
563 | 9: rts | |
564 | nop | |
565 | ||
566 | ! | |
567 | ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. | |
568 | ! | |
569 | ||
570 | .balign 32 | |
571 | .Lcase2: | |
572 | ! Size is 16 or greater and less then 64, but may have trailing bytes | |
573 | ||
574 | 2: mov r5, r6 ! 5 MT (latency=0) | |
575 | add #-2,r5 ! 50 EX | |
576 | ||
577 | mov r4,r2 ! 5 MT (latency=0) | |
578 | add #-4,r6 ! 50 EX | |
579 | ||
580 | add #7,r2 ! 50 EX | |
581 | 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | |
582 | ||
583 | mov.w @(r0,r6),r3 ! 20 LS (latency=2) | |
584 | cmp/hi r2,r0 ! 57 MT | |
585 | ||
586 | mov.w r1,@-r0 ! 29 LS | |
587 | bt/s 3b ! 111 BR | |
588 | ||
589 | mov.w r3,@-r0 ! 29 LS | |
590 | ||
591 | bra 10f | |
592 | nop | |
593 | ||
594 | ||
595 | .balign 32 | |
596 | .Lcase2b: | |
597 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | |
598 | ! | |
599 | ! r2 = rounded up r4 | |
600 | ! r3 = rounded down r0 | |
601 | ||
602 | mov r0, r3 ! 5 MT (latency=0) | |
603 | mov #(~0x1f), r1 ! 6 EX | |
604 | ||
605 | and r1, r3 ! 78 EX | |
606 | mov r4, r2 ! 5 MT (latency=0) | |
607 | ||
608 | cmp/eq r3, r0 ! 54 MT | |
609 | add #0x1f, r2 ! 50 EX | |
610 | ||
611 | add #-2, r5 ! 50 EX | |
612 | bt/s 1f ! 110 BR | |
613 | and r1, r2 ! 78 EX | |
614 | ||
615 | ! Copy a short word one at a time until we are cache line aligned | |
616 | ! Normal values: r0, r2, r3, r4 | |
617 | ! Unused: r1, r6, r7 | |
618 | ! Mod: r5 (=r5-2) | |
619 | ! | |
620 | add #2, r3 ! 50 EX | |
621 | ||
622 | 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | |
623 | cmp/eq r3,r0 ! 54 MT | |
624 | ||
625 | bf/s 2b ! 111 BR | |
626 | ||
627 | mov.w r1,@-r0 ! 29 LS | |
628 | ||
629 | ! Copy the cache line aligned blocks | |
630 | ! | |
631 | ! In use: r0, r2, r4, r5 (=r5-2) | |
632 | ! Scratch: r1, r3, r6, r7 | |
633 | ! | |
634 | ! We could do this with the four scratch registers, but if src | |
635 | ! and dest hit the same cache line, this will thrash, so make | |
636 | ! use of additional registers. | |
637 | ! | |
638 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | |
639 | ! r5: src (was r0+r5) | |
640 | ! r1: dest (was r0) | |
641 | ! this can be reversed at the end, so we don't need to save any extra | |
642 | ! state. | |
643 | ! | |
644 | 1: mov.l r8, @-r15 ! 30 LS | |
645 | add r0, r5 ! 49 EX | |
646 | ||
647 | mov.l r9, @-r15 ! 30 LS | |
648 | mov r0, r1 ! 5 MT (latency=0) | |
649 | ||
650 | mov.l r10, @-r15 ! 30 LS | |
651 | add #-0x1e, r5 ! 50 EX | |
652 | ||
653 | mov.l r11, @-r15 ! 30 LS | |
654 | ||
655 | mov.l r12, @-r15 ! 30 LS | |
656 | ||
657 | ! 17 cycles, 32 bytes per iteration | |
658 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
659 | 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI | |
660 | add #-0x20, r1 ! 50 EX | |
661 | ||
662 | mov.l @r5+, r3 ! 15 LS (latency=2) NMLK | |
663 | ||
664 | mov.l @r5+, r6 ! 15 LS (latency=2) RQPO | |
665 | shll16 r0 ! 103 EX JI.. | |
666 | ||
667 | mov.l @r5+, r7 ! 15 LS (latency=2) | |
668 | xtrct r3, r0 ! 48 EX LKJI | |
669 | ||
670 | mov.l @r5+, r8 ! 15 LS (latency=2) | |
671 | xtrct r6, r3 ! 48 EX PONM | |
672 | ||
673 | mov.l @r5+, r9 ! 15 LS (latency=2) | |
674 | xtrct r7, r6 ! 48 EX | |
675 | ||
676 | mov.l @r5+, r10 ! 15 LS (latency=2) | |
677 | xtrct r8, r7 ! 48 EX | |
678 | ||
679 | mov.l @r5+, r11 ! 15 LS (latency=2) | |
680 | xtrct r9, r8 ! 48 EX | |
681 | ||
682 | mov.w @r5+, r12 ! 15 LS (latency=2) | |
683 | xtrct r10, r9 ! 48 EX | |
684 | ||
685 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
686 | xtrct r11, r10 ! 48 EX | |
687 | ||
688 | mov.l r3, @(0x04,r1) ! 33 LS | |
689 | xtrct r12, r11 ! 48 EX | |
690 | ||
691 | mov.l r6, @(0x08,r1) ! 33 LS | |
692 | ||
693 | mov.l r7, @(0x0c,r1) ! 33 LS | |
694 | ||
695 | mov.l r8, @(0x10,r1) ! 33 LS | |
696 | add #-0x40, r5 ! 50 EX | |
697 | ||
698 | mov.l r9, @(0x14,r1) ! 33 LS | |
699 | cmp/eq r2,r1 ! 54 MT | |
700 | ||
701 | mov.l r10, @(0x18,r1) ! 33 LS | |
702 | bf/s 2b ! 109 BR | |
703 | ||
704 | mov.l r11, @(0x1c,r1) ! 33 LS | |
705 | #else | |
706 | 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) | |
707 | add #-2, r5 ! 50 EX | |
708 | ||
709 | mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) | |
710 | add #-4, r1 ! 50 EX | |
711 | ||
712 | mov.l @(0x18,r5), r6 ! 18 LS (latency=2) | |
713 | shll16 r0 ! 103 EX | |
714 | ||
715 | mov.l @(0x14,r5), r7 ! 18 LS (latency=2) | |
716 | xtrct r3, r0 ! 48 EX | |
717 | ||
718 | mov.l @(0x10,r5), r8 ! 18 LS (latency=2) | |
719 | xtrct r6, r3 ! 48 EX | |
720 | ||
721 | mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) | |
722 | xtrct r7, r6 ! 48 EX | |
723 | ||
724 | mov.l @(0x08,r5), r10 ! 18 LS (latency=2) | |
725 | xtrct r8, r7 ! 48 EX | |
726 | ||
727 | mov.l @(0x04,r5), r11 ! 18 LS (latency=2) | |
728 | xtrct r9, r8 ! 48 EX | |
729 | ||
c7afb7e5 NI |
730 | mov.l @(0x00,r5), r12 ! 18 LS (latency=2) |
731 | xtrct r10, r9 ! 48 EX | |
1da177e4 LT |
732 | |
733 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
734 | add #-0x1c, r1 ! 50 EX | |
735 | ||
e08b954c | 736 | mov.l r3, @(0x18,r1) ! 33 LS |
1da177e4 LT |
737 | xtrct r11, r10 ! 48 EX |
738 | ||
e08b954c | 739 | mov.l r6, @(0x14,r1) ! 33 LS |
1da177e4 LT |
740 | xtrct r12, r11 ! 48 EX |
741 | ||
e08b954c | 742 | mov.l r7, @(0x10,r1) ! 33 LS |
1da177e4 | 743 | |
e08b954c HS |
744 | mov.l r8, @(0x0c,r1) ! 33 LS |
745 | add #-0x1e, r5 ! 50 EX | |
1da177e4 | 746 | |
e08b954c | 747 | mov.l r9, @(0x08,r1) ! 33 LS |
1da177e4 LT |
748 | cmp/eq r2,r1 ! 54 MT |
749 | ||
e08b954c | 750 | mov.l r10, @(0x04,r1) ! 33 LS |
1da177e4 LT |
751 | bf/s 2b ! 109 BR |
752 | ||
e08b954c | 753 | mov.l r11, @(0x00,r1) ! 33 LS |
1da177e4 LT |
754 | #endif |
755 | ||
756 | mov.l @r15+, r12 | |
757 | mov r1, r0 ! 5 MT (latency=0) | |
758 | ||
759 | mov.l @r15+, r11 ! 15 LS | |
760 | sub r1, r5 ! 75 EX | |
761 | ||
762 | mov.l @r15+, r10 ! 15 LS | |
763 | cmp/eq r4, r0 ! 54 MT | |
764 | ||
765 | bf/s 1f ! 109 BR | |
766 | mov.l @r15+, r9 ! 15 LS | |
767 | ||
768 | rts | |
769 | 1: mov.l @r15+, r8 ! 15 LS | |
770 | ||
771 | add #0x1e, r5 ! 50 EX | |
772 | ||
773 | ! Finish off a short word at a time | |
774 | ! r5 must be invariant - 2 | |
775 | 10: mov r4,r2 ! 5 MT (latency=0) | |
776 | add #1,r2 ! 50 EX | |
777 | ||
778 | cmp/hi r2, r0 ! 57 MT | |
779 | bf/s 1f ! 109 BR | |
780 | ||
781 | add #2, r2 ! 50 EX | |
782 | ||
783 | 3: mov.w @(r0,r5),r1 ! 20 LS | |
784 | cmp/hi r2,r0 ! 57 MT | |
785 | ||
786 | bt/s 3b ! 109 BR | |
787 | ||
788 | mov.w r1,@-r0 ! 29 LS | |
789 | 1: | |
790 | ||
791 | ! | |
792 | ! Finally, copy the last byte if necessary | |
793 | cmp/eq r4,r0 ! 54 MT | |
794 | bt/s 9b | |
795 | add #1,r5 | |
796 | mov.b @(r0,r5),r1 | |
797 | rts | |
798 | mov.b r1,@-r0 | |
799 |