Merge tag 'v3.8-rc5' into x86/mm
[deliverable/linux.git] / arch / xtensa / lib / memcopy.S
1 /*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License. See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12 #include <variant/core.h>
13
14 .macro src_b r, w0, w1
15 #ifdef __XTENSA_EB__
16 src \r, \w0, \w1
17 #else
18 src \r, \w1, \w0
19 #endif
20 .endm
21
22 .macro ssa8 r
23 #ifdef __XTENSA_EB__
24 ssa8b \r
25 #else
26 ssa8l \r
27 #endif
28 .endm
29
30 /*
31 * void *memcpy(void *dst, const void *src, size_t len);
32 *
33 * This function is intended to do the same thing as the standard
34 * library function memcpy() for most cases.
35 * However, where the source and/or destination references
36 * an instruction RAM or ROM or a data RAM or ROM, that
37 * source and/or destination will always be accessed with
38 * 32-bit load and store instructions (as required for these
39 * types of devices).
40 *
41 * !!!!!!! XTFIXME:
42 * !!!!!!! Handling of IRAM/IROM has not yet
43 * !!!!!!! been implemented.
44 *
45 * The (general case) algorithm is as follows:
46 * If destination is unaligned, align it by conditionally
47 * copying 1 and 2 bytes.
48 * If source is aligned,
49 * do 16 bytes with a loop, and then finish up with
50 * 8, 4, 2, and 1 byte copies conditional on the length;
51 * else (if source is unaligned),
52 * do the same, but use SRC to align the source data.
53 * This code tries to use fall-through branches for the common
54 * case of aligned source and destination and multiple
55 * of 4 (or 8) length.
56 *
57 * Register use:
58 * a0/ return address
59 * a1/ stack pointer
60 * a2/ return value
61 * a3/ src
62 * a4/ length
63 * a5/ dst
64 * a6/ tmp
65 * a7/ tmp
66 * a8/ tmp
67 * a9/ tmp
68 * a10/ tmp
69 * a11/ tmp
70 */
71
72 .text
73
74 /*
75 * Byte by byte copy
76 */
77 .align 4
78 .byte 0 # 1 mod 4 alignment for LOOPNEZ
79 # (0 mod 4 alignment for LBEG)
80 .Lbytecopy:
81 #if XCHAL_HAVE_LOOPS
82 loopnez a4, .Lbytecopydone
83 #else /* !XCHAL_HAVE_LOOPS */
84 beqz a4, .Lbytecopydone
85 add a7, a3, a4 # a7 = end address for source
86 #endif /* !XCHAL_HAVE_LOOPS */
87 .Lnextbyte:
88 l8ui a6, a3, 0
89 addi a3, a3, 1
90 s8i a6, a5, 0
91 addi a5, a5, 1
92 #if !XCHAL_HAVE_LOOPS
93 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
94 #endif /* !XCHAL_HAVE_LOOPS */
95 .Lbytecopydone:
96 retw
97
98 /*
99 * Destination is unaligned
100 */
101
102 .align 4
103 .Ldst1mod2: # dst is only byte aligned
104 _bltui a4, 7, .Lbytecopy # do short copies byte by byte
105
106 # copy 1 byte
107 l8ui a6, a3, 0
108 addi a3, a3, 1
109 addi a4, a4, -1
110 s8i a6, a5, 0
111 addi a5, a5, 1
112 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
113 # return to main algorithm
114 .Ldst2mod4: # dst 16-bit aligned
115 # copy 2 bytes
116 _bltui a4, 6, .Lbytecopy # do short copies byte by byte
117 l8ui a6, a3, 0
118 l8ui a7, a3, 1
119 addi a3, a3, 2
120 addi a4, a4, -2
121 s8i a6, a5, 0
122 s8i a7, a5, 1
123 addi a5, a5, 2
124 j .Ldstaligned # dst is now aligned, return to main algorithm
125
126 .align 4
127 .global memcpy
128 .type memcpy,@function
129 memcpy:
130
131 entry sp, 16 # minimal stack frame
132 # a2/ dst, a3/ src, a4/ len
133 mov a5, a2 # copy dst so that a2 is return value
134 .Lcommon:
135 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
136 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
137 .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
138 srli a7, a4, 4 # number of loop iterations with 16B
139 # per iteration
140 movi a8, 3 # if source is not aligned,
141 _bany a3, a8, .Lsrcunaligned # then use shifting copy
142 /*
143 * Destination and source are word-aligned, use word copy.
144 */
145 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
146 #if XCHAL_HAVE_LOOPS
147 loopnez a7, .Loop1done
148 #else /* !XCHAL_HAVE_LOOPS */
149 beqz a7, .Loop1done
150 slli a8, a7, 4
151 add a8, a8, a3 # a8 = end of last 16B source chunk
152 #endif /* !XCHAL_HAVE_LOOPS */
153 .Loop1:
154 l32i a6, a3, 0
155 l32i a7, a3, 4
156 s32i a6, a5, 0
157 l32i a6, a3, 8
158 s32i a7, a5, 4
159 l32i a7, a3, 12
160 s32i a6, a5, 8
161 addi a3, a3, 16
162 s32i a7, a5, 12
163 addi a5, a5, 16
164 #if !XCHAL_HAVE_LOOPS
165 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
166 #endif /* !XCHAL_HAVE_LOOPS */
167 .Loop1done:
168 bbci.l a4, 3, .L2
169 # copy 8 bytes
170 l32i a6, a3, 0
171 l32i a7, a3, 4
172 addi a3, a3, 8
173 s32i a6, a5, 0
174 s32i a7, a5, 4
175 addi a5, a5, 8
176 .L2:
177 bbsi.l a4, 2, .L3
178 bbsi.l a4, 1, .L4
179 bbsi.l a4, 0, .L5
180 retw
181 .L3:
182 # copy 4 bytes
183 l32i a6, a3, 0
184 addi a3, a3, 4
185 s32i a6, a5, 0
186 addi a5, a5, 4
187 bbsi.l a4, 1, .L4
188 bbsi.l a4, 0, .L5
189 retw
190 .L4:
191 # copy 2 bytes
192 l16ui a6, a3, 0
193 addi a3, a3, 2
194 s16i a6, a5, 0
195 addi a5, a5, 2
196 bbsi.l a4, 0, .L5
197 retw
198 .L5:
199 # copy 1 byte
200 l8ui a6, a3, 0
201 s8i a6, a5, 0
202 retw
203
204 /*
205 * Destination is aligned, Source is unaligned
206 */
207
208 .align 4
209 .Lsrcunaligned:
210 _beqz a4, .Ldone # avoid loading anything for zero-length copies
211 # copy 16 bytes per iteration for word-aligned dst and unaligned src
212 ssa8 a3 # set shift amount from byte offset
213
214 /* set to 1 when running on ISS (simulator) with the
215 lint or ferret client, or 0 to save a few cycles */
216 #define SIM_CHECKS_ALIGNMENT 1
217 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
218 and a11, a3, a8 # save unalignment offset for below
219 sub a3, a3, a11 # align a3
220 #endif
221 l32i a6, a3, 0 # load first word
222 #if XCHAL_HAVE_LOOPS
223 loopnez a7, .Loop2done
224 #else /* !XCHAL_HAVE_LOOPS */
225 beqz a7, .Loop2done
226 slli a10, a7, 4
227 add a10, a10, a3 # a10 = end of last 16B source chunk
228 #endif /* !XCHAL_HAVE_LOOPS */
229 .Loop2:
230 l32i a7, a3, 4
231 l32i a8, a3, 8
232 src_b a6, a6, a7
233 s32i a6, a5, 0
234 l32i a9, a3, 12
235 src_b a7, a7, a8
236 s32i a7, a5, 4
237 l32i a6, a3, 16
238 src_b a8, a8, a9
239 s32i a8, a5, 8
240 addi a3, a3, 16
241 src_b a9, a9, a6
242 s32i a9, a5, 12
243 addi a5, a5, 16
244 #if !XCHAL_HAVE_LOOPS
245 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
246 #endif /* !XCHAL_HAVE_LOOPS */
247 .Loop2done:
248 bbci.l a4, 3, .L12
249 # copy 8 bytes
250 l32i a7, a3, 4
251 l32i a8, a3, 8
252 src_b a6, a6, a7
253 s32i a6, a5, 0
254 addi a3, a3, 8
255 src_b a7, a7, a8
256 s32i a7, a5, 4
257 addi a5, a5, 8
258 mov a6, a8
259 .L12:
260 bbci.l a4, 2, .L13
261 # copy 4 bytes
262 l32i a7, a3, 4
263 addi a3, a3, 4
264 src_b a6, a6, a7
265 s32i a6, a5, 0
266 addi a5, a5, 4
267 mov a6, a7
268 .L13:
269 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
270 add a3, a3, a11 # readjust a3 with correct misalignment
271 #endif
272 bbsi.l a4, 1, .L14
273 bbsi.l a4, 0, .L15
274 .Ldone: retw
275 .L14:
276 # copy 2 bytes
277 l8ui a6, a3, 0
278 l8ui a7, a3, 1
279 addi a3, a3, 2
280 s8i a6, a5, 0
281 s8i a7, a5, 1
282 addi a5, a5, 2
283 bbsi.l a4, 0, .L15
284 retw
285 .L15:
286 # copy 1 byte
287 l8ui a6, a3, 0
288 s8i a6, a5, 0
289 retw
290
291
292 /*
293 * void bcopy(const void *src, void *dest, size_t n);
294 */
295 .align 4
296 .global bcopy
297 .type bcopy,@function
298 bcopy:
299 entry sp, 16 # minimal stack frame
300 # a2=src, a3=dst, a4=len
301 mov a5, a3
302 mov a3, a2
303 mov a2, a5
304 j .Lmovecommon # go to common code for memmove+bcopy
305
306 /*
307 * void *memmove(void *dst, const void *src, size_t len);
308 *
309 * This function is intended to do the same thing as the standard
310 * library function memmove() for most cases.
311 * However, where the source and/or destination references
312 * an instruction RAM or ROM or a data RAM or ROM, that
313 * source and/or destination will always be accessed with
314 * 32-bit load and store instructions (as required for these
315 * types of devices).
316 *
317 * !!!!!!! XTFIXME:
318 * !!!!!!! Handling of IRAM/IROM has not yet
319 * !!!!!!! been implemented.
320 *
321 * The (general case) algorithm is as follows:
322 * If end of source doesn't overlap destination then use memcpy.
323 * Otherwise do memcpy backwards.
324 *
325 * Register use:
326 * a0/ return address
327 * a1/ stack pointer
328 * a2/ return value
329 * a3/ src
330 * a4/ length
331 * a5/ dst
332 * a6/ tmp
333 * a7/ tmp
334 * a8/ tmp
335 * a9/ tmp
336 * a10/ tmp
337 * a11/ tmp
338 */
339
340 /*
341 * Byte by byte copy
342 */
343 .align 4
344 .byte 0 # 1 mod 4 alignment for LOOPNEZ
345 # (0 mod 4 alignment for LBEG)
346 .Lbackbytecopy:
347 #if XCHAL_HAVE_LOOPS
348 loopnez a4, .Lbackbytecopydone
349 #else /* !XCHAL_HAVE_LOOPS */
350 beqz a4, .Lbackbytecopydone
351 sub a7, a3, a4 # a7 = start address for source
352 #endif /* !XCHAL_HAVE_LOOPS */
353 .Lbacknextbyte:
354 addi a3, a3, -1
355 l8ui a6, a3, 0
356 addi a5, a5, -1
357 s8i a6, a5, 0
358 #if !XCHAL_HAVE_LOOPS
359 bne a3, a7, .Lbacknextbyte # continue loop if
360 # $a3:src != $a7:src_start
361 #endif /* !XCHAL_HAVE_LOOPS */
362 .Lbackbytecopydone:
363 retw
364
365 /*
366 * Destination is unaligned
367 */
368
369 .align 4
370 .Lbackdst1mod2: # dst is only byte aligned
371 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
372
373 # copy 1 byte
374 addi a3, a3, -1
375 l8ui a6, a3, 0
376 addi a5, a5, -1
377 s8i a6, a5, 0
378 addi a4, a4, -1
379 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
380 # return to main algorithm
381 .Lbackdst2mod4: # dst 16-bit aligned
382 # copy 2 bytes
383 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
384 addi a3, a3, -2
385 l8ui a6, a3, 0
386 l8ui a7, a3, 1
387 addi a5, a5, -2
388 s8i a6, a5, 0
389 s8i a7, a5, 1
390 addi a4, a4, -2
391 j .Lbackdstaligned # dst is now aligned,
392 # return to main algorithm
393
394 .align 4
395 .global memmove
396 .type memmove,@function
397 memmove:
398
399 entry sp, 16 # minimal stack frame
400 # a2/ dst, a3/ src, a4/ len
401 mov a5, a2 # copy dst so that a2 is return value
402 .Lmovecommon:
403 sub a6, a5, a3
404 bgeu a6, a4, .Lcommon
405
406 add a5, a5, a4
407 add a3, a3, a4
408
409 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
410 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
411 .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
412 srli a7, a4, 4 # number of loop iterations with 16B
413 # per iteration
414 movi a8, 3 # if source is not aligned,
415 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
416 /*
417 * Destination and source are word-aligned, use word copy.
418 */
419 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
420 #if XCHAL_HAVE_LOOPS
421 loopnez a7, .backLoop1done
422 #else /* !XCHAL_HAVE_LOOPS */
423 beqz a7, .backLoop1done
424 slli a8, a7, 4
425 sub a8, a3, a8 # a8 = start of first 16B source chunk
426 #endif /* !XCHAL_HAVE_LOOPS */
427 .backLoop1:
428 addi a3, a3, -16
429 l32i a7, a3, 12
430 l32i a6, a3, 8
431 addi a5, a5, -16
432 s32i a7, a5, 12
433 l32i a7, a3, 4
434 s32i a6, a5, 8
435 l32i a6, a3, 0
436 s32i a7, a5, 4
437 s32i a6, a5, 0
438 #if !XCHAL_HAVE_LOOPS
439 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
440 #endif /* !XCHAL_HAVE_LOOPS */
441 .backLoop1done:
442 bbci.l a4, 3, .Lback2
443 # copy 8 bytes
444 addi a3, a3, -8
445 l32i a6, a3, 0
446 l32i a7, a3, 4
447 addi a5, a5, -8
448 s32i a6, a5, 0
449 s32i a7, a5, 4
450 .Lback2:
451 bbsi.l a4, 2, .Lback3
452 bbsi.l a4, 1, .Lback4
453 bbsi.l a4, 0, .Lback5
454 retw
455 .Lback3:
456 # copy 4 bytes
457 addi a3, a3, -4
458 l32i a6, a3, 0
459 addi a5, a5, -4
460 s32i a6, a5, 0
461 bbsi.l a4, 1, .Lback4
462 bbsi.l a4, 0, .Lback5
463 retw
464 .Lback4:
465 # copy 2 bytes
466 addi a3, a3, -2
467 l16ui a6, a3, 0
468 addi a5, a5, -2
469 s16i a6, a5, 0
470 bbsi.l a4, 0, .Lback5
471 retw
472 .Lback5:
473 # copy 1 byte
474 addi a3, a3, -1
475 l8ui a6, a3, 0
476 addi a5, a5, -1
477 s8i a6, a5, 0
478 retw
479
480 /*
481 * Destination is aligned, Source is unaligned
482 */
483
484 .align 4
485 .Lbacksrcunaligned:
486 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
487 # copy 16 bytes per iteration for word-aligned dst and unaligned src
488 ssa8 a3 # set shift amount from byte offset
489 #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
490 * the lint or ferret client, or 0
491 * to save a few cycles */
492 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
493 and a11, a3, a8 # save unalignment offset for below
494 sub a3, a3, a11 # align a3
495 #endif
496 l32i a6, a3, 0 # load first word
497 #if XCHAL_HAVE_LOOPS
498 loopnez a7, .backLoop2done
499 #else /* !XCHAL_HAVE_LOOPS */
500 beqz a7, .backLoop2done
501 slli a10, a7, 4
502 sub a10, a3, a10 # a10 = start of first 16B source chunk
503 #endif /* !XCHAL_HAVE_LOOPS */
504 .backLoop2:
505 addi a3, a3, -16
506 l32i a7, a3, 12
507 l32i a8, a3, 8
508 addi a5, a5, -16
509 src_b a6, a7, a6
510 s32i a6, a5, 12
511 l32i a9, a3, 4
512 src_b a7, a8, a7
513 s32i a7, a5, 8
514 l32i a6, a3, 0
515 src_b a8, a9, a8
516 s32i a8, a5, 4
517 src_b a9, a6, a9
518 s32i a9, a5, 0
519 #if !XCHAL_HAVE_LOOPS
520 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
521 #endif /* !XCHAL_HAVE_LOOPS */
522 .backLoop2done:
523 bbci.l a4, 3, .Lback12
524 # copy 8 bytes
525 addi a3, a3, -8
526 l32i a7, a3, 4
527 l32i a8, a3, 0
528 addi a5, a5, -8
529 src_b a6, a7, a6
530 s32i a6, a5, 4
531 src_b a7, a8, a7
532 s32i a7, a5, 0
533 mov a6, a8
534 .Lback12:
535 bbci.l a4, 2, .Lback13
536 # copy 4 bytes
537 addi a3, a3, -4
538 l32i a7, a3, 0
539 addi a5, a5, -4
540 src_b a6, a7, a6
541 s32i a6, a5, 0
542 mov a6, a7
543 .Lback13:
544 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
545 add a3, a3, a11 # readjust a3 with correct misalignment
546 #endif
547 bbsi.l a4, 1, .Lback14
548 bbsi.l a4, 0, .Lback15
549 .Lbackdone:
550 retw
551 .Lback14:
552 # copy 2 bytes
553 addi a3, a3, -2
554 l8ui a6, a3, 0
555 l8ui a7, a3, 1
556 addi a5, a5, -2
557 s8i a6, a5, 0
558 s8i a7, a5, 1
559 bbsi.l a4, 0, .Lback15
560 retw
561 .Lback15:
562 # copy 1 byte
563 addi a3, a3, -1
564 addi a5, a5, -1
565 l8ui a6, a3, 0
566 s8i a6, a5, 0
567 retw
568
569 \f
570 /*
571 * Local Variables:
572 * mode:fundamental
573 * comment-start: "# "
574 * comment-start-skip: "# *"
575 * End:
576 */
This page took 0.06015 seconds and 6 git commands to generate.