Merge remote-tracking branch 'asoc/topic/wm8750' into asoc-next
[deliverable/linux.git] / arch / xtensa / lib / memcopy.S
1 /*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License. See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12 #include <variant/core.h>
13
14 .macro src_b r, w0, w1
15 #ifdef __XTENSA_EB__
16 src \r, \w0, \w1
17 #else
18 src \r, \w1, \w0
19 #endif
20 .endm
21
22 .macro ssa8 r
23 #ifdef __XTENSA_EB__
24 ssa8b \r
25 #else
26 ssa8l \r
27 #endif
28 .endm
29
30 /*
31 * void *memcpy(void *dst, const void *src, size_t len);
32 *
33 * This function is intended to do the same thing as the standard
34 * library function memcpy() for most cases.
35 * However, where the source and/or destination references
36 * an instruction RAM or ROM or a data RAM or ROM, that
37 * source and/or destination will always be accessed with
38 * 32-bit load and store instructions (as required for these
39 * types of devices).
40 *
41 * !!!!!!! XTFIXME:
42 * !!!!!!! Handling of IRAM/IROM has not yet
43 * !!!!!!! been implemented.
44 *
45 * The (general case) algorithm is as follows:
46 * If destination is unaligned, align it by conditionally
47 * copying 1 and 2 bytes.
48 * If source is aligned,
49 * do 16 bytes with a loop, and then finish up with
50 * 8, 4, 2, and 1 byte copies conditional on the length;
51 * else (if source is unaligned),
52 * do the same, but use SRC to align the source data.
53 * This code tries to use fall-through branches for the common
54 * case of aligned source and destination and multiple
55 * of 4 (or 8) length.
56 *
57 * Register use:
58 * a0/ return address
59 * a1/ stack pointer
60 * a2/ return value
61 * a3/ src
62 * a4/ length
63 * a5/ dst
64 * a6/ tmp
65 * a7/ tmp
66 * a8/ tmp
67 * a9/ tmp
68 * a10/ tmp
69 * a11/ tmp
70 */
71
72 .text
73
74 /*
75 * Byte by byte copy
76 */
77 .align 4
78 .byte 0 # 1 mod 4 alignment for LOOPNEZ
79 # (0 mod 4 alignment for LBEG)
80 .Lbytecopy:
81 #if XCHAL_HAVE_LOOPS
82 loopnez a4, .Lbytecopydone
83 #else /* !XCHAL_HAVE_LOOPS */
84 beqz a4, .Lbytecopydone
85 add a7, a3, a4 # a7 = end address for source
86 #endif /* !XCHAL_HAVE_LOOPS */
87 .Lnextbyte:
88 l8ui a6, a3, 0
89 addi a3, a3, 1
90 s8i a6, a5, 0
91 addi a5, a5, 1
92 #if !XCHAL_HAVE_LOOPS
93 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
94 #endif /* !XCHAL_HAVE_LOOPS */
95 .Lbytecopydone:
96 retw
97
98 /*
99 * Destination is unaligned
100 */
101
102 .align 4
103 .Ldst1mod2: # dst is only byte aligned
104 _bltui a4, 7, .Lbytecopy # do short copies byte by byte
105
106 # copy 1 byte
107 l8ui a6, a3, 0
108 addi a3, a3, 1
109 addi a4, a4, -1
110 s8i a6, a5, 0
111 addi a5, a5, 1
112 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
113 # return to main algorithm
114 .Ldst2mod4: # dst 16-bit aligned
115 # copy 2 bytes
116 _bltui a4, 6, .Lbytecopy # do short copies byte by byte
117 l8ui a6, a3, 0
118 l8ui a7, a3, 1
119 addi a3, a3, 2
120 addi a4, a4, -2
121 s8i a6, a5, 0
122 s8i a7, a5, 1
123 addi a5, a5, 2
124 j .Ldstaligned # dst is now aligned, return to main algorithm
125
126 .align 4
127 .global memcpy
128 .type memcpy,@function
129 memcpy:
130
131 entry sp, 16 # minimal stack frame
132 # a2/ dst, a3/ src, a4/ len
133 mov a5, a2 # copy dst so that a2 is return value
134 .Lcommon:
135 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
136 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
137 .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
138 srli a7, a4, 4 # number of loop iterations with 16B
139 # per iteration
140 movi a8, 3 # if source is not aligned,
141 _bany a3, a8, .Lsrcunaligned # then use shifting copy
142 /*
143 * Destination and source are word-aligned, use word copy.
144 */
145 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
146 #if XCHAL_HAVE_LOOPS
147 loopnez a7, .Loop1done
148 #else /* !XCHAL_HAVE_LOOPS */
149 beqz a7, .Loop1done
150 slli a8, a7, 4
151 add a8, a8, a3 # a8 = end of last 16B source chunk
152 #endif /* !XCHAL_HAVE_LOOPS */
153 .Loop1:
154 l32i a6, a3, 0
155 l32i a7, a3, 4
156 s32i a6, a5, 0
157 l32i a6, a3, 8
158 s32i a7, a5, 4
159 l32i a7, a3, 12
160 s32i a6, a5, 8
161 addi a3, a3, 16
162 s32i a7, a5, 12
163 addi a5, a5, 16
164 #if !XCHAL_HAVE_LOOPS
165 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
166 #endif /* !XCHAL_HAVE_LOOPS */
167 .Loop1done:
168 bbci.l a4, 3, .L2
169 # copy 8 bytes
170 l32i a6, a3, 0
171 l32i a7, a3, 4
172 addi a3, a3, 8
173 s32i a6, a5, 0
174 s32i a7, a5, 4
175 addi a5, a5, 8
176 .L2:
177 bbsi.l a4, 2, .L3
178 bbsi.l a4, 1, .L4
179 bbsi.l a4, 0, .L5
180 retw
181 .L3:
182 # copy 4 bytes
183 l32i a6, a3, 0
184 addi a3, a3, 4
185 s32i a6, a5, 0
186 addi a5, a5, 4
187 bbsi.l a4, 1, .L4
188 bbsi.l a4, 0, .L5
189 retw
190 .L4:
191 # copy 2 bytes
192 l16ui a6, a3, 0
193 addi a3, a3, 2
194 s16i a6, a5, 0
195 addi a5, a5, 2
196 bbsi.l a4, 0, .L5
197 retw
198 .L5:
199 # copy 1 byte
200 l8ui a6, a3, 0
201 s8i a6, a5, 0
202 retw
203
204 /*
205 * Destination is aligned, Source is unaligned
206 */
207
208 .align 4
209 .Lsrcunaligned:
210 _beqz a4, .Ldone # avoid loading anything for zero-length copies
211 # copy 16 bytes per iteration for word-aligned dst and unaligned src
212 ssa8 a3 # set shift amount from byte offset
213 #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the
214 lint or ferret client, or 0 to save a few cycles */
215 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
216 and a11, a3, a8 # save unalignment offset for below
217 sub a3, a3, a11 # align a3
218 #endif
219 l32i a6, a3, 0 # load first word
220 #if XCHAL_HAVE_LOOPS
221 loopnez a7, .Loop2done
222 #else /* !XCHAL_HAVE_LOOPS */
223 beqz a7, .Loop2done
224 slli a10, a7, 4
225 add a10, a10, a3 # a10 = end of last 16B source chunk
226 #endif /* !XCHAL_HAVE_LOOPS */
227 .Loop2:
228 l32i a7, a3, 4
229 l32i a8, a3, 8
230 src_b a6, a6, a7
231 s32i a6, a5, 0
232 l32i a9, a3, 12
233 src_b a7, a7, a8
234 s32i a7, a5, 4
235 l32i a6, a3, 16
236 src_b a8, a8, a9
237 s32i a8, a5, 8
238 addi a3, a3, 16
239 src_b a9, a9, a6
240 s32i a9, a5, 12
241 addi a5, a5, 16
242 #if !XCHAL_HAVE_LOOPS
243 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
244 #endif /* !XCHAL_HAVE_LOOPS */
245 .Loop2done:
246 bbci.l a4, 3, .L12
247 # copy 8 bytes
248 l32i a7, a3, 4
249 l32i a8, a3, 8
250 src_b a6, a6, a7
251 s32i a6, a5, 0
252 addi a3, a3, 8
253 src_b a7, a7, a8
254 s32i a7, a5, 4
255 addi a5, a5, 8
256 mov a6, a8
257 .L12:
258 bbci.l a4, 2, .L13
259 # copy 4 bytes
260 l32i a7, a3, 4
261 addi a3, a3, 4
262 src_b a6, a6, a7
263 s32i a6, a5, 0
264 addi a5, a5, 4
265 mov a6, a7
266 .L13:
267 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
268 add a3, a3, a11 # readjust a3 with correct misalignment
269 #endif
270 bbsi.l a4, 1, .L14
271 bbsi.l a4, 0, .L15
272 .Ldone: retw
273 .L14:
274 # copy 2 bytes
275 l8ui a6, a3, 0
276 l8ui a7, a3, 1
277 addi a3, a3, 2
278 s8i a6, a5, 0
279 s8i a7, a5, 1
280 addi a5, a5, 2
281 bbsi.l a4, 0, .L15
282 retw
283 .L15:
284 # copy 1 byte
285 l8ui a6, a3, 0
286 s8i a6, a5, 0
287 retw
288
289
290 /*
291 * void bcopy(const void *src, void *dest, size_t n);
292 */
293 .align 4
294 .global bcopy
295 .type bcopy,@function
296 bcopy:
297 entry sp, 16 # minimal stack frame
298 # a2=src, a3=dst, a4=len
299 mov a5, a3
300 mov a3, a2
301 mov a2, a5
302 j .Lmovecommon # go to common code for memmove+bcopy
303
304 /*
305 * void *memmove(void *dst, const void *src, size_t len);
306 *
307 * This function is intended to do the same thing as the standard
308 * library function memmove() for most cases.
309 * However, where the source and/or destination references
310 * an instruction RAM or ROM or a data RAM or ROM, that
311 * source and/or destination will always be accessed with
312 * 32-bit load and store instructions (as required for these
313 * types of devices).
314 *
315 * !!!!!!! XTFIXME:
316 * !!!!!!! Handling of IRAM/IROM has not yet
317 * !!!!!!! been implemented.
318 *
319 * The (general case) algorithm is as follows:
320 * If end of source doesn't overlap destination then use memcpy.
321 * Otherwise do memcpy backwards.
322 *
323 * Register use:
324 * a0/ return address
325 * a1/ stack pointer
326 * a2/ return value
327 * a3/ src
328 * a4/ length
329 * a5/ dst
330 * a6/ tmp
331 * a7/ tmp
332 * a8/ tmp
333 * a9/ tmp
334 * a10/ tmp
335 * a11/ tmp
336 */
337
338 /*
339 * Byte by byte copy
340 */
341 .align 4
342 .byte 0 # 1 mod 4 alignment for LOOPNEZ
343 # (0 mod 4 alignment for LBEG)
344 .Lbackbytecopy:
345 #if XCHAL_HAVE_LOOPS
346 loopnez a4, .Lbackbytecopydone
347 #else /* !XCHAL_HAVE_LOOPS */
348 beqz a4, .Lbackbytecopydone
349 sub a7, a3, a4 # a7 = start address for source
350 #endif /* !XCHAL_HAVE_LOOPS */
351 .Lbacknextbyte:
352 addi a3, a3, -1
353 l8ui a6, a3, 0
354 addi a5, a5, -1
355 s8i a6, a5, 0
356 #if !XCHAL_HAVE_LOOPS
357 bne a3, a7, .Lbacknextbyte # continue loop if
358 # $a3:src != $a7:src_start
359 #endif /* !XCHAL_HAVE_LOOPS */
360 .Lbackbytecopydone:
361 retw
362
363 /*
364 * Destination is unaligned
365 */
366
367 .align 4
368 .Lbackdst1mod2: # dst is only byte aligned
369 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
370
371 # copy 1 byte
372 addi a3, a3, -1
373 l8ui a6, a3, 0
374 addi a5, a5, -1
375 s8i a6, a5, 0
376 addi a4, a4, -1
377 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
378 # return to main algorithm
379 .Lbackdst2mod4: # dst 16-bit aligned
380 # copy 2 bytes
381 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
382 addi a3, a3, -2
383 l8ui a6, a3, 0
384 l8ui a7, a3, 1
385 addi a5, a5, -2
386 s8i a6, a5, 0
387 s8i a7, a5, 1
388 addi a4, a4, -2
389 j .Lbackdstaligned # dst is now aligned,
390 # return to main algorithm
391
392 .align 4
393 .global memmove
394 .type memmove,@function
395 memmove:
396
397 entry sp, 16 # minimal stack frame
398 # a2/ dst, a3/ src, a4/ len
399 mov a5, a2 # copy dst so that a2 is return value
400 .Lmovecommon:
401 sub a6, a5, a3
402 bgeu a6, a4, .Lcommon
403
404 add a5, a5, a4
405 add a3, a3, a4
406
407 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
408 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
409 .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
410 srli a7, a4, 4 # number of loop iterations with 16B
411 # per iteration
412 movi a8, 3 # if source is not aligned,
413 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
414 /*
415 * Destination and source are word-aligned, use word copy.
416 */
417 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
418 #if XCHAL_HAVE_LOOPS
419 loopnez a7, .backLoop1done
420 #else /* !XCHAL_HAVE_LOOPS */
421 beqz a7, .backLoop1done
422 slli a8, a7, 4
423 sub a8, a3, a8 # a8 = start of first 16B source chunk
424 #endif /* !XCHAL_HAVE_LOOPS */
425 .backLoop1:
426 addi a3, a3, -16
427 l32i a7, a3, 12
428 l32i a6, a3, 8
429 addi a5, a5, -16
430 s32i a7, a5, 12
431 l32i a7, a3, 4
432 s32i a6, a5, 8
433 l32i a6, a3, 0
434 s32i a7, a5, 4
435 s32i a6, a5, 0
436 #if !XCHAL_HAVE_LOOPS
437 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
438 #endif /* !XCHAL_HAVE_LOOPS */
439 .backLoop1done:
440 bbci.l a4, 3, .Lback2
441 # copy 8 bytes
442 addi a3, a3, -8
443 l32i a6, a3, 0
444 l32i a7, a3, 4
445 addi a5, a5, -8
446 s32i a6, a5, 0
447 s32i a7, a5, 4
448 .Lback2:
449 bbsi.l a4, 2, .Lback3
450 bbsi.l a4, 1, .Lback4
451 bbsi.l a4, 0, .Lback5
452 retw
453 .Lback3:
454 # copy 4 bytes
455 addi a3, a3, -4
456 l32i a6, a3, 0
457 addi a5, a5, -4
458 s32i a6, a5, 0
459 bbsi.l a4, 1, .Lback4
460 bbsi.l a4, 0, .Lback5
461 retw
462 .Lback4:
463 # copy 2 bytes
464 addi a3, a3, -2
465 l16ui a6, a3, 0
466 addi a5, a5, -2
467 s16i a6, a5, 0
468 bbsi.l a4, 0, .Lback5
469 retw
470 .Lback5:
471 # copy 1 byte
472 addi a3, a3, -1
473 l8ui a6, a3, 0
474 addi a5, a5, -1
475 s8i a6, a5, 0
476 retw
477
478 /*
479 * Destination is aligned, Source is unaligned
480 */
481
482 .align 4
483 .Lbacksrcunaligned:
484 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
485 # copy 16 bytes per iteration for word-aligned dst and unaligned src
486 ssa8 a3 # set shift amount from byte offset
487 #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
488 * the lint or ferret client, or 0
489 * to save a few cycles */
490 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
491 and a11, a3, a8 # save unalignment offset for below
492 sub a3, a3, a11 # align a3
493 #endif
494 l32i a6, a3, 0 # load first word
495 #if XCHAL_HAVE_LOOPS
496 loopnez a7, .backLoop2done
497 #else /* !XCHAL_HAVE_LOOPS */
498 beqz a7, .backLoop2done
499 slli a10, a7, 4
500 sub a10, a3, a10 # a10 = start of first 16B source chunk
501 #endif /* !XCHAL_HAVE_LOOPS */
502 .backLoop2:
503 addi a3, a3, -16
504 l32i a7, a3, 12
505 l32i a8, a3, 8
506 addi a5, a5, -16
507 src_b a6, a7, a6
508 s32i a6, a5, 12
509 l32i a9, a3, 4
510 src_b a7, a8, a7
511 s32i a7, a5, 8
512 l32i a6, a3, 0
513 src_b a8, a9, a8
514 s32i a8, a5, 4
515 src_b a9, a6, a9
516 s32i a9, a5, 0
517 #if !XCHAL_HAVE_LOOPS
518 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
519 #endif /* !XCHAL_HAVE_LOOPS */
520 .backLoop2done:
521 bbci.l a4, 3, .Lback12
522 # copy 8 bytes
523 addi a3, a3, -8
524 l32i a7, a3, 4
525 l32i a8, a3, 0
526 addi a5, a5, -8
527 src_b a6, a7, a6
528 s32i a6, a5, 4
529 src_b a7, a8, a7
530 s32i a7, a5, 0
531 mov a6, a8
532 .Lback12:
533 bbci.l a4, 2, .Lback13
534 # copy 4 bytes
535 addi a3, a3, -4
536 l32i a7, a3, 0
537 addi a5, a5, -4
538 src_b a6, a7, a6
539 s32i a6, a5, 0
540 mov a6, a7
541 .Lback13:
542 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
543 add a3, a3, a11 # readjust a3 with correct misalignment
544 #endif
545 bbsi.l a4, 1, .Lback14
546 bbsi.l a4, 0, .Lback15
547 .Lbackdone:
548 retw
549 .Lback14:
550 # copy 2 bytes
551 addi a3, a3, -2
552 l8ui a6, a3, 0
553 l8ui a7, a3, 1
554 addi a5, a5, -2
555 s8i a6, a5, 0
556 s8i a7, a5, 1
557 bbsi.l a4, 0, .Lback15
558 retw
559 .Lback15:
560 # copy 1 byte
561 addi a3, a3, -1
562 addi a5, a5, -1
563 l8ui a6, a3, 0
564 s8i a6, a5, 0
565 retw
566
567 \f
568 /*
569 * Local Variables:
570 * mode:fundamental
571 * comment-start: "# "
572 * comment-start-skip: "# *"
573 * End:
574 */
This page took 0.04596 seconds and 6 git commands to generate.