arch/xtensa/lib/memcopy.S

   1 /*
   2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
   3  * xthal_memcpy and xthal_bcopy
   4  *
   5  * This file is subject to the terms and conditions of the GNU General Public
   6  * License.  See the file "COPYING" in the main directory of this archive
   7  * for more details.
   8  *
   9  * Copyright (C) 2002 - 2012 Tensilica Inc.
  10  */
  11
  12 #include <variant/core.h>
  13
  14         .macro  src_b   r, w0, w1
  15 #ifdef __XTENSA_EB__
  16         src     \r, \w0, \w1
  17 #else
  18         src     \r, \w1, \w0
  19 #endif
  20         .endm
  21
  22         .macro  ssa8    r
  23 #ifdef __XTENSA_EB__
  24         ssa8b   \r
  25 #else
  26         ssa8l   \r
  27 #endif
  28         .endm
  29
  30 /*
  31  * void *memcpy(void *dst, const void *src, size_t len);
  32  *
  33  * This function is intended to do the same thing as the standard
  34  * library function memcpy() for most cases.
  35  * However, where the source and/or destination references
  36  * an instruction RAM or ROM or a data RAM or ROM, that
  37  * source and/or destination will always be accessed with
  38  * 32-bit load and store instructions (as required for these
  39  * types of devices).
  40  *
  41  * !!!!!!!  XTFIXME:
  42  * !!!!!!!  Handling of IRAM/IROM has not yet
  43  * !!!!!!!  been implemented.
  44  *
  45  * The (general case) algorithm is as follows:
  46  *   If destination is unaligned, align it by conditionally
  47  *     copying 1 and 2 bytes.
  48  *   If source is aligned,
  49  *     do 16 bytes with a loop, and then finish up with
  50  *     8, 4, 2, and 1 byte copies conditional on the length;
  51  *   else (if source is unaligned),
  52  *     do the same, but use SRC to align the source data.
  53  *   This code tries to use fall-through branches for the common
  54  *     case of aligned source and destination and multiple
  55  *     of 4 (or 8) length.
  56  *
  57  * Register use:
  58  *      a0/ return address
  59  *      a1/ stack pointer
  60  *      a2/ return value
  61  *      a3/ src
  62  *      a4/ length
  63  *      a5/ dst
  64  *      a6/ tmp
  65  *      a7/ tmp
  66  *      a8/ tmp
  67  *      a9/ tmp
  68  *      a10/ tmp
  69  *      a11/ tmp
  70  */
  71
  72         .text
  73
  74 /*
  75  * Byte by byte copy
  76  */
  77         .align  4
  78         .byte   0               # 1 mod 4 alignment for LOOPNEZ
  79                                 # (0 mod 4 alignment for LBEG)
  80 .Lbytecopy:
  81 #if XCHAL_HAVE_LOOPS
  82         loopnez a4, .Lbytecopydone
  83 #else /* !XCHAL_HAVE_LOOPS */
  84         beqz    a4, .Lbytecopydone
  85         add     a7, a3, a4      # a7 = end address for source
  86 #endif /* !XCHAL_HAVE_LOOPS */
  87 .Lnextbyte:
  88         l8ui    a6, a3, 0
  89         addi    a3, a3, 1
  90         s8i     a6, a5, 0
  91         addi    a5, a5, 1
  92 #if !XCHAL_HAVE_LOOPS
  93         bne     a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  94 #endif /* !XCHAL_HAVE_LOOPS */
  95 .Lbytecopydone:
  96         retw
  97
  98 /*
  99  * Destination is unaligned
 100  */
 101
 102         .align  4
 103 .Ldst1mod2:     # dst is only byte aligned
 104         _bltui  a4, 7, .Lbytecopy       # do short copies byte by byte
 105
 106         # copy 1 byte
 107         l8ui    a6, a3,  0
 108         addi    a3, a3,  1
 109         addi    a4, a4, -1
 110         s8i     a6, a5,  0
 111         addi    a5, a5,  1
 112         _bbci.l a5, 1, .Ldstaligned     # if dst is now aligned, then
 113                                         # return to main algorithm
 114 .Ldst2mod4:     # dst 16-bit aligned
 115         # copy 2 bytes
 116         _bltui  a4, 6, .Lbytecopy       # do short copies byte by byte
 117         l8ui    a6, a3,  0
 118         l8ui    a7, a3,  1
 119         addi    a3, a3,  2
 120         addi    a4, a4, -2
 121         s8i     a6, a5,  0
 122         s8i     a7, a5,  1
 123         addi    a5, a5,  2
 124         j       .Ldstaligned    # dst is now aligned, return to main algorithm
 125
 126         .align  4
 127         .global memcpy
 128         .type   memcpy,@function
 129 memcpy:
 130
 131         entry   sp, 16          # minimal stack frame
 132         # a2/ dst, a3/ src, a4/ len
 133         mov     a5, a2          # copy dst so that a2 is return value
 134 .Lcommon:
 135         _bbsi.l a2, 0, .Ldst1mod2       # if dst is 1 mod 2
 136         _bbsi.l a2, 1, .Ldst2mod4       # if dst is 2 mod 4
 137 .Ldstaligned:   # return here from .Ldst?mod? once dst is aligned
 138         srli    a7, a4, 4       # number of loop iterations with 16B
 139                                 # per iteration
 140         movi    a8, 3           # if source is not aligned,
 141         _bany   a3, a8, .Lsrcunaligned  # then use shifting copy
 142         /*
 143          * Destination and source are word-aligned, use word copy.
 144          */
 145         # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 146 #if XCHAL_HAVE_LOOPS
 147         loopnez a7, .Loop1done
 148 #else /* !XCHAL_HAVE_LOOPS */
 149         beqz    a7, .Loop1done
 150         slli    a8, a7, 4
 151         add     a8, a8, a3      # a8 = end of last 16B source chunk
 152 #endif /* !XCHAL_HAVE_LOOPS */
 153 .Loop1:
 154         l32i    a6, a3,  0
 155         l32i    a7, a3,  4
 156         s32i    a6, a5,  0
 157         l32i    a6, a3,  8
 158         s32i    a7, a5,  4
 159         l32i    a7, a3, 12
 160         s32i    a6, a5,  8
 161         addi    a3, a3, 16
 162         s32i    a7, a5, 12
 163         addi    a5, a5, 16
 164 #if !XCHAL_HAVE_LOOPS
 165         bne     a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
 166 #endif /* !XCHAL_HAVE_LOOPS */
 167 .Loop1done:
 168         bbci.l  a4, 3, .L2
 169         # copy 8 bytes
 170         l32i    a6, a3,  0
 171         l32i    a7, a3,  4
 172         addi    a3, a3,  8
 173         s32i    a6, a5,  0
 174         s32i    a7, a5,  4
 175         addi    a5, a5,  8
 176 .L2:
 177         bbsi.l  a4, 2, .L3
 178         bbsi.l  a4, 1, .L4
 179         bbsi.l  a4, 0, .L5
 180         retw
 181 .L3:
 182         # copy 4 bytes
 183         l32i    a6, a3,  0
 184         addi    a3, a3,  4
 185         s32i    a6, a5,  0
 186         addi    a5, a5,  4
 187         bbsi.l  a4, 1, .L4
 188         bbsi.l  a4, 0, .L5
 189         retw
 190 .L4:
 191         # copy 2 bytes
 192         l16ui   a6, a3,  0
 193         addi    a3, a3,  2
 194         s16i    a6, a5,  0
 195         addi    a5, a5,  2
 196         bbsi.l  a4, 0, .L5
 197         retw
 198 .L5:
 199         # copy 1 byte
 200         l8ui    a6, a3,  0
 201         s8i     a6, a5,  0
 202         retw
 203
 204 /*
 205  * Destination is aligned, Source is unaligned
 206  */
 207
 208         .align  4
 209 .Lsrcunaligned:
 210         _beqz   a4, .Ldone      # avoid loading anything for zero-length copies
 211         # copy 16 bytes per iteration for word-aligned dst and unaligned src
 212         ssa8    a3              # set shift amount from byte offset
 213 #define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS (simulator) with the
 214                                            lint or ferret client, or 0 to save a few cycles */
 215 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 216         and     a11, a3, a8     # save unalignment offset for below
 217         sub     a3, a3, a11     # align a3
 218 #endif
 219         l32i    a6, a3, 0       # load first word
 220 #if XCHAL_HAVE_LOOPS
 221         loopnez a7, .Loop2done
 222 #else /* !XCHAL_HAVE_LOOPS */
 223         beqz    a7, .Loop2done
 224         slli    a10, a7, 4
 225         add     a10, a10, a3    # a10 = end of last 16B source chunk
 226 #endif /* !XCHAL_HAVE_LOOPS */
 227 .Loop2:
 228         l32i    a7, a3,  4
 229         l32i    a8, a3,  8
 230         src_b   a6, a6, a7
 231         s32i    a6, a5,  0
 232         l32i    a9, a3, 12
 233         src_b   a7, a7, a8
 234         s32i    a7, a5,  4
 235         l32i    a6, a3, 16
 236         src_b   a8, a8, a9
 237         s32i    a8, a5,  8
 238         addi    a3, a3, 16
 239         src_b   a9, a9, a6
 240         s32i    a9, a5, 12
 241         addi    a5, a5, 16
 242 #if !XCHAL_HAVE_LOOPS
 243         bne     a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
 244 #endif /* !XCHAL_HAVE_LOOPS */
 245 .Loop2done:
 246         bbci.l  a4, 3, .L12
 247         # copy 8 bytes
 248         l32i    a7, a3,  4
 249         l32i    a8, a3,  8
 250         src_b   a6, a6, a7
 251         s32i    a6, a5,  0
 252         addi    a3, a3,  8
 253         src_b   a7, a7, a8
 254         s32i    a7, a5,  4
 255         addi    a5, a5,  8
 256         mov     a6, a8
 257 .L12:
 258         bbci.l  a4, 2, .L13
 259         # copy 4 bytes
 260         l32i    a7, a3,  4
 261         addi    a3, a3,  4
 262         src_b   a6, a6, a7
 263         s32i    a6, a5,  0
 264         addi    a5, a5,  4
 265         mov     a6, a7
 266 .L13:
 267 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 268         add     a3, a3, a11     # readjust a3 with correct misalignment
 269 #endif
 270         bbsi.l  a4, 1, .L14
 271         bbsi.l  a4, 0, .L15
 272 .Ldone: retw
 273 .L14:
 274         # copy 2 bytes
 275         l8ui    a6, a3,  0
 276         l8ui    a7, a3,  1
 277         addi    a3, a3,  2
 278         s8i     a6, a5,  0
 279         s8i     a7, a5,  1
 280         addi    a5, a5,  2
 281         bbsi.l  a4, 0, .L15
 282         retw
 283 .L15:
 284         # copy 1 byte
 285         l8ui    a6, a3,  0
 286         s8i     a6, a5,  0
 287         retw
 288
 289
 290 /*
 291  * void bcopy(const void *src, void *dest, size_t n);
 292  */
 293         .align  4
 294         .global bcopy
 295         .type   bcopy,@function
 296 bcopy:
 297         entry   sp, 16          # minimal stack frame
 298         # a2=src, a3=dst, a4=len
 299         mov     a5, a3
 300         mov     a3, a2
 301         mov     a2, a5
 302         j       .Lmovecommon    # go to common code for memmove+bcopy
 303
 304 /*
 305  * void *memmove(void *dst, const void *src, size_t len);
 306  *
 307  * This function is intended to do the same thing as the standard
 308  * library function memmove() for most cases.
 309  * However, where the source and/or destination references
 310  * an instruction RAM or ROM or a data RAM or ROM, that
 311  * source and/or destination will always be accessed with
 312  * 32-bit load and store instructions (as required for these
 313  * types of devices).
 314  *
 315  * !!!!!!!  XTFIXME:
 316  * !!!!!!!  Handling of IRAM/IROM has not yet
 317  * !!!!!!!  been implemented.
 318  *
 319  * The (general case) algorithm is as follows:
 320  *   If end of source doesn't overlap destination then use memcpy.
 321  *   Otherwise do memcpy backwards.
 322  *
 323  * Register use:
 324  *      a0/ return address
 325  *      a1/ stack pointer
 326  *      a2/ return value
 327  *      a3/ src
 328  *      a4/ length
 329  *      a5/ dst
 330  *      a6/ tmp
 331  *      a7/ tmp
 332  *      a8/ tmp
 333  *      a9/ tmp
 334  *      a10/ tmp
 335  *      a11/ tmp
 336  */
 337
 338 /*
 339  * Byte by byte copy
 340  */
 341         .align  4
 342         .byte   0               # 1 mod 4 alignment for LOOPNEZ
 343                                 # (0 mod 4 alignment for LBEG)
 344 .Lbackbytecopy:
 345 #if XCHAL_HAVE_LOOPS
 346         loopnez a4, .Lbackbytecopydone
 347 #else /* !XCHAL_HAVE_LOOPS */
 348         beqz    a4, .Lbackbytecopydone
 349         sub     a7, a3, a4      # a7 = start address for source
 350 #endif /* !XCHAL_HAVE_LOOPS */
 351 .Lbacknextbyte:
 352         addi    a3, a3, -1
 353         l8ui    a6, a3, 0
 354         addi    a5, a5, -1
 355         s8i     a6, a5, 0
 356 #if !XCHAL_HAVE_LOOPS
 357         bne     a3, a7, .Lbacknextbyte # continue loop if
 358                                        # $a3:src != $a7:src_start
 359 #endif /* !XCHAL_HAVE_LOOPS */
 360 .Lbackbytecopydone:
 361         retw
 362
 363 /*
 364  * Destination is unaligned
 365  */
 366
 367         .align  4
 368 .Lbackdst1mod2: # dst is only byte aligned
 369         _bltui  a4, 7, .Lbackbytecopy   # do short copies byte by byte
 370
 371         # copy 1 byte
 372         addi    a3, a3, -1
 373         l8ui    a6, a3,  0
 374         addi    a5, a5, -1
 375         s8i     a6, a5,  0
 376         addi    a4, a4, -1
 377         _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
 378                                         # return to main algorithm
 379 .Lbackdst2mod4: # dst 16-bit aligned
 380         # copy 2 bytes
 381         _bltui  a4, 6, .Lbackbytecopy   # do short copies byte by byte
 382         addi    a3, a3, -2
 383         l8ui    a6, a3,  0
 384         l8ui    a7, a3,  1
 385         addi    a5, a5, -2
 386         s8i     a6, a5,  0
 387         s8i     a7, a5,  1
 388         addi    a4, a4, -2
 389         j       .Lbackdstaligned        # dst is now aligned,
 390                                         # return to main algorithm
 391
 392         .align  4
 393         .global memmove
 394         .type   memmove,@function
 395 memmove:
 396
 397         entry   sp, 16          # minimal stack frame
 398         # a2/ dst, a3/ src, a4/ len
 399         mov     a5, a2          # copy dst so that a2 is return value
 400 .Lmovecommon:
 401         sub     a6, a5, a3
 402         bgeu    a6, a4, .Lcommon
 403
 404         add     a5, a5, a4
 405         add     a3, a3, a4
 406
 407         _bbsi.l a5, 0, .Lbackdst1mod2   # if dst is 1 mod 2
 408         _bbsi.l a5, 1, .Lbackdst2mod4   # if dst is 2 mod 4
 409 .Lbackdstaligned:       # return here from .Lbackdst?mod? once dst is aligned
 410         srli    a7, a4, 4       # number of loop iterations with 16B
 411                                 # per iteration
 412         movi    a8, 3           # if source is not aligned,
 413         _bany   a3, a8, .Lbacksrcunaligned      # then use shifting copy
 414         /*
 415          * Destination and source are word-aligned, use word copy.
 416          */
 417         # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 418 #if XCHAL_HAVE_LOOPS
 419         loopnez a7, .backLoop1done
 420 #else /* !XCHAL_HAVE_LOOPS */
 421         beqz    a7, .backLoop1done
 422         slli    a8, a7, 4
 423         sub     a8, a3, a8      # a8 = start of first 16B source chunk
 424 #endif /* !XCHAL_HAVE_LOOPS */
 425 .backLoop1:
 426         addi    a3, a3, -16
 427         l32i    a7, a3, 12
 428         l32i    a6, a3,  8
 429         addi    a5, a5, -16
 430         s32i    a7, a5, 12
 431         l32i    a7, a3,  4
 432         s32i    a6, a5,  8
 433         l32i    a6, a3,  0
 434         s32i    a7, a5,  4
 435         s32i    a6, a5,  0
 436 #if !XCHAL_HAVE_LOOPS
 437         bne     a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
 438 #endif /* !XCHAL_HAVE_LOOPS */
 439 .backLoop1done:
 440         bbci.l  a4, 3, .Lback2
 441         # copy 8 bytes
 442         addi    a3, a3, -8
 443         l32i    a6, a3,  0
 444         l32i    a7, a3,  4
 445         addi    a5, a5, -8
 446         s32i    a6, a5,  0
 447         s32i    a7, a5,  4
 448 .Lback2:
 449         bbsi.l  a4, 2, .Lback3
 450         bbsi.l  a4, 1, .Lback4
 451         bbsi.l  a4, 0, .Lback5
 452         retw
 453 .Lback3:
 454         # copy 4 bytes
 455         addi    a3, a3, -4
 456         l32i    a6, a3,  0
 457         addi    a5, a5, -4
 458         s32i    a6, a5,  0
 459         bbsi.l  a4, 1, .Lback4
 460         bbsi.l  a4, 0, .Lback5
 461         retw
 462 .Lback4:
 463         # copy 2 bytes
 464         addi    a3, a3, -2
 465         l16ui   a6, a3,  0
 466         addi    a5, a5, -2
 467         s16i    a6, a5,  0
 468         bbsi.l  a4, 0, .Lback5
 469         retw
 470 .Lback5:
 471         # copy 1 byte
 472         addi    a3, a3, -1
 473         l8ui    a6, a3,  0
 474         addi    a5, a5, -1
 475         s8i     a6, a5,  0
 476         retw
 477
 478 /*
 479  * Destination is aligned, Source is unaligned
 480  */
 481
 482         .align  4
 483 .Lbacksrcunaligned:
 484         _beqz   a4, .Lbackdone  # avoid loading anything for zero-length copies
 485         # copy 16 bytes per iteration for word-aligned dst and unaligned src
 486         ssa8    a3              # set shift amount from byte offset
 487 #define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS with
 488                                          * the lint or ferret client, or 0
 489                                          * to save a few cycles */
 490 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 491         and     a11, a3, a8     # save unalignment offset for below
 492         sub     a3, a3, a11     # align a3
 493 #endif
 494         l32i    a6, a3, 0       # load first word
 495 #if XCHAL_HAVE_LOOPS
 496         loopnez a7, .backLoop2done
 497 #else /* !XCHAL_HAVE_LOOPS */
 498         beqz    a7, .backLoop2done
 499         slli    a10, a7, 4
 500         sub     a10, a3, a10    # a10 = start of first 16B source chunk
 501 #endif /* !XCHAL_HAVE_LOOPS */
 502 .backLoop2:
 503         addi    a3, a3, -16
 504         l32i    a7, a3, 12
 505         l32i    a8, a3,  8
 506         addi    a5, a5, -16
 507         src_b   a6, a7, a6
 508         s32i    a6, a5, 12
 509         l32i    a9, a3,  4
 510         src_b   a7, a8, a7
 511         s32i    a7, a5,  8
 512         l32i    a6, a3,  0
 513         src_b   a8, a9, a8
 514         s32i    a8, a5,  4
 515         src_b   a9, a6, a9
 516         s32i    a9, a5,  0
 517 #if !XCHAL_HAVE_LOOPS
 518         bne     a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
 519 #endif /* !XCHAL_HAVE_LOOPS */
 520 .backLoop2done:
 521         bbci.l  a4, 3, .Lback12
 522         # copy 8 bytes
 523         addi    a3, a3, -8
 524         l32i    a7, a3,  4
 525         l32i    a8, a3,  0
 526         addi    a5, a5, -8
 527         src_b   a6, a7, a6
 528         s32i    a6, a5,  4
 529         src_b   a7, a8, a7
 530         s32i    a7, a5,  0
 531         mov     a6, a8
 532 .Lback12:
 533         bbci.l  a4, 2, .Lback13
 534         # copy 4 bytes
 535         addi    a3, a3, -4
 536         l32i    a7, a3,  0
 537         addi    a5, a5, -4
 538         src_b   a6, a7, a6
 539         s32i    a6, a5,  0
 540         mov     a6, a7
 541 .Lback13:
 542 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 543         add     a3, a3, a11     # readjust a3 with correct misalignment
 544 #endif
 545         bbsi.l  a4, 1, .Lback14
 546         bbsi.l  a4, 0, .Lback15
 547 .Lbackdone:
 548         retw
 549 .Lback14:
 550         # copy 2 bytes
 551         addi    a3, a3, -2
 552         l8ui    a6, a3,  0
 553         l8ui    a7, a3,  1
 554         addi    a5, a5, -2
 555         s8i     a6, a5,  0
 556         s8i     a7, a5,  1
 557         bbsi.l  a4, 0, .Lback15
 558         retw
 559 .Lback15:
 560         # copy 1 byte
 561         addi    a3, a3, -1
 562         addi    a5, a5, -1
 563         l8ui    a6, a3,  0
 564         s8i     a6, a5,  0
 565         retw
 566
 567 \f
 568 /*
 569  * Local Variables:
 570  * mode:fundamental
 571  * comment-start: "# "
 572  * comment-start-skip: "# *"
 573  * End:
 574  */