arch/alpha/lib/ev6-memset.S

   1 /*
   2  * arch/alpha/lib/ev6-memset.S
   3  *
   4  * This is an efficient (and relatively small) implementation of the C library
   5  * "memset()" function for the 21264 implementation of Alpha.
   6  *
   7  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
   8  *
   9  * Much of the information about 21264 scheduling/coding comes from:
  10  *      Compiler Writer's Guide for the Alpha 21264
  11  *      abbreviated as 'CWG' in other comments here
  12  *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  13  * Scheduling notation:
  14  *      E       - either cluster
  15  *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  16  *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  17  * The algorithm for the leading and trailing quadwords remains the same,
  18  * however the loop has been unrolled to enable better memory throughput,
  19  * and the code has been replicated for each of the entry points: __memset
  20  * and __memsetw to permit better scheduling to eliminate the stalling
  21  * encountered during the mask replication.
  22  * A future enhancement might be to put in a byte store loop for really
  23  * small (say < 32 bytes) memset()s.  Whether or not that change would be
  24  * a win in the kernel would depend upon the contextual usage.
  25  * WARNING: Maintaining this is going to be more work than the above version,
  26  * as fixes will need to be made in multiple places.  The performance gain
  27  * is worth it.
  28  */
  29
  30         .set noat
  31         .set noreorder
  32 .text
  33         .globl memset
  34         .globl __memset
  35         .globl ___memset
  36         .globl __memsetw
  37         .globl __constant_c_memset
  38
  39         .ent ___memset
  40 .align 5
  41 ___memset:
  42         .frame $30,0,$26,0
  43         .prologue 0
  44
  45         /*
  46          * Serious stalling happens.  The only way to mitigate this is to
  47          * undertake a major re-write to interleave the constant materialization
  48          * with other parts of the fall-through code.  This is important, even
  49          * though it makes maintenance tougher.
  50          * Do this later.
  51          */
  52         and $17,255,$1          # E : 00000000000000ch
  53         insbl $17,1,$2          # U : 000000000000ch00
  54         bis $16,$16,$0          # E : return value
  55         ble $18,end_b           # U : zero length requested?
  56
  57         addq $18,$16,$6         # E : max address to write to
  58         bis     $1,$2,$17       # E : 000000000000chch
  59         insbl   $1,2,$3         # U : 0000000000ch0000
  60         insbl   $1,3,$4         # U : 00000000ch000000
  61
  62         or      $3,$4,$3        # E : 00000000chch0000
  63         inswl   $17,4,$5        # U : 0000chch00000000
  64         xor     $16,$6,$1       # E : will complete write be within one quadword?
  65         inswl   $17,6,$2        # U : chch000000000000
  66
  67         or      $17,$3,$17      # E : 00000000chchchch
  68         or      $2,$5,$2        # E : chchchch00000000
  69         bic     $1,7,$1         # E : fit within a single quadword?
  70         and     $16,7,$3        # E : Target addr misalignment
  71
  72         or      $17,$2,$17      # E : chchchchchchchch
  73         beq     $1,within_quad_b # U :
  74         nop                     # E :
  75         beq     $3,aligned_b    # U : target is 0mod8
  76
  77         /*
  78          * Target address is misaligned, and won't fit within a quadword
  79          */
  80         ldq_u $4,0($16)         # L : Fetch first partial
  81         bis $16,$16,$5          # E : Save the address
  82         insql $17,$16,$2        # U : Insert new bytes
  83         subq $3,8,$3            # E : Invert (for addressing uses)
  84
  85         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
  86         mskql $4,$16,$4         # U : clear relevant parts of the quad
  87         subq $16,$3,$16         # E : $16 is new aligned destination
  88         bis $2,$4,$1            # E : Final bytes
  89
  90         nop
  91         stq_u $1,0($5)          # L : Store result
  92         nop
  93         nop
  94
  95 .align 4
  96 aligned_b:
  97         /*
  98          * We are now guaranteed to be quad aligned, with at least
  99          * one partial quad to write.
 100          */
 101
 102         sra $18,3,$3            # U : Number of remaining quads to write
 103         and $18,7,$18           # E : Number of trailing bytes to write
 104         bis $16,$16,$5          # E : Save dest address
 105         beq $3,no_quad_b        # U : tail stuff only
 106
 107         /*
 108          * it's worth the effort to unroll this and use wh64 if possible
 109          * Lifted a bunch of code from clear_user.S
 110          * At this point, entry values are:
 111          * $16  Current destination address
 112          * $5   A copy of $16
 113          * $6   The max quadword address to write to
 114          * $18  Number trailer bytes
 115          * $3   Number quads to write
 116          */
 117
 118         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 119         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 120         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 121         blt     $4, loop_b      # U :
 122
 123         /*
 124          * We know we've got at least 16 quads, minimum of one trip
 125          * through unrolled loop.  Do a quad at a time to get us 0mod64
 126          * aligned.
 127          */
 128
 129         nop                     # E :
 130         nop                     # E :
 131         nop                     # E :
 132         beq     $1, $bigalign_b # U :
 133
 134 $alignmod64_b:
 135         stq     $17, 0($5)      # L :
 136         subq    $3, 1, $3       # E : For consistency later
 137         addq    $1, 8, $1       # E : Increment towards zero for alignment
 138         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 139
 140         nop
 141         nop
 142         addq    $5, 8, $5       # E : Inc address
 143         blt     $1, $alignmod64_b # U :
 144
 145 $bigalign_b:
 146         /*
 147          * $3 - number quads left to go
 148          * $5 - target address (aligned 0mod64)
 149          * $17 - mask of stuff to store
 150          * Scratch registers available: $7, $2, $4, $1
 151          * we know that we'll be taking a minimum of one trip through
 152          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 153          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 154          * The wh64 is issued on for the starting destination address for trip +2
 155          * through the loop, and if there are less than two trips left, the target
 156          * address will be for the current trip.
 157          */
 158
 159 $do_wh64_b:
 160         wh64    ($4)            # L1 : memory subsystem write hint
 161         subq    $3, 24, $2      # E : For determining future wh64 addresses
 162         stq     $17, 0($5)      # L :
 163         nop                     # E :
 164
 165         addq    $5, 128, $4     # E : speculative target of next wh64
 166         stq     $17, 8($5)      # L :
 167         stq     $17, 16($5)     # L :
 168         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 169
 170         stq     $17, 24($5)     # L :
 171         stq     $17, 32($5)     # L :
 172         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 173         nop
 174
 175         stq     $17, 40($5)     # L :
 176         stq     $17, 48($5)     # L :
 177         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 178         nop
 179
 180         stq     $17, 56($5)     # L :
 181         addq    $5, 64, $5      # E :
 182         subq    $3, 8, $3       # E :
 183         bge     $2, $do_wh64_b  # U :
 184
 185         nop
 186         nop
 187         nop
 188         beq     $3, no_quad_b   # U : Might have finished already
 189
 190 .align 4
 191         /*
 192          * Simple loop for trailing quadwords, or for small amounts
 193          * of data (where we can't use an unrolled loop and wh64)
 194          */
 195 loop_b:
 196         stq $17,0($5)           # L :
 197         subq $3,1,$3            # E : Decrement number quads left
 198         addq $5,8,$5            # E : Inc address
 199         bne $3,loop_b           # U : more?
 200
 201 no_quad_b:
 202         /*
 203          * Write 0..7 trailing bytes.
 204          */
 205         nop                     # E :
 206         beq $18,end_b           # U : All done?
 207         ldq $7,0($5)            # L :
 208         mskqh $7,$6,$2          # U : Mask final quad
 209
 210         insqh $17,$6,$4         # U : New bits
 211         bis $2,$4,$1            # E : Put it all together
 212         stq $1,0($5)            # L : And back to memory
 213         ret $31,($26),1         # L0 :
 214
 215 within_quad_b:
 216         ldq_u $1,0($16)         # L :
 217         insql $17,$16,$2        # U : New bits
 218         mskql $1,$16,$4         # U : Clear old
 219         bis $2,$4,$2            # E : New result
 220
 221         mskql $2,$6,$4          # U :
 222         mskqh $1,$6,$2          # U :
 223         bis $2,$4,$1            # E :
 224         stq_u $1,0($16)         # L :
 225
 226 end_b:
 227         nop
 228         nop
 229         nop
 230         ret $31,($26),1         # L0 :
 231         .end ___memset
 232
 233         /*
 234          * This is the original body of code, prior to replication and
 235          * rescheduling.  Leave it here, as there may be calls to this
 236          * entry point.
 237          */
 238 .align 4
 239         .ent __constant_c_memset
 240 __constant_c_memset:
 241         .frame $30,0,$26,0
 242         .prologue 0
 243
 244         addq $18,$16,$6         # E : max address to write to
 245         bis $16,$16,$0          # E : return value
 246         xor $16,$6,$1           # E : will complete write be within one quadword?
 247         ble $18,end             # U : zero length requested?
 248
 249         bic $1,7,$1             # E : fit within a single quadword
 250         beq $1,within_one_quad  # U :
 251         and $16,7,$3            # E : Target addr misalignment
 252         beq $3,aligned          # U : target is 0mod8
 253
 254         /*
 255          * Target address is misaligned, and won't fit within a quadword
 256          */
 257         ldq_u $4,0($16)         # L : Fetch first partial
 258         bis $16,$16,$5          # E : Save the address
 259         insql $17,$16,$2        # U : Insert new bytes
 260         subq $3,8,$3            # E : Invert (for addressing uses)
 261
 262         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 263         mskql $4,$16,$4         # U : clear relevant parts of the quad
 264         subq $16,$3,$16         # E : $16 is new aligned destination
 265         bis $2,$4,$1            # E : Final bytes
 266
 267         nop
 268         stq_u $1,0($5)          # L : Store result
 269         nop
 270         nop
 271
 272 .align 4
 273 aligned:
 274         /*
 275          * We are now guaranteed to be quad aligned, with at least
 276          * one partial quad to write.
 277          */
 278
 279         sra $18,3,$3            # U : Number of remaining quads to write
 280         and $18,7,$18           # E : Number of trailing bytes to write
 281         bis $16,$16,$5          # E : Save dest address
 282         beq $3,no_quad          # U : tail stuff only
 283
 284         /*
 285          * it's worth the effort to unroll this and use wh64 if possible
 286          * Lifted a bunch of code from clear_user.S
 287          * At this point, entry values are:
 288          * $16  Current destination address
 289          * $5   A copy of $16
 290          * $6   The max quadword address to write to
 291          * $18  Number trailer bytes
 292          * $3   Number quads to write
 293          */
 294
 295         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 296         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 297         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 298         blt     $4, loop        # U :
 299
 300         /*
 301          * We know we've got at least 16 quads, minimum of one trip
 302          * through unrolled loop.  Do a quad at a time to get us 0mod64
 303          * aligned.
 304          */
 305
 306         nop                     # E :
 307         nop                     # E :
 308         nop                     # E :
 309         beq     $1, $bigalign   # U :
 310
 311 $alignmod64:
 312         stq     $17, 0($5)      # L :
 313         subq    $3, 1, $3       # E : For consistency later
 314         addq    $1, 8, $1       # E : Increment towards zero for alignment
 315         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 316
 317         nop
 318         nop
 319         addq    $5, 8, $5       # E : Inc address
 320         blt     $1, $alignmod64 # U :
 321
 322 $bigalign:
 323         /*
 324          * $3 - number quads left to go
 325          * $5 - target address (aligned 0mod64)
 326          * $17 - mask of stuff to store
 327          * Scratch registers available: $7, $2, $4, $1
 328          * we know that we'll be taking a minimum of one trip through
 329          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 330          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 331          * The wh64 is issued on for the starting destination address for trip +2
 332          * through the loop, and if there are less than two trips left, the target
 333          * address will be for the current trip.
 334          */
 335
 336 $do_wh64:
 337         wh64    ($4)            # L1 : memory subsystem write hint
 338         subq    $3, 24, $2      # E : For determining future wh64 addresses
 339         stq     $17, 0($5)      # L :
 340         nop                     # E :
 341
 342         addq    $5, 128, $4     # E : speculative target of next wh64
 343         stq     $17, 8($5)      # L :
 344         stq     $17, 16($5)     # L :
 345         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 346
 347         stq     $17, 24($5)     # L :
 348         stq     $17, 32($5)     # L :
 349         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 350         nop
 351
 352         stq     $17, 40($5)     # L :
 353         stq     $17, 48($5)     # L :
 354         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 355         nop
 356
 357         stq     $17, 56($5)     # L :
 358         addq    $5, 64, $5      # E :
 359         subq    $3, 8, $3       # E :
 360         bge     $2, $do_wh64    # U :
 361
 362         nop
 363         nop
 364         nop
 365         beq     $3, no_quad     # U : Might have finished already
 366
 367 .align 4
 368         /*
 369          * Simple loop for trailing quadwords, or for small amounts
 370          * of data (where we can't use an unrolled loop and wh64)
 371          */
 372 loop:
 373         stq $17,0($5)           # L :
 374         subq $3,1,$3            # E : Decrement number quads left
 375         addq $5,8,$5            # E : Inc address
 376         bne $3,loop             # U : more?
 377
 378 no_quad:
 379         /*
 380          * Write 0..7 trailing bytes.
 381          */
 382         nop                     # E :
 383         beq $18,end             # U : All done?
 384         ldq $7,0($5)            # L :
 385         mskqh $7,$6,$2          # U : Mask final quad
 386
 387         insqh $17,$6,$4         # U : New bits
 388         bis $2,$4,$1            # E : Put it all together
 389         stq $1,0($5)            # L : And back to memory
 390         ret $31,($26),1         # L0 :
 391
 392 within_one_quad:
 393         ldq_u $1,0($16)         # L :
 394         insql $17,$16,$2        # U : New bits
 395         mskql $1,$16,$4         # U : Clear old
 396         bis $2,$4,$2            # E : New result
 397
 398         mskql $2,$6,$4          # U :
 399         mskqh $1,$6,$2          # U :
 400         bis $2,$4,$1            # E :
 401         stq_u $1,0($16)         # L :
 402
 403 end:
 404         nop
 405         nop
 406         nop
 407         ret $31,($26),1         # L0 :
 408         .end __constant_c_memset
 409
 410         /*
 411          * This is a replicant of the __constant_c_memset code, rescheduled
 412          * to mask stalls.  Note that entry point names also had to change
 413          */
 414         .align 5
 415         .ent __memsetw
 416
 417 __memsetw:
 418         .frame $30,0,$26,0
 419         .prologue 0
 420
 421         inswl $17,0,$5          # U : 000000000000c1c2
 422         inswl $17,2,$2          # U : 00000000c1c20000
 423         bis $16,$16,$0          # E : return value
 424         addq    $18,$16,$6      # E : max address to write to
 425
 426         ble $18, end_w          # U : zero length requested?
 427         inswl   $17,4,$3        # U : 0000c1c200000000
 428         inswl   $17,6,$4        # U : c1c2000000000000
 429         xor     $16,$6,$1       # E : will complete write be within one quadword?
 430
 431         or      $2,$5,$2        # E : 00000000c1c2c1c2
 432         or      $3,$4,$17       # E : c1c2c1c200000000
 433         bic     $1,7,$1         # E : fit within a single quadword
 434         and     $16,7,$3        # E : Target addr misalignment
 435
 436         or      $17,$2,$17      # E : c1c2c1c2c1c2c1c2
 437         beq $1,within_quad_w    # U :
 438         nop
 439         beq $3,aligned_w        # U : target is 0mod8
 440
 441         /*
 442          * Target address is misaligned, and won't fit within a quadword
 443          */
 444         ldq_u $4,0($16)         # L : Fetch first partial
 445         bis $16,$16,$5          # E : Save the address
 446         insql $17,$16,$2        # U : Insert new bytes
 447         subq $3,8,$3            # E : Invert (for addressing uses)
 448
 449         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 450         mskql $4,$16,$4         # U : clear relevant parts of the quad
 451         subq $16,$3,$16         # E : $16 is new aligned destination
 452         bis $2,$4,$1            # E : Final bytes
 453
 454         nop
 455         stq_u $1,0($5)          # L : Store result
 456         nop
 457         nop
 458
 459 .align 4
 460 aligned_w:
 461         /*
 462          * We are now guaranteed to be quad aligned, with at least
 463          * one partial quad to write.
 464          */
 465
 466         sra $18,3,$3            # U : Number of remaining quads to write
 467         and $18,7,$18           # E : Number of trailing bytes to write
 468         bis $16,$16,$5          # E : Save dest address
 469         beq $3,no_quad_w        # U : tail stuff only
 470
 471         /*
 472          * it's worth the effort to unroll this and use wh64 if possible
 473          * Lifted a bunch of code from clear_user.S
 474          * At this point, entry values are:
 475          * $16  Current destination address
 476          * $5   A copy of $16
 477          * $6   The max quadword address to write to
 478          * $18  Number trailer bytes
 479          * $3   Number quads to write
 480          */
 481
 482         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 483         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 484         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 485         blt     $4, loop_w      # U :
 486
 487         /*
 488          * We know we've got at least 16 quads, minimum of one trip
 489          * through unrolled loop.  Do a quad at a time to get us 0mod64
 490          * aligned.
 491          */
 492
 493         nop                     # E :
 494         nop                     # E :
 495         nop                     # E :
 496         beq     $1, $bigalign_w # U :
 497
 498 $alignmod64_w:
 499         stq     $17, 0($5)      # L :
 500         subq    $3, 1, $3       # E : For consistency later
 501         addq    $1, 8, $1       # E : Increment towards zero for alignment
 502         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 503
 504         nop
 505         nop
 506         addq    $5, 8, $5       # E : Inc address
 507         blt     $1, $alignmod64_w       # U :
 508
 509 $bigalign_w:
 510         /*
 511          * $3 - number quads left to go
 512          * $5 - target address (aligned 0mod64)
 513          * $17 - mask of stuff to store
 514          * Scratch registers available: $7, $2, $4, $1
 515          * we know that we'll be taking a minimum of one trip through
 516          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 517          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 518          * The wh64 is issued on for the starting destination address for trip +2
 519          * through the loop, and if there are less than two trips left, the target
 520          * address will be for the current trip.
 521          */
 522
 523 $do_wh64_w:
 524         wh64    ($4)            # L1 : memory subsystem write hint
 525         subq    $3, 24, $2      # E : For determining future wh64 addresses
 526         stq     $17, 0($5)      # L :
 527         nop                     # E :
 528
 529         addq    $5, 128, $4     # E : speculative target of next wh64
 530         stq     $17, 8($5)      # L :
 531         stq     $17, 16($5)     # L :
 532         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 533
 534         stq     $17, 24($5)     # L :
 535         stq     $17, 32($5)     # L :
 536         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 537         nop
 538
 539         stq     $17, 40($5)     # L :
 540         stq     $17, 48($5)     # L :
 541         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 542         nop
 543
 544         stq     $17, 56($5)     # L :
 545         addq    $5, 64, $5      # E :
 546         subq    $3, 8, $3       # E :
 547         bge     $2, $do_wh64_w  # U :
 548
 549         nop
 550         nop
 551         nop
 552         beq     $3, no_quad_w   # U : Might have finished already
 553
 554 .align 4
 555         /*
 556          * Simple loop for trailing quadwords, or for small amounts
 557          * of data (where we can't use an unrolled loop and wh64)
 558          */
 559 loop_w:
 560         stq $17,0($5)           # L :
 561         subq $3,1,$3            # E : Decrement number quads left
 562         addq $5,8,$5            # E : Inc address
 563         bne $3,loop_w           # U : more?
 564
 565 no_quad_w:
 566         /*
 567          * Write 0..7 trailing bytes.
 568          */
 569         nop                     # E :
 570         beq $18,end_w           # U : All done?
 571         ldq $7,0($5)            # L :
 572         mskqh $7,$6,$2          # U : Mask final quad
 573
 574         insqh $17,$6,$4         # U : New bits
 575         bis $2,$4,$1            # E : Put it all together
 576         stq $1,0($5)            # L : And back to memory
 577         ret $31,($26),1         # L0 :
 578
 579 within_quad_w:
 580         ldq_u $1,0($16)         # L :
 581         insql $17,$16,$2        # U : New bits
 582         mskql $1,$16,$4         # U : Clear old
 583         bis $2,$4,$2            # E : New result
 584
 585         mskql $2,$6,$4          # U :
 586         mskqh $1,$6,$2          # U :
 587         bis $2,$4,$1            # E :
 588         stq_u $1,0($16)         # L :
 589
 590 end_w:
 591         nop
 592         nop
 593         nop
 594         ret $31,($26),1         # L0 :
 595
 596         .end __memsetw
 597
 598 memset = ___memset
 599 __memset = ___memset