linux/arch/alpha/lib/ev6-memset.S
<<
>>
Prefs
   1/*
   2 * arch/alpha/lib/ev6-memset.S
   3 *
   4 * This is an efficient (and relatively small) implementation of the C library
   5 * "memset()" function for the 21264 implementation of Alpha.
   6 *
   7 * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
   8 *
   9 * Much of the information about 21264 scheduling/coding comes from:
  10 *      Compiler Writer's Guide for the Alpha 21264
  11 *      abbreviated as 'CWG' in other comments here
  12 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  13 * Scheduling notation:
  14 *      E       - either cluster
  15 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  16 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  17 * The algorithm for the leading and trailing quadwords remains the same,
  18 * however the loop has been unrolled to enable better memory throughput,
  19 * and the code has been replicated for each of the entry points: __memset
  20 * and __memsetw to permit better scheduling to eliminate the stalling
  21 * encountered during the mask replication.
  22 * A future enhancement might be to put in a byte store loop for really
  23 * small (say < 32 bytes) memset()s.  Whether or not that change would be
  24 * a win in the kernel would depend upon the contextual usage.
  25 * WARNING: Maintaining this is going to be more work than the above version,
  26 * as fixes will need to be made in multiple places.  The performance gain
  27 * is worth it.
  28 */
  29
  30        .set noat
  31        .set noreorder
  32.text
  33        .globl memset
  34        .globl __memset
  35        .globl ___memset
  36        .globl __memsetw
  37        .globl __constant_c_memset
  38
  39        .ent ___memset
  40.align 5
  41___memset:
  42        .frame $30,0,$26,0
  43        .prologue 0
  44
  45        /*
  46         * Serious stalling happens.  The only way to mitigate this is to
  47         * undertake a major re-write to interleave the constant materialization
  48         * with other parts of the fall-through code.  This is important, even
  49         * though it makes maintenance tougher.
  50         * Do this later.
  51         */
  52        and $17,255,$1          # E : 00000000000000ch
  53        insbl $17,1,$2          # U : 000000000000ch00
  54        bis $16,$16,$0          # E : return value
  55        ble $18,end_b           # U : zero length requested?
  56
  57        addq $18,$16,$6         # E : max address to write to
  58        bis     $1,$2,$17       # E : 000000000000chch
  59        insbl   $1,2,$3         # U : 0000000000ch0000
  60        insbl   $1,3,$4         # U : 00000000ch000000
  61
  62        or      $3,$4,$3        # E : 00000000chch0000
  63        inswl   $17,4,$5        # U : 0000chch00000000
  64        xor     $16,$6,$1       # E : will complete write be within one quadword?
  65        inswl   $17,6,$2        # U : chch000000000000
  66
  67        or      $17,$3,$17      # E : 00000000chchchch
  68        or      $2,$5,$2        # E : chchchch00000000
  69        bic     $1,7,$1         # E : fit within a single quadword?
  70        and     $16,7,$3        # E : Target addr misalignment
  71
  72        or      $17,$2,$17      # E : chchchchchchchch
  73        beq     $1,within_quad_b # U :
  74        nop                     # E :
  75        beq     $3,aligned_b    # U : target is 0mod8
  76
  77        /*
  78         * Target address is misaligned, and won't fit within a quadword
  79         */
  80        ldq_u $4,0($16)         # L : Fetch first partial
  81        bis $16,$16,$5          # E : Save the address
  82        insql $17,$16,$2        # U : Insert new bytes
  83        subq $3,8,$3            # E : Invert (for addressing uses)
  84
  85        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
  86        mskql $4,$16,$4         # U : clear relevant parts of the quad
  87        subq $16,$3,$16         # E : $16 is new aligned destination
  88        bis $2,$4,$1            # E : Final bytes
  89
  90        nop
  91        stq_u $1,0($5)          # L : Store result
  92        nop
  93        nop
  94
  95.align 4
  96aligned_b:
  97        /*
  98         * We are now guaranteed to be quad aligned, with at least
  99         * one partial quad to write.
 100         */
 101
 102        sra $18,3,$3            # U : Number of remaining quads to write
 103        and $18,7,$18           # E : Number of trailing bytes to write
 104        bis $16,$16,$5          # E : Save dest address
 105        beq $3,no_quad_b        # U : tail stuff only
 106
 107        /*
 108         * it's worth the effort to unroll this and use wh64 if possible
 109         * Lifted a bunch of code from clear_user.S
 110         * At this point, entry values are:
 111         * $16  Current destination address
 112         * $5   A copy of $16
 113         * $6   The max quadword address to write to
 114         * $18  Number trailer bytes
 115         * $3   Number quads to write
 116         */
 117
 118        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 119        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 120        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 121        blt     $4, loop_b      # U :
 122
 123        /*
 124         * We know we've got at least 16 quads, minimum of one trip
 125         * through unrolled loop.  Do a quad at a time to get us 0mod64
 126         * aligned.
 127         */
 128
 129        nop                     # E :
 130        nop                     # E :
 131        nop                     # E :
 132        beq     $1, $bigalign_b # U :
 133
 134$alignmod64_b:
 135        stq     $17, 0($5)      # L :
 136        subq    $3, 1, $3       # E : For consistency later
 137        addq    $1, 8, $1       # E : Increment towards zero for alignment
 138        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 139
 140        nop
 141        nop
 142        addq    $5, 8, $5       # E : Inc address
 143        blt     $1, $alignmod64_b # U :
 144
 145$bigalign_b:
 146        /*
 147         * $3 - number quads left to go
 148         * $5 - target address (aligned 0mod64)
 149         * $17 - mask of stuff to store
 150         * Scratch registers available: $7, $2, $4, $1
 151         * we know that we'll be taking a minimum of one trip through
 152         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 153         * Assumes the wh64 needs to be for 2 trips through the loop in the future
 154         * The wh64 is issued on for the starting destination address for trip +2
 155         * through the loop, and if there are less than two trips left, the target
 156         * address will be for the current trip.
 157         */
 158
 159$do_wh64_b:
 160        wh64    ($4)            # L1 : memory subsystem write hint
 161        subq    $3, 24, $2      # E : For determining future wh64 addresses
 162        stq     $17, 0($5)      # L :
 163        nop                     # E :
 164
 165        addq    $5, 128, $4     # E : speculative target of next wh64
 166        stq     $17, 8($5)      # L :
 167        stq     $17, 16($5)     # L :
 168        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 169
 170        stq     $17, 24($5)     # L :
 171        stq     $17, 32($5)     # L :
 172        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 173        nop
 174
 175        stq     $17, 40($5)     # L :
 176        stq     $17, 48($5)     # L :
 177        subq    $3, 16, $2      # E : Repeat the loop at least once more?
 178        nop
 179
 180        stq     $17, 56($5)     # L :
 181        addq    $5, 64, $5      # E :
 182        subq    $3, 8, $3       # E :
 183        bge     $2, $do_wh64_b  # U :
 184
 185        nop
 186        nop
 187        nop
 188        beq     $3, no_quad_b   # U : Might have finished already
 189
 190.align 4
 191        /*
 192         * Simple loop for trailing quadwords, or for small amounts
 193         * of data (where we can't use an unrolled loop and wh64)
 194         */
 195loop_b:
 196        stq $17,0($5)           # L :
 197        subq $3,1,$3            # E : Decrement number quads left
 198        addq $5,8,$5            # E : Inc address
 199        bne $3,loop_b           # U : more?
 200
 201no_quad_b:
 202        /*
 203         * Write 0..7 trailing bytes.
 204         */
 205        nop                     # E :
 206        beq $18,end_b           # U : All done?
 207        ldq $7,0($5)            # L :
 208        mskqh $7,$6,$2          # U : Mask final quad
 209
 210        insqh $17,$6,$4         # U : New bits
 211        bis $2,$4,$1            # E : Put it all together
 212        stq $1,0($5)            # L : And back to memory
 213        ret $31,($26),1         # L0 :
 214
 215within_quad_b:
 216        ldq_u $1,0($16)         # L :
 217        insql $17,$16,$2        # U : New bits
 218        mskql $1,$16,$4         # U : Clear old
 219        bis $2,$4,$2            # E : New result
 220
 221        mskql $2,$6,$4          # U :
 222        mskqh $1,$6,$2          # U :
 223        bis $2,$4,$1            # E :
 224        stq_u $1,0($16)         # L :
 225
 226end_b:
 227        nop
 228        nop
 229        nop
 230        ret $31,($26),1         # L0 :
 231        .end ___memset
 232
 233        /*
 234         * This is the original body of code, prior to replication and
 235         * rescheduling.  Leave it here, as there may be calls to this
 236         * entry point.
 237         */
 238.align 4
 239        .ent __constant_c_memset
 240__constant_c_memset:
 241        .frame $30,0,$26,0
 242        .prologue 0
 243
 244        addq $18,$16,$6         # E : max address to write to
 245        bis $16,$16,$0          # E : return value
 246        xor $16,$6,$1           # E : will complete write be within one quadword?
 247        ble $18,end             # U : zero length requested?
 248
 249        bic $1,7,$1             # E : fit within a single quadword
 250        beq $1,within_one_quad  # U :
 251        and $16,7,$3            # E : Target addr misalignment
 252        beq $3,aligned          # U : target is 0mod8
 253
 254        /*
 255         * Target address is misaligned, and won't fit within a quadword
 256         */
 257        ldq_u $4,0($16)         # L : Fetch first partial
 258        bis $16,$16,$5          # E : Save the address
 259        insql $17,$16,$2        # U : Insert new bytes
 260        subq $3,8,$3            # E : Invert (for addressing uses)
 261
 262        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 263        mskql $4,$16,$4         # U : clear relevant parts of the quad
 264        subq $16,$3,$16         # E : $16 is new aligned destination
 265        bis $2,$4,$1            # E : Final bytes
 266
 267        nop
 268        stq_u $1,0($5)          # L : Store result
 269        nop
 270        nop
 271
 272.align 4
 273aligned:
 274        /*
 275         * We are now guaranteed to be quad aligned, with at least
 276         * one partial quad to write.
 277         */
 278
 279        sra $18,3,$3            # U : Number of remaining quads to write
 280        and $18,7,$18           # E : Number of trailing bytes to write
 281        bis $16,$16,$5          # E : Save dest address
 282        beq $3,no_quad          # U : tail stuff only
 283
 284        /*
 285         * it's worth the effort to unroll this and use wh64 if possible
 286         * Lifted a bunch of code from clear_user.S
 287         * At this point, entry values are:
 288         * $16  Current destination address
 289         * $5   A copy of $16
 290         * $6   The max quadword address to write to
 291         * $18  Number trailer bytes
 292         * $3   Number quads to write
 293         */
 294
 295        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 296        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 297        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 298        blt     $4, loop        # U :
 299
 300        /*
 301         * We know we've got at least 16 quads, minimum of one trip
 302         * through unrolled loop.  Do a quad at a time to get us 0mod64
 303         * aligned.
 304         */
 305
 306        nop                     # E :
 307        nop                     # E :
 308        nop                     # E :
 309        beq     $1, $bigalign   # U :
 310
 311$alignmod64:
 312        stq     $17, 0($5)      # L :
 313        subq    $3, 1, $3       # E : For consistency later
 314        addq    $1, 8, $1       # E : Increment towards zero for alignment
 315        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 316
 317        nop
 318        nop
 319        addq    $5, 8, $5       # E : Inc address
 320        blt     $1, $alignmod64 # U :
 321
 322$bigalign:
 323        /*
 324         * $3 - number quads left to go
 325         * $5 - target address (aligned 0mod64)
 326         * $17 - mask of stuff to store
 327         * Scratch registers available: $7, $2, $4, $1
 328         * we know that we'll be taking a minimum of one trip through
 329         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 330         * Assumes the wh64 needs to be for 2 trips through the loop in the future
 331         * The wh64 is issued on for the starting destination address for trip +2
 332         * through the loop, and if there are less than two trips left, the target
 333         * address will be for the current trip.
 334         */
 335
 336$do_wh64:
 337        wh64    ($4)            # L1 : memory subsystem write hint
 338        subq    $3, 24, $2      # E : For determining future wh64 addresses
 339        stq     $17, 0($5)      # L :
 340        nop                     # E :
 341
 342        addq    $5, 128, $4     # E : speculative target of next wh64
 343        stq     $17, 8($5)      # L :
 344        stq     $17, 16($5)     # L :
 345        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 346
 347        stq     $17, 24($5)     # L :
 348        stq     $17, 32($5)     # L :
 349        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 350        nop
 351
 352        stq     $17, 40($5)     # L :
 353        stq     $17, 48($5)     # L :
 354        subq    $3, 16, $2      # E : Repeat the loop at least once more?
 355        nop
 356
 357        stq     $17, 56($5)     # L :
 358        addq    $5, 64, $5      # E :
 359        subq    $3, 8, $3       # E :
 360        bge     $2, $do_wh64    # U :
 361
 362        nop
 363        nop
 364        nop
 365        beq     $3, no_quad     # U : Might have finished already
 366
 367.align 4
 368        /*
 369         * Simple loop for trailing quadwords, or for small amounts
 370         * of data (where we can't use an unrolled loop and wh64)
 371         */
 372loop:
 373        stq $17,0($5)           # L :
 374        subq $3,1,$3            # E : Decrement number quads left
 375        addq $5,8,$5            # E : Inc address
 376        bne $3,loop             # U : more?
 377
 378no_quad:
 379        /*
 380         * Write 0..7 trailing bytes.
 381         */
 382        nop                     # E :
 383        beq $18,end             # U : All done?
 384        ldq $7,0($5)            # L :
 385        mskqh $7,$6,$2          # U : Mask final quad
 386
 387        insqh $17,$6,$4         # U : New bits
 388        bis $2,$4,$1            # E : Put it all together
 389        stq $1,0($5)            # L : And back to memory
 390        ret $31,($26),1         # L0 :
 391
 392within_one_quad:
 393        ldq_u $1,0($16)         # L :
 394        insql $17,$16,$2        # U : New bits
 395        mskql $1,$16,$4         # U : Clear old
 396        bis $2,$4,$2            # E : New result
 397
 398        mskql $2,$6,$4          # U :
 399        mskqh $1,$6,$2          # U :
 400        bis $2,$4,$1            # E :
 401        stq_u $1,0($16)         # L :
 402
 403end:
 404        nop
 405        nop
 406        nop
 407        ret $31,($26),1         # L0 :
 408        .end __constant_c_memset
 409
 410        /*
 411         * This is a replicant of the __constant_c_memset code, rescheduled
 412         * to mask stalls.  Note that entry point names also had to change
 413         */
 414        .align 5
 415        .ent __memsetw
 416
 417__memsetw:
 418        .frame $30,0,$26,0
 419        .prologue 0
 420
 421        inswl $17,0,$5          # U : 000000000000c1c2
 422        inswl $17,2,$2          # U : 00000000c1c20000
 423        bis $16,$16,$0          # E : return value
 424        addq    $18,$16,$6      # E : max address to write to
 425
 426        ble $18, end_w          # U : zero length requested?
 427        inswl   $17,4,$3        # U : 0000c1c200000000
 428        inswl   $17,6,$4        # U : c1c2000000000000
 429        xor     $16,$6,$1       # E : will complete write be within one quadword?
 430
 431        or      $2,$5,$2        # E : 00000000c1c2c1c2
 432        or      $3,$4,$17       # E : c1c2c1c200000000
 433        bic     $1,7,$1         # E : fit within a single quadword
 434        and     $16,7,$3        # E : Target addr misalignment
 435
 436        or      $17,$2,$17      # E : c1c2c1c2c1c2c1c2
 437        beq $1,within_quad_w    # U :
 438        nop
 439        beq $3,aligned_w        # U : target is 0mod8
 440
 441        /*
 442         * Target address is misaligned, and won't fit within a quadword
 443         */
 444        ldq_u $4,0($16)         # L : Fetch first partial
 445        bis $16,$16,$5          # E : Save the address
 446        insql $17,$16,$2        # U : Insert new bytes
 447        subq $3,8,$3            # E : Invert (for addressing uses)
 448
 449        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 450        mskql $4,$16,$4         # U : clear relevant parts of the quad
 451        subq $16,$3,$16         # E : $16 is new aligned destination
 452        bis $2,$4,$1            # E : Final bytes
 453
 454        nop
 455        stq_u $1,0($5)          # L : Store result
 456        nop
 457        nop
 458
 459.align 4
 460aligned_w:
 461        /*
 462         * We are now guaranteed to be quad aligned, with at least
 463         * one partial quad to write.
 464         */
 465
 466        sra $18,3,$3            # U : Number of remaining quads to write
 467        and $18,7,$18           # E : Number of trailing bytes to write
 468        bis $16,$16,$5          # E : Save dest address
 469        beq $3,no_quad_w        # U : tail stuff only
 470
 471        /*
 472         * it's worth the effort to unroll this and use wh64 if possible
 473         * Lifted a bunch of code from clear_user.S
 474         * At this point, entry values are:
 475         * $16  Current destination address
 476         * $5   A copy of $16
 477         * $6   The max quadword address to write to
 478         * $18  Number trailer bytes
 479         * $3   Number quads to write
 480         */
 481
 482        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 483        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 484        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 485        blt     $4, loop_w      # U :
 486
 487        /*
 488         * We know we've got at least 16 quads, minimum of one trip
 489         * through unrolled loop.  Do a quad at a time to get us 0mod64
 490         * aligned.
 491         */
 492
 493        nop                     # E :
 494        nop                     # E :
 495        nop                     # E :
 496        beq     $1, $bigalign_w # U :
 497
 498$alignmod64_w:
 499        stq     $17, 0($5)      # L :
 500        subq    $3, 1, $3       # E : For consistency later
 501        addq    $1, 8, $1       # E : Increment towards zero for alignment
 502        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 503
 504        nop
 505        nop
 506        addq    $5, 8, $5       # E : Inc address
 507        blt     $1, $alignmod64_w       # U :
 508
 509$bigalign_w:
 510        /*
 511         * $3 - number quads left to go
 512         * $5 - target address (aligned 0mod64)
 513         * $17 - mask of stuff to store
 514         * Scratch registers available: $7, $2, $4, $1
 515         * we know that we'll be taking a minimum of one trip through
 516         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 517         * Assumes the wh64 needs to be for 2 trips through the loop in the future
 518         * The wh64 is issued on for the starting destination address for trip +2
 519         * through the loop, and if there are less than two trips left, the target
 520         * address will be for the current trip.
 521         */
 522
 523$do_wh64_w:
 524        wh64    ($4)            # L1 : memory subsystem write hint
 525        subq    $3, 24, $2      # E : For determining future wh64 addresses
 526        stq     $17, 0($5)      # L :
 527        nop                     # E :
 528
 529        addq    $5, 128, $4     # E : speculative target of next wh64
 530        stq     $17, 8($5)      # L :
 531        stq     $17, 16($5)     # L :
 532        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 533
 534        stq     $17, 24($5)     # L :
 535        stq     $17, 32($5)     # L :
 536        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 537        nop
 538
 539        stq     $17, 40($5)     # L :
 540        stq     $17, 48($5)     # L :
 541        subq    $3, 16, $2      # E : Repeat the loop at least once more?
 542        nop
 543
 544        stq     $17, 56($5)     # L :
 545        addq    $5, 64, $5      # E :
 546        subq    $3, 8, $3       # E :
 547        bge     $2, $do_wh64_w  # U :
 548
 549        nop
 550        nop
 551        nop
 552        beq     $3, no_quad_w   # U : Might have finished already
 553
 554.align 4
 555        /*
 556         * Simple loop for trailing quadwords, or for small amounts
 557         * of data (where we can't use an unrolled loop and wh64)
 558         */
 559loop_w:
 560        stq $17,0($5)           # L :
 561        subq $3,1,$3            # E : Decrement number quads left
 562        addq $5,8,$5            # E : Inc address
 563        bne $3,loop_w           # U : more?
 564
 565no_quad_w:
 566        /*
 567         * Write 0..7 trailing bytes.
 568         */
 569        nop                     # E :
 570        beq $18,end_w           # U : All done?
 571        ldq $7,0($5)            # L :
 572        mskqh $7,$6,$2          # U : Mask final quad
 573
 574        insqh $17,$6,$4         # U : New bits
 575        bis $2,$4,$1            # E : Put it all together
 576        stq $1,0($5)            # L : And back to memory
 577        ret $31,($26),1         # L0 :
 578
 579within_quad_w:
 580        ldq_u $1,0($16)         # L :
 581        insql $17,$16,$2        # U : New bits
 582        mskql $1,$16,$4         # U : Clear old
 583        bis $2,$4,$2            # E : New result
 584
 585        mskql $2,$6,$4          # U :
 586        mskqh $1,$6,$2          # U :
 587        bis $2,$4,$1            # E :
 588        stq_u $1,0($16)         # L :
 589
 590end_w:
 591        nop
 592        nop
 593        nop
 594        ret $31,($26),1         # L0 :
 595
 596        .end __memsetw
 597
 598memset = ___memset
 599__memset = ___memset
 600