linux/arch/alpha/lib/ev6-memcpy.S
<<
>>
Prefs
   1/*
   2 * arch/alpha/lib/ev6-memcpy.S
   3 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
   4 *
   5 * Reasonably optimized memcpy() routine for the Alpha 21264
   6 *
   7 *      - memory accessed as aligned quadwords only
   8 *      - uses bcmpge to compare 8 bytes in parallel
   9 *
  10 * Much of the information about 21264 scheduling/coding comes from:
  11 *      Compiler Writer's Guide for the Alpha 21264
  12 *      abbreviated as 'CWG' in other comments here
  13 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  14 * Scheduling notation:
  15 *      E       - either cluster
  16 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  17 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  18 *
  19 * Temp usage notes:
  20 *      $1,$2,          - scratch
  21 */
  22
  23        .set noreorder
  24        .set noat
  25
  26        .align  4
  27        .globl memcpy
  28        .ent memcpy
  29memcpy:
  30        .frame $30,0,$26,0
  31        .prologue 0
  32
  33        mov     $16, $0                 # E : copy dest to return
  34        ble     $18, $nomoredata        # U : done with the copy?
  35        xor     $16, $17, $1            # E : are source and dest alignments the same?
  36        and     $1, 7, $1               # E : are they the same mod 8?
  37
  38        bne     $1, $misaligned         # U : Nope - gotta do this the slow way
  39        /* source and dest are same mod 8 address */
  40        and     $16, 7, $1              # E : Are both 0mod8?
  41        beq     $1, $both_0mod8         # U : Yes
  42        nop                             # E :
  43
  44        /*
  45         * source and dest are same misalignment.  move a byte at a time
  46         * until a 0mod8 alignment for both is reached.
  47         * At least one byte more to move
  48         */
  49
  50$head_align:
  51        ldbu    $1, 0($17)              # L : grab a byte
  52        subq    $18, 1, $18             # E : count--
  53        addq    $17, 1, $17             # E : src++
  54        stb     $1, 0($16)              # L :
  55        addq    $16, 1, $16             # E : dest++
  56        and     $16, 7, $1              # E : Are we at 0mod8 yet?
  57        ble     $18, $nomoredata        # U : done with the copy?
  58        bne     $1, $head_align         # U :
  59
  60$both_0mod8:
  61        cmple   $18, 127, $1            # E : Can we unroll the loop?
  62        bne     $1, $no_unroll          # U :
  63        and     $16, 63, $1             # E : get mod64 alignment
  64        beq     $1, $do_unroll          # U : no single quads to fiddle
  65
  66$single_head_quad:
  67        ldq     $1, 0($17)              # L : get 8 bytes
  68        subq    $18, 8, $18             # E : count -= 8
  69        addq    $17, 8, $17             # E : src += 8
  70        nop                             # E :
  71
  72        stq     $1, 0($16)              # L : store
  73        addq    $16, 8, $16             # E : dest += 8
  74        and     $16, 63, $1             # E : get mod64 alignment
  75        bne     $1, $single_head_quad   # U : still not fully aligned
  76
  77$do_unroll:
  78        addq    $16, 64, $7             # E : Initial (+1 trip) wh64 address
  79        cmple   $18, 127, $1            # E : Can we go through the unrolled loop?
  80        bne     $1, $tail_quads         # U : Nope
  81        nop                             # E : 
  82
  83$unroll_body:
  84        wh64    ($7)                    # L1 : memory subsystem hint: 64 bytes at
  85                                        # ($7) are about to be over-written
  86        ldq     $6, 0($17)              # L0 : bytes 0..7
  87        nop                             # E :
  88        nop                             # E :
  89
  90        ldq     $4, 8($17)              # L : bytes 8..15
  91        ldq     $5, 16($17)             # L : bytes 16..23
  92        addq    $7, 64, $7              # E : Update next wh64 address
  93        nop                             # E :
  94
  95        ldq     $3, 24($17)             # L : bytes 24..31
  96        addq    $16, 64, $1             # E : fallback value for wh64
  97        nop                             # E :
  98        nop                             # E :
  99
 100        addq    $17, 32, $17            # E : src += 32 bytes
 101        stq     $6, 0($16)              # L : bytes 0..7
 102        nop                             # E :
 103        nop                             # E :
 104
 105        stq     $4, 8($16)              # L : bytes 8..15
 106        stq     $5, 16($16)             # L : bytes 16..23
 107        subq    $18, 192, $2            # E : At least two more trips to go?
 108        nop                             # E :
 109
 110        stq     $3, 24($16)             # L : bytes 24..31
 111        addq    $16, 32, $16            # E : dest += 32 bytes
 112        nop                             # E :
 113        nop                             # E :
 114
 115        ldq     $6, 0($17)              # L : bytes 0..7
 116        ldq     $4, 8($17)              # L : bytes 8..15
 117        cmovlt  $2, $1, $7              # E : Latency 2, extra map slot - Use
 118                                        # fallback wh64 address if < 2 more trips
 119        nop                             # E :
 120
 121        ldq     $5, 16($17)             # L : bytes 16..23
 122        ldq     $3, 24($17)             # L : bytes 24..31
 123        addq    $16, 32, $16            # E : dest += 32
 124        subq    $18, 64, $18            # E : count -= 64
 125
 126        addq    $17, 32, $17            # E : src += 32
 127        stq     $6, -32($16)            # L : bytes 0..7
 128        stq     $4, -24($16)            # L : bytes 8..15
 129        cmple   $18, 63, $1             # E : At least one more trip?
 130
 131        stq     $5, -16($16)            # L : bytes 16..23
 132        stq     $3, -8($16)             # L : bytes 24..31
 133        nop                             # E :
 134        beq     $1, $unroll_body
 135
 136$tail_quads:
 137$no_unroll:
 138        .align 4
 139        subq    $18, 8, $18             # E : At least a quad left?
 140        blt     $18, $less_than_8       # U : Nope
 141        nop                             # E :
 142        nop                             # E :
 143
 144$move_a_quad:
 145        ldq     $1, 0($17)              # L : fetch 8
 146        subq    $18, 8, $18             # E : count -= 8
 147        addq    $17, 8, $17             # E : src += 8
 148        nop                             # E :
 149
 150        stq     $1, 0($16)              # L : store 8
 151        addq    $16, 8, $16             # E : dest += 8
 152        bge     $18, $move_a_quad       # U :
 153        nop                             # E :
 154
 155$less_than_8:
 156        .align 4
 157        addq    $18, 8, $18             # E : add back for trailing bytes
 158        ble     $18, $nomoredata        # U : All-done
 159        nop                             # E :
 160        nop                             # E :
 161
 162        /* Trailing bytes */
 163$tail_bytes:
 164        subq    $18, 1, $18             # E : count--
 165        ldbu    $1, 0($17)              # L : fetch a byte
 166        addq    $17, 1, $17             # E : src++
 167        nop                             # E :
 168
 169        stb     $1, 0($16)              # L : store a byte
 170        addq    $16, 1, $16             # E : dest++
 171        bgt     $18, $tail_bytes        # U : more to be done?
 172        nop                             # E :
 173
 174        /* branching to exit takes 3 extra cycles, so replicate exit here */
 175        ret     $31, ($26), 1           # L0 :
 176        nop                             # E :
 177        nop                             # E :
 178        nop                             # E :
 179
 180$misaligned:
 181        mov     $0, $4                  # E : dest temp
 182        and     $0, 7, $1               # E : dest alignment mod8
 183        beq     $1, $dest_0mod8         # U : life doesnt totally suck
 184        nop
 185
 186$aligndest:
 187        ble     $18, $nomoredata        # U :
 188        ldbu    $1, 0($17)              # L : fetch a byte
 189        subq    $18, 1, $18             # E : count--
 190        addq    $17, 1, $17             # E : src++
 191
 192        stb     $1, 0($4)               # L : store it
 193        addq    $4, 1, $4               # E : dest++
 194        and     $4, 7, $1               # E : dest 0mod8 yet?
 195        bne     $1, $aligndest          # U : go until we are aligned.
 196
 197        /* Source has unknown alignment, but dest is known to be 0mod8 */
 198$dest_0mod8:
 199        subq    $18, 8, $18             # E : At least a quad left?
 200        blt     $18, $misalign_tail     # U : Nope
 201        ldq_u   $3, 0($17)              # L : seed (rotating load) of 8 bytes
 202        nop                             # E :
 203
 204$mis_quad:
 205        ldq_u   $16, 8($17)             # L : Fetch next 8
 206        extql   $3, $17, $3             # U : masking
 207        extqh   $16, $17, $1            # U : masking
 208        bis     $3, $1, $1              # E : merged bytes to store
 209
 210        subq    $18, 8, $18             # E : count -= 8
 211        addq    $17, 8, $17             # E : src += 8
 212        stq     $1, 0($4)               # L : store 8 (aligned)
 213        mov     $16, $3                 # E : "rotate" source data
 214
 215        addq    $4, 8, $4               # E : dest += 8
 216        bge     $18, $mis_quad          # U : More quads to move
 217        nop
 218        nop
 219
 220$misalign_tail:
 221        addq    $18, 8, $18             # E : account for tail stuff
 222        ble     $18, $nomoredata        # U :
 223        nop
 224        nop
 225
 226$misalign_byte:
 227        ldbu    $1, 0($17)              # L : fetch 1
 228        subq    $18, 1, $18             # E : count--
 229        addq    $17, 1, $17             # E : src++
 230        nop                             # E :
 231
 232        stb     $1, 0($4)               # L : store
 233        addq    $4, 1, $4               # E : dest++
 234        bgt     $18, $misalign_byte     # U : more to go?
 235        nop
 236
 237
 238$nomoredata:
 239        ret     $31, ($26), 1           # L0 :
 240        nop                             # E :
 241        nop                             # E :
 242        nop                             # E :
 243
 244        .end memcpy
 245
 246/* For backwards module compatibility.  */
 247__memcpy = memcpy
 248.globl __memcpy
 249