linux/arch/microblaze/lib/fastcopy.S
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
   3 * Copyright (C) 2008-2009 PetaLogix
   4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
   5 *
   6 * This file is subject to the terms and conditions of the GNU General
   7 * Public License.  See the file COPYING in the main directory of this
   8 * archive for more details.
   9 *
  10 * Written by Jim Law <jlaw@irispower.com>
  11 *
  12 * intended to replace:
  13 *      memcpy in memcpy.c and
  14 *      memmove in memmove.c
  15 * ... in arch/microblaze/lib
  16 *
  17 *
  18 * assly_fastcopy.S
  19 *
  20 * Attempt at quicker memcpy and memmove for MicroBlaze
  21 *      Input : Operand1 in Reg r5 - destination address
  22 *              Operand2 in Reg r6 - source address
  23 *              Operand3 in Reg r7 - number of bytes to transfer
  24 *      Output: Result in Reg r3 - starting destinaition address
  25 *
  26 *
  27 * Explanation:
  28 *      Perform (possibly unaligned) copy of a block of memory
  29 *      between mem locations with size of xfer spec'd in bytes
  30 */
  31
  32#ifdef __MICROBLAZEEL__
  33#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
  34#endif
  35
  36#include <linux/linkage.h>
  37        .text
  38        .globl  memcpy
  39        .type  memcpy, @function
  40        .ent    memcpy
  41
  42memcpy:
  43fast_memcpy_ascending:
  44        /* move d to return register as value of function */
  45        addi    r3, r5, 0
  46
  47        addi    r4, r0, 4       /* n = 4 */
  48        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
  49        blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
  50
  51        /* transfer first 0~3 bytes to get aligned dest address */
  52        andi    r4, r5, 3               /* n = d & 3 */
  53        /* if zero, destination already aligned */
  54        beqi    r4, a_dalign_done
  55        /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  56        rsubi   r4, r4, 4
  57        rsub    r7, r4, r7              /* c = c - n adjust c */
  58
  59a_xfer_first_loop:
  60        /* if no bytes left to transfer, transfer the bulk */
  61        beqi    r4, a_dalign_done
  62        lbui    r11, r6, 0              /* h = *s */
  63        sbi     r11, r5, 0              /* *d = h */
  64        addi    r6, r6, 1               /* s++ */
  65        addi    r5, r5, 1               /* d++ */
  66        brid    a_xfer_first_loop       /* loop */
  67        addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
  68
  69a_dalign_done:
  70        addi    r4, r0, 32              /* n = 32 */
  71        cmpu    r4, r4, r7              /* n = c - n  (unsigned) */
  72        /* if n < 0, less than one block to transfer */
  73        blti    r4, a_block_done
  74
  75a_block_xfer:
  76        andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
  77        rsub    r7, r4, r7              /* c = c - n */
  78
  79        andi    r9, r6, 3               /* t1 = s & 3 */
  80        /* if temp != 0, unaligned transfers needed */
  81        bnei    r9, a_block_unaligned
  82
  83a_block_aligned:
  84        lwi     r9, r6, 0               /* t1 = *(s + 0) */
  85        lwi     r10, r6, 4              /* t2 = *(s + 4) */
  86        lwi     r11, r6, 8              /* t3 = *(s + 8) */
  87        lwi     r12, r6, 12             /* t4 = *(s + 12) */
  88        swi     r9, r5, 0               /* *(d + 0) = t1 */
  89        swi     r10, r5, 4              /* *(d + 4) = t2 */
  90        swi     r11, r5, 8              /* *(d + 8) = t3 */
  91        swi     r12, r5, 12             /* *(d + 12) = t4 */
  92        lwi     r9, r6, 16              /* t1 = *(s + 16) */
  93        lwi     r10, r6, 20             /* t2 = *(s + 20) */
  94        lwi     r11, r6, 24             /* t3 = *(s + 24) */
  95        lwi     r12, r6, 28             /* t4 = *(s + 28) */
  96        swi     r9, r5, 16              /* *(d + 16) = t1 */
  97        swi     r10, r5, 20             /* *(d + 20) = t2 */
  98        swi     r11, r5, 24             /* *(d + 24) = t3 */
  99        swi     r12, r5, 28             /* *(d + 28) = t4 */
 100        addi    r6, r6, 32              /* s = s + 32 */
 101        addi    r4, r4, -32             /* n = n - 32 */
 102        bneid   r4, a_block_aligned     /* while (n) loop */
 103        addi    r5, r5, 32              /* d = d + 32 (IN DELAY SLOT) */
 104        bri     a_block_done
 105
 106a_block_unaligned:
 107        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 108        add     r6, r6, r4              /* s = s + n */
 109        lwi     r11, r8, 0              /* h = *(as + 0) */
 110
 111        addi    r9, r9, -1
 112        beqi    r9, a_block_u1          /* t1 was 1 => 1 byte offset */
 113        addi    r9, r9, -1
 114        beqi    r9, a_block_u2          /* t1 was 2 => 2 byte offset */
 115
 116a_block_u3:
 117        bslli   r11, r11, 24    /* h = h << 24 */
 118a_bu3_loop:
 119        lwi     r12, r8, 4      /* v = *(as + 4) */
 120        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 121        or      r9, r11, r9     /* t1 = h | t1 */
 122        swi     r9, r5, 0       /* *(d + 0) = t1 */
 123        bslli   r11, r12, 24    /* h = v << 24 */
 124        lwi     r12, r8, 8      /* v = *(as + 8) */
 125        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 126        or      r9, r11, r9     /* t1 = h | t1 */
 127        swi     r9, r5, 4       /* *(d + 4) = t1 */
 128        bslli   r11, r12, 24    /* h = v << 24 */
 129        lwi     r12, r8, 12     /* v = *(as + 12) */
 130        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 131        or      r9, r11, r9     /* t1 = h | t1 */
 132        swi     r9, r5, 8       /* *(d + 8) = t1 */
 133        bslli   r11, r12, 24    /* h = v << 24 */
 134        lwi     r12, r8, 16     /* v = *(as + 16) */
 135        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 136        or      r9, r11, r9     /* t1 = h | t1 */
 137        swi     r9, r5, 12      /* *(d + 12) = t1 */
 138        bslli   r11, r12, 24    /* h = v << 24 */
 139        lwi     r12, r8, 20     /* v = *(as + 20) */
 140        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 141        or      r9, r11, r9     /* t1 = h | t1 */
 142        swi     r9, r5, 16      /* *(d + 16) = t1 */
 143        bslli   r11, r12, 24    /* h = v << 24 */
 144        lwi     r12, r8, 24     /* v = *(as + 24) */
 145        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 146        or      r9, r11, r9     /* t1 = h | t1 */
 147        swi     r9, r5, 20      /* *(d + 20) = t1 */
 148        bslli   r11, r12, 24    /* h = v << 24 */
 149        lwi     r12, r8, 28     /* v = *(as + 28) */
 150        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 151        or      r9, r11, r9     /* t1 = h | t1 */
 152        swi     r9, r5, 24      /* *(d + 24) = t1 */
 153        bslli   r11, r12, 24    /* h = v << 24 */
 154        lwi     r12, r8, 32     /* v = *(as + 32) */
 155        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 156        or      r9, r11, r9     /* t1 = h | t1 */
 157        swi     r9, r5, 28      /* *(d + 28) = t1 */
 158        bslli   r11, r12, 24    /* h = v << 24 */
 159        addi    r8, r8, 32      /* as = as + 32 */
 160        addi    r4, r4, -32     /* n = n - 32 */
 161        bneid   r4, a_bu3_loop  /* while (n) loop */
 162        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 163        bri     a_block_done
 164
 165a_block_u1:
 166        bslli   r11, r11, 8     /* h = h << 8 */
 167a_bu1_loop:
 168        lwi     r12, r8, 4      /* v = *(as + 4) */
 169        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 170        or      r9, r11, r9     /* t1 = h | t1 */
 171        swi     r9, r5, 0       /* *(d + 0) = t1 */
 172        bslli   r11, r12, 8     /* h = v << 8 */
 173        lwi     r12, r8, 8      /* v = *(as + 8) */
 174        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 175        or      r9, r11, r9     /* t1 = h | t1 */
 176        swi     r9, r5, 4       /* *(d + 4) = t1 */
 177        bslli   r11, r12, 8     /* h = v << 8 */
 178        lwi     r12, r8, 12     /* v = *(as + 12) */
 179        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 180        or      r9, r11, r9     /* t1 = h | t1 */
 181        swi     r9, r5, 8       /* *(d + 8) = t1 */
 182        bslli   r11, r12, 8     /* h = v << 8 */
 183        lwi     r12, r8, 16     /* v = *(as + 16) */
 184        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 185        or      r9, r11, r9     /* t1 = h | t1 */
 186        swi     r9, r5, 12      /* *(d + 12) = t1 */
 187        bslli   r11, r12, 8     /* h = v << 8 */
 188        lwi     r12, r8, 20     /* v = *(as + 20) */
 189        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 190        or      r9, r11, r9     /* t1 = h | t1 */
 191        swi     r9, r5, 16      /* *(d + 16) = t1 */
 192        bslli   r11, r12, 8     /* h = v << 8 */
 193        lwi     r12, r8, 24     /* v = *(as + 24) */
 194        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 195        or      r9, r11, r9     /* t1 = h | t1 */
 196        swi     r9, r5, 20      /* *(d + 20) = t1 */
 197        bslli   r11, r12, 8     /* h = v << 8 */
 198        lwi     r12, r8, 28     /* v = *(as + 28) */
 199        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 200        or      r9, r11, r9     /* t1 = h | t1 */
 201        swi     r9, r5, 24      /* *(d + 24) = t1 */
 202        bslli   r11, r12, 8     /* h = v << 8 */
 203        lwi     r12, r8, 32     /* v = *(as + 32) */
 204        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 205        or      r9, r11, r9     /* t1 = h | t1 */
 206        swi     r9, r5, 28      /* *(d + 28) = t1 */
 207        bslli   r11, r12, 8     /* h = v << 8 */
 208        addi    r8, r8, 32      /* as = as + 32 */
 209        addi    r4, r4, -32     /* n = n - 32 */
 210        bneid   r4, a_bu1_loop  /* while (n) loop */
 211        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 212        bri     a_block_done
 213
 214a_block_u2:
 215        bslli   r11, r11, 16    /* h = h << 16 */
 216a_bu2_loop:
 217        lwi     r12, r8, 4      /* v = *(as + 4) */
 218        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 219        or      r9, r11, r9     /* t1 = h | t1 */
 220        swi     r9, r5, 0       /* *(d + 0) = t1 */
 221        bslli   r11, r12, 16    /* h = v << 16 */
 222        lwi     r12, r8, 8      /* v = *(as + 8) */
 223        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 224        or      r9, r11, r9     /* t1 = h | t1 */
 225        swi     r9, r5, 4       /* *(d + 4) = t1 */
 226        bslli   r11, r12, 16    /* h = v << 16 */
 227        lwi     r12, r8, 12     /* v = *(as + 12) */
 228        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 229        or      r9, r11, r9     /* t1 = h | t1 */
 230        swi     r9, r5, 8       /* *(d + 8) = t1 */
 231        bslli   r11, r12, 16    /* h = v << 16 */
 232        lwi     r12, r8, 16     /* v = *(as + 16) */
 233        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 234        or      r9, r11, r9     /* t1 = h | t1 */
 235        swi     r9, r5, 12      /* *(d + 12) = t1 */
 236        bslli   r11, r12, 16    /* h = v << 16 */
 237        lwi     r12, r8, 20     /* v = *(as + 20) */
 238        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 239        or      r9, r11, r9     /* t1 = h | t1 */
 240        swi     r9, r5, 16      /* *(d + 16) = t1 */
 241        bslli   r11, r12, 16    /* h = v << 16 */
 242        lwi     r12, r8, 24     /* v = *(as + 24) */
 243        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 244        or      r9, r11, r9     /* t1 = h | t1 */
 245        swi     r9, r5, 20      /* *(d + 20) = t1 */
 246        bslli   r11, r12, 16    /* h = v << 16 */
 247        lwi     r12, r8, 28     /* v = *(as + 28) */
 248        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 249        or      r9, r11, r9     /* t1 = h | t1 */
 250        swi     r9, r5, 24      /* *(d + 24) = t1 */
 251        bslli   r11, r12, 16    /* h = v << 16 */
 252        lwi     r12, r8, 32     /* v = *(as + 32) */
 253        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 254        or      r9, r11, r9     /* t1 = h | t1 */
 255        swi     r9, r5, 28      /* *(d + 28) = t1 */
 256        bslli   r11, r12, 16    /* h = v << 16 */
 257        addi    r8, r8, 32      /* as = as + 32 */
 258        addi    r4, r4, -32     /* n = n - 32 */
 259        bneid   r4, a_bu2_loop  /* while (n) loop */
 260        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 261
 262a_block_done:
 263        addi    r4, r0, 4       /* n = 4 */
 264        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 265        blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
 266
 267a_word_xfer:
 268        andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 269        addi    r10, r0, 0              /* offset = 0 */
 270
 271        andi    r9, r6, 3               /* t1 = s & 3 */
 272        /* if temp != 0, unaligned transfers needed */
 273        bnei    r9, a_word_unaligned
 274
 275a_word_aligned:
 276        lw      r9, r6, r10             /* t1 = *(s+offset) */
 277        sw      r9, r5, r10             /* *(d+offset) = t1 */
 278        addi    r4, r4,-4               /* n-- */
 279        bneid   r4, a_word_aligned      /* loop */
 280        addi    r10, r10, 4             /* offset++ (IN DELAY SLOT) */
 281
 282        bri     a_word_done
 283
 284a_word_unaligned:
 285        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 286        lwi     r11, r8, 0              /* h = *(as + 0) */
 287        addi    r8, r8, 4               /* as = as + 4 */
 288
 289        addi    r9, r9, -1
 290        beqi    r9, a_word_u1           /* t1 was 1 => 1 byte offset */
 291        addi    r9, r9, -1
 292        beqi    r9, a_word_u2           /* t1 was 2 => 2 byte offset */
 293
 294a_word_u3:
 295        bslli   r11, r11, 24    /* h = h << 24 */
 296a_wu3_loop:
 297        lw      r12, r8, r10    /* v = *(as + offset) */
 298        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 299        or      r9, r11, r9     /* t1 = h | t1 */
 300        sw      r9, r5, r10     /* *(d + offset) = t1 */
 301        bslli   r11, r12, 24    /* h = v << 24 */
 302        addi    r4, r4,-4       /* n = n - 4 */
 303        bneid   r4, a_wu3_loop  /* while (n) loop */
 304        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 305
 306        bri     a_word_done
 307
 308a_word_u1:
 309        bslli   r11, r11, 8     /* h = h << 8 */
 310a_wu1_loop:
 311        lw      r12, r8, r10    /* v = *(as + offset) */
 312        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 313        or      r9, r11, r9     /* t1 = h | t1 */
 314        sw      r9, r5, r10     /* *(d + offset) = t1 */
 315        bslli   r11, r12, 8     /* h = v << 8 */
 316        addi    r4, r4,-4       /* n = n - 4 */
 317        bneid   r4, a_wu1_loop  /* while (n) loop */
 318        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 319
 320        bri     a_word_done
 321
 322a_word_u2:
 323        bslli   r11, r11, 16    /* h = h << 16 */
 324a_wu2_loop:
 325        lw      r12, r8, r10    /* v = *(as + offset) */
 326        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 327        or      r9, r11, r9     /* t1 = h | t1 */
 328        sw      r9, r5, r10     /* *(d + offset) = t1 */
 329        bslli   r11, r12, 16    /* h = v << 16 */
 330        addi    r4, r4,-4       /* n = n - 4 */
 331        bneid   r4, a_wu2_loop  /* while (n) loop */
 332        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 333
 334a_word_done:
 335        add     r5, r5, r10     /* d = d + offset */
 336        add     r6, r6, r10     /* s = s + offset */
 337        rsub    r7, r10, r7     /* c = c - offset */
 338
 339a_xfer_end:
 340a_xfer_end_loop:
 341        beqi    r7, a_done              /* while (c) */
 342        lbui    r9, r6, 0               /* t1 = *s */
 343        addi    r6, r6, 1               /* s++ */
 344        sbi     r9, r5, 0               /* *d = t1 */
 345        addi    r7, r7, -1              /* c-- */
 346        brid    a_xfer_end_loop         /* loop */
 347        addi    r5, r5, 1               /* d++ (IN DELAY SLOT) */
 348
 349a_done:
 350        rtsd    r15, 8
 351        nop
 352
 353.size  memcpy, . - memcpy
 354.end memcpy
 355/*----------------------------------------------------------------------------*/
 356        .globl  memmove
 357        .type  memmove, @function
 358        .ent    memmove
 359
 360memmove:
 361        cmpu    r4, r5, r6      /* n = s - d */
 362        bgei    r4,fast_memcpy_ascending
 363
 364fast_memcpy_descending:
 365        /* move d to return register as value of function */
 366        addi    r3, r5, 0
 367
 368        add     r5, r5, r7      /* d = d + c */
 369        add     r6, r6, r7      /* s = s + c */
 370
 371        addi    r4, r0, 4       /* n = 4 */
 372        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 373        blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 374
 375        /* transfer first 0~3 bytes to get aligned dest address */
 376        andi    r4, r5, 3               /* n = d & 3 */
 377        /* if zero, destination already aligned */
 378        beqi    r4,d_dalign_done
 379        rsub    r7, r4, r7              /* c = c - n adjust c */
 380
 381d_xfer_first_loop:
 382        /* if no bytes left to transfer, transfer the bulk */
 383        beqi    r4,d_dalign_done
 384        addi    r6, r6, -1              /* s-- */
 385        addi    r5, r5, -1              /* d-- */
 386        lbui    r11, r6, 0              /* h = *s */
 387        sbi     r11, r5, 0              /* *d = h */
 388        brid    d_xfer_first_loop       /* loop */
 389        addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
 390
 391d_dalign_done:
 392        addi    r4, r0, 32      /* n = 32 */
 393        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 394        /* if n < 0, less than one block to transfer */
 395        blti    r4, d_block_done
 396
 397d_block_xfer:
 398        andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
 399        rsub    r7, r4, r7              /* c = c - n */
 400
 401        andi    r9, r6, 3               /* t1 = s & 3 */
 402        /* if temp != 0, unaligned transfers needed */
 403        bnei    r9, d_block_unaligned
 404
 405d_block_aligned:
 406        addi    r6, r6, -32             /* s = s - 32 */
 407        addi    r5, r5, -32             /* d = d - 32 */
 408        lwi     r9, r6, 28              /* t1 = *(s + 28) */
 409        lwi     r10, r6, 24             /* t2 = *(s + 24) */
 410        lwi     r11, r6, 20             /* t3 = *(s + 20) */
 411        lwi     r12, r6, 16             /* t4 = *(s + 16) */
 412        swi     r9, r5, 28              /* *(d + 28) = t1 */
 413        swi     r10, r5, 24             /* *(d + 24) = t2 */
 414        swi     r11, r5, 20             /* *(d + 20) = t3 */
 415        swi     r12, r5, 16             /* *(d + 16) = t4 */
 416        lwi     r9, r6, 12              /* t1 = *(s + 12) */
 417        lwi     r10, r6, 8              /* t2 = *(s + 8) */
 418        lwi     r11, r6, 4              /* t3 = *(s + 4) */
 419        lwi     r12, r6, 0              /* t4 = *(s + 0) */
 420        swi     r9, r5, 12              /* *(d + 12) = t1 */
 421        swi     r10, r5, 8              /* *(d + 8) = t2 */
 422        swi     r11, r5, 4              /* *(d + 4) = t3 */
 423        addi    r4, r4, -32             /* n = n - 32 */
 424        bneid   r4, d_block_aligned     /* while (n) loop */
 425        swi     r12, r5, 0              /* *(d + 0) = t4 (IN DELAY SLOT) */
 426        bri     d_block_done
 427
 428d_block_unaligned:
 429        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 430        rsub    r6, r4, r6              /* s = s - n */
 431        lwi     r11, r8, 0              /* h = *(as + 0) */
 432
 433        addi    r9, r9, -1
 434        beqi    r9,d_block_u1           /* t1 was 1 => 1 byte offset */
 435        addi    r9, r9, -1
 436        beqi    r9,d_block_u2           /* t1 was 2 => 2 byte offset */
 437
 438d_block_u3:
 439        bsrli   r11, r11, 8     /* h = h >> 8 */
 440d_bu3_loop:
 441        addi    r8, r8, -32     /* as = as - 32 */
 442        addi    r5, r5, -32     /* d = d - 32 */
 443        lwi     r12, r8, 28     /* v = *(as + 28) */
 444        bslli   r9, r12, 24     /* t1 = v << 24 */
 445        or      r9, r11, r9     /* t1 = h | t1 */
 446        swi     r9, r5, 28      /* *(d + 28) = t1 */
 447        bsrli   r11, r12, 8     /* h = v >> 8 */
 448        lwi     r12, r8, 24     /* v = *(as + 24) */
 449        bslli   r9, r12, 24     /* t1 = v << 24 */
 450        or      r9, r11, r9     /* t1 = h | t1 */
 451        swi     r9, r5, 24      /* *(d + 24) = t1 */
 452        bsrli   r11, r12, 8     /* h = v >> 8 */
 453        lwi     r12, r8, 20     /* v = *(as + 20) */
 454        bslli   r9, r12, 24     /* t1 = v << 24 */
 455        or      r9, r11, r9     /* t1 = h | t1 */
 456        swi     r9, r5, 20      /* *(d + 20) = t1 */
 457        bsrli   r11, r12, 8     /* h = v >> 8 */
 458        lwi     r12, r8, 16     /* v = *(as + 16) */
 459        bslli   r9, r12, 24     /* t1 = v << 24 */
 460        or      r9, r11, r9     /* t1 = h | t1 */
 461        swi     r9, r5, 16      /* *(d + 16) = t1 */
 462        bsrli   r11, r12, 8     /* h = v >> 8 */
 463        lwi     r12, r8, 12     /* v = *(as + 12) */
 464        bslli   r9, r12, 24     /* t1 = v << 24 */
 465        or      r9, r11, r9     /* t1 = h | t1 */
 466        swi     r9, r5, 12      /* *(d + 112) = t1 */
 467        bsrli   r11, r12, 8     /* h = v >> 8 */
 468        lwi     r12, r8, 8      /* v = *(as + 8) */
 469        bslli   r9, r12, 24     /* t1 = v << 24 */
 470        or      r9, r11, r9     /* t1 = h | t1 */
 471        swi     r9, r5, 8       /* *(d + 8) = t1 */
 472        bsrli   r11, r12, 8     /* h = v >> 8 */
 473        lwi     r12, r8, 4      /* v = *(as + 4) */
 474        bslli   r9, r12, 24     /* t1 = v << 24 */
 475        or      r9, r11, r9     /* t1 = h | t1 */
 476        swi     r9, r5, 4       /* *(d + 4) = t1 */
 477        bsrli   r11, r12, 8     /* h = v >> 8 */
 478        lwi     r12, r8, 0      /* v = *(as + 0) */
 479        bslli   r9, r12, 24     /* t1 = v << 24 */
 480        or      r9, r11, r9     /* t1 = h | t1 */
 481        swi     r9, r5, 0       /* *(d + 0) = t1 */
 482        addi    r4, r4, -32     /* n = n - 32 */
 483        bneid   r4, d_bu3_loop  /* while (n) loop */
 484        bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 485        bri     d_block_done
 486
 487d_block_u1:
 488        bsrli   r11, r11, 24    /* h = h >> 24 */
 489d_bu1_loop:
 490        addi    r8, r8, -32     /* as = as - 32 */
 491        addi    r5, r5, -32     /* d = d - 32 */
 492        lwi     r12, r8, 28     /* v = *(as + 28) */
 493        bslli   r9, r12, 8      /* t1 = v << 8 */
 494        or      r9, r11, r9     /* t1 = h | t1 */
 495        swi     r9, r5, 28      /* *(d + 28) = t1 */
 496        bsrli   r11, r12, 24    /* h = v >> 24 */
 497        lwi     r12, r8, 24     /* v = *(as + 24) */
 498        bslli   r9, r12, 8      /* t1 = v << 8 */
 499        or      r9, r11, r9     /* t1 = h | t1 */
 500        swi     r9, r5, 24      /* *(d + 24) = t1 */
 501        bsrli   r11, r12, 24    /* h = v >> 24 */
 502        lwi     r12, r8, 20     /* v = *(as + 20) */
 503        bslli   r9, r12, 8      /* t1 = v << 8 */
 504        or      r9, r11, r9     /* t1 = h | t1 */
 505        swi     r9, r5, 20      /* *(d + 20) = t1 */
 506        bsrli   r11, r12, 24    /* h = v >> 24 */
 507        lwi     r12, r8, 16     /* v = *(as + 16) */
 508        bslli   r9, r12, 8      /* t1 = v << 8 */
 509        or      r9, r11, r9     /* t1 = h | t1 */
 510        swi     r9, r5, 16      /* *(d + 16) = t1 */
 511        bsrli   r11, r12, 24    /* h = v >> 24 */
 512        lwi     r12, r8, 12     /* v = *(as + 12) */
 513        bslli   r9, r12, 8      /* t1 = v << 8 */
 514        or      r9, r11, r9     /* t1 = h | t1 */
 515        swi     r9, r5, 12      /* *(d + 112) = t1 */
 516        bsrli   r11, r12, 24    /* h = v >> 24 */
 517        lwi     r12, r8, 8      /* v = *(as + 8) */
 518        bslli   r9, r12, 8      /* t1 = v << 8 */
 519        or      r9, r11, r9     /* t1 = h | t1 */
 520        swi     r9, r5, 8       /* *(d + 8) = t1 */
 521        bsrli   r11, r12, 24    /* h = v >> 24 */
 522        lwi     r12, r8, 4      /* v = *(as + 4) */
 523        bslli   r9, r12, 8      /* t1 = v << 8 */
 524        or      r9, r11, r9     /* t1 = h | t1 */
 525        swi     r9, r5, 4       /* *(d + 4) = t1 */
 526        bsrli   r11, r12, 24    /* h = v >> 24 */
 527        lwi     r12, r8, 0      /* v = *(as + 0) */
 528        bslli   r9, r12, 8      /* t1 = v << 8 */
 529        or      r9, r11, r9     /* t1 = h | t1 */
 530        swi     r9, r5, 0       /* *(d + 0) = t1 */
 531        addi    r4, r4, -32     /* n = n - 32 */
 532        bneid   r4, d_bu1_loop  /* while (n) loop */
 533        bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 534        bri     d_block_done
 535
 536d_block_u2:
 537        bsrli   r11, r11, 16    /* h = h >> 16 */
 538d_bu2_loop:
 539        addi    r8, r8, -32     /* as = as - 32 */
 540        addi    r5, r5, -32     /* d = d - 32 */
 541        lwi     r12, r8, 28     /* v = *(as + 28) */
 542        bslli   r9, r12, 16     /* t1 = v << 16 */
 543        or      r9, r11, r9     /* t1 = h | t1 */
 544        swi     r9, r5, 28      /* *(d + 28) = t1 */
 545        bsrli   r11, r12, 16    /* h = v >> 16 */
 546        lwi     r12, r8, 24     /* v = *(as + 24) */
 547        bslli   r9, r12, 16     /* t1 = v << 16 */
 548        or      r9, r11, r9     /* t1 = h | t1 */
 549        swi     r9, r5, 24      /* *(d + 24) = t1 */
 550        bsrli   r11, r12, 16    /* h = v >> 16 */
 551        lwi     r12, r8, 20     /* v = *(as + 20) */
 552        bslli   r9, r12, 16     /* t1 = v << 16 */
 553        or      r9, r11, r9     /* t1 = h | t1 */
 554        swi     r9, r5, 20      /* *(d + 20) = t1 */
 555        bsrli   r11, r12, 16    /* h = v >> 16 */
 556        lwi     r12, r8, 16     /* v = *(as + 16) */
 557        bslli   r9, r12, 16     /* t1 = v << 16 */
 558        or      r9, r11, r9     /* t1 = h | t1 */
 559        swi     r9, r5, 16      /* *(d + 16) = t1 */
 560        bsrli   r11, r12, 16    /* h = v >> 16 */
 561        lwi     r12, r8, 12     /* v = *(as + 12) */
 562        bslli   r9, r12, 16     /* t1 = v << 16 */
 563        or      r9, r11, r9     /* t1 = h | t1 */
 564        swi     r9, r5, 12      /* *(d + 112) = t1 */
 565        bsrli   r11, r12, 16    /* h = v >> 16 */
 566        lwi     r12, r8, 8      /* v = *(as + 8) */
 567        bslli   r9, r12, 16     /* t1 = v << 16 */
 568        or      r9, r11, r9     /* t1 = h | t1 */
 569        swi     r9, r5, 8       /* *(d + 8) = t1 */
 570        bsrli   r11, r12, 16    /* h = v >> 16 */
 571        lwi     r12, r8, 4      /* v = *(as + 4) */
 572        bslli   r9, r12, 16     /* t1 = v << 16 */
 573        or      r9, r11, r9     /* t1 = h | t1 */
 574        swi     r9, r5, 4       /* *(d + 4) = t1 */
 575        bsrli   r11, r12, 16    /* h = v >> 16 */
 576        lwi     r12, r8, 0      /* v = *(as + 0) */
 577        bslli   r9, r12, 16     /* t1 = v << 16 */
 578        or      r9, r11, r9     /* t1 = h | t1 */
 579        swi     r9, r5, 0       /* *(d + 0) = t1 */
 580        addi    r4, r4, -32     /* n = n - 32 */
 581        bneid   r4, d_bu2_loop  /* while (n) loop */
 582        bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 583
 584d_block_done:
 585        addi    r4, r0, 4       /* n = 4 */
 586        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 587        blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 588
 589d_word_xfer:
 590        andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 591        rsub    r5, r4, r5              /* d = d - n */
 592        rsub    r6, r4, r6              /* s = s - n */
 593        rsub    r7, r4, r7              /* c = c - n */
 594
 595        andi    r9, r6, 3               /* t1 = s & 3 */
 596        /* if temp != 0, unaligned transfers needed */
 597        bnei    r9, d_word_unaligned
 598
 599d_word_aligned:
 600        addi    r4, r4,-4               /* n-- */
 601        lw      r9, r6, r4              /* t1 = *(s+n) */
 602        bneid   r4, d_word_aligned      /* loop */
 603        sw      r9, r5, r4              /* *(d+n) = t1 (IN DELAY SLOT) */
 604
 605        bri     d_word_done
 606
 607d_word_unaligned:
 608        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 609        lw      r11, r8, r4             /* h = *(as + n) */
 610
 611        addi    r9, r9, -1
 612        beqi    r9,d_word_u1            /* t1 was 1 => 1 byte offset */
 613        addi    r9, r9, -1
 614        beqi    r9,d_word_u2            /* t1 was 2 => 2 byte offset */
 615
 616d_word_u3:
 617        bsrli   r11, r11, 8     /* h = h >> 8 */
 618d_wu3_loop:
 619        addi    r4, r4,-4       /* n = n - 4 */
 620        lw      r12, r8, r4     /* v = *(as + n) */
 621        bslli   r9, r12, 24     /* t1 = v << 24 */
 622        or      r9, r11, r9     /* t1 = h | t1 */
 623        sw      r9, r5, r4      /* *(d + n) = t1 */
 624        bneid   r4, d_wu3_loop  /* while (n) loop */
 625        bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 626
 627        bri     d_word_done
 628
 629d_word_u1:
 630        bsrli   r11, r11, 24    /* h = h >> 24 */
 631d_wu1_loop:
 632        addi    r4, r4,-4       /* n = n - 4 */
 633        lw      r12, r8, r4     /* v = *(as + n) */
 634        bslli   r9, r12, 8      /* t1 = v << 8 */
 635        or      r9, r11, r9     /* t1 = h | t1 */
 636        sw      r9, r5, r4      /* *(d + n) = t1 */
 637        bneid   r4, d_wu1_loop  /* while (n) loop */
 638        bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 639
 640        bri     d_word_done
 641
 642d_word_u2:
 643        bsrli   r11, r11, 16    /* h = h >> 16 */
 644d_wu2_loop:
 645        addi    r4, r4,-4       /* n = n - 4 */
 646        lw      r12, r8, r4     /* v = *(as + n) */
 647        bslli   r9, r12, 16     /* t1 = v << 16 */
 648        or      r9, r11, r9     /* t1 = h | t1 */
 649        sw      r9, r5, r4      /* *(d + n) = t1 */
 650        bneid   r4, d_wu2_loop  /* while (n) loop */
 651        bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 652
 653d_word_done:
 654
 655d_xfer_end:
 656d_xfer_end_loop:
 657        beqi    r7, a_done              /* while (c) */
 658        addi    r6, r6, -1              /* s-- */
 659        lbui    r9, r6, 0               /* t1 = *s */
 660        addi    r5, r5, -1              /* d-- */
 661        sbi     r9, r5, 0               /* *d = t1 */
 662        brid    d_xfer_end_loop         /* loop */
 663        addi    r7, r7, -1              /* c-- (IN DELAY SLOT) */
 664
 665d_done:
 666        rtsd    r15, 8
 667        nop
 668
 669.size  memmove, . - memmove
 670.end memmove
 671