linux/arch/microblaze/lib/fastcopy.S
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
   3 * Copyright (C) 2008-2009 PetaLogix
   4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
   5 *
   6 * This file is subject to the terms and conditions of the GNU General
   7 * Public License.  See the file COPYING in the main directory of this
   8 * archive for more details.
   9 *
  10 * Written by Jim Law <jlaw@irispower.com>
  11 *
  12 * intended to replace:
  13 *      memcpy in memcpy.c and
  14 *      memmove in memmove.c
  15 * ... in arch/microblaze/lib
  16 *
  17 *
  18 * assly_fastcopy.S
  19 *
  20 * Attempt at quicker memcpy and memmove for MicroBlaze
  21 *      Input : Operand1 in Reg r5 - destination address
  22 *              Operand2 in Reg r6 - source address
  23 *              Operand3 in Reg r7 - number of bytes to transfer
  24 *      Output: Result in Reg r3 - starting destinaition address
  25 *
  26 *
  27 * Explanation:
  28 *      Perform (possibly unaligned) copy of a block of memory
  29 *      between mem locations with size of xfer spec'd in bytes
  30 */
  31
  32#include <linux/linkage.h>
  33        .text
  34        .globl  memcpy
  35        .type  memcpy, @function
  36        .ent    memcpy
  37
  38memcpy:
  39fast_memcpy_ascending:
  40        /* move d to return register as value of function */
  41        addi    r3, r5, 0
  42
  43        addi    r4, r0, 4       /* n = 4 */
  44        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
  45        blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
  46
  47        /* transfer first 0~3 bytes to get aligned dest address */
  48        andi    r4, r5, 3               /* n = d & 3 */
  49        /* if zero, destination already aligned */
  50        beqi    r4, a_dalign_done
  51        /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  52        rsubi   r4, r4, 4
  53        rsub    r7, r4, r7              /* c = c - n adjust c */
  54
  55a_xfer_first_loop:
  56        /* if no bytes left to transfer, transfer the bulk */
  57        beqi    r4, a_dalign_done
  58        lbui    r11, r6, 0              /* h = *s */
  59        sbi     r11, r5, 0              /* *d = h */
  60        addi    r6, r6, 1               /* s++ */
  61        addi    r5, r5, 1               /* d++ */
  62        brid    a_xfer_first_loop       /* loop */
  63        addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
  64
  65a_dalign_done:
  66        addi    r4, r0, 32              /* n = 32 */
  67        cmpu    r4, r4, r7              /* n = c - n  (unsigned) */
  68        /* if n < 0, less than one block to transfer */
  69        blti    r4, a_block_done
  70
  71a_block_xfer:
  72        andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
  73        rsub    r7, r4, r7              /* c = c - n */
  74
  75        andi    r9, r6, 3               /* t1 = s & 3 */
  76        /* if temp != 0, unaligned transfers needed */
  77        bnei    r9, a_block_unaligned
  78
  79a_block_aligned:
  80        lwi     r9, r6, 0               /* t1 = *(s + 0) */
  81        lwi     r10, r6, 4              /* t2 = *(s + 4) */
  82        lwi     r11, r6, 8              /* t3 = *(s + 8) */
  83        lwi     r12, r6, 12             /* t4 = *(s + 12) */
  84        swi     r9, r5, 0               /* *(d + 0) = t1 */
  85        swi     r10, r5, 4              /* *(d + 4) = t2 */
  86        swi     r11, r5, 8              /* *(d + 8) = t3 */
  87        swi     r12, r5, 12             /* *(d + 12) = t4 */
  88        lwi     r9, r6, 16              /* t1 = *(s + 16) */
  89        lwi     r10, r6, 20             /* t2 = *(s + 20) */
  90        lwi     r11, r6, 24             /* t3 = *(s + 24) */
  91        lwi     r12, r6, 28             /* t4 = *(s + 28) */
  92        swi     r9, r5, 16              /* *(d + 16) = t1 */
  93        swi     r10, r5, 20             /* *(d + 20) = t2 */
  94        swi     r11, r5, 24             /* *(d + 24) = t3 */
  95        swi     r12, r5, 28             /* *(d + 28) = t4 */
  96        addi    r6, r6, 32              /* s = s + 32 */
  97        addi    r4, r4, -32             /* n = n - 32 */
  98        bneid   r4, a_block_aligned     /* while (n) loop */
  99        addi    r5, r5, 32              /* d = d + 32 (IN DELAY SLOT) */
 100        bri     a_block_done
 101
 102a_block_unaligned:
 103        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 104        add     r6, r6, r4              /* s = s + n */
 105        lwi     r11, r8, 0              /* h = *(as + 0) */
 106
 107        addi    r9, r9, -1
 108        beqi    r9, a_block_u1          /* t1 was 1 => 1 byte offset */
 109        addi    r9, r9, -1
 110        beqi    r9, a_block_u2          /* t1 was 2 => 2 byte offset */
 111
 112a_block_u3:
 113        bslli   r11, r11, 24    /* h = h << 24 */
 114a_bu3_loop:
 115        lwi     r12, r8, 4      /* v = *(as + 4) */
 116        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 117        or      r9, r11, r9     /* t1 = h | t1 */
 118        swi     r9, r5, 0       /* *(d + 0) = t1 */
 119        bslli   r11, r12, 24    /* h = v << 24 */
 120        lwi     r12, r8, 8      /* v = *(as + 8) */
 121        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 122        or      r9, r11, r9     /* t1 = h | t1 */
 123        swi     r9, r5, 4       /* *(d + 4) = t1 */
 124        bslli   r11, r12, 24    /* h = v << 24 */
 125        lwi     r12, r8, 12     /* v = *(as + 12) */
 126        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 127        or      r9, r11, r9     /* t1 = h | t1 */
 128        swi     r9, r5, 8       /* *(d + 8) = t1 */
 129        bslli   r11, r12, 24    /* h = v << 24 */
 130        lwi     r12, r8, 16     /* v = *(as + 16) */
 131        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 132        or      r9, r11, r9     /* t1 = h | t1 */
 133        swi     r9, r5, 12      /* *(d + 12) = t1 */
 134        bslli   r11, r12, 24    /* h = v << 24 */
 135        lwi     r12, r8, 20     /* v = *(as + 20) */
 136        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 137        or      r9, r11, r9     /* t1 = h | t1 */
 138        swi     r9, r5, 16      /* *(d + 16) = t1 */
 139        bslli   r11, r12, 24    /* h = v << 24 */
 140        lwi     r12, r8, 24     /* v = *(as + 24) */
 141        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 142        or      r9, r11, r9     /* t1 = h | t1 */
 143        swi     r9, r5, 20      /* *(d + 20) = t1 */
 144        bslli   r11, r12, 24    /* h = v << 24 */
 145        lwi     r12, r8, 28     /* v = *(as + 28) */
 146        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 147        or      r9, r11, r9     /* t1 = h | t1 */
 148        swi     r9, r5, 24      /* *(d + 24) = t1 */
 149        bslli   r11, r12, 24    /* h = v << 24 */
 150        lwi     r12, r8, 32     /* v = *(as + 32) */
 151        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 152        or      r9, r11, r9     /* t1 = h | t1 */
 153        swi     r9, r5, 28      /* *(d + 28) = t1 */
 154        bslli   r11, r12, 24    /* h = v << 24 */
 155        addi    r8, r8, 32      /* as = as + 32 */
 156        addi    r4, r4, -32     /* n = n - 32 */
 157        bneid   r4, a_bu3_loop  /* while (n) loop */
 158        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 159        bri     a_block_done
 160
 161a_block_u1:
 162        bslli   r11, r11, 8     /* h = h << 8 */
 163a_bu1_loop:
 164        lwi     r12, r8, 4      /* v = *(as + 4) */
 165        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 166        or      r9, r11, r9     /* t1 = h | t1 */
 167        swi     r9, r5, 0       /* *(d + 0) = t1 */
 168        bslli   r11, r12, 8     /* h = v << 8 */
 169        lwi     r12, r8, 8      /* v = *(as + 8) */
 170        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 171        or      r9, r11, r9     /* t1 = h | t1 */
 172        swi     r9, r5, 4       /* *(d + 4) = t1 */
 173        bslli   r11, r12, 8     /* h = v << 8 */
 174        lwi     r12, r8, 12     /* v = *(as + 12) */
 175        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 176        or      r9, r11, r9     /* t1 = h | t1 */
 177        swi     r9, r5, 8       /* *(d + 8) = t1 */
 178        bslli   r11, r12, 8     /* h = v << 8 */
 179        lwi     r12, r8, 16     /* v = *(as + 16) */
 180        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 181        or      r9, r11, r9     /* t1 = h | t1 */
 182        swi     r9, r5, 12      /* *(d + 12) = t1 */
 183        bslli   r11, r12, 8     /* h = v << 8 */
 184        lwi     r12, r8, 20     /* v = *(as + 20) */
 185        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 186        or      r9, r11, r9     /* t1 = h | t1 */
 187        swi     r9, r5, 16      /* *(d + 16) = t1 */
 188        bslli   r11, r12, 8     /* h = v << 8 */
 189        lwi     r12, r8, 24     /* v = *(as + 24) */
 190        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 191        or      r9, r11, r9     /* t1 = h | t1 */
 192        swi     r9, r5, 20      /* *(d + 20) = t1 */
 193        bslli   r11, r12, 8     /* h = v << 8 */
 194        lwi     r12, r8, 28     /* v = *(as + 28) */
 195        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 196        or      r9, r11, r9     /* t1 = h | t1 */
 197        swi     r9, r5, 24      /* *(d + 24) = t1 */
 198        bslli   r11, r12, 8     /* h = v << 8 */
 199        lwi     r12, r8, 32     /* v = *(as + 32) */
 200        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 201        or      r9, r11, r9     /* t1 = h | t1 */
 202        swi     r9, r5, 28      /* *(d + 28) = t1 */
 203        bslli   r11, r12, 8     /* h = v << 8 */
 204        addi    r8, r8, 32      /* as = as + 32 */
 205        addi    r4, r4, -32     /* n = n - 32 */
 206        bneid   r4, a_bu1_loop  /* while (n) loop */
 207        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 208        bri     a_block_done
 209
 210a_block_u2:
 211        bslli   r11, r11, 16    /* h = h << 16 */
 212a_bu2_loop:
 213        lwi     r12, r8, 4      /* v = *(as + 4) */
 214        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 215        or      r9, r11, r9     /* t1 = h | t1 */
 216        swi     r9, r5, 0       /* *(d + 0) = t1 */
 217        bslli   r11, r12, 16    /* h = v << 16 */
 218        lwi     r12, r8, 8      /* v = *(as + 8) */
 219        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 220        or      r9, r11, r9     /* t1 = h | t1 */
 221        swi     r9, r5, 4       /* *(d + 4) = t1 */
 222        bslli   r11, r12, 16    /* h = v << 16 */
 223        lwi     r12, r8, 12     /* v = *(as + 12) */
 224        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 225        or      r9, r11, r9     /* t1 = h | t1 */
 226        swi     r9, r5, 8       /* *(d + 8) = t1 */
 227        bslli   r11, r12, 16    /* h = v << 16 */
 228        lwi     r12, r8, 16     /* v = *(as + 16) */
 229        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 230        or      r9, r11, r9     /* t1 = h | t1 */
 231        swi     r9, r5, 12      /* *(d + 12) = t1 */
 232        bslli   r11, r12, 16    /* h = v << 16 */
 233        lwi     r12, r8, 20     /* v = *(as + 20) */
 234        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 235        or      r9, r11, r9     /* t1 = h | t1 */
 236        swi     r9, r5, 16      /* *(d + 16) = t1 */
 237        bslli   r11, r12, 16    /* h = v << 16 */
 238        lwi     r12, r8, 24     /* v = *(as + 24) */
 239        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 240        or      r9, r11, r9     /* t1 = h | t1 */
 241        swi     r9, r5, 20      /* *(d + 20) = t1 */
 242        bslli   r11, r12, 16    /* h = v << 16 */
 243        lwi     r12, r8, 28     /* v = *(as + 28) */
 244        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 245        or      r9, r11, r9     /* t1 = h | t1 */
 246        swi     r9, r5, 24      /* *(d + 24) = t1 */
 247        bslli   r11, r12, 16    /* h = v << 16 */
 248        lwi     r12, r8, 32     /* v = *(as + 32) */
 249        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 250        or      r9, r11, r9     /* t1 = h | t1 */
 251        swi     r9, r5, 28      /* *(d + 28) = t1 */
 252        bslli   r11, r12, 16    /* h = v << 16 */
 253        addi    r8, r8, 32      /* as = as + 32 */
 254        addi    r4, r4, -32     /* n = n - 32 */
 255        bneid   r4, a_bu2_loop  /* while (n) loop */
 256        addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 257
 258a_block_done:
 259        addi    r4, r0, 4       /* n = 4 */
 260        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 261        blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
 262
 263a_word_xfer:
 264        andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 265        addi    r10, r0, 0              /* offset = 0 */
 266
 267        andi    r9, r6, 3               /* t1 = s & 3 */
 268        /* if temp != 0, unaligned transfers needed */
 269        bnei    r9, a_word_unaligned
 270
 271a_word_aligned:
 272        lw      r9, r6, r10             /* t1 = *(s+offset) */
 273        sw      r9, r5, r10             /* *(d+offset) = t1 */
 274        addi    r4, r4,-4               /* n-- */
 275        bneid   r4, a_word_aligned      /* loop */
 276        addi    r10, r10, 4             /* offset++ (IN DELAY SLOT) */
 277
 278        bri     a_word_done
 279
 280a_word_unaligned:
 281        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 282        lwi     r11, r8, 0              /* h = *(as + 0) */
 283        addi    r8, r8, 4               /* as = as + 4 */
 284
 285        addi    r9, r9, -1
 286        beqi    r9, a_word_u1           /* t1 was 1 => 1 byte offset */
 287        addi    r9, r9, -1
 288        beqi    r9, a_word_u2           /* t1 was 2 => 2 byte offset */
 289
 290a_word_u3:
 291        bslli   r11, r11, 24    /* h = h << 24 */
 292a_wu3_loop:
 293        lw      r12, r8, r10    /* v = *(as + offset) */
 294        bsrli   r9, r12, 8      /* t1 = v >> 8 */
 295        or      r9, r11, r9     /* t1 = h | t1 */
 296        sw      r9, r5, r10     /* *(d + offset) = t1 */
 297        bslli   r11, r12, 24    /* h = v << 24 */
 298        addi    r4, r4,-4       /* n = n - 4 */
 299        bneid   r4, a_wu3_loop  /* while (n) loop */
 300        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 301
 302        bri     a_word_done
 303
 304a_word_u1:
 305        bslli   r11, r11, 8     /* h = h << 8 */
 306a_wu1_loop:
 307        lw      r12, r8, r10    /* v = *(as + offset) */
 308        bsrli   r9, r12, 24     /* t1 = v >> 24 */
 309        or      r9, r11, r9     /* t1 = h | t1 */
 310        sw      r9, r5, r10     /* *(d + offset) = t1 */
 311        bslli   r11, r12, 8     /* h = v << 8 */
 312        addi    r4, r4,-4       /* n = n - 4 */
 313        bneid   r4, a_wu1_loop  /* while (n) loop */
 314        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 315
 316        bri     a_word_done
 317
 318a_word_u2:
 319        bslli   r11, r11, 16    /* h = h << 16 */
 320a_wu2_loop:
 321        lw      r12, r8, r10    /* v = *(as + offset) */
 322        bsrli   r9, r12, 16     /* t1 = v >> 16 */
 323        or      r9, r11, r9     /* t1 = h | t1 */
 324        sw      r9, r5, r10     /* *(d + offset) = t1 */
 325        bslli   r11, r12, 16    /* h = v << 16 */
 326        addi    r4, r4,-4       /* n = n - 4 */
 327        bneid   r4, a_wu2_loop  /* while (n) loop */
 328        addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 329
 330a_word_done:
 331        add     r5, r5, r10     /* d = d + offset */
 332        add     r6, r6, r10     /* s = s + offset */
 333        rsub    r7, r10, r7     /* c = c - offset */
 334
 335a_xfer_end:
 336a_xfer_end_loop:
 337        beqi    r7, a_done              /* while (c) */
 338        lbui    r9, r6, 0               /* t1 = *s */
 339        addi    r6, r6, 1               /* s++ */
 340        sbi     r9, r5, 0               /* *d = t1 */
 341        addi    r7, r7, -1              /* c-- */
 342        brid    a_xfer_end_loop         /* loop */
 343        addi    r5, r5, 1               /* d++ (IN DELAY SLOT) */
 344
 345a_done:
 346        rtsd    r15, 8
 347        nop
 348
 349.size  memcpy, . - memcpy
 350.end memcpy
 351/*----------------------------------------------------------------------------*/
 352        .globl  memmove
 353        .type  memmove, @function
 354        .ent    memmove
 355
 356memmove:
 357        cmpu    r4, r5, r6      /* n = s - d */
 358        bgei    r4,fast_memcpy_ascending
 359
 360fast_memcpy_descending:
 361        /* move d to return register as value of function */
 362        addi    r3, r5, 0
 363
 364        add     r5, r5, r7      /* d = d + c */
 365        add     r6, r6, r7      /* s = s + c */
 366
 367        addi    r4, r0, 4       /* n = 4 */
 368        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 369        blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 370
 371        /* transfer first 0~3 bytes to get aligned dest address */
 372        andi    r4, r5, 3               /* n = d & 3 */
 373        /* if zero, destination already aligned */
 374        beqi    r4,d_dalign_done
 375        rsub    r7, r4, r7              /* c = c - n adjust c */
 376
 377d_xfer_first_loop:
 378        /* if no bytes left to transfer, transfer the bulk */
 379        beqi    r4,d_dalign_done
 380        addi    r6, r6, -1              /* s-- */
 381        addi    r5, r5, -1              /* d-- */
 382        lbui    r11, r6, 0              /* h = *s */
 383        sbi     r11, r5, 0              /* *d = h */
 384        brid    d_xfer_first_loop       /* loop */
 385        addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
 386
 387d_dalign_done:
 388        addi    r4, r0, 32      /* n = 32 */
 389        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 390        /* if n < 0, less than one block to transfer */
 391        blti    r4, d_block_done
 392
 393d_block_xfer:
 394        andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
 395        rsub    r7, r4, r7              /* c = c - n */
 396
 397        andi    r9, r6, 3               /* t1 = s & 3 */
 398        /* if temp != 0, unaligned transfers needed */
 399        bnei    r9, d_block_unaligned
 400
 401d_block_aligned:
 402        addi    r6, r6, -32             /* s = s - 32 */
 403        addi    r5, r5, -32             /* d = d - 32 */
 404        lwi     r9, r6, 28              /* t1 = *(s + 28) */
 405        lwi     r10, r6, 24             /* t2 = *(s + 24) */
 406        lwi     r11, r6, 20             /* t3 = *(s + 20) */
 407        lwi     r12, r6, 16             /* t4 = *(s + 16) */
 408        swi     r9, r5, 28              /* *(d + 28) = t1 */
 409        swi     r10, r5, 24             /* *(d + 24) = t2 */
 410        swi     r11, r5, 20             /* *(d + 20) = t3 */
 411        swi     r12, r5, 16             /* *(d + 16) = t4 */
 412        lwi     r9, r6, 12              /* t1 = *(s + 12) */
 413        lwi     r10, r6, 8              /* t2 = *(s + 8) */
 414        lwi     r11, r6, 4              /* t3 = *(s + 4) */
 415        lwi     r12, r6, 0              /* t4 = *(s + 0) */
 416        swi     r9, r5, 12              /* *(d + 12) = t1 */
 417        swi     r10, r5, 8              /* *(d + 8) = t2 */
 418        swi     r11, r5, 4              /* *(d + 4) = t3 */
 419        addi    r4, r4, -32             /* n = n - 32 */
 420        bneid   r4, d_block_aligned     /* while (n) loop */
 421        swi     r12, r5, 0              /* *(d + 0) = t4 (IN DELAY SLOT) */
 422        bri     d_block_done
 423
 424d_block_unaligned:
 425        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 426        rsub    r6, r4, r6              /* s = s - n */
 427        lwi     r11, r8, 0              /* h = *(as + 0) */
 428
 429        addi    r9, r9, -1
 430        beqi    r9,d_block_u1           /* t1 was 1 => 1 byte offset */
 431        addi    r9, r9, -1
 432        beqi    r9,d_block_u2           /* t1 was 2 => 2 byte offset */
 433
 434d_block_u3:
 435        bsrli   r11, r11, 8     /* h = h >> 8 */
 436d_bu3_loop:
 437        addi    r8, r8, -32     /* as = as - 32 */
 438        addi    r5, r5, -32     /* d = d - 32 */
 439        lwi     r12, r8, 28     /* v = *(as + 28) */
 440        bslli   r9, r12, 24     /* t1 = v << 24 */
 441        or      r9, r11, r9     /* t1 = h | t1 */
 442        swi     r9, r5, 28      /* *(d + 28) = t1 */
 443        bsrli   r11, r12, 8     /* h = v >> 8 */
 444        lwi     r12, r8, 24     /* v = *(as + 24) */
 445        bslli   r9, r12, 24     /* t1 = v << 24 */
 446        or      r9, r11, r9     /* t1 = h | t1 */
 447        swi     r9, r5, 24      /* *(d + 24) = t1 */
 448        bsrli   r11, r12, 8     /* h = v >> 8 */
 449        lwi     r12, r8, 20     /* v = *(as + 20) */
 450        bslli   r9, r12, 24     /* t1 = v << 24 */
 451        or      r9, r11, r9     /* t1 = h | t1 */
 452        swi     r9, r5, 20      /* *(d + 20) = t1 */
 453        bsrli   r11, r12, 8     /* h = v >> 8 */
 454        lwi     r12, r8, 16     /* v = *(as + 16) */
 455        bslli   r9, r12, 24     /* t1 = v << 24 */
 456        or      r9, r11, r9     /* t1 = h | t1 */
 457        swi     r9, r5, 16      /* *(d + 16) = t1 */
 458        bsrli   r11, r12, 8     /* h = v >> 8 */
 459        lwi     r12, r8, 12     /* v = *(as + 12) */
 460        bslli   r9, r12, 24     /* t1 = v << 24 */
 461        or      r9, r11, r9     /* t1 = h | t1 */
 462        swi     r9, r5, 12      /* *(d + 112) = t1 */
 463        bsrli   r11, r12, 8     /* h = v >> 8 */
 464        lwi     r12, r8, 8      /* v = *(as + 8) */
 465        bslli   r9, r12, 24     /* t1 = v << 24 */
 466        or      r9, r11, r9     /* t1 = h | t1 */
 467        swi     r9, r5, 8       /* *(d + 8) = t1 */
 468        bsrli   r11, r12, 8     /* h = v >> 8 */
 469        lwi     r12, r8, 4      /* v = *(as + 4) */
 470        bslli   r9, r12, 24     /* t1 = v << 24 */
 471        or      r9, r11, r9     /* t1 = h | t1 */
 472        swi     r9, r5, 4       /* *(d + 4) = t1 */
 473        bsrli   r11, r12, 8     /* h = v >> 8 */
 474        lwi     r12, r8, 0      /* v = *(as + 0) */
 475        bslli   r9, r12, 24     /* t1 = v << 24 */
 476        or      r9, r11, r9     /* t1 = h | t1 */
 477        swi     r9, r5, 0       /* *(d + 0) = t1 */
 478        addi    r4, r4, -32     /* n = n - 32 */
 479        bneid   r4, d_bu3_loop  /* while (n) loop */
 480        bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 481        bri     d_block_done
 482
 483d_block_u1:
 484        bsrli   r11, r11, 24    /* h = h >> 24 */
 485d_bu1_loop:
 486        addi    r8, r8, -32     /* as = as - 32 */
 487        addi    r5, r5, -32     /* d = d - 32 */
 488        lwi     r12, r8, 28     /* v = *(as + 28) */
 489        bslli   r9, r12, 8      /* t1 = v << 8 */
 490        or      r9, r11, r9     /* t1 = h | t1 */
 491        swi     r9, r5, 28      /* *(d + 28) = t1 */
 492        bsrli   r11, r12, 24    /* h = v >> 24 */
 493        lwi     r12, r8, 24     /* v = *(as + 24) */
 494        bslli   r9, r12, 8      /* t1 = v << 8 */
 495        or      r9, r11, r9     /* t1 = h | t1 */
 496        swi     r9, r5, 24      /* *(d + 24) = t1 */
 497        bsrli   r11, r12, 24    /* h = v >> 24 */
 498        lwi     r12, r8, 20     /* v = *(as + 20) */
 499        bslli   r9, r12, 8      /* t1 = v << 8 */
 500        or      r9, r11, r9     /* t1 = h | t1 */
 501        swi     r9, r5, 20      /* *(d + 20) = t1 */
 502        bsrli   r11, r12, 24    /* h = v >> 24 */
 503        lwi     r12, r8, 16     /* v = *(as + 16) */
 504        bslli   r9, r12, 8      /* t1 = v << 8 */
 505        or      r9, r11, r9     /* t1 = h | t1 */
 506        swi     r9, r5, 16      /* *(d + 16) = t1 */
 507        bsrli   r11, r12, 24    /* h = v >> 24 */
 508        lwi     r12, r8, 12     /* v = *(as + 12) */
 509        bslli   r9, r12, 8      /* t1 = v << 8 */
 510        or      r9, r11, r9     /* t1 = h | t1 */
 511        swi     r9, r5, 12      /* *(d + 112) = t1 */
 512        bsrli   r11, r12, 24    /* h = v >> 24 */
 513        lwi     r12, r8, 8      /* v = *(as + 8) */
 514        bslli   r9, r12, 8      /* t1 = v << 8 */
 515        or      r9, r11, r9     /* t1 = h | t1 */
 516        swi     r9, r5, 8       /* *(d + 8) = t1 */
 517        bsrli   r11, r12, 24    /* h = v >> 24 */
 518        lwi     r12, r8, 4      /* v = *(as + 4) */
 519        bslli   r9, r12, 8      /* t1 = v << 8 */
 520        or      r9, r11, r9     /* t1 = h | t1 */
 521        swi     r9, r5, 4       /* *(d + 4) = t1 */
 522        bsrli   r11, r12, 24    /* h = v >> 24 */
 523        lwi     r12, r8, 0      /* v = *(as + 0) */
 524        bslli   r9, r12, 8      /* t1 = v << 8 */
 525        or      r9, r11, r9     /* t1 = h | t1 */
 526        swi     r9, r5, 0       /* *(d + 0) = t1 */
 527        addi    r4, r4, -32     /* n = n - 32 */
 528        bneid   r4, d_bu1_loop  /* while (n) loop */
 529        bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 530        bri     d_block_done
 531
 532d_block_u2:
 533        bsrli   r11, r11, 16    /* h = h >> 16 */
 534d_bu2_loop:
 535        addi    r8, r8, -32     /* as = as - 32 */
 536        addi    r5, r5, -32     /* d = d - 32 */
 537        lwi     r12, r8, 28     /* v = *(as + 28) */
 538        bslli   r9, r12, 16     /* t1 = v << 16 */
 539        or      r9, r11, r9     /* t1 = h | t1 */
 540        swi     r9, r5, 28      /* *(d + 28) = t1 */
 541        bsrli   r11, r12, 16    /* h = v >> 16 */
 542        lwi     r12, r8, 24     /* v = *(as + 24) */
 543        bslli   r9, r12, 16     /* t1 = v << 16 */
 544        or      r9, r11, r9     /* t1 = h | t1 */
 545        swi     r9, r5, 24      /* *(d + 24) = t1 */
 546        bsrli   r11, r12, 16    /* h = v >> 16 */
 547        lwi     r12, r8, 20     /* v = *(as + 20) */
 548        bslli   r9, r12, 16     /* t1 = v << 16 */
 549        or      r9, r11, r9     /* t1 = h | t1 */
 550        swi     r9, r5, 20      /* *(d + 20) = t1 */
 551        bsrli   r11, r12, 16    /* h = v >> 16 */
 552        lwi     r12, r8, 16     /* v = *(as + 16) */
 553        bslli   r9, r12, 16     /* t1 = v << 16 */
 554        or      r9, r11, r9     /* t1 = h | t1 */
 555        swi     r9, r5, 16      /* *(d + 16) = t1 */
 556        bsrli   r11, r12, 16    /* h = v >> 16 */
 557        lwi     r12, r8, 12     /* v = *(as + 12) */
 558        bslli   r9, r12, 16     /* t1 = v << 16 */
 559        or      r9, r11, r9     /* t1 = h | t1 */
 560        swi     r9, r5, 12      /* *(d + 112) = t1 */
 561        bsrli   r11, r12, 16    /* h = v >> 16 */
 562        lwi     r12, r8, 8      /* v = *(as + 8) */
 563        bslli   r9, r12, 16     /* t1 = v << 16 */
 564        or      r9, r11, r9     /* t1 = h | t1 */
 565        swi     r9, r5, 8       /* *(d + 8) = t1 */
 566        bsrli   r11, r12, 16    /* h = v >> 16 */
 567        lwi     r12, r8, 4      /* v = *(as + 4) */
 568        bslli   r9, r12, 16     /* t1 = v << 16 */
 569        or      r9, r11, r9     /* t1 = h | t1 */
 570        swi     r9, r5, 4       /* *(d + 4) = t1 */
 571        bsrli   r11, r12, 16    /* h = v >> 16 */
 572        lwi     r12, r8, 0      /* v = *(as + 0) */
 573        bslli   r9, r12, 16     /* t1 = v << 16 */
 574        or      r9, r11, r9     /* t1 = h | t1 */
 575        swi     r9, r5, 0       /* *(d + 0) = t1 */
 576        addi    r4, r4, -32     /* n = n - 32 */
 577        bneid   r4, d_bu2_loop  /* while (n) loop */
 578        bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 579
 580d_block_done:
 581        addi    r4, r0, 4       /* n = 4 */
 582        cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 583        blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 584
 585d_word_xfer:
 586        andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 587        rsub    r5, r4, r5              /* d = d - n */
 588        rsub    r6, r4, r6              /* s = s - n */
 589        rsub    r7, r4, r7              /* c = c - n */
 590
 591        andi    r9, r6, 3               /* t1 = s & 3 */
 592        /* if temp != 0, unaligned transfers needed */
 593        bnei    r9, d_word_unaligned
 594
 595d_word_aligned:
 596        addi    r4, r4,-4               /* n-- */
 597        lw      r9, r6, r4              /* t1 = *(s+n) */
 598        bneid   r4, d_word_aligned      /* loop */
 599        sw      r9, r5, r4              /* *(d+n) = t1 (IN DELAY SLOT) */
 600
 601        bri     d_word_done
 602
 603d_word_unaligned:
 604        andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 605        lw      r11, r8, r4             /* h = *(as + n) */
 606
 607        addi    r9, r9, -1
 608        beqi    r9,d_word_u1            /* t1 was 1 => 1 byte offset */
 609        addi    r9, r9, -1
 610        beqi    r9,d_word_u2            /* t1 was 2 => 2 byte offset */
 611
 612d_word_u3:
 613        bsrli   r11, r11, 8     /* h = h >> 8 */
 614d_wu3_loop:
 615        addi    r4, r4,-4       /* n = n - 4 */
 616        lw      r12, r8, r4     /* v = *(as + n) */
 617        bslli   r9, r12, 24     /* t1 = v << 24 */
 618        or      r9, r11, r9     /* t1 = h | t1 */
 619        sw      r9, r5, r4      /* *(d + n) = t1 */
 620        bneid   r4, d_wu3_loop  /* while (n) loop */
 621        bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 622
 623        bri     d_word_done
 624
 625d_word_u1:
 626        bsrli   r11, r11, 24    /* h = h >> 24 */
 627d_wu1_loop:
 628        addi    r4, r4,-4       /* n = n - 4 */
 629        lw      r12, r8, r4     /* v = *(as + n) */
 630        bslli   r9, r12, 8      /* t1 = v << 8 */
 631        or      r9, r11, r9     /* t1 = h | t1 */
 632        sw      r9, r5, r4      /* *(d + n) = t1 */
 633        bneid   r4, d_wu1_loop  /* while (n) loop */
 634        bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 635
 636        bri     d_word_done
 637
 638d_word_u2:
 639        bsrli   r11, r11, 16    /* h = h >> 16 */
 640d_wu2_loop:
 641        addi    r4, r4,-4       /* n = n - 4 */
 642        lw      r12, r8, r4     /* v = *(as + n) */
 643        bslli   r9, r12, 16     /* t1 = v << 16 */
 644        or      r9, r11, r9     /* t1 = h | t1 */
 645        sw      r9, r5, r4      /* *(d + n) = t1 */
 646        bneid   r4, d_wu2_loop  /* while (n) loop */
 647        bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 648
 649d_word_done:
 650
 651d_xfer_end:
 652d_xfer_end_loop:
 653        beqi    r7, a_done              /* while (c) */
 654        addi    r6, r6, -1              /* s-- */
 655        lbui    r9, r6, 0               /* t1 = *s */
 656        addi    r5, r5, -1              /* d-- */
 657        sbi     r9, r5, 0               /* *d = t1 */
 658        brid    d_xfer_end_loop         /* loop */
 659        addi    r7, r7, -1              /* c-- (IN DELAY SLOT) */
 660
 661d_done:
 662        rtsd    r15, 8
 663        nop
 664
 665.size  memmove, . - memmove
 666.end memmove
 667