linux/arch/tile/lib/memcpy_32.S
<<
>>
Prefs
   1/*
   2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
   3 *
   4 *   This program is free software; you can redistribute it and/or
   5 *   modify it under the terms of the GNU General Public License
   6 *   as published by the Free Software Foundation, version 2.
   7 *
   8 *   This program is distributed in the hope that it will be useful, but
   9 *   WITHOUT ANY WARRANTY; without even the implied warranty of
  10 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11 *   NON INFRINGEMENT.  See the GNU General Public License for
  12 *   more details.
  13 */
  14
  15#include <arch/chip.h>
  16
  17
  18/*
  19 * This file shares the implementation of the userspace memcpy and
  20 * the kernel's memcpy, copy_to_user and copy_from_user.
  21 */
  22
  23#include <linux/linkage.h>
  24
  25/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
  26#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
  27#define memcpy __memcpy_asm
  28#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
  29#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
  30#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
  31#endif
  32
  33#define IS_MEMCPY         0
  34#define IS_COPY_FROM_USER  1
  35#define IS_COPY_FROM_USER_ZEROING  2
  36#define IS_COPY_TO_USER   -1
  37
  38        .section .text.memcpy_common, "ax"
  39        .align 64
  40
  41/* Use this to preface each bundle that can cause an exception so
  42 * the kernel can clean up properly. The special cleanup code should
  43 * not use these, since it knows what it is doing.
  44 */
  45#define EX \
  46        .pushsection __ex_table, "a"; \
  47        .word 9f, memcpy_common_fixup; \
  48        .popsection; \
  49        9
  50
  51
  52/* __copy_from_user_inatomic takes the kernel target address in r0,
  53 * the user source in r1, and the bytes to copy in r2.
  54 * It returns the number of uncopiable bytes (hopefully zero) in r0.
  55 */
  56ENTRY(__copy_from_user_inatomic)
  57.type __copy_from_user_inatomic, @function
  58        FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
  59          .text.memcpy_common, \
  60          .Lend_memcpy_common - __copy_from_user_inatomic)
  61        { movei r29, IS_COPY_FROM_USER; j memcpy_common }
  62        .size __copy_from_user_inatomic, . - __copy_from_user_inatomic
  63
  64/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
  65 * any uncopiable bytes are zeroed in the target.
  66 */
  67ENTRY(__copy_from_user_zeroing)
  68.type __copy_from_user_zeroing, @function
  69        FEEDBACK_REENTER(__copy_from_user_inatomic)
  70        { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
  71        .size __copy_from_user_zeroing, . - __copy_from_user_zeroing
  72
  73/* __copy_to_user_inatomic takes the user target address in r0,
  74 * the kernel source in r1, and the bytes to copy in r2.
  75 * It returns the number of uncopiable bytes (hopefully zero) in r0.
  76 */
  77ENTRY(__copy_to_user_inatomic)
  78.type __copy_to_user_inatomic, @function
  79        FEEDBACK_REENTER(__copy_from_user_inatomic)
  80        { movei r29, IS_COPY_TO_USER; j memcpy_common }
  81        .size __copy_to_user_inatomic, . - __copy_to_user_inatomic
  82
  83ENTRY(memcpy)
  84.type memcpy, @function
  85        FEEDBACK_REENTER(__copy_from_user_inatomic)
  86        { movei r29, IS_MEMCPY }
  87        .size memcpy, . - memcpy
  88        /* Fall through */
  89
  90        .type memcpy_common, @function
  91memcpy_common:
  92        /* On entry, r29 holds one of the IS_* macro values from above. */
  93
  94
  95        /* r0 is the dest, r1 is the source, r2 is the size. */
  96
  97        /* Save aside original dest so we can return it at the end. */
  98        { sw sp, lr; move r23, r0; or r4, r0, r1 }
  99
 100        /* Check for an empty size. */
 101        { bz r2, .Ldone; andi r4, r4, 3 }
 102
 103        /* Save aside original values in case of a fault. */
 104        { move r24, r1; move r25, r2 }
 105        move r27, lr
 106
 107        /* Check for an unaligned source or dest. */
 108        { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 }
 109
 110.Lcheck_aligned_copy_size:
 111        /* If we are copying < 256 bytes, branch to simple case. */
 112        { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 }
 113
 114        /* Copying >= 256 bytes, so jump to complex prefetching loop. */
 115        { andi r6, r1, 63; j .Lcopy_many }
 116
 117/*
 118 *
 119 * Aligned 4 byte at a time copy loop
 120 *
 121 */
 122
 123.Lcopy_8_loop:
 124        /* Copy two words at a time to hide load latency. */
 125EX:     { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 }
 126EX:     { lw r4, r1; addi r1, r1, 4 }
 127EX:     { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 128EX:     { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 }
 129.Lcopy_8_check:
 130        { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 }
 131
 132        /* Copy odd leftover word, if any. */
 133        { bnzt r4, .Lcheck_odd_stragglers }
 134EX:     { lw r3, r1; addi r1, r1, 4 }
 135EX:     { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 136
 137.Lcheck_odd_stragglers:
 138        { bnz r2, .Lcopy_unaligned_few }
 139
 140.Ldone:
 141        /* For memcpy return original dest address, else zero. */
 142        { mz r0, r29, r23; jrp lr }
 143
 144
 145/*
 146 *
 147 * Prefetching multiple cache line copy handler (for large transfers).
 148 *
 149 */
 150
 151        /* Copy words until r1 is cache-line-aligned. */
 152.Lalign_loop:
 153EX:     { lw r3, r1; addi r1, r1, 4 }
 154        { andi r6, r1, 63 }
 155EX:     { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 156.Lcopy_many:
 157        { bnzt r6, .Lalign_loop; addi r9, r0, 63 }
 158
 159        { addi r3, r1, 60; andi r9, r9, -64 }
 160
 161#if CHIP_HAS_WH64()
 162        /* No need to prefetch dst, we'll just do the wh64
 163         * right before we copy a line.
 164         */
 165#endif
 166
 167EX:     { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 168        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 169        { bnzt zero, .; move r27, lr }
 170EX:     { lw r6, r3; addi r3, r3, 64 }
 171        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 172        { bnzt zero, . }
 173EX:     { lw r7, r3; addi r3, r3, 64 }
 174#if !CHIP_HAS_WH64()
 175        /* Prefetch the dest */
 176        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 177        { bnzt zero, . }
 178        /* Use a real load to cause a TLB miss if necessary.  We aren't using
 179         * r28, so this should be fine.
 180         */
 181EX:     { lw r28, r9; addi r9, r9, 64 }
 182        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 183        { bnzt zero, . }
 184        { prefetch r9; addi r9, r9, 64 }
 185        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 186        { bnzt zero, . }
 187        { prefetch r9; addi r9, r9, 64 }
 188#endif
 189        /* Intentionally stall for a few cycles to leave L2 cache alone. */
 190        { bz zero, .Lbig_loop2 }
 191
 192        /* On entry to this loop:
 193         * - r0 points to the start of dst line 0
 194         * - r1 points to start of src line 0
 195         * - r2 >= (256 - 60), only the first time the loop trips.
 196         * - r3 contains r1 + 128 + 60    [pointer to end of source line 2]
 197         *   This is our prefetch address. When we get near the end
 198         *   rather than prefetching off the end this is changed to point
 199         *   to some "safe" recently loaded address.
 200         * - r5 contains *(r1 + 60)       [i.e. last word of source line 0]
 201         * - r6 contains *(r1 + 64 + 60)  [i.e. last word of source line 1]
 202         * - r9 contains ((r0 + 63) & -64)
 203         *     [start of next dst cache line.]
 204         */
 205
 206.Lbig_loop:
 207        { jal .Lcopy_line2; add r15, r1, r2 }
 208
 209.Lbig_loop2:
 210        /* Copy line 0, first stalling until r5 is ready. */
 211EX:     { move r12, r5; lw r16, r1 }
 212        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
 213        /* Prefetch several lines ahead. */
 214EX:     { lw r5, r3; addi r3, r3, 64 }
 215        { jal .Lcopy_line }
 216
 217        /* Copy line 1, first stalling until r6 is ready. */
 218EX:     { move r12, r6; lw r16, r1 }
 219        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
 220        /* Prefetch several lines ahead. */
 221EX:     { lw r6, r3; addi r3, r3, 64 }
 222        { jal .Lcopy_line }
 223
 224        /* Copy line 2, first stalling until r7 is ready. */
 225EX:     { move r12, r7; lw r16, r1 }
 226        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
 227        /* Prefetch several lines ahead. */
 228EX:     { lw r7, r3; addi r3, r3, 64 }
 229        /* Use up a caches-busy cycle by jumping back to the top of the
 230         * loop. Might as well get it out of the way now.
 231         */
 232        { j .Lbig_loop }
 233
 234
 235        /* On entry:
 236         * - r0 points to the destination line.
 237         * - r1 points to the source line.
 238         * - r3 is the next prefetch address.
 239         * - r9 holds the last address used for wh64.
 240         * - r12 = WORD_15
 241         * - r16 = WORD_0.
 242         * - r17 == r1 + 16.
 243         * - r27 holds saved lr to restore.
 244         *
 245         * On exit:
 246         * - r0 is incremented by 64.
 247         * - r1 is incremented by 64, unless that would point to a word
 248         *   beyond the end of the source array, in which case it is redirected
 249         *   to point to an arbitrary word already in the cache.
 250         * - r2 is decremented by 64.
 251         * - r3 is unchanged, unless it points to a word beyond the
 252         *   end of the source array, in which case it is redirected
 253         *   to point to an arbitrary word already in the cache.
 254         *   Redirecting is OK since if we are that close to the end
 255         *   of the array we will not come back to this subroutine
 256         *   and use the contents of the prefetched address.
 257         * - r4 is nonzero iff r2 >= 64.
 258         * - r9 is incremented by 64, unless it points beyond the
 259         *   end of the last full destination cache line, in which
 260         *   case it is redirected to a "safe address" that can be
 261         *   clobbered (sp - 64)
 262         * - lr contains the value in r27.
 263         */
 264
 265/* r26 unused */
 266
 267.Lcopy_line:
 268        /* TODO: when r3 goes past the end, we would like to redirect it
 269         * to prefetch the last partial cache line (if any) just once, for the
 270         * benefit of the final cleanup loop. But we don't want to
 271         * prefetch that line more than once, or subsequent prefetches
 272         * will go into the RTF. But then .Lbig_loop should unconditionally
 273         * branch to top of loop to execute final prefetch, and its
 274         * nop should become a conditional branch.
 275         */
 276
 277        /* We need two non-memory cycles here to cover the resources
 278         * used by the loads initiated by the caller.
 279         */
 280        { add r15, r1, r2 }
 281.Lcopy_line2:
 282        { slt_u r13, r3, r15; addi r17, r1, 16 }
 283
 284        /* NOTE: this will stall for one cycle as L1 is busy. */
 285
 286        /* Fill second L1D line. */
 287EX:     { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 288
 289#if CHIP_HAS_WH64()
 290        /* Prepare destination line for writing. */
 291EX:     { wh64 r9; addi r9, r9, 64 }
 292#else
 293        /* Prefetch dest line */
 294        { prefetch r9; addi r9, r9, 64 }
 295#endif
 296        /* Load seven words that are L1D hits to cover wh64 L2 usage. */
 297
 298        /* Load the three remaining words from the last L1D line, which
 299         * we know has already filled the L1D.
 300         */
 301EX:     { lw r4, r1;  addi r1, r1, 4;   addi r20, r1, 16 }   /* r4 = WORD_12 */
 302EX:     { lw r8, r1;  addi r1, r1, 4;   slt_u r13, r20, r15 }/* r8 = WORD_13 */
 303EX:     { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 }  /* r11 = WORD_14 */
 304
 305        /* Load the three remaining words from the first L1D line, first
 306         * stalling until it has filled by "looking at" r16.
 307         */
 308EX:     { lw r13, r1; addi r1, r1, 4; move zero, r16 }   /* r13 = WORD_1 */
 309EX:     { lw r14, r1; addi r1, r1, 4 }                   /* r14 = WORD_2 */
 310EX:     { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
 311
 312        /* Load second word from the second L1D line, first
 313         * stalling until it has filled by "looking at" r17.
 314         */
 315EX:     { lw r19, r1; addi r1, r1, 4; move zero, r17 }  /* r19 = WORD_5 */
 316
 317        /* Store last word to the destination line, potentially dirtying it
 318         * for the first time, which keeps the L2 busy for two cycles.
 319         */
 320EX:     { sw r10, r12 }                                 /* store(WORD_15) */
 321
 322        /* Use two L1D hits to cover the sw L2 access above. */
 323EX:     { lw r10, r1; addi r1, r1, 4 }                  /* r10 = WORD_6 */
 324EX:     { lw r12, r1; addi r1, r1, 4 }                  /* r12 = WORD_7 */
 325
 326        /* Fill third L1D line. */
 327EX:     { lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 328
 329        /* Store first L1D line. */
 330EX:     { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 331EX:     { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 332EX:     { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
 333#if CHIP_HAS_WH64()
 334EX:     { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
 335#else
 336        /* Back up the r9 to a cache line we are already storing to
 337         * if it gets past the end of the dest vector.  Strictly speaking,
 338         * we don't need to back up to the start of a cache line, but it's free
 339         * and tidy, so why not?
 340         */
 341EX:     { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
 342#endif
 343        /* Store second L1D line. */
 344EX:     { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 345EX:     { sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
 346EX:     { sw r0, r10; addi r0, r0, 4 }                  /* store(WORD_6) */
 347EX:     { sw r0, r12; addi r0, r0, 4 }                  /* store(WORD_7) */
 348
 349EX:     { lw r13, r1; addi r1, r1, 4; move zero, r18 }  /* r13 = WORD_9 */
 350EX:     { lw r14, r1; addi r1, r1, 4 }                  /* r14 = WORD_10 */
 351EX:     { lw r15, r1; move r1, r20   }                  /* r15 = WORD_11 */
 352
 353        /* Store third L1D line. */
 354EX:     { sw r0, r18; addi r0, r0, 4 }                  /* store(WORD_8) */
 355EX:     { sw r0, r13; addi r0, r0, 4 }                  /* store(WORD_9) */
 356EX:     { sw r0, r14; addi r0, r0, 4 }                  /* store(WORD_10) */
 357EX:     { sw r0, r15; addi r0, r0, 4 }                  /* store(WORD_11) */
 358
 359        /* Store rest of fourth L1D line. */
 360EX:     { sw r0, r4;  addi r0, r0, 4 }                  /* store(WORD_12) */
 361        {
 362EX:     sw r0, r8                                       /* store(WORD_13) */
 363        addi r0, r0, 4
 364        /* Will r2 be > 64 after we subtract 64 below? */
 365        shri r4, r2, 7
 366        }
 367        {
 368EX:     sw r0, r11                                      /* store(WORD_14) */
 369        addi r0, r0, 8
 370        /* Record 64 bytes successfully copied. */
 371        addi r2, r2, -64
 372        }
 373
 374        { jrp lr; move lr, r27 }
 375
 376        /* Convey to the backtrace library that the stack frame is size
 377         * zero, and the real return address is on the stack rather than
 378         * in 'lr'.
 379         */
 380        { info 8 }
 381
 382        .align 64
 383.Lcopy_unaligned_maybe_many:
 384        /* Skip the setup overhead if we aren't copying many bytes. */
 385        { slti_u r8, r2, 20; sub r4, zero, r0 }
 386        { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 }
 387        { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 }
 388
 389/*
 390 *
 391 * unaligned 4 byte at a time copy handler.
 392 *
 393 */
 394
 395        /* Copy single bytes until r0 == 0 mod 4, so we can store words. */
 396.Lalign_dest_loop:
 397EX:     { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 }
 398EX:     { sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 399        { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 }
 400
 401        /* If source and dest are now *both* aligned, do an aligned copy. */
 402        { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 }
 403
 404.Ldest_is_word_aligned:
 405
 406#if CHIP_HAS_DWORD_ALIGN()
 407EX:     { andi r8, r0, 63; lwadd_na r6, r1, 4}
 408        { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 409
 410        /* This copies unaligned words until either there are fewer
 411         * than 4 bytes left to copy, or until the destination pointer
 412         * is cache-aligned, whichever comes first.
 413         *
 414         * On entry:
 415         * - r0 is the next store address.
 416         * - r1 points 4 bytes past the load address corresponding to r0.
 417         * - r2 >= 4
 418         * - r6 is the next aligned word loaded.
 419         */
 420.Lcopy_unaligned_src_words:
 421EX:     { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 }
 422        /* stall */
 423        { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 }
 424EX:     { swadd r0, r6, 4; addi r2, r2, -4 }
 425        { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 }
 426        { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
 427
 428        /* On entry:
 429         * - r0 is the next store address.
 430         * - r1 points 4 bytes past the load address corresponding to r0.
 431         * - r2 >= 4 (# of bytes left to store).
 432         * - r6 is the next aligned src word value.
 433         * - r9 = (r2 < 64U).
 434         * - r18 points one byte past the end of source memory.
 435         */
 436.Ldest_is_L2_line_aligned:
 437
 438        {
 439        /* Not a full cache line remains. */
 440        bnz r9, .Lcleanup_unaligned_words
 441        move r7, r6
 442        }
 443
 444        /* r2 >= 64 */
 445
 446        /* Kick off two prefetches, but don't go past the end. */
 447        { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 }
 448        { prefetch r3; move r3, r8; slt_u r8, r8, r18 }
 449        { mvz r3, r8, r1; addi r8, r3, 64 }
 450        { prefetch r3; move r3, r8; slt_u r8, r8, r18 }
 451        { mvz r3, r8, r1; movei r17, 0 }
 452
 453.Lcopy_unaligned_line:
 454        /* Prefetch another line. */
 455        { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 }
 456        /* Fire off a load of the last word we are about to copy. */
 457EX:     { lw_na r15, r15; slt_u r8, r3, r18 }
 458
 459EX:     { mvz r3, r8, r1; wh64 r0 }
 460
 461        /* This loop runs twice.
 462         *
 463         * On entry:
 464         * - r17 is even before the first iteration, and odd before
 465         *   the second.  It is incremented inside the loop.  Encountering
 466         *   an even value at the end of the loop makes it stop.
 467         */
 468.Lcopy_half_an_unaligned_line:
 469EX:     {
 470        /* Stall until the last byte is ready. In the steady state this
 471         * guarantees all words to load below will be in the L2 cache, which
 472         * avoids shunting the loads to the RTF.
 473         */
 474        move zero, r15
 475        lwadd_na r7, r1, 16
 476        }
 477EX:     { lwadd_na r11, r1, 12 }
 478EX:     { lwadd_na r14, r1, -24 }
 479EX:     { lwadd_na r8, r1, 4 }
 480EX:     { lwadd_na r9, r1, 4 }
 481EX:     {
 482        lwadd_na r10, r1, 8
 483        /* r16 = (r2 < 64), after we subtract 32 from r2 below. */
 484        slti_u r16, r2, 64 + 32
 485        }
 486EX:     { lwadd_na r12, r1, 4; addi r17, r17, 1 }
 487EX:     { lwadd_na r13, r1, 8; dword_align r6, r7, r1 }
 488EX:     { swadd r0, r6,  4; dword_align r7,  r8,  r1 }
 489EX:     { swadd r0, r7,  4; dword_align r8,  r9,  r1 }
 490EX:     { swadd r0, r8,  4; dword_align r9,  r10, r1 }
 491EX:     { swadd r0, r9,  4; dword_align r10, r11, r1 }
 492EX:     { swadd r0, r10, 4; dword_align r11, r12, r1 }
 493EX:     { swadd r0, r11, 4; dword_align r12, r13, r1 }
 494EX:     { swadd r0, r12, 4; dword_align r13, r14, r1 }
 495EX:     { swadd r0, r13, 4; addi r2, r2, -32 }
 496        { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line }
 497
 498        { bzt r16, .Lcopy_unaligned_line; move r7, r6 }
 499
 500        /* On entry:
 501         * - r0 is the next store address.
 502         * - r1 points 4 bytes past the load address corresponding to r0.
 503         * - r2 >= 0 (# of bytes left to store).
 504         * - r7 is the next aligned src word value.
 505         */
 506.Lcleanup_unaligned_words:
 507        /* Handle any trailing bytes. */
 508        { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 }
 509        { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
 510
 511        /* Move r1 back to the point where it corresponds to r0. */
 512        { addi r1, r1, -4 }
 513
 514#else /* !CHIP_HAS_DWORD_ALIGN() */
 515
 516        /* Compute right/left shift counts and load initial source words. */
 517        { andi r5, r1, -4; andi r3, r1, 3 }
 518EX:     { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
 519EX:     { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
 520
 521        /* Load and store one word at a time, using shifts and ORs
 522         * to correct for the misaligned src.
 523         */
 524.Lcopy_unaligned_src_loop:
 525        { shr r6, r6, r3; shl r8, r7, r4 }
 526EX:     { lw r7, r5; or r8, r8, r6; move r6, r7 }
 527EX:     { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
 528        { addi r5, r5, 4; slti_u r8, r2, 8 }
 529        { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
 530
 531        { bz r2, .Lcopy_unaligned_done }
 532#endif /* !CHIP_HAS_DWORD_ALIGN() */
 533
 534        /* Fall through */
 535
 536/*
 537 *
 538 * 1 byte at a time copy handler.
 539 *
 540 */
 541
 542.Lcopy_unaligned_few:
 543EX:     { lb_u r3, r1; addi r1, r1, 1 }
 544EX:     { sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 545        { bnzt r2, .Lcopy_unaligned_few }
 546
 547.Lcopy_unaligned_done:
 548
 549        /* For memcpy return original dest address, else zero. */
 550        { mz r0, r29, r23; jrp lr }
 551
 552.Lend_memcpy_common:
 553        .size memcpy_common, .Lend_memcpy_common - memcpy_common
 554
 555        .section .fixup,"ax"
 556memcpy_common_fixup:
 557        .type memcpy_common_fixup, @function
 558
 559        /* Skip any bytes we already successfully copied.
 560         * r2 (num remaining) is correct, but r0 (dst) and r1 (src)
 561         * may not be quite right because of unrolling and prefetching.
 562         * So we need to recompute their values as the address just
 563         * after the last byte we are sure was successfully loaded and
 564         * then stored.
 565         */
 566
 567        /* Determine how many bytes we successfully copied. */
 568        { sub r3, r25, r2 }
 569
 570        /* Add this to the original r0 and r1 to get their new values. */
 571        { add r0, r23, r3; add r1, r24, r3 }
 572
 573        { bzt r29, memcpy_fixup_loop }
 574        { blzt r29, copy_to_user_fixup_loop }
 575
 576copy_from_user_fixup_loop:
 577        /* Try copying the rest one byte at a time, expecting a load fault. */
 578.Lcfu:  { lb_u r3, r1; addi r1, r1, 1 }
 579        { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
 580        { bnzt r2, copy_from_user_fixup_loop }
 581
 582.Lcopy_from_user_fixup_zero_remainder:
 583        { bbs r29, 2f }  /* low bit set means IS_COPY_FROM_USER */
 584        /* byte-at-a-time loop faulted, so zero the rest. */
 585        { move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
 5861:      { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
 587        { bnzt r3, 1b }
 5882:      move lr, r27
 589        { move r0, r2; jrp lr }
 590
 591copy_to_user_fixup_loop:
 592        /* Try copying the rest one byte at a time, expecting a store fault. */
 593        { lb_u r3, r1; addi r1, r1, 1 }
 594.Lctu:  { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
 595        { bnzt r2, copy_to_user_fixup_loop }
 596.Lcopy_to_user_fixup_done:
 597        move lr, r27
 598        { move r0, r2; jrp lr }
 599
 600memcpy_fixup_loop:
 601        /* Try copying the rest one byte at a time. We expect a disastrous
 602         * fault to happen since we are in fixup code, but let it happen.
 603         */
 604        { lb_u r3, r1; addi r1, r1, 1 }
 605        { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
 606        { bnzt r2, memcpy_fixup_loop }
 607        /* This should be unreachable, we should have faulted again.
 608         * But be paranoid and handle it in case some interrupt changed
 609         * the TLB or something.
 610         */
 611        move lr, r27
 612        { move r0, r23; jrp lr }
 613
 614        .size memcpy_common_fixup, . - memcpy_common_fixup
 615
 616        .section __ex_table,"a"
 617        .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
 618        .word .Lctu, .Lcopy_to_user_fixup_done
 619