linux/arch/mips/lib/memcpy.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 * Copyright (C) 2007  Maciej W. Rozycki
  13 *
  14 * Mnemonic names for arguments to memcpy/__copy_user
  15 */
  16
  17/*
  18 * Hack to resolve longstanding prefetch issue
  19 *
  20 * Prefetching may be fatal on some systems if we're prefetching beyond the
  21 * end of memory on some systems.  It's also a seriously bad idea on non
  22 * dma-coherent systems.
  23 */
  24#ifdef CONFIG_DMA_NONCOHERENT
  25#undef CONFIG_CPU_HAS_PREFETCH
  26#endif
  27#ifdef CONFIG_MIPS_MALTA
  28#undef CONFIG_CPU_HAS_PREFETCH
  29#endif
  30
  31#include <asm/asm.h>
  32#include <asm/asm-offsets.h>
  33#include <asm/regdef.h>
  34
  35#define dst a0
  36#define src a1
  37#define len a2
  38
  39/*
  40 * Spec
  41 *
  42 * memcpy copies len bytes from src to dst and sets v0 to dst.
  43 * It assumes that
  44 *   - src and dst don't overlap
  45 *   - src is readable
  46 *   - dst is writable
  47 * memcpy uses the standard calling convention
  48 *
  49 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50 * the number of uncopied bytes due to an exception caused by a read or write.
  51 * __copy_user assumes that src and dst don't overlap, and that the call is
  52 * implementing one of the following:
  53 *   copy_to_user
  54 *     - src is readable  (no exceptions when reading src)
  55 *   copy_from_user
  56 *     - dst is writable  (no exceptions when writing dst)
  57 * __copy_user uses a non-standard calling convention; see
  58 * include/asm-mips/uaccess.h
  59 *
  60 * When an exception happens on a load, the handler must
  61 # ensure that all of the destination buffer is overwritten to prevent
  62 * leaking information to user mode programs.
  63 */
  64
  65/*
  66 * Implementation
  67 */
  68
  69/*
  70 * The exception handler for loads requires that:
  71 *  1- AT contain the address of the byte just past the end of the source
  72 *     of the copy,
  73 *  2- src_entry <= src < AT, and
  74 *  3- (dst - src) == (dst_entry - src_entry),
  75 * The _entry suffix denotes values when __copy_user was called.
  76 *
  77 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78 * (2) is met by incrementing src by the number of bytes copied
  79 * (3) is met by not doing loads between a pair of increments of dst and src
  80 *
  81 * The exception handlers for stores adjust len (if necessary) and return.
  82 * These handlers do not need to overwrite any data.
  83 *
  84 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85 * they're not protected.
  86 */
  87
  88#define EXC(inst_reg,addr,handler)              \
  899:      inst_reg, addr;                         \
  90        .section __ex_table,"a";                \
  91        PTR     9b, handler;                    \
  92        .previous
  93
  94/*
  95 * Only on the 64-bit kernel we can made use of 64-bit registers.
  96 */
  97#ifdef CONFIG_64BIT
  98#define USE_DOUBLE
  99#endif
 100
 101#ifdef USE_DOUBLE
 102
 103#define LOAD   ld
 104#define LOADL  ldl
 105#define LOADR  ldr
 106#define STOREL sdl
 107#define STORER sdr
 108#define STORE  sd
 109#define ADD    daddu
 110#define SUB    dsubu
 111#define SRL    dsrl
 112#define SRA    dsra
 113#define SLL    dsll
 114#define SLLV   dsllv
 115#define SRLV   dsrlv
 116#define NBYTES 8
 117#define LOG_NBYTES 3
 118
 119/*
 120 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121 * register definitions). We need to redefine the register definitions from
 122 * the n64 ABI register naming to the o32 ABI register naming.
 123 */
 124#undef t0
 125#undef t1
 126#undef t2
 127#undef t3
 128#define t0      $8
 129#define t1      $9
 130#define t2      $10
 131#define t3      $11
 132#define t4      $12
 133#define t5      $13
 134#define t6      $14
 135#define t7      $15
 136
 137#else
 138
 139#define LOAD   lw
 140#define LOADL  lwl
 141#define LOADR  lwr
 142#define STOREL swl
 143#define STORER swr
 144#define STORE  sw
 145#define ADD    addu
 146#define SUB    subu
 147#define SRL    srl
 148#define SLL    sll
 149#define SRA    sra
 150#define SLLV   sllv
 151#define SRLV   srlv
 152#define NBYTES 4
 153#define LOG_NBYTES 2
 154
 155#endif /* USE_DOUBLE */
 156
 157#ifdef CONFIG_CPU_LITTLE_ENDIAN
 158#define LDFIRST LOADR
 159#define LDREST  LOADL
 160#define STFIRST STORER
 161#define STREST  STOREL
 162#define SHIFT_DISCARD SLLV
 163#else
 164#define LDFIRST LOADL
 165#define LDREST  LOADR
 166#define STFIRST STOREL
 167#define STREST  STORER
 168#define SHIFT_DISCARD SRLV
 169#endif
 170
 171#define FIRST(unit) ((unit)*NBYTES)
 172#define REST(unit)  (FIRST(unit)+NBYTES-1)
 173#define UNIT(unit)  FIRST(unit)
 174
 175#define ADDRMASK (NBYTES-1)
 176
 177        .text
 178        .set    noreorder
 179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180        .set    noat
 181#else
 182        .set    at=v1
 183#endif
 184
 185/*
 186 * A combined memcpy/__copy_user
 187 * __copy_user sets len to 0 for success; else to an upper bound of
 188 * the number of uncopied bytes.
 189 * memcpy sets v0 to dst.
 190 */
 191        .align  5
 192LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 193        move    v0, dst                         /* return value */
 194.L__memcpy:
 195FEXPORT(__copy_user)
 196        /*
 197         * Note: dst & src may be unaligned, len may be 0
 198         * Temps
 199         */
 200#define rem t8
 201
 202        R10KCBARRIER(0(ra))
 203        /*
 204         * The "issue break"s below are very approximate.
 205         * Issue delays for dcache fills will perturb the schedule, as will
 206         * load queue full replay traps, etc.
 207         *
 208         * If len < NBYTES use byte operations.
 209         */
 210        PREF(   0, 0(src) )
 211        PREF(   1, 0(dst) )
 212        sltu    t2, len, NBYTES
 213        and     t1, dst, ADDRMASK
 214        PREF(   0, 1*32(src) )
 215        PREF(   1, 1*32(dst) )
 216        bnez    t2, .Lcopy_bytes_checklen
 217         and    t0, src, ADDRMASK
 218        PREF(   0, 2*32(src) )
 219        PREF(   1, 2*32(dst) )
 220        bnez    t1, .Ldst_unaligned
 221         nop
 222        bnez    t0, .Lsrc_unaligned_dst_aligned
 223        /*
 224         * use delay slot for fall-through
 225         * src and dst are aligned; need to compute rem
 226         */
 227.Lboth_aligned:
 228         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 229        beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 230         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 231        PREF(   0, 3*32(src) )
 232        PREF(   1, 3*32(dst) )
 233        .align  4
 2341:
 235        R10KCBARRIER(0(ra))
 236EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 237EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 238EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 239EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 240        SUB     len, len, 8*NBYTES
 241EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 242EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 243EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p8u)
 244EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p7u)
 245EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 246EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 247        ADD     src, src, 8*NBYTES
 248        ADD     dst, dst, 8*NBYTES
 249EXC(    STORE   t2, UNIT(-6)(dst),      .Ls_exc_p6u)
 250EXC(    STORE   t3, UNIT(-5)(dst),      .Ls_exc_p5u)
 251EXC(    STORE   t4, UNIT(-4)(dst),      .Ls_exc_p4u)
 252EXC(    STORE   t7, UNIT(-3)(dst),      .Ls_exc_p3u)
 253EXC(    STORE   t0, UNIT(-2)(dst),      .Ls_exc_p2u)
 254EXC(    STORE   t1, UNIT(-1)(dst),      .Ls_exc_p1u)
 255        PREF(   0, 8*32(src) )
 256        PREF(   1, 8*32(dst) )
 257        bne     len, rem, 1b
 258         nop
 259
 260        /*
 261         * len == rem == the number of bytes left to copy < 8*NBYTES
 262         */
 263.Lcleanup_both_aligned:
 264        beqz    len, .Ldone
 265         sltu   t0, len, 4*NBYTES
 266        bnez    t0, .Lless_than_4units
 267         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 268        /*
 269         * len >= 4*NBYTES
 270         */
 271EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 272EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 273EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 274EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 275        SUB     len, len, 4*NBYTES
 276        ADD     src, src, 4*NBYTES
 277        R10KCBARRIER(0(ra))
 278EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 279EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 280EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 281EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 282        .set    reorder                         /* DADDI_WAR */
 283        ADD     dst, dst, 4*NBYTES
 284        beqz    len, .Ldone
 285        .set    noreorder
 286.Lless_than_4units:
 287        /*
 288         * rem = len % NBYTES
 289         */
 290        beq     rem, len, .Lcopy_bytes
 291         nop
 2921:
 293        R10KCBARRIER(0(ra))
 294EXC(    LOAD    t0, 0(src),             .Ll_exc)
 295        ADD     src, src, NBYTES
 296        SUB     len, len, NBYTES
 297EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 298        .set    reorder                         /* DADDI_WAR */
 299        ADD     dst, dst, NBYTES
 300        bne     rem, len, 1b
 301        .set    noreorder
 302
 303        /*
 304         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 305         * A loop would do only a byte at a time with possible branch
 306         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 307         * because can't assume read-access to dst.  Instead, use
 308         * STREST dst, which doesn't require read access to dst.
 309         *
 310         * This code should perform better than a simple loop on modern,
 311         * wide-issue mips processors because the code has fewer branches and
 312         * more instruction-level parallelism.
 313         */
 314#define bits t2
 315        beqz    len, .Ldone
 316         ADD    t1, dst, len    # t1 is just past last byte of dst
 317        li      bits, 8*NBYTES
 318        SLL     rem, len, 3     # rem = number of bits to keep
 319EXC(    LOAD    t0, 0(src),             .Ll_exc)
 320        SUB     bits, bits, rem # bits = number of bits to discard
 321        SHIFT_DISCARD t0, t0, bits
 322EXC(    STREST  t0, -1(t1),             .Ls_exc)
 323        jr      ra
 324         move   len, zero
 325.Ldst_unaligned:
 326        /*
 327         * dst is unaligned
 328         * t0 = src & ADDRMASK
 329         * t1 = dst & ADDRMASK; T1 > 0
 330         * len >= NBYTES
 331         *
 332         * Copy enough bytes to align dst
 333         * Set match = (src and dst have same alignment)
 334         */
 335#define match rem
 336EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 337        ADD     t2, zero, NBYTES
 338EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 339        SUB     t2, t2, t1      # t2 = number of bytes copied
 340        xor     match, t0, t1
 341        R10KCBARRIER(0(ra))
 342EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 343        beq     len, t2, .Ldone
 344         SUB    len, len, t2
 345        ADD     dst, dst, t2
 346        beqz    match, .Lboth_aligned
 347         ADD    src, src, t2
 348
 349.Lsrc_unaligned_dst_aligned:
 350        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 351        PREF(   0, 3*32(src) )
 352        beqz    t0, .Lcleanup_src_unaligned
 353         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 354        PREF(   1, 3*32(dst) )
 3551:
 356/*
 357 * Avoid consecutive LD*'s to the same register since some mips
 358 * implementations can't issue them in the same cycle.
 359 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 360 * are to the same unit (unless src is aligned, but it's not).
 361 */
 362        R10KCBARRIER(0(ra))
 363EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 364EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 365        SUB     len, len, 4*NBYTES
 366EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 367EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 368EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 369EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 370EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 371EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 372        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 373        ADD     src, src, 4*NBYTES
 374#ifdef CONFIG_CPU_SB1
 375        nop                             # improves slotting
 376#endif
 377EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 378EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 379EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 380EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 381        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 382        .set    reorder                         /* DADDI_WAR */
 383        ADD     dst, dst, 4*NBYTES
 384        bne     len, rem, 1b
 385        .set    noreorder
 386
 387.Lcleanup_src_unaligned:
 388        beqz    len, .Ldone
 389         and    rem, len, NBYTES-1  # rem = len % NBYTES
 390        beq     rem, len, .Lcopy_bytes
 391         nop
 3921:
 393        R10KCBARRIER(0(ra))
 394EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 395EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 396        ADD     src, src, NBYTES
 397        SUB     len, len, NBYTES
 398EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 399        .set    reorder                         /* DADDI_WAR */
 400        ADD     dst, dst, NBYTES
 401        bne     len, rem, 1b
 402        .set    noreorder
 403
 404.Lcopy_bytes_checklen:
 405        beqz    len, .Ldone
 406         nop
 407.Lcopy_bytes:
 408        /* 0 < len < NBYTES  */
 409        R10KCBARRIER(0(ra))
 410#define COPY_BYTE(N)                    \
 411EXC(    lb      t0, N(src), .Ll_exc);   \
 412        SUB     len, len, 1;            \
 413        beqz    len, .Ldone;            \
 414EXC(     sb     t0, N(dst), .Ls_exc_p1)
 415
 416        COPY_BYTE(0)
 417        COPY_BYTE(1)
 418#ifdef USE_DOUBLE
 419        COPY_BYTE(2)
 420        COPY_BYTE(3)
 421        COPY_BYTE(4)
 422        COPY_BYTE(5)
 423#endif
 424EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 425        SUB     len, len, 1
 426        jr      ra
 427EXC(     sb     t0, NBYTES-2(dst), .Ls_exc_p1)
 428.Ldone:
 429        jr      ra
 430         nop
 431        END(memcpy)
 432
 433.Ll_exc_copy:
 434        /*
 435         * Copy bytes from src until faulting load address (or until a
 436         * lb faults)
 437         *
 438         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 439         * may be more than a byte beyond the last address.
 440         * Hence, the lb below may get an exception.
 441         *
 442         * Assumes src < THREAD_BUADDR($28)
 443         */
 444        LOAD    t0, TI_TASK($28)
 445         nop
 446        LOAD    t0, THREAD_BUADDR(t0)
 4471:
 448EXC(    lb      t1, 0(src),     .Ll_exc)
 449        ADD     src, src, 1
 450        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 451        .set    reorder                         /* DADDI_WAR */
 452        ADD     dst, dst, 1
 453        bne     src, t0, 1b
 454        .set    noreorder
 455.Ll_exc:
 456        LOAD    t0, TI_TASK($28)
 457         nop
 458        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 459         nop
 460        SUB     len, AT, t0             # len number of uncopied bytes
 461        /*
 462         * Here's where we rely on src and dst being incremented in tandem,
 463         *   See (3) above.
 464         * dst += (fault addr - src) to put dst at first byte to clear
 465         */
 466        ADD     dst, t0                 # compute start address in a1
 467        SUB     dst, src
 468        /*
 469         * Clear len bytes starting at dst.  Can't call __bzero because it
 470         * might modify len.  An inefficient loop for these rare times...
 471         */
 472        .set    reorder                         /* DADDI_WAR */
 473        SUB     src, len, 1
 474        beqz    len, .Ldone
 475        .set    noreorder
 4761:      sb      zero, 0(dst)
 477        ADD     dst, dst, 1
 478#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 479        bnez    src, 1b
 480         SUB    src, src, 1
 481#else
 482        .set    push
 483        .set    noat
 484        li      v1, 1
 485        bnez    src, 1b
 486         SUB    src, src, v1
 487        .set    pop
 488#endif
 489        jr      ra
 490         nop
 491
 492
 493#define SEXC(n)                                                 \
 494        .set    reorder;                        /* DADDI_WAR */ \
 495.Ls_exc_p ## n ## u:                                            \
 496        ADD     len, len, n*NBYTES;                             \
 497        jr      ra;                                             \
 498        .set    noreorder
 499
 500SEXC(8)
 501SEXC(7)
 502SEXC(6)
 503SEXC(5)
 504SEXC(4)
 505SEXC(3)
 506SEXC(2)
 507SEXC(1)
 508
 509.Ls_exc_p1:
 510        .set    reorder                         /* DADDI_WAR */
 511        ADD     len, len, 1
 512        jr      ra
 513        .set    noreorder
 514.Ls_exc:
 515        jr      ra
 516         nop
 517
 518        .align  5
 519LEAF(memmove)
 520        ADD     t0, a0, a2
 521        ADD     t1, a1, a2
 522        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 523        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 524        and     t0, t1
 525        beqz    t0, .L__memcpy
 526         move   v0, a0                          /* return value */
 527        beqz    a2, .Lr_out
 528        END(memmove)
 529
 530        /* fall through to __rmemcpy */
 531LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 532         sltu   t0, a1, a0
 533        beqz    t0, .Lr_end_bytes_up            # src >= dst
 534         nop
 535        ADD     a0, a2                          # dst = dst + len
 536        ADD     a1, a2                          # src = src + len
 537
 538.Lr_end_bytes:
 539        R10KCBARRIER(0(ra))
 540        lb      t0, -1(a1)
 541        SUB     a2, a2, 0x1
 542        sb      t0, -1(a0)
 543        SUB     a1, a1, 0x1
 544        .set    reorder                         /* DADDI_WAR */
 545        SUB     a0, a0, 0x1
 546        bnez    a2, .Lr_end_bytes
 547        .set    noreorder
 548
 549.Lr_out:
 550        jr      ra
 551         move   a2, zero
 552
 553.Lr_end_bytes_up:
 554        R10KCBARRIER(0(ra))
 555        lb      t0, (a1)
 556        SUB     a2, a2, 0x1
 557        sb      t0, (a0)
 558        ADD     a1, a1, 0x1
 559        .set    reorder                         /* DADDI_WAR */
 560        ADD     a0, a0, 0x1
 561        bnez    a2, .Lr_end_bytes_up
 562        .set    noreorder
 563
 564        jr      ra
 565         move   a2, zero
 566        END(__rmemcpy)
 567