linux/arch/mips/lib/memcpy.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 * Copyright (C) 2007  Maciej W. Rozycki
  13 *
  14 * Mnemonic names for arguments to memcpy/__copy_user
  15 */
  16
  17/*
  18 * Hack to resolve longstanding prefetch issue
  19 *
  20 * Prefetching may be fatal on some systems if we're prefetching beyond the
  21 * end of memory on some systems.  It's also a seriously bad idea on non
  22 * dma-coherent systems.
  23 */
  24#ifdef CONFIG_DMA_NONCOHERENT
  25#undef CONFIG_CPU_HAS_PREFETCH
  26#endif
  27#ifdef CONFIG_MIPS_MALTA
  28#undef CONFIG_CPU_HAS_PREFETCH
  29#endif
  30
  31#include <asm/asm.h>
  32#include <asm/asm-offsets.h>
  33#include <asm/regdef.h>
  34
  35#define dst a0
  36#define src a1
  37#define len a2
  38
  39/*
  40 * Spec
  41 *
  42 * memcpy copies len bytes from src to dst and sets v0 to dst.
  43 * It assumes that
  44 *   - src and dst don't overlap
  45 *   - src is readable
  46 *   - dst is writable
  47 * memcpy uses the standard calling convention
  48 *
  49 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50 * the number of uncopied bytes due to an exception caused by a read or write.
  51 * __copy_user assumes that src and dst don't overlap, and that the call is
  52 * implementing one of the following:
  53 *   copy_to_user
  54 *     - src is readable  (no exceptions when reading src)
  55 *   copy_from_user
  56 *     - dst is writable  (no exceptions when writing dst)
  57 * __copy_user uses a non-standard calling convention; see
  58 * include/asm-mips/uaccess.h
  59 *
  60 * When an exception happens on a load, the handler must
  61 # ensure that all of the destination buffer is overwritten to prevent
  62 * leaking information to user mode programs.
  63 */
  64
  65/*
  66 * Implementation
  67 */
  68
  69/*
  70 * The exception handler for loads requires that:
  71 *  1- AT contain the address of the byte just past the end of the source
  72 *     of the copy,
  73 *  2- src_entry <= src < AT, and
  74 *  3- (dst - src) == (dst_entry - src_entry),
  75 * The _entry suffix denotes values when __copy_user was called.
  76 *
  77 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78 * (2) is met by incrementing src by the number of bytes copied
  79 * (3) is met by not doing loads between a pair of increments of dst and src
  80 *
  81 * The exception handlers for stores adjust len (if necessary) and return.
  82 * These handlers do not need to overwrite any data.
  83 *
  84 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85 * they're not protected.
  86 */
  87
  88#define EXC(inst_reg,addr,handler)              \
  899:      inst_reg, addr;                         \
  90        .section __ex_table,"a";                \
  91        PTR     9b, handler;                    \
  92        .previous
  93
  94/*
  95 * Only on the 64-bit kernel we can made use of 64-bit registers.
  96 */
  97#ifdef CONFIG_64BIT
  98#define USE_DOUBLE
  99#endif
 100
 101#ifdef USE_DOUBLE
 102
 103#define LOAD   ld
 104#define LOADL  ldl
 105#define LOADR  ldr
 106#define STOREL sdl
 107#define STORER sdr
 108#define STORE  sd
 109#define ADD    daddu
 110#define SUB    dsubu
 111#define SRL    dsrl
 112#define SRA    dsra
 113#define SLL    dsll
 114#define SLLV   dsllv
 115#define SRLV   dsrlv
 116#define NBYTES 8
 117#define LOG_NBYTES 3
 118
 119/*
 120 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121 * register definitions). We need to redefine the register definitions from
 122 * the n64 ABI register naming to the o32 ABI register naming.
 123 */
 124#undef t0
 125#undef t1
 126#undef t2
 127#undef t3
 128#define t0      $8
 129#define t1      $9
 130#define t2      $10
 131#define t3      $11
 132#define t4      $12
 133#define t5      $13
 134#define t6      $14
 135#define t7      $15
 136
 137#else
 138
 139#define LOAD   lw
 140#define LOADL  lwl
 141#define LOADR  lwr
 142#define STOREL swl
 143#define STORER swr
 144#define STORE  sw
 145#define ADD    addu
 146#define SUB    subu
 147#define SRL    srl
 148#define SLL    sll
 149#define SRA    sra
 150#define SLLV   sllv
 151#define SRLV   srlv
 152#define NBYTES 4
 153#define LOG_NBYTES 2
 154
 155#endif /* USE_DOUBLE */
 156
 157#ifdef CONFIG_CPU_LITTLE_ENDIAN
 158#define LDFIRST LOADR
 159#define LDREST  LOADL
 160#define STFIRST STORER
 161#define STREST  STOREL
 162#define SHIFT_DISCARD SLLV
 163#else
 164#define LDFIRST LOADL
 165#define LDREST  LOADR
 166#define STFIRST STOREL
 167#define STREST  STORER
 168#define SHIFT_DISCARD SRLV
 169#endif
 170
 171#define FIRST(unit) ((unit)*NBYTES)
 172#define REST(unit)  (FIRST(unit)+NBYTES-1)
 173#define UNIT(unit)  FIRST(unit)
 174
 175#define ADDRMASK (NBYTES-1)
 176
 177        .text
 178        .set    noreorder
 179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180        .set    noat
 181#else
 182        .set    at=v1
 183#endif
 184
 185/*
 186 * t6 is used as a flag to note inatomic mode.
 187 */
 188LEAF(__copy_user_inatomic)
 189        b       __copy_user_common
 190         li     t6, 1
 191        END(__copy_user_inatomic)
 192
 193/*
 194 * A combined memcpy/__copy_user
 195 * __copy_user sets len to 0 for success; else to an upper bound of
 196 * the number of uncopied bytes.
 197 * memcpy sets v0 to dst.
 198 */
 199        .align  5
 200LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 201        move    v0, dst                         /* return value */
 202.L__memcpy:
 203FEXPORT(__copy_user)
 204        li      t6, 0   /* not inatomic */
 205__copy_user_common:
 206        /*
 207         * Note: dst & src may be unaligned, len may be 0
 208         * Temps
 209         */
 210#define rem t8
 211
 212        R10KCBARRIER(0(ra))
 213        /*
 214         * The "issue break"s below are very approximate.
 215         * Issue delays for dcache fills will perturb the schedule, as will
 216         * load queue full replay traps, etc.
 217         *
 218         * If len < NBYTES use byte operations.
 219         */
 220        PREF(   0, 0(src) )
 221        PREF(   1, 0(dst) )
 222        sltu    t2, len, NBYTES
 223        and     t1, dst, ADDRMASK
 224        PREF(   0, 1*32(src) )
 225        PREF(   1, 1*32(dst) )
 226        bnez    t2, .Lcopy_bytes_checklen
 227         and    t0, src, ADDRMASK
 228        PREF(   0, 2*32(src) )
 229        PREF(   1, 2*32(dst) )
 230        bnez    t1, .Ldst_unaligned
 231         nop
 232        bnez    t0, .Lsrc_unaligned_dst_aligned
 233        /*
 234         * use delay slot for fall-through
 235         * src and dst are aligned; need to compute rem
 236         */
 237.Lboth_aligned:
 238         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 239        beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 240         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 241        PREF(   0, 3*32(src) )
 242        PREF(   1, 3*32(dst) )
 243        .align  4
 2441:
 245        R10KCBARRIER(0(ra))
 246EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 247EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 248EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 249EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 250        SUB     len, len, 8*NBYTES
 251EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 252EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 253EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p8u)
 254EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p7u)
 255EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 256EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 257        ADD     src, src, 8*NBYTES
 258        ADD     dst, dst, 8*NBYTES
 259EXC(    STORE   t2, UNIT(-6)(dst),      .Ls_exc_p6u)
 260EXC(    STORE   t3, UNIT(-5)(dst),      .Ls_exc_p5u)
 261EXC(    STORE   t4, UNIT(-4)(dst),      .Ls_exc_p4u)
 262EXC(    STORE   t7, UNIT(-3)(dst),      .Ls_exc_p3u)
 263EXC(    STORE   t0, UNIT(-2)(dst),      .Ls_exc_p2u)
 264EXC(    STORE   t1, UNIT(-1)(dst),      .Ls_exc_p1u)
 265        PREF(   0, 8*32(src) )
 266        PREF(   1, 8*32(dst) )
 267        bne     len, rem, 1b
 268         nop
 269
 270        /*
 271         * len == rem == the number of bytes left to copy < 8*NBYTES
 272         */
 273.Lcleanup_both_aligned:
 274        beqz    len, .Ldone
 275         sltu   t0, len, 4*NBYTES
 276        bnez    t0, .Lless_than_4units
 277         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 278        /*
 279         * len >= 4*NBYTES
 280         */
 281EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 282EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 283EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 284EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 285        SUB     len, len, 4*NBYTES
 286        ADD     src, src, 4*NBYTES
 287        R10KCBARRIER(0(ra))
 288EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 289EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 290EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 291EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 292        .set    reorder                         /* DADDI_WAR */
 293        ADD     dst, dst, 4*NBYTES
 294        beqz    len, .Ldone
 295        .set    noreorder
 296.Lless_than_4units:
 297        /*
 298         * rem = len % NBYTES
 299         */
 300        beq     rem, len, .Lcopy_bytes
 301         nop
 3021:
 303        R10KCBARRIER(0(ra))
 304EXC(    LOAD    t0, 0(src),             .Ll_exc)
 305        ADD     src, src, NBYTES
 306        SUB     len, len, NBYTES
 307EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 308        .set    reorder                         /* DADDI_WAR */
 309        ADD     dst, dst, NBYTES
 310        bne     rem, len, 1b
 311        .set    noreorder
 312
 313        /*
 314         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 315         * A loop would do only a byte at a time with possible branch
 316         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 317         * because can't assume read-access to dst.  Instead, use
 318         * STREST dst, which doesn't require read access to dst.
 319         *
 320         * This code should perform better than a simple loop on modern,
 321         * wide-issue mips processors because the code has fewer branches and
 322         * more instruction-level parallelism.
 323         */
 324#define bits t2
 325        beqz    len, .Ldone
 326         ADD    t1, dst, len    # t1 is just past last byte of dst
 327        li      bits, 8*NBYTES
 328        SLL     rem, len, 3     # rem = number of bits to keep
 329EXC(    LOAD    t0, 0(src),             .Ll_exc)
 330        SUB     bits, bits, rem # bits = number of bits to discard
 331        SHIFT_DISCARD t0, t0, bits
 332EXC(    STREST  t0, -1(t1),             .Ls_exc)
 333        jr      ra
 334         move   len, zero
 335.Ldst_unaligned:
 336        /*
 337         * dst is unaligned
 338         * t0 = src & ADDRMASK
 339         * t1 = dst & ADDRMASK; T1 > 0
 340         * len >= NBYTES
 341         *
 342         * Copy enough bytes to align dst
 343         * Set match = (src and dst have same alignment)
 344         */
 345#define match rem
 346EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 347        ADD     t2, zero, NBYTES
 348EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 349        SUB     t2, t2, t1      # t2 = number of bytes copied
 350        xor     match, t0, t1
 351        R10KCBARRIER(0(ra))
 352EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 353        beq     len, t2, .Ldone
 354         SUB    len, len, t2
 355        ADD     dst, dst, t2
 356        beqz    match, .Lboth_aligned
 357         ADD    src, src, t2
 358
 359.Lsrc_unaligned_dst_aligned:
 360        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 361        PREF(   0, 3*32(src) )
 362        beqz    t0, .Lcleanup_src_unaligned
 363         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 364        PREF(   1, 3*32(dst) )
 3651:
 366/*
 367 * Avoid consecutive LD*'s to the same register since some mips
 368 * implementations can't issue them in the same cycle.
 369 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 370 * are to the same unit (unless src is aligned, but it's not).
 371 */
 372        R10KCBARRIER(0(ra))
 373EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 374EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 375        SUB     len, len, 4*NBYTES
 376EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 377EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 378EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 379EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 380EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 381EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 382        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 383        ADD     src, src, 4*NBYTES
 384#ifdef CONFIG_CPU_SB1
 385        nop                             # improves slotting
 386#endif
 387EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 388EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 389EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 390EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 391        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 392        .set    reorder                         /* DADDI_WAR */
 393        ADD     dst, dst, 4*NBYTES
 394        bne     len, rem, 1b
 395        .set    noreorder
 396
 397.Lcleanup_src_unaligned:
 398        beqz    len, .Ldone
 399         and    rem, len, NBYTES-1  # rem = len % NBYTES
 400        beq     rem, len, .Lcopy_bytes
 401         nop
 4021:
 403        R10KCBARRIER(0(ra))
 404EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 405EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 406        ADD     src, src, NBYTES
 407        SUB     len, len, NBYTES
 408EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 409        .set    reorder                         /* DADDI_WAR */
 410        ADD     dst, dst, NBYTES
 411        bne     len, rem, 1b
 412        .set    noreorder
 413
 414.Lcopy_bytes_checklen:
 415        beqz    len, .Ldone
 416         nop
 417.Lcopy_bytes:
 418        /* 0 < len < NBYTES  */
 419        R10KCBARRIER(0(ra))
 420#define COPY_BYTE(N)                    \
 421EXC(    lb      t0, N(src), .Ll_exc);   \
 422        SUB     len, len, 1;            \
 423        beqz    len, .Ldone;            \
 424EXC(     sb     t0, N(dst), .Ls_exc_p1)
 425
 426        COPY_BYTE(0)
 427        COPY_BYTE(1)
 428#ifdef USE_DOUBLE
 429        COPY_BYTE(2)
 430        COPY_BYTE(3)
 431        COPY_BYTE(4)
 432        COPY_BYTE(5)
 433#endif
 434EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 435        SUB     len, len, 1
 436        jr      ra
 437EXC(     sb     t0, NBYTES-2(dst), .Ls_exc_p1)
 438.Ldone:
 439        jr      ra
 440         nop
 441        END(memcpy)
 442
 443.Ll_exc_copy:
 444        /*
 445         * Copy bytes from src until faulting load address (or until a
 446         * lb faults)
 447         *
 448         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 449         * may be more than a byte beyond the last address.
 450         * Hence, the lb below may get an exception.
 451         *
 452         * Assumes src < THREAD_BUADDR($28)
 453         */
 454        LOAD    t0, TI_TASK($28)
 455         nop
 456        LOAD    t0, THREAD_BUADDR(t0)
 4571:
 458EXC(    lb      t1, 0(src),     .Ll_exc)
 459        ADD     src, src, 1
 460        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 461        .set    reorder                         /* DADDI_WAR */
 462        ADD     dst, dst, 1
 463        bne     src, t0, 1b
 464        .set    noreorder
 465.Ll_exc:
 466        LOAD    t0, TI_TASK($28)
 467         nop
 468        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 469         nop
 470        SUB     len, AT, t0             # len number of uncopied bytes
 471        bnez    t6, .Ldone      /* Skip the zeroing part if inatomic */
 472        /*
 473         * Here's where we rely on src and dst being incremented in tandem,
 474         *   See (3) above.
 475         * dst += (fault addr - src) to put dst at first byte to clear
 476         */
 477        ADD     dst, t0                 # compute start address in a1
 478        SUB     dst, src
 479        /*
 480         * Clear len bytes starting at dst.  Can't call __bzero because it
 481         * might modify len.  An inefficient loop for these rare times...
 482         */
 483        .set    reorder                         /* DADDI_WAR */
 484        SUB     src, len, 1
 485        beqz    len, .Ldone
 486        .set    noreorder
 4871:      sb      zero, 0(dst)
 488        ADD     dst, dst, 1
 489#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 490        bnez    src, 1b
 491         SUB    src, src, 1
 492#else
 493        .set    push
 494        .set    noat
 495        li      v1, 1
 496        bnez    src, 1b
 497         SUB    src, src, v1
 498        .set    pop
 499#endif
 500        jr      ra
 501         nop
 502
 503
 504#define SEXC(n)                                                 \
 505        .set    reorder;                        /* DADDI_WAR */ \
 506.Ls_exc_p ## n ## u:                                            \
 507        ADD     len, len, n*NBYTES;                             \
 508        jr      ra;                                             \
 509        .set    noreorder
 510
 511SEXC(8)
 512SEXC(7)
 513SEXC(6)
 514SEXC(5)
 515SEXC(4)
 516SEXC(3)
 517SEXC(2)
 518SEXC(1)
 519
 520.Ls_exc_p1:
 521        .set    reorder                         /* DADDI_WAR */
 522        ADD     len, len, 1
 523        jr      ra
 524        .set    noreorder
 525.Ls_exc:
 526        jr      ra
 527         nop
 528
 529        .align  5
 530LEAF(memmove)
 531        ADD     t0, a0, a2
 532        ADD     t1, a1, a2
 533        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 534        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 535        and     t0, t1
 536        beqz    t0, .L__memcpy
 537         move   v0, a0                          /* return value */
 538        beqz    a2, .Lr_out
 539        END(memmove)
 540
 541        /* fall through to __rmemcpy */
 542LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 543         sltu   t0, a1, a0
 544        beqz    t0, .Lr_end_bytes_up            # src >= dst
 545         nop
 546        ADD     a0, a2                          # dst = dst + len
 547        ADD     a1, a2                          # src = src + len
 548
 549.Lr_end_bytes:
 550        R10KCBARRIER(0(ra))
 551        lb      t0, -1(a1)
 552        SUB     a2, a2, 0x1
 553        sb      t0, -1(a0)
 554        SUB     a1, a1, 0x1
 555        .set    reorder                         /* DADDI_WAR */
 556        SUB     a0, a0, 0x1
 557        bnez    a2, .Lr_end_bytes
 558        .set    noreorder
 559
 560.Lr_out:
 561        jr      ra
 562         move   a2, zero
 563
 564.Lr_end_bytes_up:
 565        R10KCBARRIER(0(ra))
 566        lb      t0, (a1)
 567        SUB     a2, a2, 0x1
 568        sb      t0, (a0)
 569        ADD     a1, a1, 0x1
 570        .set    reorder                         /* DADDI_WAR */
 571        ADD     a0, a0, 0x1
 572        bnez    a2, .Lr_end_bytes_up
 573        .set    noreorder
 574
 575        jr      ra
 576         move   a2, zero
 577        END(__rmemcpy)
 578