linux/arch/mips/lib/memcpy-inatomic.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 * Copyright (C) 2007  Maciej W. Rozycki
  13 *
  14 * Mnemonic names for arguments to memcpy/__copy_user
  15 */
  16
  17/*
  18 * Hack to resolve longstanding prefetch issue
  19 *
  20 * Prefetching may be fatal on some systems if we're prefetching beyond the
  21 * end of memory on some systems.  It's also a seriously bad idea on non
  22 * dma-coherent systems.
  23 */
  24#ifdef CONFIG_DMA_NONCOHERENT
  25#undef CONFIG_CPU_HAS_PREFETCH
  26#endif
  27#ifdef CONFIG_MIPS_MALTA
  28#undef CONFIG_CPU_HAS_PREFETCH
  29#endif
  30
  31#include <asm/asm.h>
  32#include <asm/asm-offsets.h>
  33#include <asm/regdef.h>
  34
  35#define dst a0
  36#define src a1
  37#define len a2
  38
  39/*
  40 * Spec
  41 *
  42 * memcpy copies len bytes from src to dst and sets v0 to dst.
  43 * It assumes that
  44 *   - src and dst don't overlap
  45 *   - src is readable
  46 *   - dst is writable
  47 * memcpy uses the standard calling convention
  48 *
  49 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50 * the number of uncopied bytes due to an exception caused by a read or write.
  51 * __copy_user assumes that src and dst don't overlap, and that the call is
  52 * implementing one of the following:
  53 *   copy_to_user
  54 *     - src is readable  (no exceptions when reading src)
  55 *   copy_from_user
  56 *     - dst is writable  (no exceptions when writing dst)
  57 * __copy_user uses a non-standard calling convention; see
  58 * include/asm-mips/uaccess.h
  59 *
  60 * When an exception happens on a load, the handler must
  61 # ensure that all of the destination buffer is overwritten to prevent
  62 * leaking information to user mode programs.
  63 */
  64
  65/*
  66 * Implementation
  67 */
  68
  69/*
  70 * The exception handler for loads requires that:
  71 *  1- AT contain the address of the byte just past the end of the source
  72 *     of the copy,
  73 *  2- src_entry <= src < AT, and
  74 *  3- (dst - src) == (dst_entry - src_entry),
  75 * The _entry suffix denotes values when __copy_user was called.
  76 *
  77 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78 * (2) is met by incrementing src by the number of bytes copied
  79 * (3) is met by not doing loads between a pair of increments of dst and src
  80 *
  81 * The exception handlers for stores adjust len (if necessary) and return.
  82 * These handlers do not need to overwrite any data.
  83 *
  84 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85 * they're not protected.
  86 */
  87
  88#define EXC(inst_reg,addr,handler)              \
  899:      inst_reg, addr;                         \
  90        .section __ex_table,"a";                \
  91        PTR     9b, handler;                    \
  92        .previous
  93
  94/*
  95 * Only on the 64-bit kernel we can made use of 64-bit registers.
  96 */
  97#ifdef CONFIG_64BIT
  98#define USE_DOUBLE
  99#endif
 100
 101#ifdef USE_DOUBLE
 102
 103#define LOAD   ld
 104#define LOADL  ldl
 105#define LOADR  ldr
 106#define STOREL sdl
 107#define STORER sdr
 108#define STORE  sd
 109#define ADD    daddu
 110#define SUB    dsubu
 111#define SRL    dsrl
 112#define SRA    dsra
 113#define SLL    dsll
 114#define SLLV   dsllv
 115#define SRLV   dsrlv
 116#define NBYTES 8
 117#define LOG_NBYTES 3
 118
 119/*
 120 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121 * register definitions). We need to redefine the register definitions from
 122 * the n64 ABI register naming to the o32 ABI register naming.
 123 */
 124#undef t0
 125#undef t1
 126#undef t2
 127#undef t3
 128#define t0      $8
 129#define t1      $9
 130#define t2      $10
 131#define t3      $11
 132#define t4      $12
 133#define t5      $13
 134#define t6      $14
 135#define t7      $15
 136
 137#else
 138
 139#define LOAD   lw
 140#define LOADL  lwl
 141#define LOADR  lwr
 142#define STOREL swl
 143#define STORER swr
 144#define STORE  sw
 145#define ADD    addu
 146#define SUB    subu
 147#define SRL    srl
 148#define SLL    sll
 149#define SRA    sra
 150#define SLLV   sllv
 151#define SRLV   srlv
 152#define NBYTES 4
 153#define LOG_NBYTES 2
 154
 155#endif /* USE_DOUBLE */
 156
 157#ifdef CONFIG_CPU_LITTLE_ENDIAN
 158#define LDFIRST LOADR
 159#define LDREST  LOADL
 160#define STFIRST STORER
 161#define STREST  STOREL
 162#define SHIFT_DISCARD SLLV
 163#else
 164#define LDFIRST LOADL
 165#define LDREST  LOADR
 166#define STFIRST STOREL
 167#define STREST  STORER
 168#define SHIFT_DISCARD SRLV
 169#endif
 170
 171#define FIRST(unit) ((unit)*NBYTES)
 172#define REST(unit)  (FIRST(unit)+NBYTES-1)
 173#define UNIT(unit)  FIRST(unit)
 174
 175#define ADDRMASK (NBYTES-1)
 176
 177        .text
 178        .set    noreorder
 179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180        .set    noat
 181#else
 182        .set    at=v1
 183#endif
 184
 185/*
 186 * A combined memcpy/__copy_user
 187 * __copy_user sets len to 0 for success; else to an upper bound of
 188 * the number of uncopied bytes.
 189 * memcpy sets v0 to dst.
 190 */
 191        .align  5
 192LEAF(__copy_user_inatomic)
 193        /*
 194         * Note: dst & src may be unaligned, len may be 0
 195         * Temps
 196         */
 197#define rem t8
 198
 199        /*
 200         * The "issue break"s below are very approximate.
 201         * Issue delays for dcache fills will perturb the schedule, as will
 202         * load queue full replay traps, etc.
 203         *
 204         * If len < NBYTES use byte operations.
 205         */
 206        PREF(   0, 0(src) )
 207        PREF(   1, 0(dst) )
 208        sltu    t2, len, NBYTES
 209        and     t1, dst, ADDRMASK
 210        PREF(   0, 1*32(src) )
 211        PREF(   1, 1*32(dst) )
 212        bnez    t2, .Lcopy_bytes_checklen
 213         and    t0, src, ADDRMASK
 214        PREF(   0, 2*32(src) )
 215        PREF(   1, 2*32(dst) )
 216        bnez    t1, .Ldst_unaligned
 217         nop
 218        bnez    t0, .Lsrc_unaligned_dst_aligned
 219        /*
 220         * use delay slot for fall-through
 221         * src and dst are aligned; need to compute rem
 222         */
 223.Lboth_aligned:
 224         SRL    t0, len, LOG_NBYTES+3           # +3 for 8 units/iter
 225        beqz    t0, .Lcleanup_both_aligned      # len < 8*NBYTES
 226         and    rem, len, (8*NBYTES-1)          # rem = len % (8*NBYTES)
 227        PREF(   0, 3*32(src) )
 228        PREF(   1, 3*32(dst) )
 229        .align  4
 2301:
 231EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 232EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 233EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 234EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 235        SUB     len, len, 8*NBYTES
 236EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 237EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 238        STORE   t0, UNIT(0)(dst)
 239        STORE   t1, UNIT(1)(dst)
 240EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 241EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 242        ADD     src, src, 8*NBYTES
 243        ADD     dst, dst, 8*NBYTES
 244        STORE   t2, UNIT(-6)(dst)
 245        STORE   t3, UNIT(-5)(dst)
 246        STORE   t4, UNIT(-4)(dst)
 247        STORE   t7, UNIT(-3)(dst)
 248        STORE   t0, UNIT(-2)(dst)
 249        STORE   t1, UNIT(-1)(dst)
 250        PREF(   0, 8*32(src) )
 251        PREF(   1, 8*32(dst) )
 252        bne     len, rem, 1b
 253         nop
 254
 255        /*
 256         * len == rem == the number of bytes left to copy < 8*NBYTES
 257         */
 258.Lcleanup_both_aligned:
 259        beqz    len, .Ldone
 260         sltu   t0, len, 4*NBYTES
 261        bnez    t0, .Lless_than_4units
 262         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 263        /*
 264         * len >= 4*NBYTES
 265         */
 266EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 267EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 268EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 269EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 270        SUB     len, len, 4*NBYTES
 271        ADD     src, src, 4*NBYTES
 272        STORE   t0, UNIT(0)(dst)
 273        STORE   t1, UNIT(1)(dst)
 274        STORE   t2, UNIT(2)(dst)
 275        STORE   t3, UNIT(3)(dst)
 276        .set    reorder                         /* DADDI_WAR */
 277        ADD     dst, dst, 4*NBYTES
 278        beqz    len, .Ldone
 279        .set    noreorder
 280.Lless_than_4units:
 281        /*
 282         * rem = len % NBYTES
 283         */
 284        beq     rem, len, .Lcopy_bytes
 285         nop
 2861:
 287EXC(    LOAD    t0, 0(src),             .Ll_exc)
 288        ADD     src, src, NBYTES
 289        SUB     len, len, NBYTES
 290        STORE   t0, 0(dst)
 291        .set    reorder                         /* DADDI_WAR */
 292        ADD     dst, dst, NBYTES
 293        bne     rem, len, 1b
 294        .set    noreorder
 295
 296        /*
 297         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 298         * A loop would do only a byte at a time with possible branch
 299         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 300         * because can't assume read-access to dst.  Instead, use
 301         * STREST dst, which doesn't require read access to dst.
 302         *
 303         * This code should perform better than a simple loop on modern,
 304         * wide-issue mips processors because the code has fewer branches and
 305         * more instruction-level parallelism.
 306         */
 307#define bits t2
 308        beqz    len, .Ldone
 309         ADD    t1, dst, len    # t1 is just past last byte of dst
 310        li      bits, 8*NBYTES
 311        SLL     rem, len, 3     # rem = number of bits to keep
 312EXC(    LOAD    t0, 0(src),             .Ll_exc)
 313        SUB     bits, bits, rem # bits = number of bits to discard
 314        SHIFT_DISCARD t0, t0, bits
 315        STREST  t0, -1(t1)
 316        jr      ra
 317         move   len, zero
 318.Ldst_unaligned:
 319        /*
 320         * dst is unaligned
 321         * t0 = src & ADDRMASK
 322         * t1 = dst & ADDRMASK; T1 > 0
 323         * len >= NBYTES
 324         *
 325         * Copy enough bytes to align dst
 326         * Set match = (src and dst have same alignment)
 327         */
 328#define match rem
 329EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 330        ADD     t2, zero, NBYTES
 331EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 332        SUB     t2, t2, t1      # t2 = number of bytes copied
 333        xor     match, t0, t1
 334        STFIRST t3, FIRST(0)(dst)
 335        beq     len, t2, .Ldone
 336         SUB    len, len, t2
 337        ADD     dst, dst, t2
 338        beqz    match, .Lboth_aligned
 339         ADD    src, src, t2
 340
 341.Lsrc_unaligned_dst_aligned:
 342        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 343        PREF(   0, 3*32(src) )
 344        beqz    t0, .Lcleanup_src_unaligned
 345         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 346        PREF(   1, 3*32(dst) )
 3471:
 348/*
 349 * Avoid consecutive LD*'s to the same register since some mips
 350 * implementations can't issue them in the same cycle.
 351 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 352 * are to the same unit (unless src is aligned, but it's not).
 353 */
 354EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 355EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 356        SUB     len, len, 4*NBYTES
 357EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 358EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 359EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 360EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 361EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 362EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 363        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 364        ADD     src, src, 4*NBYTES
 365#ifdef CONFIG_CPU_SB1
 366        nop                             # improves slotting
 367#endif
 368        STORE   t0, UNIT(0)(dst)
 369        STORE   t1, UNIT(1)(dst)
 370        STORE   t2, UNIT(2)(dst)
 371        STORE   t3, UNIT(3)(dst)
 372        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 373        .set    reorder                         /* DADDI_WAR */
 374        ADD     dst, dst, 4*NBYTES
 375        bne     len, rem, 1b
 376        .set    noreorder
 377
 378.Lcleanup_src_unaligned:
 379        beqz    len, .Ldone
 380         and    rem, len, NBYTES-1  # rem = len % NBYTES
 381        beq     rem, len, .Lcopy_bytes
 382         nop
 3831:
 384EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 385EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 386        ADD     src, src, NBYTES
 387        SUB     len, len, NBYTES
 388        STORE   t0, 0(dst)
 389        .set    reorder                         /* DADDI_WAR */
 390        ADD     dst, dst, NBYTES
 391        bne     len, rem, 1b
 392        .set    noreorder
 393
 394.Lcopy_bytes_checklen:
 395        beqz    len, .Ldone
 396         nop
 397.Lcopy_bytes:
 398        /* 0 < len < NBYTES  */
 399#define COPY_BYTE(N)                    \
 400EXC(    lb      t0, N(src), .Ll_exc);   \
 401        SUB     len, len, 1;            \
 402        beqz    len, .Ldone;            \
 403         sb     t0, N(dst)
 404
 405        COPY_BYTE(0)
 406        COPY_BYTE(1)
 407#ifdef USE_DOUBLE
 408        COPY_BYTE(2)
 409        COPY_BYTE(3)
 410        COPY_BYTE(4)
 411        COPY_BYTE(5)
 412#endif
 413EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 414        SUB     len, len, 1
 415        jr      ra
 416         sb     t0, NBYTES-2(dst)
 417.Ldone:
 418        jr      ra
 419         nop
 420        END(__copy_user_inatomic)
 421
 422.Ll_exc_copy:
 423        /*
 424         * Copy bytes from src until faulting load address (or until a
 425         * lb faults)
 426         *
 427         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 428         * may be more than a byte beyond the last address.
 429         * Hence, the lb below may get an exception.
 430         *
 431         * Assumes src < THREAD_BUADDR($28)
 432         */
 433        LOAD    t0, TI_TASK($28)
 434         nop
 435        LOAD    t0, THREAD_BUADDR(t0)
 4361:
 437EXC(    lb      t1, 0(src),     .Ll_exc)
 438        ADD     src, src, 1
 439        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 440        .set    reorder                         /* DADDI_WAR */
 441        ADD     dst, dst, 1
 442        bne     src, t0, 1b
 443        .set    noreorder
 444.Ll_exc:
 445        LOAD    t0, TI_TASK($28)
 446         nop
 447        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 448         nop
 449        SUB     len, AT, t0             # len number of uncopied bytes
 450        jr      ra
 451         nop
 452