linux/arch/mips/cavium-octeon/octeon-memcpy.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 *
  13 * Mnemonic names for arguments to memcpy/__copy_user
  14 */
  15
  16#include <asm/asm.h>
  17#include <asm/asm-offsets.h>
  18#include <asm/regdef.h>
  19
  20#define dst a0
  21#define src a1
  22#define len a2
  23
  24/*
  25 * Spec
  26 *
  27 * memcpy copies len bytes from src to dst and sets v0 to dst.
  28 * It assumes that
  29 *   - src and dst don't overlap
  30 *   - src is readable
  31 *   - dst is writable
  32 * memcpy uses the standard calling convention
  33 *
  34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  35 * the number of uncopied bytes due to an exception caused by a read or write.
  36 * __copy_user assumes that src and dst don't overlap, and that the call is
  37 * implementing one of the following:
  38 *   copy_to_user
  39 *     - src is readable  (no exceptions when reading src)
  40 *   copy_from_user
  41 *     - dst is writable  (no exceptions when writing dst)
  42 * __copy_user uses a non-standard calling convention; see
  43 * arch/mips/include/asm/uaccess.h
  44 *
  45 * When an exception happens on a load, the handler must
  46 # ensure that all of the destination buffer is overwritten to prevent
  47 * leaking information to user mode programs.
  48 */
  49
  50/*
  51 * Implementation
  52 */
  53
  54/*
  55 * The exception handler for loads requires that:
  56 *  1- AT contain the address of the byte just past the end of the source
  57 *     of the copy,
  58 *  2- src_entry <= src < AT, and
  59 *  3- (dst - src) == (dst_entry - src_entry),
  60 * The _entry suffix denotes values when __copy_user was called.
  61 *
  62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  63 * (2) is met by incrementing src by the number of bytes copied
  64 * (3) is met by not doing loads between a pair of increments of dst and src
  65 *
  66 * The exception handlers for stores adjust len (if necessary) and return.
  67 * These handlers do not need to overwrite any data.
  68 *
  69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  70 * they're not protected.
  71 */
  72
  73#define EXC(inst_reg,addr,handler)              \
  749:      inst_reg, addr;                         \
  75        .section __ex_table,"a";                \
  76        PTR     9b, handler;                    \
  77        .previous
  78
  79/*
  80 * Only on the 64-bit kernel we can made use of 64-bit registers.
  81 */
  82
  83#define LOAD   ld
  84#define LOADL  ldl
  85#define LOADR  ldr
  86#define STOREL sdl
  87#define STORER sdr
  88#define STORE  sd
  89#define ADD    daddu
  90#define SUB    dsubu
  91#define SRL    dsrl
  92#define SRA    dsra
  93#define SLL    dsll
  94#define SLLV   dsllv
  95#define SRLV   dsrlv
  96#define NBYTES 8
  97#define LOG_NBYTES 3
  98
  99/*
 100 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 101 * register definitions). We need to redefine the register definitions from
 102 * the n64 ABI register naming to the o32 ABI register naming.
 103 */
 104#undef t0
 105#undef t1
 106#undef t2
 107#undef t3
 108#define t0      $8
 109#define t1      $9
 110#define t2      $10
 111#define t3      $11
 112#define t4      $12
 113#define t5      $13
 114#define t6      $14
 115#define t7      $15
 116
 117#ifdef CONFIG_CPU_LITTLE_ENDIAN
 118#define LDFIRST LOADR
 119#define LDREST  LOADL
 120#define STFIRST STORER
 121#define STREST  STOREL
 122#define SHIFT_DISCARD SLLV
 123#else
 124#define LDFIRST LOADL
 125#define LDREST  LOADR
 126#define STFIRST STOREL
 127#define STREST  STORER
 128#define SHIFT_DISCARD SRLV
 129#endif
 130
 131#define FIRST(unit) ((unit)*NBYTES)
 132#define REST(unit)  (FIRST(unit)+NBYTES-1)
 133#define UNIT(unit)  FIRST(unit)
 134
 135#define ADDRMASK (NBYTES-1)
 136
 137        .text
 138        .set    noreorder
 139        .set    noat
 140
 141/*
 142 * t7 is used as a flag to note inatomic mode.
 143 */
 144LEAF(__copy_user_inatomic)
 145        b       __copy_user_common
 146         li     t7, 1
 147        END(__copy_user_inatomic)
 148
 149/*
 150 * A combined memcpy/__copy_user
 151 * __copy_user sets len to 0 for success; else to an upper bound of
 152 * the number of uncopied bytes.
 153 * memcpy sets v0 to dst.
 154 */
 155        .align  5
 156LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 157        move    v0, dst                         /* return value */
 158__memcpy:
 159FEXPORT(__copy_user)
 160        li      t7, 0                           /* not inatomic */
 161__copy_user_common:
 162        /*
 163         * Note: dst & src may be unaligned, len may be 0
 164         * Temps
 165         */
 166        #
 167        # Octeon doesn't care if the destination is unaligned. The hardware
 168        # can fix it faster than we can special case the assembly.
 169        #
 170        pref    0, 0(src)
 171        sltu    t0, len, NBYTES         # Check if < 1 word
 172        bnez    t0, copy_bytes_checklen
 173         and    t0, src, ADDRMASK       # Check if src unaligned
 174        bnez    t0, src_unaligned
 175         sltu   t0, len, 4*NBYTES       # Check if < 4 words
 176        bnez    t0, less_than_4units
 177         sltu   t0, len, 8*NBYTES       # Check if < 8 words
 178        bnez    t0, less_than_8units
 179         sltu   t0, len, 16*NBYTES      # Check if < 16 words
 180        bnez    t0, cleanup_both_aligned
 181         sltu   t0, len, 128+1          # Check if len < 129
 182        bnez    t0, 1f                  # Skip prefetch if len is too short
 183         sltu   t0, len, 256+1          # Check if len < 257
 184        bnez    t0, 1f                  # Skip prefetch if len is too short
 185         pref   0, 128(src)             # We must not prefetch invalid addresses
 186        #
 187        # This is where we loop if there is more than 128 bytes left
 1882:      pref    0, 256(src)             # We must not prefetch invalid addresses
 189        #
 190        # This is where we loop if we can't prefetch anymore
 1911:
 192EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 193EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 194EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 195EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 196        SUB     len, len, 16*NBYTES
 197EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p16u)
 198EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p15u)
 199EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p14u)
 200EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p13u)
 201EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 202EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 203EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 204EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 205EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p12u)
 206EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p11u)
 207EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p10u)
 208        ADD     src, src, 16*NBYTES
 209EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p9u)
 210        ADD     dst, dst, 16*NBYTES
 211EXC(    LOAD    t0, UNIT(-8)(src),      l_exc_copy)
 212EXC(    LOAD    t1, UNIT(-7)(src),      l_exc_copy)
 213EXC(    LOAD    t2, UNIT(-6)(src),      l_exc_copy)
 214EXC(    LOAD    t3, UNIT(-5)(src),      l_exc_copy)
 215EXC(    STORE   t0, UNIT(-8)(dst),      s_exc_p8u)
 216EXC(    STORE   t1, UNIT(-7)(dst),      s_exc_p7u)
 217EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 218EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 219EXC(    LOAD    t0, UNIT(-4)(src),      l_exc_copy)
 220EXC(    LOAD    t1, UNIT(-3)(src),      l_exc_copy)
 221EXC(    LOAD    t2, UNIT(-2)(src),      l_exc_copy)
 222EXC(    LOAD    t3, UNIT(-1)(src),      l_exc_copy)
 223EXC(    STORE   t0, UNIT(-4)(dst),      s_exc_p4u)
 224EXC(    STORE   t1, UNIT(-3)(dst),      s_exc_p3u)
 225EXC(    STORE   t2, UNIT(-2)(dst),      s_exc_p2u)
 226EXC(    STORE   t3, UNIT(-1)(dst),      s_exc_p1u)
 227        sltu    t0, len, 256+1          # See if we can prefetch more
 228        beqz    t0, 2b
 229         sltu   t0, len, 128            # See if we can loop more time
 230        beqz    t0, 1b
 231         nop
 232        #
 233        # Jump here if there are less than 16*NBYTES left.
 234        #
 235cleanup_both_aligned:
 236        beqz    len, done
 237         sltu   t0, len, 8*NBYTES
 238        bnez    t0, less_than_8units
 239         nop
 240EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 241EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 242EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 243EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 244        SUB     len, len, 8*NBYTES
 245EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 246EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 247EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p6u)
 248EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p5u)
 249EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 250EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 251EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 252EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 253EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p4u)
 254EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p3u)
 255EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p2u)
 256EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p1u)
 257        ADD     src, src, 8*NBYTES
 258        beqz    len, done
 259         ADD    dst, dst, 8*NBYTES
 260        #
 261        # Jump here if there are less than 8*NBYTES left.
 262        #
 263less_than_8units:
 264        sltu    t0, len, 4*NBYTES
 265        bnez    t0, less_than_4units
 266         nop
 267EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 268EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 269EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 270EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 271        SUB     len, len, 4*NBYTES
 272EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 273EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 274EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 275EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 276        ADD     src, src, 4*NBYTES
 277        beqz    len, done
 278         ADD    dst, dst, 4*NBYTES
 279        #
 280        # Jump here if there are less than 4*NBYTES left. This means
 281        # we may need to copy up to 3 NBYTES words.
 282        #
 283less_than_4units:
 284        sltu    t0, len, 1*NBYTES
 285        bnez    t0, copy_bytes_checklen
 286         nop
 287        #
 288        # 1) Copy NBYTES, then check length again
 289        #
 290EXC(    LOAD    t0, 0(src),             l_exc)
 291        SUB     len, len, NBYTES
 292        sltu    t1, len, 8
 293EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 294        ADD     src, src, NBYTES
 295        bnez    t1, copy_bytes_checklen
 296         ADD    dst, dst, NBYTES
 297        #
 298        # 2) Copy NBYTES, then check length again
 299        #
 300EXC(    LOAD    t0, 0(src),             l_exc)
 301        SUB     len, len, NBYTES
 302        sltu    t1, len, 8
 303EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 304        ADD     src, src, NBYTES
 305        bnez    t1, copy_bytes_checklen
 306         ADD    dst, dst, NBYTES
 307        #
 308        # 3) Copy NBYTES, then check length again
 309        #
 310EXC(    LOAD    t0, 0(src),             l_exc)
 311        SUB     len, len, NBYTES
 312        ADD     src, src, NBYTES
 313        ADD     dst, dst, NBYTES
 314        b copy_bytes_checklen
 315EXC(     STORE  t0, -8(dst),            s_exc_p1u)
 316
 317src_unaligned:
 318#define rem t8
 319        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 320        beqz    t0, cleanup_src_unaligned
 321         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 3221:
 323/*
 324 * Avoid consecutive LD*'s to the same register since some mips
 325 * implementations can't issue them in the same cycle.
 326 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 327 * are to the same unit (unless src is aligned, but it's not).
 328 */
 329EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 330EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 331        SUB     len, len, 4*NBYTES
 332EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 333EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 334EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 335EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 336EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 337EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 338        ADD     src, src, 4*NBYTES
 339EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 340EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 341EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 342EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 343        bne     len, rem, 1b
 344         ADD    dst, dst, 4*NBYTES
 345
 346cleanup_src_unaligned:
 347        beqz    len, done
 348         and    rem, len, NBYTES-1  # rem = len % NBYTES
 349        beq     rem, len, copy_bytes
 350         nop
 3511:
 352EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 353EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 354        SUB     len, len, NBYTES
 355EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 356        ADD     src, src, NBYTES
 357        bne     len, rem, 1b
 358         ADD    dst, dst, NBYTES
 359
 360copy_bytes_checklen:
 361        beqz    len, done
 362         nop
 363copy_bytes:
 364        /* 0 < len < NBYTES  */
 365#define COPY_BYTE(N)                    \
 366EXC(    lb      t0, N(src), l_exc);     \
 367        SUB     len, len, 1;            \
 368        beqz    len, done;              \
 369EXC(     sb     t0, N(dst), s_exc_p1)
 370
 371        COPY_BYTE(0)
 372        COPY_BYTE(1)
 373        COPY_BYTE(2)
 374        COPY_BYTE(3)
 375        COPY_BYTE(4)
 376        COPY_BYTE(5)
 377EXC(    lb      t0, NBYTES-2(src), l_exc)
 378        SUB     len, len, 1
 379        jr      ra
 380EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 381done:
 382        jr      ra
 383         nop
 384        END(memcpy)
 385
 386l_exc_copy:
 387        /*
 388         * Copy bytes from src until faulting load address (or until a
 389         * lb faults)
 390         *
 391         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 392         * may be more than a byte beyond the last address.
 393         * Hence, the lb below may get an exception.
 394         *
 395         * Assumes src < THREAD_BUADDR($28)
 396         */
 397        LOAD    t0, TI_TASK($28)
 398        LOAD    t0, THREAD_BUADDR(t0)
 3991:
 400EXC(    lb      t1, 0(src),     l_exc)
 401        ADD     src, src, 1
 402        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 403        bne     src, t0, 1b
 404         ADD    dst, dst, 1
 405l_exc:
 406        LOAD    t0, TI_TASK($28)
 407        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 408        SUB     len, AT, t0             # len number of uncopied bytes
 409        bnez    t7, 2f          /* Skip the zeroing out part if inatomic */
 410        /*
 411         * Here's where we rely on src and dst being incremented in tandem,
 412         *   See (3) above.
 413         * dst += (fault addr - src) to put dst at first byte to clear
 414         */
 415        ADD     dst, t0                 # compute start address in a1
 416        SUB     dst, src
 417        /*
 418         * Clear len bytes starting at dst.  Can't call __bzero because it
 419         * might modify len.  An inefficient loop for these rare times...
 420         */
 421        beqz    len, done
 422         SUB    src, len, 1
 4231:      sb      zero, 0(dst)
 424        ADD     dst, dst, 1
 425        bnez    src, 1b
 426         SUB    src, src, 1
 4272:      jr      ra
 428         nop
 429
 430
 431#define SEXC(n)                         \
 432s_exc_p ## n ## u:                      \
 433        jr      ra;                     \
 434         ADD    len, len, n*NBYTES
 435
 436SEXC(16)
 437SEXC(15)
 438SEXC(14)
 439SEXC(13)
 440SEXC(12)
 441SEXC(11)
 442SEXC(10)
 443SEXC(9)
 444SEXC(8)
 445SEXC(7)
 446SEXC(6)
 447SEXC(5)
 448SEXC(4)
 449SEXC(3)
 450SEXC(2)
 451SEXC(1)
 452
 453s_exc_p1:
 454        jr      ra
 455         ADD    len, len, 1
 456s_exc:
 457        jr      ra
 458         nop
 459
 460        .align  5
 461LEAF(memmove)
 462        ADD     t0, a0, a2
 463        ADD     t1, a1, a2
 464        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 465        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 466        and     t0, t1
 467        beqz    t0, __memcpy
 468         move   v0, a0                          /* return value */
 469        beqz    a2, r_out
 470        END(memmove)
 471
 472        /* fall through to __rmemcpy */
 473LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 474         sltu   t0, a1, a0
 475        beqz    t0, r_end_bytes_up              # src >= dst
 476         nop
 477        ADD     a0, a2                          # dst = dst + len
 478        ADD     a1, a2                          # src = src + len
 479
 480r_end_bytes:
 481        lb      t0, -1(a1)
 482        SUB     a2, a2, 0x1
 483        sb      t0, -1(a0)
 484        SUB     a1, a1, 0x1
 485        bnez    a2, r_end_bytes
 486         SUB    a0, a0, 0x1
 487
 488r_out:
 489        jr      ra
 490         move   a2, zero
 491
 492r_end_bytes_up:
 493        lb      t0, (a1)
 494        SUB     a2, a2, 0x1
 495        sb      t0, (a0)
 496        ADD     a1, a1, 0x1
 497        bnez    a2, r_end_bytes_up
 498         ADD    a0, a0, 0x1
 499
 500        jr      ra
 501         move   a2, zero
 502        END(__rmemcpy)
 503