linux/arch/mips/cavium-octeon/octeon-memcpy.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 *
  13 * Mnemonic names for arguments to memcpy/__copy_user
  14 */
  15
  16#include <asm/asm.h>
  17#include <asm/asm-offsets.h>
  18#include <asm/regdef.h>
  19
  20#define dst a0
  21#define src a1
  22#define len a2
  23
  24/*
  25 * Spec
  26 *
  27 * memcpy copies len bytes from src to dst and sets v0 to dst.
  28 * It assumes that
  29 *   - src and dst don't overlap
  30 *   - src is readable
  31 *   - dst is writable
  32 * memcpy uses the standard calling convention
  33 *
  34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  35 * the number of uncopied bytes due to an exception caused by a read or write.
  36 * __copy_user assumes that src and dst don't overlap, and that the call is
  37 * implementing one of the following:
  38 *   copy_to_user
  39 *     - src is readable  (no exceptions when reading src)
  40 *   copy_from_user
  41 *     - dst is writable  (no exceptions when writing dst)
  42 * __copy_user uses a non-standard calling convention; see
  43 * arch/mips/include/asm/uaccess.h
  44 *
  45 * When an exception happens on a load, the handler must
  46 # ensure that all of the destination buffer is overwritten to prevent
  47 * leaking information to user mode programs.
  48 */
  49
  50/*
  51 * Implementation
  52 */
  53
  54/*
  55 * The exception handler for loads requires that:
  56 *  1- AT contain the address of the byte just past the end of the source
  57 *     of the copy,
  58 *  2- src_entry <= src < AT, and
  59 *  3- (dst - src) == (dst_entry - src_entry),
  60 * The _entry suffix denotes values when __copy_user was called.
  61 *
  62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  63 * (2) is met by incrementing src by the number of bytes copied
  64 * (3) is met by not doing loads between a pair of increments of dst and src
  65 *
  66 * The exception handlers for stores adjust len (if necessary) and return.
  67 * These handlers do not need to overwrite any data.
  68 *
  69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  70 * they're not protected.
  71 */
  72
  73#define EXC(inst_reg,addr,handler)              \
  749:      inst_reg, addr;                         \
  75        .section __ex_table,"a";                \
  76        PTR     9b, handler;                    \
  77        .previous
  78
  79/*
  80 * Only on the 64-bit kernel we can made use of 64-bit registers.
  81 */
  82#ifdef CONFIG_64BIT
  83#define USE_DOUBLE
  84#endif
  85
  86#ifdef USE_DOUBLE
  87
  88#define LOAD   ld
  89#define LOADL  ldl
  90#define LOADR  ldr
  91#define STOREL sdl
  92#define STORER sdr
  93#define STORE  sd
  94#define ADD    daddu
  95#define SUB    dsubu
  96#define SRL    dsrl
  97#define SRA    dsra
  98#define SLL    dsll
  99#define SLLV   dsllv
 100#define SRLV   dsrlv
 101#define NBYTES 8
 102#define LOG_NBYTES 3
 103
 104/*
 105 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 106 * register definitions). We need to redefine the register definitions from
 107 * the n64 ABI register naming to the o32 ABI register naming.
 108 */
 109#undef t0
 110#undef t1
 111#undef t2
 112#undef t3
 113#define t0      $8
 114#define t1      $9
 115#define t2      $10
 116#define t3      $11
 117#define t4      $12
 118#define t5      $13
 119#define t6      $14
 120#define t7      $15
 121
 122#else
 123
 124#define LOAD   lw
 125#define LOADL  lwl
 126#define LOADR  lwr
 127#define STOREL swl
 128#define STORER swr
 129#define STORE  sw
 130#define ADD    addu
 131#define SUB    subu
 132#define SRL    srl
 133#define SLL    sll
 134#define SRA    sra
 135#define SLLV   sllv
 136#define SRLV   srlv
 137#define NBYTES 4
 138#define LOG_NBYTES 2
 139
 140#endif /* USE_DOUBLE */
 141
 142#ifdef CONFIG_CPU_LITTLE_ENDIAN
 143#define LDFIRST LOADR
 144#define LDREST  LOADL
 145#define STFIRST STORER
 146#define STREST  STOREL
 147#define SHIFT_DISCARD SLLV
 148#else
 149#define LDFIRST LOADL
 150#define LDREST  LOADR
 151#define STFIRST STOREL
 152#define STREST  STORER
 153#define SHIFT_DISCARD SRLV
 154#endif
 155
 156#define FIRST(unit) ((unit)*NBYTES)
 157#define REST(unit)  (FIRST(unit)+NBYTES-1)
 158#define UNIT(unit)  FIRST(unit)
 159
 160#define ADDRMASK (NBYTES-1)
 161
 162        .text
 163        .set    noreorder
 164        .set    noat
 165
 166/*
 167 * A combined memcpy/__copy_user
 168 * __copy_user sets len to 0 for success; else to an upper bound of
 169 * the number of uncopied bytes.
 170 * memcpy sets v0 to dst.
 171 */
 172        .align  5
 173LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 174        move    v0, dst                         /* return value */
 175__memcpy:
 176FEXPORT(__copy_user)
 177        /*
 178         * Note: dst & src may be unaligned, len may be 0
 179         * Temps
 180         */
 181        #
 182        # Octeon doesn't care if the destination is unaligned. The hardware
 183        # can fix it faster than we can special case the assembly.
 184        #
 185        pref    0, 0(src)
 186        sltu    t0, len, NBYTES         # Check if < 1 word
 187        bnez    t0, copy_bytes_checklen
 188         and    t0, src, ADDRMASK       # Check if src unaligned
 189        bnez    t0, src_unaligned
 190         sltu   t0, len, 4*NBYTES       # Check if < 4 words
 191        bnez    t0, less_than_4units
 192         sltu   t0, len, 8*NBYTES       # Check if < 8 words
 193        bnez    t0, less_than_8units
 194         sltu   t0, len, 16*NBYTES      # Check if < 16 words
 195        bnez    t0, cleanup_both_aligned
 196         sltu   t0, len, 128+1          # Check if len < 129
 197        bnez    t0, 1f                  # Skip prefetch if len is too short
 198         sltu   t0, len, 256+1          # Check if len < 257
 199        bnez    t0, 1f                  # Skip prefetch if len is too short
 200         pref   0, 128(src)             # We must not prefetch invalid addresses
 201        #
 202        # This is where we loop if there is more than 128 bytes left
 2032:      pref    0, 256(src)             # We must not prefetch invalid addresses
 204        #
 205        # This is where we loop if we can't prefetch anymore
 2061:
 207EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 208EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 209EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 210EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 211        SUB     len, len, 16*NBYTES
 212EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p16u)
 213EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p15u)
 214EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p14u)
 215EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p13u)
 216EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 217EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 218EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 219EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 220EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p12u)
 221EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p11u)
 222EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p10u)
 223        ADD     src, src, 16*NBYTES
 224EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p9u)
 225        ADD     dst, dst, 16*NBYTES
 226EXC(    LOAD    t0, UNIT(-8)(src),      l_exc_copy)
 227EXC(    LOAD    t1, UNIT(-7)(src),      l_exc_copy)
 228EXC(    LOAD    t2, UNIT(-6)(src),      l_exc_copy)
 229EXC(    LOAD    t3, UNIT(-5)(src),      l_exc_copy)
 230EXC(    STORE   t0, UNIT(-8)(dst),      s_exc_p8u)
 231EXC(    STORE   t1, UNIT(-7)(dst),      s_exc_p7u)
 232EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 233EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 234EXC(    LOAD    t0, UNIT(-4)(src),      l_exc_copy)
 235EXC(    LOAD    t1, UNIT(-3)(src),      l_exc_copy)
 236EXC(    LOAD    t2, UNIT(-2)(src),      l_exc_copy)
 237EXC(    LOAD    t3, UNIT(-1)(src),      l_exc_copy)
 238EXC(    STORE   t0, UNIT(-4)(dst),      s_exc_p4u)
 239EXC(    STORE   t1, UNIT(-3)(dst),      s_exc_p3u)
 240EXC(    STORE   t2, UNIT(-2)(dst),      s_exc_p2u)
 241EXC(    STORE   t3, UNIT(-1)(dst),      s_exc_p1u)
 242        sltu    t0, len, 256+1          # See if we can prefetch more
 243        beqz    t0, 2b
 244         sltu   t0, len, 128            # See if we can loop more time
 245        beqz    t0, 1b
 246         nop
 247        #
 248        # Jump here if there are less than 16*NBYTES left.
 249        #
 250cleanup_both_aligned:
 251        beqz    len, done
 252         sltu   t0, len, 8*NBYTES
 253        bnez    t0, less_than_8units
 254         nop
 255EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 256EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 257EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 258EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 259        SUB     len, len, 8*NBYTES
 260EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 261EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 262EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p6u)
 263EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p5u)
 264EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 265EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 266EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 267EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 268EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p4u)
 269EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p3u)
 270EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p2u)
 271EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p1u)
 272        ADD     src, src, 8*NBYTES
 273        beqz    len, done
 274         ADD    dst, dst, 8*NBYTES
 275        #
 276        # Jump here if there are less than 8*NBYTES left.
 277        #
 278less_than_8units:
 279        sltu    t0, len, 4*NBYTES
 280        bnez    t0, less_than_4units
 281         nop
 282EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 283EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 284EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 285EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 286        SUB     len, len, 4*NBYTES
 287EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 288EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 289EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 290EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 291        ADD     src, src, 4*NBYTES
 292        beqz    len, done
 293         ADD    dst, dst, 4*NBYTES
 294        #
 295        # Jump here if there are less than 4*NBYTES left. This means
 296        # we may need to copy up to 3 NBYTES words.
 297        #
 298less_than_4units:
 299        sltu    t0, len, 1*NBYTES
 300        bnez    t0, copy_bytes_checklen
 301         nop
 302        #
 303        # 1) Copy NBYTES, then check length again
 304        #
 305EXC(    LOAD    t0, 0(src),             l_exc)
 306        SUB     len, len, NBYTES
 307        sltu    t1, len, 8
 308EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 309        ADD     src, src, NBYTES
 310        bnez    t1, copy_bytes_checklen
 311         ADD    dst, dst, NBYTES
 312        #
 313        # 2) Copy NBYTES, then check length again
 314        #
 315EXC(    LOAD    t0, 0(src),             l_exc)
 316        SUB     len, len, NBYTES
 317        sltu    t1, len, 8
 318EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 319        ADD     src, src, NBYTES
 320        bnez    t1, copy_bytes_checklen
 321         ADD    dst, dst, NBYTES
 322        #
 323        # 3) Copy NBYTES, then check length again
 324        #
 325EXC(    LOAD    t0, 0(src),             l_exc)
 326        SUB     len, len, NBYTES
 327        ADD     src, src, NBYTES
 328        ADD     dst, dst, NBYTES
 329        b copy_bytes_checklen
 330EXC(     STORE  t0, -8(dst),            s_exc_p1u)
 331
 332src_unaligned:
 333#define rem t8
 334        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 335        beqz    t0, cleanup_src_unaligned
 336         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 3371:
 338/*
 339 * Avoid consecutive LD*'s to the same register since some mips
 340 * implementations can't issue them in the same cycle.
 341 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 342 * are to the same unit (unless src is aligned, but it's not).
 343 */
 344EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 345EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 346        SUB     len, len, 4*NBYTES
 347EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 348EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 349EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 350EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 351EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 352EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 353        ADD     src, src, 4*NBYTES
 354EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 355EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 356EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 357EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 358        bne     len, rem, 1b
 359         ADD    dst, dst, 4*NBYTES
 360
 361cleanup_src_unaligned:
 362        beqz    len, done
 363         and    rem, len, NBYTES-1  # rem = len % NBYTES
 364        beq     rem, len, copy_bytes
 365         nop
 3661:
 367EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 368EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 369        SUB     len, len, NBYTES
 370EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 371        ADD     src, src, NBYTES
 372        bne     len, rem, 1b
 373         ADD    dst, dst, NBYTES
 374
 375copy_bytes_checklen:
 376        beqz    len, done
 377         nop
 378copy_bytes:
 379        /* 0 < len < NBYTES  */
 380#define COPY_BYTE(N)                    \
 381EXC(    lb      t0, N(src), l_exc);     \
 382        SUB     len, len, 1;            \
 383        beqz    len, done;              \
 384EXC(     sb     t0, N(dst), s_exc_p1)
 385
 386        COPY_BYTE(0)
 387        COPY_BYTE(1)
 388#ifdef USE_DOUBLE
 389        COPY_BYTE(2)
 390        COPY_BYTE(3)
 391        COPY_BYTE(4)
 392        COPY_BYTE(5)
 393#endif
 394EXC(    lb      t0, NBYTES-2(src), l_exc)
 395        SUB     len, len, 1
 396        jr      ra
 397EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 398done:
 399        jr      ra
 400         nop
 401        END(memcpy)
 402
 403l_exc_copy:
 404        /*
 405         * Copy bytes from src until faulting load address (or until a
 406         * lb faults)
 407         *
 408         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 409         * may be more than a byte beyond the last address.
 410         * Hence, the lb below may get an exception.
 411         *
 412         * Assumes src < THREAD_BUADDR($28)
 413         */
 414        LOAD    t0, TI_TASK($28)
 415         nop
 416        LOAD    t0, THREAD_BUADDR(t0)
 4171:
 418EXC(    lb      t1, 0(src),     l_exc)
 419        ADD     src, src, 1
 420        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 421        bne     src, t0, 1b
 422         ADD    dst, dst, 1
 423l_exc:
 424        LOAD    t0, TI_TASK($28)
 425         nop
 426        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 427         nop
 428        SUB     len, AT, t0             # len number of uncopied bytes
 429        /*
 430         * Here's where we rely on src and dst being incremented in tandem,
 431         *   See (3) above.
 432         * dst += (fault addr - src) to put dst at first byte to clear
 433         */
 434        ADD     dst, t0                 # compute start address in a1
 435        SUB     dst, src
 436        /*
 437         * Clear len bytes starting at dst.  Can't call __bzero because it
 438         * might modify len.  An inefficient loop for these rare times...
 439         */
 440        beqz    len, done
 441         SUB    src, len, 1
 4421:      sb      zero, 0(dst)
 443        ADD     dst, dst, 1
 444        bnez    src, 1b
 445         SUB    src, src, 1
 446        jr      ra
 447         nop
 448
 449
 450#define SEXC(n)                         \
 451s_exc_p ## n ## u:                      \
 452        jr      ra;                     \
 453         ADD    len, len, n*NBYTES
 454
 455SEXC(16)
 456SEXC(15)
 457SEXC(14)
 458SEXC(13)
 459SEXC(12)
 460SEXC(11)
 461SEXC(10)
 462SEXC(9)
 463SEXC(8)
 464SEXC(7)
 465SEXC(6)
 466SEXC(5)
 467SEXC(4)
 468SEXC(3)
 469SEXC(2)
 470SEXC(1)
 471
 472s_exc_p1:
 473        jr      ra
 474         ADD    len, len, 1
 475s_exc:
 476        jr      ra
 477         nop
 478
 479        .align  5
 480LEAF(memmove)
 481        ADD     t0, a0, a2
 482        ADD     t1, a1, a2
 483        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 484        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 485        and     t0, t1
 486        beqz    t0, __memcpy
 487         move   v0, a0                          /* return value */
 488        beqz    a2, r_out
 489        END(memmove)
 490
 491        /* fall through to __rmemcpy */
 492LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 493         sltu   t0, a1, a0
 494        beqz    t0, r_end_bytes_up              # src >= dst
 495         nop
 496        ADD     a0, a2                          # dst = dst + len
 497        ADD     a1, a2                          # src = src + len
 498
 499r_end_bytes:
 500        lb      t0, -1(a1)
 501        SUB     a2, a2, 0x1
 502        sb      t0, -1(a0)
 503        SUB     a1, a1, 0x1
 504        bnez    a2, r_end_bytes
 505         SUB    a0, a0, 0x1
 506
 507r_out:
 508        jr      ra
 509         move   a2, zero
 510
 511r_end_bytes_up:
 512        lb      t0, (a1)
 513        SUB     a2, a2, 0x1
 514        sb      t0, (a0)
 515        ADD     a1, a1, 0x1
 516        bnez    a2, r_end_bytes_up
 517         ADD    a0, a0, 0x1
 518
 519        jr      ra
 520         move   a2, zero
 521        END(__rmemcpy)
 522