linux/arch/mips/lib/csum_partial.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Quick'n'dirty IP checksum ...
   7 *
   8 * Copyright (C) 1998, 1999 Ralf Baechle
   9 * Copyright (C) 1999 Silicon Graphics, Inc.
  10 * Copyright (C) 2007  Maciej W. Rozycki
  11 */
  12#include <linux/errno.h>
  13#include <asm/asm.h>
  14#include <asm/asm-offsets.h>
  15#include <asm/regdef.h>
  16
  17#ifdef CONFIG_64BIT
  18/*
  19 * As we are sharing code base with the mips32 tree (which use the o32 ABI
  20 * register definitions). We need to redefine the register definitions from
  21 * the n64 ABI register naming to the o32 ABI register naming.
  22 */
  23#undef t0
  24#undef t1
  25#undef t2
  26#undef t3
  27#define t0      $8
  28#define t1      $9
  29#define t2      $10
  30#define t3      $11
  31#define t4      $12
  32#define t5      $13
  33#define t6      $14
  34#define t7      $15
  35
  36#define USE_DOUBLE
  37#endif
  38
  39#ifdef USE_DOUBLE
  40
  41#define LOAD   ld
  42#define LOAD32 lwu
  43#define ADD    daddu
  44#define NBYTES 8
  45
  46#else
  47
  48#define LOAD   lw
  49#define LOAD32 lw
  50#define ADD    addu
  51#define NBYTES 4
  52
  53#endif /* USE_DOUBLE */
  54
  55#define UNIT(unit)  ((unit)*NBYTES)
  56
  57#define ADDC(sum,reg)                                           \
  58        ADD     sum, reg;                                       \
  59        sltu    v1, sum, reg;                                   \
  60        ADD     sum, v1;                                        \
  61
  62#define ADDC32(sum,reg)                                         \
  63        addu    sum, reg;                                       \
  64        sltu    v1, sum, reg;                                   \
  65        addu    sum, v1;                                        \
  66
  67#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  68        LOAD    _t0, (offset + UNIT(0))(src);                   \
  69        LOAD    _t1, (offset + UNIT(1))(src);                   \
  70        LOAD    _t2, (offset + UNIT(2))(src);                   \
  71        LOAD    _t3, (offset + UNIT(3))(src);                   \
  72        ADDC(sum, _t0);                                         \
  73        ADDC(sum, _t1);                                         \
  74        ADDC(sum, _t2);                                         \
  75        ADDC(sum, _t3)
  76
  77#ifdef USE_DOUBLE
  78#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  79        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  80#else
  81#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  82        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  83        CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  84#endif
  85
  86/*
  87 * a0: source address
  88 * a1: length of the area to checksum
  89 * a2: partial checksum
  90 */
  91
  92#define src a0
  93#define sum v0
  94
  95        .text
  96        .set    noreorder
  97        .align  5
  98LEAF(csum_partial)
  99        move    sum, zero
 100        move    t7, zero
 101
 102        sltiu   t8, a1, 0x8
 103        bnez    t8, .Lsmall_csumcpy             /* < 8 bytes to copy */
 104         move   t2, a1
 105
 106        andi    t7, src, 0x1                    /* odd buffer? */
 107
 108.Lhword_align:
 109        beqz    t7, .Lword_align
 110         andi   t8, src, 0x2
 111
 112        lbu     t0, (src)
 113        LONG_SUBU       a1, a1, 0x1
 114#ifdef __MIPSEL__
 115        sll     t0, t0, 8
 116#endif
 117        ADDC(sum, t0)
 118        PTR_ADDU        src, src, 0x1
 119        andi    t8, src, 0x2
 120
 121.Lword_align:
 122        beqz    t8, .Ldword_align
 123         sltiu  t8, a1, 56
 124
 125        lhu     t0, (src)
 126        LONG_SUBU       a1, a1, 0x2
 127        ADDC(sum, t0)
 128        sltiu   t8, a1, 56
 129        PTR_ADDU        src, src, 0x2
 130
 131.Ldword_align:
 132        bnez    t8, .Ldo_end_words
 133         move   t8, a1
 134
 135        andi    t8, src, 0x4
 136        beqz    t8, .Lqword_align
 137         andi   t8, src, 0x8
 138
 139        LOAD32  t0, 0x00(src)
 140        LONG_SUBU       a1, a1, 0x4
 141        ADDC(sum, t0)
 142        PTR_ADDU        src, src, 0x4
 143        andi    t8, src, 0x8
 144
 145.Lqword_align:
 146        beqz    t8, .Loword_align
 147         andi   t8, src, 0x10
 148
 149#ifdef USE_DOUBLE
 150        ld      t0, 0x00(src)
 151        LONG_SUBU       a1, a1, 0x8
 152        ADDC(sum, t0)
 153#else
 154        lw      t0, 0x00(src)
 155        lw      t1, 0x04(src)
 156        LONG_SUBU       a1, a1, 0x8
 157        ADDC(sum, t0)
 158        ADDC(sum, t1)
 159#endif
 160        PTR_ADDU        src, src, 0x8
 161        andi    t8, src, 0x10
 162
 163.Loword_align:
 164        beqz    t8, .Lbegin_movement
 165         LONG_SRL       t8, a1, 0x7
 166
 167#ifdef USE_DOUBLE
 168        ld      t0, 0x00(src)
 169        ld      t1, 0x08(src)
 170        ADDC(sum, t0)
 171        ADDC(sum, t1)
 172#else
 173        CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 174#endif
 175        LONG_SUBU       a1, a1, 0x10
 176        PTR_ADDU        src, src, 0x10
 177        LONG_SRL        t8, a1, 0x7
 178
 179.Lbegin_movement:
 180        beqz    t8, 1f
 181         andi   t2, a1, 0x40
 182
 183.Lmove_128bytes:
 184        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 185        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 186        CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 187        CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 188        LONG_SUBU       t8, t8, 0x01
 189        .set    reorder                         /* DADDI_WAR */
 190        PTR_ADDU        src, src, 0x80
 191        bnez    t8, .Lmove_128bytes
 192        .set    noreorder
 193
 1941:
 195        beqz    t2, 1f
 196         andi   t2, a1, 0x20
 197
 198.Lmove_64bytes:
 199        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 200        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 201        PTR_ADDU        src, src, 0x40
 202
 2031:
 204        beqz    t2, .Ldo_end_words
 205         andi   t8, a1, 0x1c
 206
 207.Lmove_32bytes:
 208        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 209        andi    t8, a1, 0x1c
 210        PTR_ADDU        src, src, 0x20
 211
 212.Ldo_end_words:
 213        beqz    t8, .Lsmall_csumcpy
 214         andi   t2, a1, 0x3
 215        LONG_SRL        t8, t8, 0x2
 216
 217.Lend_words:
 218        LOAD32  t0, (src)
 219        LONG_SUBU       t8, t8, 0x1
 220        ADDC(sum, t0)
 221        .set    reorder                         /* DADDI_WAR */
 222        PTR_ADDU        src, src, 0x4
 223        bnez    t8, .Lend_words
 224        .set    noreorder
 225
 226/* unknown src alignment and < 8 bytes to go  */
 227.Lsmall_csumcpy:
 228        move    a1, t2
 229
 230        andi    t0, a1, 4
 231        beqz    t0, 1f
 232         andi   t0, a1, 2
 233
 234        /* Still a full word to go  */
 235        ulw     t1, (src)
 236        PTR_ADDIU       src, 4
 237#ifdef USE_DOUBLE
 238        dsll    t1, t1, 32                      /* clear lower 32bit */
 239#endif
 240        ADDC(sum, t1)
 241
 2421:      move    t1, zero
 243        beqz    t0, 1f
 244         andi   t0, a1, 1
 245
 246        /* Still a halfword to go  */
 247        ulhu    t1, (src)
 248        PTR_ADDIU       src, 2
 249
 2501:      beqz    t0, 1f
 251         sll    t1, t1, 16
 252
 253        lbu     t2, (src)
 254         nop
 255
 256#ifdef __MIPSEB__
 257        sll     t2, t2, 8
 258#endif
 259        or      t1, t2
 260
 2611:      ADDC(sum, t1)
 262
 263        /* fold checksum */
 264#ifdef USE_DOUBLE
 265        dsll32  v1, sum, 0
 266        daddu   sum, v1
 267        sltu    v1, sum, v1
 268        dsra32  sum, sum, 0
 269        addu    sum, v1
 270#endif
 271
 272        /* odd buffer alignment? */
 273#ifdef CONFIG_CPU_MIPSR2
 274        wsbh    v1, sum
 275        movn    sum, v1, t7
 276#else
 277        beqz    t7, 1f                  /* odd buffer alignment? */
 278         lui    v1, 0x00ff
 279        addu    v1, 0x00ff
 280        and     t0, sum, v1
 281        sll     t0, t0, 8
 282        srl     sum, sum, 8
 283        and     sum, sum, v1
 284        or      sum, sum, t0
 2851:
 286#endif
 287        .set    reorder
 288        /* Add the passed partial csum.  */
 289        ADDC32(sum, a2)
 290        jr      ra
 291        .set    noreorder
 292        END(csum_partial)
 293
 294
 295/*
 296 * checksum and copy routines based on memcpy.S
 297 *
 298 *      csum_partial_copy_nocheck(src, dst, len, sum)
 299 *      __csum_partial_copy_user(src, dst, len, sum, errp)
 300 *
 301 * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 302 * function in this file use the standard calling convention.
 303 */
 304
 305#define src a0
 306#define dst a1
 307#define len a2
 308#define psum a3
 309#define sum v0
 310#define odd t8
 311#define errptr t9
 312
 313/*
 314 * The exception handler for loads requires that:
 315 *  1- AT contain the address of the byte just past the end of the source
 316 *     of the copy,
 317 *  2- src_entry <= src < AT, and
 318 *  3- (dst - src) == (dst_entry - src_entry),
 319 * The _entry suffix denotes values when __copy_user was called.
 320 *
 321 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 322 *      not writing AT in __csum_partial_copy
 323 * (2) is met by incrementing src by the number of bytes copied
 324 * (3) is met by not doing loads between a pair of increments of dst and src
 325 *
 326 * The exception handlers for stores stores -EFAULT to errptr and return.
 327 * These handlers do not need to overwrite any data.
 328 */
 329
 330#define EXC(inst_reg,addr,handler)              \
 3319:      inst_reg, addr;                         \
 332        .section __ex_table,"a";                \
 333        PTR     9b, handler;                    \
 334        .previous
 335
 336#ifdef USE_DOUBLE
 337
 338#define LOAD   ld
 339#define LOADL  ldl
 340#define LOADR  ldr
 341#define STOREL sdl
 342#define STORER sdr
 343#define STORE  sd
 344#define ADD    daddu
 345#define SUB    dsubu
 346#define SRL    dsrl
 347#define SLL    dsll
 348#define SLLV   dsllv
 349#define SRLV   dsrlv
 350#define NBYTES 8
 351#define LOG_NBYTES 3
 352
 353#else
 354
 355#define LOAD   lw
 356#define LOADL  lwl
 357#define LOADR  lwr
 358#define STOREL swl
 359#define STORER swr
 360#define STORE  sw
 361#define ADD    addu
 362#define SUB    subu
 363#define SRL    srl
 364#define SLL    sll
 365#define SLLV   sllv
 366#define SRLV   srlv
 367#define NBYTES 4
 368#define LOG_NBYTES 2
 369
 370#endif /* USE_DOUBLE */
 371
 372#ifdef CONFIG_CPU_LITTLE_ENDIAN
 373#define LDFIRST LOADR
 374#define LDREST  LOADL
 375#define STFIRST STORER
 376#define STREST  STOREL
 377#define SHIFT_DISCARD SLLV
 378#define SHIFT_DISCARD_REVERT SRLV
 379#else
 380#define LDFIRST LOADL
 381#define LDREST  LOADR
 382#define STFIRST STOREL
 383#define STREST  STORER
 384#define SHIFT_DISCARD SRLV
 385#define SHIFT_DISCARD_REVERT SLLV
 386#endif
 387
 388#define FIRST(unit) ((unit)*NBYTES)
 389#define REST(unit)  (FIRST(unit)+NBYTES-1)
 390
 391#define ADDRMASK (NBYTES-1)
 392
 393#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 394        .set    noat
 395#else
 396        .set    at=v1
 397#endif
 398
 399LEAF(__csum_partial_copy_user)
 400        PTR_ADDU        AT, src, len    /* See (1) above. */
 401#ifdef CONFIG_64BIT
 402        move    errptr, a4
 403#else
 404        lw      errptr, 16(sp)
 405#endif
 406FEXPORT(csum_partial_copy_nocheck)
 407        move    sum, zero
 408        move    odd, zero
 409        /*
 410         * Note: dst & src may be unaligned, len may be 0
 411         * Temps
 412         */
 413        /*
 414         * The "issue break"s below are very approximate.
 415         * Issue delays for dcache fills will perturb the schedule, as will
 416         * load queue full replay traps, etc.
 417         *
 418         * If len < NBYTES use byte operations.
 419         */
 420        sltu    t2, len, NBYTES
 421        and     t1, dst, ADDRMASK
 422        bnez    t2, .Lcopy_bytes_checklen
 423         and    t0, src, ADDRMASK
 424        andi    odd, dst, 0x1                   /* odd buffer? */
 425        bnez    t1, .Ldst_unaligned
 426         nop
 427        bnez    t0, .Lsrc_unaligned_dst_aligned
 428        /*
 429         * use delay slot for fall-through
 430         * src and dst are aligned; need to compute rem
 431         */
 432.Lboth_aligned:
 433         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 434        beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 435         nop
 436        SUB     len, 8*NBYTES           # subtract here for bgez loop
 437        .align  4
 4381:
 439EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 440EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 441EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 442EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 443EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 444EXC(    LOAD    t5, UNIT(5)(src),       .Ll_exc_copy)
 445EXC(    LOAD    t6, UNIT(6)(src),       .Ll_exc_copy)
 446EXC(    LOAD    t7, UNIT(7)(src),       .Ll_exc_copy)
 447        SUB     len, len, 8*NBYTES
 448        ADD     src, src, 8*NBYTES
 449EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 450        ADDC(sum, t0)
 451EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 452        ADDC(sum, t1)
 453EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 454        ADDC(sum, t2)
 455EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 456        ADDC(sum, t3)
 457EXC(    STORE   t4, UNIT(4)(dst),       .Ls_exc)
 458        ADDC(sum, t4)
 459EXC(    STORE   t5, UNIT(5)(dst),       .Ls_exc)
 460        ADDC(sum, t5)
 461EXC(    STORE   t6, UNIT(6)(dst),       .Ls_exc)
 462        ADDC(sum, t6)
 463EXC(    STORE   t7, UNIT(7)(dst),       .Ls_exc)
 464        ADDC(sum, t7)
 465        .set    reorder                         /* DADDI_WAR */
 466        ADD     dst, dst, 8*NBYTES
 467        bgez    len, 1b
 468        .set    noreorder
 469        ADD     len, 8*NBYTES           # revert len (see above)
 470
 471        /*
 472         * len == the number of bytes left to copy < 8*NBYTES
 473         */
 474.Lcleanup_both_aligned:
 475#define rem t7
 476        beqz    len, .Ldone
 477         sltu   t0, len, 4*NBYTES
 478        bnez    t0, .Lless_than_4units
 479         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 480        /*
 481         * len >= 4*NBYTES
 482         */
 483EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 484EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 485EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 486EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 487        SUB     len, len, 4*NBYTES
 488        ADD     src, src, 4*NBYTES
 489EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 490        ADDC(sum, t0)
 491EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 492        ADDC(sum, t1)
 493EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 494        ADDC(sum, t2)
 495EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 496        ADDC(sum, t3)
 497        .set    reorder                         /* DADDI_WAR */
 498        ADD     dst, dst, 4*NBYTES
 499        beqz    len, .Ldone
 500        .set    noreorder
 501.Lless_than_4units:
 502        /*
 503         * rem = len % NBYTES
 504         */
 505        beq     rem, len, .Lcopy_bytes
 506         nop
 5071:
 508EXC(    LOAD    t0, 0(src),             .Ll_exc)
 509        ADD     src, src, NBYTES
 510        SUB     len, len, NBYTES
 511EXC(    STORE   t0, 0(dst),             .Ls_exc)
 512        ADDC(sum, t0)
 513        .set    reorder                         /* DADDI_WAR */
 514        ADD     dst, dst, NBYTES
 515        bne     rem, len, 1b
 516        .set    noreorder
 517
 518        /*
 519         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 520         * A loop would do only a byte at a time with possible branch
 521         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 522         * because can't assume read-access to dst.  Instead, use
 523         * STREST dst, which doesn't require read access to dst.
 524         *
 525         * This code should perform better than a simple loop on modern,
 526         * wide-issue mips processors because the code has fewer branches and
 527         * more instruction-level parallelism.
 528         */
 529#define bits t2
 530        beqz    len, .Ldone
 531         ADD    t1, dst, len    # t1 is just past last byte of dst
 532        li      bits, 8*NBYTES
 533        SLL     rem, len, 3     # rem = number of bits to keep
 534EXC(    LOAD    t0, 0(src),             .Ll_exc)
 535        SUB     bits, bits, rem # bits = number of bits to discard
 536        SHIFT_DISCARD t0, t0, bits
 537EXC(    STREST  t0, -1(t1),             .Ls_exc)
 538        SHIFT_DISCARD_REVERT t0, t0, bits
 539        .set reorder
 540        ADDC(sum, t0)
 541        b       .Ldone
 542        .set noreorder
 543.Ldst_unaligned:
 544        /*
 545         * dst is unaligned
 546         * t0 = src & ADDRMASK
 547         * t1 = dst & ADDRMASK; T1 > 0
 548         * len >= NBYTES
 549         *
 550         * Copy enough bytes to align dst
 551         * Set match = (src and dst have same alignment)
 552         */
 553#define match rem
 554EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 555        ADD     t2, zero, NBYTES
 556EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 557        SUB     t2, t2, t1      # t2 = number of bytes copied
 558        xor     match, t0, t1
 559EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 560        SLL     t4, t1, 3               # t4 = number of bits to discard
 561        SHIFT_DISCARD t3, t3, t4
 562        /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 563        ADDC(sum, t3)
 564        beq     len, t2, .Ldone
 565         SUB    len, len, t2
 566        ADD     dst, dst, t2
 567        beqz    match, .Lboth_aligned
 568         ADD    src, src, t2
 569
 570.Lsrc_unaligned_dst_aligned:
 571        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 572        beqz    t0, .Lcleanup_src_unaligned
 573         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 5741:
 575/*
 576 * Avoid consecutive LD*'s to the same register since some mips
 577 * implementations can't issue them in the same cycle.
 578 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 579 * are to the same unit (unless src is aligned, but it's not).
 580 */
 581EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 582EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 583        SUB     len, len, 4*NBYTES
 584EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 585EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 586EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 587EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 588EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 589EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 590        ADD     src, src, 4*NBYTES
 591#ifdef CONFIG_CPU_SB1
 592        nop                             # improves slotting
 593#endif
 594EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 595        ADDC(sum, t0)
 596EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 597        ADDC(sum, t1)
 598EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 599        ADDC(sum, t2)
 600EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 601        ADDC(sum, t3)
 602        .set    reorder                         /* DADDI_WAR */
 603        ADD     dst, dst, 4*NBYTES
 604        bne     len, rem, 1b
 605        .set    noreorder
 606
 607.Lcleanup_src_unaligned:
 608        beqz    len, .Ldone
 609         and    rem, len, NBYTES-1  # rem = len % NBYTES
 610        beq     rem, len, .Lcopy_bytes
 611         nop
 6121:
 613EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 614EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 615        ADD     src, src, NBYTES
 616        SUB     len, len, NBYTES
 617EXC(    STORE   t0, 0(dst),             .Ls_exc)
 618        ADDC(sum, t0)
 619        .set    reorder                         /* DADDI_WAR */
 620        ADD     dst, dst, NBYTES
 621        bne     len, rem, 1b
 622        .set    noreorder
 623
 624.Lcopy_bytes_checklen:
 625        beqz    len, .Ldone
 626         nop
 627.Lcopy_bytes:
 628        /* 0 < len < NBYTES  */
 629#ifdef CONFIG_CPU_LITTLE_ENDIAN
 630#define SHIFT_START 0
 631#define SHIFT_INC 8
 632#else
 633#define SHIFT_START 8*(NBYTES-1)
 634#define SHIFT_INC -8
 635#endif
 636        move    t2, zero        # partial word
 637        li      t3, SHIFT_START # shift
 638/* use .Ll_exc_copy here to return correct sum on fault */
 639#define COPY_BYTE(N)                    \
 640EXC(    lbu     t0, N(src), .Ll_exc_copy);      \
 641        SUB     len, len, 1;            \
 642EXC(    sb      t0, N(dst), .Ls_exc);   \
 643        SLLV    t0, t0, t3;             \
 644        addu    t3, SHIFT_INC;          \
 645        beqz    len, .Lcopy_bytes_done; \
 646         or     t2, t0
 647
 648        COPY_BYTE(0)
 649        COPY_BYTE(1)
 650#ifdef USE_DOUBLE
 651        COPY_BYTE(2)
 652        COPY_BYTE(3)
 653        COPY_BYTE(4)
 654        COPY_BYTE(5)
 655#endif
 656EXC(    lbu     t0, NBYTES-2(src), .Ll_exc_copy)
 657        SUB     len, len, 1
 658EXC(    sb      t0, NBYTES-2(dst), .Ls_exc)
 659        SLLV    t0, t0, t3
 660        or      t2, t0
 661.Lcopy_bytes_done:
 662        ADDC(sum, t2)
 663.Ldone:
 664        /* fold checksum */
 665#ifdef USE_DOUBLE
 666        dsll32  v1, sum, 0
 667        daddu   sum, v1
 668        sltu    v1, sum, v1
 669        dsra32  sum, sum, 0
 670        addu    sum, v1
 671#endif
 672
 673#ifdef CONFIG_CPU_MIPSR2
 674        wsbh    v1, sum
 675        movn    sum, v1, odd
 676#else
 677        beqz    odd, 1f                 /* odd buffer alignment? */
 678         lui    v1, 0x00ff
 679        addu    v1, 0x00ff
 680        and     t0, sum, v1
 681        sll     t0, t0, 8
 682        srl     sum, sum, 8
 683        and     sum, sum, v1
 684        or      sum, sum, t0
 6851:
 686#endif
 687        .set reorder
 688        ADDC32(sum, psum)
 689        jr      ra
 690        .set noreorder
 691
 692.Ll_exc_copy:
 693        /*
 694         * Copy bytes from src until faulting load address (or until a
 695         * lb faults)
 696         *
 697         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 698         * may be more than a byte beyond the last address.
 699         * Hence, the lb below may get an exception.
 700         *
 701         * Assumes src < THREAD_BUADDR($28)
 702         */
 703        LOAD    t0, TI_TASK($28)
 704         li     t2, SHIFT_START
 705        LOAD    t0, THREAD_BUADDR(t0)
 7061:
 707EXC(    lbu     t1, 0(src),     .Ll_exc)
 708        ADD     src, src, 1
 709        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 710        SLLV    t1, t1, t2
 711        addu    t2, SHIFT_INC
 712        ADDC(sum, t1)
 713        .set    reorder                         /* DADDI_WAR */
 714        ADD     dst, dst, 1
 715        bne     src, t0, 1b
 716        .set    noreorder
 717.Ll_exc:
 718        LOAD    t0, TI_TASK($28)
 719         nop
 720        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 721         nop
 722        SUB     len, AT, t0             # len number of uncopied bytes
 723        /*
 724         * Here's where we rely on src and dst being incremented in tandem,
 725         *   See (3) above.
 726         * dst += (fault addr - src) to put dst at first byte to clear
 727         */
 728        ADD     dst, t0                 # compute start address in a1
 729        SUB     dst, src
 730        /*
 731         * Clear len bytes starting at dst.  Can't call __bzero because it
 732         * might modify len.  An inefficient loop for these rare times...
 733         */
 734        .set    reorder                         /* DADDI_WAR */
 735        SUB     src, len, 1
 736        beqz    len, .Ldone
 737        .set    noreorder
 7381:      sb      zero, 0(dst)
 739        ADD     dst, dst, 1
 740        .set    push
 741        .set    noat
 742#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 743        bnez    src, 1b
 744         SUB    src, src, 1
 745#else
 746        li      v1, 1
 747        bnez    src, 1b
 748         SUB    src, src, v1
 749#endif
 750        li      v1, -EFAULT
 751        b       .Ldone
 752         sw     v1, (errptr)
 753
 754.Ls_exc:
 755        li      v0, -1 /* invalid checksum */
 756        li      v1, -EFAULT
 757        jr      ra
 758         sw     v1, (errptr)
 759        .set    pop
 760        END(__csum_partial_copy_user)
 761