linux/arch/mips/lib/csum_partial.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Quick'n'dirty IP checksum ...
   7 *
   8 * Copyright (C) 1998, 1999 Ralf Baechle
   9 * Copyright (C) 1999 Silicon Graphics, Inc.
  10 * Copyright (C) 2007  Maciej W. Rozycki
  11 * Copyright (C) 2014 Imagination Technologies Ltd.
  12 */
  13#include <linux/errno.h>
  14#include <asm/asm.h>
  15#include <asm/asm-offsets.h>
  16#include <asm/regdef.h>
  17
  18#ifdef CONFIG_64BIT
  19/*
  20 * As we are sharing code base with the mips32 tree (which use the o32 ABI
  21 * register definitions). We need to redefine the register definitions from
  22 * the n64 ABI register naming to the o32 ABI register naming.
  23 */
  24#undef t0
  25#undef t1
  26#undef t2
  27#undef t3
  28#define t0      $8
  29#define t1      $9
  30#define t2      $10
  31#define t3      $11
  32#define t4      $12
  33#define t5      $13
  34#define t6      $14
  35#define t7      $15
  36
  37#define USE_DOUBLE
  38#endif
  39
  40#ifdef USE_DOUBLE
  41
  42#define LOAD   ld
  43#define LOAD32 lwu
  44#define ADD    daddu
  45#define NBYTES 8
  46
  47#else
  48
  49#define LOAD   lw
  50#define LOAD32 lw
  51#define ADD    addu
  52#define NBYTES 4
  53
  54#endif /* USE_DOUBLE */
  55
  56#define UNIT(unit)  ((unit)*NBYTES)
  57
  58#define ADDC(sum,reg)                                           \
  59        .set    push;                                           \
  60        .set    noat;                                           \
  61        ADD     sum, reg;                                       \
  62        sltu    v1, sum, reg;                                   \
  63        ADD     sum, v1;                                        \
  64        .set    pop
  65
  66#define ADDC32(sum,reg)                                         \
  67        .set    push;                                           \
  68        .set    noat;                                           \
  69        addu    sum, reg;                                       \
  70        sltu    v1, sum, reg;                                   \
  71        addu    sum, v1;                                        \
  72        .set    pop
  73
  74#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  75        LOAD    _t0, (offset + UNIT(0))(src);                   \
  76        LOAD    _t1, (offset + UNIT(1))(src);                   \
  77        LOAD    _t2, (offset + UNIT(2))(src);                   \
  78        LOAD    _t3, (offset + UNIT(3))(src);                   \
  79        ADDC(_t0, _t1);                                         \
  80        ADDC(_t2, _t3);                                         \
  81        ADDC(sum, _t0);                                         \
  82        ADDC(sum, _t2)
  83
  84#ifdef USE_DOUBLE
  85#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  86        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  87#else
  88#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  89        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  90        CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  91#endif
  92
  93/*
  94 * a0: source address
  95 * a1: length of the area to checksum
  96 * a2: partial checksum
  97 */
  98
  99#define src a0
 100#define sum v0
 101
 102        .text
 103        .set    noreorder
 104        .align  5
 105LEAF(csum_partial)
 106        move    sum, zero
 107        move    t7, zero
 108
 109        sltiu   t8, a1, 0x8
 110        bnez    t8, .Lsmall_csumcpy             /* < 8 bytes to copy */
 111         move   t2, a1
 112
 113        andi    t7, src, 0x1                    /* odd buffer? */
 114
 115.Lhword_align:
 116        beqz    t7, .Lword_align
 117         andi   t8, src, 0x2
 118
 119        lbu     t0, (src)
 120        LONG_SUBU       a1, a1, 0x1
 121#ifdef __MIPSEL__
 122        sll     t0, t0, 8
 123#endif
 124        ADDC(sum, t0)
 125        PTR_ADDU        src, src, 0x1
 126        andi    t8, src, 0x2
 127
 128.Lword_align:
 129        beqz    t8, .Ldword_align
 130         sltiu  t8, a1, 56
 131
 132        lhu     t0, (src)
 133        LONG_SUBU       a1, a1, 0x2
 134        ADDC(sum, t0)
 135        sltiu   t8, a1, 56
 136        PTR_ADDU        src, src, 0x2
 137
 138.Ldword_align:
 139        bnez    t8, .Ldo_end_words
 140         move   t8, a1
 141
 142        andi    t8, src, 0x4
 143        beqz    t8, .Lqword_align
 144         andi   t8, src, 0x8
 145
 146        LOAD32  t0, 0x00(src)
 147        LONG_SUBU       a1, a1, 0x4
 148        ADDC(sum, t0)
 149        PTR_ADDU        src, src, 0x4
 150        andi    t8, src, 0x8
 151
 152.Lqword_align:
 153        beqz    t8, .Loword_align
 154         andi   t8, src, 0x10
 155
 156#ifdef USE_DOUBLE
 157        ld      t0, 0x00(src)
 158        LONG_SUBU       a1, a1, 0x8
 159        ADDC(sum, t0)
 160#else
 161        lw      t0, 0x00(src)
 162        lw      t1, 0x04(src)
 163        LONG_SUBU       a1, a1, 0x8
 164        ADDC(sum, t0)
 165        ADDC(sum, t1)
 166#endif
 167        PTR_ADDU        src, src, 0x8
 168        andi    t8, src, 0x10
 169
 170.Loword_align:
 171        beqz    t8, .Lbegin_movement
 172         LONG_SRL       t8, a1, 0x7
 173
 174#ifdef USE_DOUBLE
 175        ld      t0, 0x00(src)
 176        ld      t1, 0x08(src)
 177        ADDC(sum, t0)
 178        ADDC(sum, t1)
 179#else
 180        CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 181#endif
 182        LONG_SUBU       a1, a1, 0x10
 183        PTR_ADDU        src, src, 0x10
 184        LONG_SRL        t8, a1, 0x7
 185
 186.Lbegin_movement:
 187        beqz    t8, 1f
 188         andi   t2, a1, 0x40
 189
 190.Lmove_128bytes:
 191        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 192        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 193        CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 194        CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 195        LONG_SUBU       t8, t8, 0x01
 196        .set    reorder                         /* DADDI_WAR */
 197        PTR_ADDU        src, src, 0x80
 198        bnez    t8, .Lmove_128bytes
 199        .set    noreorder
 200
 2011:
 202        beqz    t2, 1f
 203         andi   t2, a1, 0x20
 204
 205.Lmove_64bytes:
 206        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 207        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 208        PTR_ADDU        src, src, 0x40
 209
 2101:
 211        beqz    t2, .Ldo_end_words
 212         andi   t8, a1, 0x1c
 213
 214.Lmove_32bytes:
 215        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 216        andi    t8, a1, 0x1c
 217        PTR_ADDU        src, src, 0x20
 218
 219.Ldo_end_words:
 220        beqz    t8, .Lsmall_csumcpy
 221         andi   t2, a1, 0x3
 222        LONG_SRL        t8, t8, 0x2
 223
 224.Lend_words:
 225        LOAD32  t0, (src)
 226        LONG_SUBU       t8, t8, 0x1
 227        ADDC(sum, t0)
 228        .set    reorder                         /* DADDI_WAR */
 229        PTR_ADDU        src, src, 0x4
 230        bnez    t8, .Lend_words
 231        .set    noreorder
 232
 233/* unknown src alignment and < 8 bytes to go  */
 234.Lsmall_csumcpy:
 235        move    a1, t2
 236
 237        andi    t0, a1, 4
 238        beqz    t0, 1f
 239         andi   t0, a1, 2
 240
 241        /* Still a full word to go  */
 242        ulw     t1, (src)
 243        PTR_ADDIU       src, 4
 244#ifdef USE_DOUBLE
 245        dsll    t1, t1, 32                      /* clear lower 32bit */
 246#endif
 247        ADDC(sum, t1)
 248
 2491:      move    t1, zero
 250        beqz    t0, 1f
 251         andi   t0, a1, 1
 252
 253        /* Still a halfword to go  */
 254        ulhu    t1, (src)
 255        PTR_ADDIU       src, 2
 256
 2571:      beqz    t0, 1f
 258         sll    t1, t1, 16
 259
 260        lbu     t2, (src)
 261         nop
 262
 263#ifdef __MIPSEB__
 264        sll     t2, t2, 8
 265#endif
 266        or      t1, t2
 267
 2681:      ADDC(sum, t1)
 269
 270        /* fold checksum */
 271#ifdef USE_DOUBLE
 272        dsll32  v1, sum, 0
 273        daddu   sum, v1
 274        sltu    v1, sum, v1
 275        dsra32  sum, sum, 0
 276        addu    sum, v1
 277#endif
 278
 279        /* odd buffer alignment? */
 280#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
 281        .set    push
 282        .set    arch=mips32r2
 283        wsbh    v1, sum
 284        movn    sum, v1, t7
 285        .set    pop
 286#else
 287        beqz    t7, 1f                  /* odd buffer alignment? */
 288         lui    v1, 0x00ff
 289        addu    v1, 0x00ff
 290        and     t0, sum, v1
 291        sll     t0, t0, 8
 292        srl     sum, sum, 8
 293        and     sum, sum, v1
 294        or      sum, sum, t0
 2951:
 296#endif
 297        .set    reorder
 298        /* Add the passed partial csum.  */
 299        ADDC32(sum, a2)
 300        jr      ra
 301        .set    noreorder
 302        END(csum_partial)
 303
 304
 305/*
 306 * checksum and copy routines based on memcpy.S
 307 *
 308 *      csum_partial_copy_nocheck(src, dst, len, sum)
 309 *      __csum_partial_copy_kernel(src, dst, len, sum, errp)
 310 *
 311 * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 312 * function in this file use the standard calling convention.
 313 */
 314
 315#define src a0
 316#define dst a1
 317#define len a2
 318#define psum a3
 319#define sum v0
 320#define odd t8
 321#define errptr t9
 322
 323/*
 324 * The exception handler for loads requires that:
 325 *  1- AT contain the address of the byte just past the end of the source
 326 *     of the copy,
 327 *  2- src_entry <= src < AT, and
 328 *  3- (dst - src) == (dst_entry - src_entry),
 329 * The _entry suffix denotes values when __copy_user was called.
 330 *
 331 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 332 *      not writing AT in __csum_partial_copy
 333 * (2) is met by incrementing src by the number of bytes copied
 334 * (3) is met by not doing loads between a pair of increments of dst and src
 335 *
 336 * The exception handlers for stores stores -EFAULT to errptr and return.
 337 * These handlers do not need to overwrite any data.
 338 */
 339
 340/* Instruction type */
 341#define LD_INSN 1
 342#define ST_INSN 2
 343#define LEGACY_MODE 1
 344#define EVA_MODE    2
 345#define USEROP   1
 346#define KERNELOP 2
 347
 348/*
 349 * Wrapper to add an entry in the exception table
 350 * in case the insn causes a memory exception.
 351 * Arguments:
 352 * insn    : Load/store instruction
 353 * type    : Instruction type
 354 * reg     : Register
 355 * addr    : Address
 356 * handler : Exception handler
 357 */
 358#define EXC(insn, type, reg, addr, handler)     \
 359        .if \mode == LEGACY_MODE;               \
 3609:              insn reg, addr;                 \
 361                .section __ex_table,"a";        \
 362                PTR     9b, handler;            \
 363                .previous;                      \
 364        /* This is enabled in EVA mode */       \
 365        .else;                                  \
 366                /* If loading from user or storing to user */   \
 367                .if ((\from == USEROP) && (type == LD_INSN)) || \
 368                    ((\to == USEROP) && (type == ST_INSN));     \
 3699:                      __BUILD_EVA_INSN(insn##e, reg, addr);   \
 370                        .section __ex_table,"a";                \
 371                        PTR     9b, handler;                    \
 372                        .previous;                              \
 373                .else;                                          \
 374                        /* EVA without exception */             \
 375                        insn reg, addr;                         \
 376                .endif;                                         \
 377        .endif
 378
 379#undef LOAD
 380
 381#ifdef USE_DOUBLE
 382
 383#define LOADK   ld /* No exception */
 384#define LOAD(reg, addr, handler)        EXC(ld, LD_INSN, reg, addr, handler)
 385#define LOADBU(reg, addr, handler)      EXC(lbu, LD_INSN, reg, addr, handler)
 386#define LOADL(reg, addr, handler)       EXC(ldl, LD_INSN, reg, addr, handler)
 387#define LOADR(reg, addr, handler)       EXC(ldr, LD_INSN, reg, addr, handler)
 388#define STOREB(reg, addr, handler)      EXC(sb, ST_INSN, reg, addr, handler)
 389#define STOREL(reg, addr, handler)      EXC(sdl, ST_INSN, reg, addr, handler)
 390#define STORER(reg, addr, handler)      EXC(sdr, ST_INSN, reg, addr, handler)
 391#define STORE(reg, addr, handler)       EXC(sd, ST_INSN, reg, addr, handler)
 392#define ADD    daddu
 393#define SUB    dsubu
 394#define SRL    dsrl
 395#define SLL    dsll
 396#define SLLV   dsllv
 397#define SRLV   dsrlv
 398#define NBYTES 8
 399#define LOG_NBYTES 3
 400
 401#else
 402
 403#define LOADK   lw /* No exception */
 404#define LOAD(reg, addr, handler)        EXC(lw, LD_INSN, reg, addr, handler)
 405#define LOADBU(reg, addr, handler)      EXC(lbu, LD_INSN, reg, addr, handler)
 406#define LOADL(reg, addr, handler)       EXC(lwl, LD_INSN, reg, addr, handler)
 407#define LOADR(reg, addr, handler)       EXC(lwr, LD_INSN, reg, addr, handler)
 408#define STOREB(reg, addr, handler)      EXC(sb, ST_INSN, reg, addr, handler)
 409#define STOREL(reg, addr, handler)      EXC(swl, ST_INSN, reg, addr, handler)
 410#define STORER(reg, addr, handler)      EXC(swr, ST_INSN, reg, addr, handler)
 411#define STORE(reg, addr, handler)       EXC(sw, ST_INSN, reg, addr, handler)
 412#define ADD    addu
 413#define SUB    subu
 414#define SRL    srl
 415#define SLL    sll
 416#define SLLV   sllv
 417#define SRLV   srlv
 418#define NBYTES 4
 419#define LOG_NBYTES 2
 420
 421#endif /* USE_DOUBLE */
 422
 423#ifdef CONFIG_CPU_LITTLE_ENDIAN
 424#define LDFIRST LOADR
 425#define LDREST  LOADL
 426#define STFIRST STORER
 427#define STREST  STOREL
 428#define SHIFT_DISCARD SLLV
 429#define SHIFT_DISCARD_REVERT SRLV
 430#else
 431#define LDFIRST LOADL
 432#define LDREST  LOADR
 433#define STFIRST STOREL
 434#define STREST  STORER
 435#define SHIFT_DISCARD SRLV
 436#define SHIFT_DISCARD_REVERT SLLV
 437#endif
 438
 439#define FIRST(unit) ((unit)*NBYTES)
 440#define REST(unit)  (FIRST(unit)+NBYTES-1)
 441
 442#define ADDRMASK (NBYTES-1)
 443
 444#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 445        .set    noat
 446#else
 447        .set    at=v1
 448#endif
 449
 450        .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to, __nocheck
 451
 452        PTR_ADDU        AT, src, len    /* See (1) above. */
 453        /* initialize __nocheck if this the first time we execute this
 454         * macro
 455         */
 456#ifdef CONFIG_64BIT
 457        move    errptr, a4
 458#else
 459        lw      errptr, 16(sp)
 460#endif
 461        .if \__nocheck == 1
 462        FEXPORT(csum_partial_copy_nocheck)
 463        .endif
 464        move    sum, zero
 465        move    odd, zero
 466        /*
 467         * Note: dst & src may be unaligned, len may be 0
 468         * Temps
 469         */
 470        /*
 471         * The "issue break"s below are very approximate.
 472         * Issue delays for dcache fills will perturb the schedule, as will
 473         * load queue full replay traps, etc.
 474         *
 475         * If len < NBYTES use byte operations.
 476         */
 477        sltu    t2, len, NBYTES
 478        and     t1, dst, ADDRMASK
 479        bnez    t2, .Lcopy_bytes_checklen\@
 480         and    t0, src, ADDRMASK
 481        andi    odd, dst, 0x1                   /* odd buffer? */
 482        bnez    t1, .Ldst_unaligned\@
 483         nop
 484        bnez    t0, .Lsrc_unaligned_dst_aligned\@
 485        /*
 486         * use delay slot for fall-through
 487         * src and dst are aligned; need to compute rem
 488         */
 489.Lboth_aligned\@:
 490         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 491        beqz    t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
 492         nop
 493        SUB     len, 8*NBYTES           # subtract here for bgez loop
 494        .align  4
 4951:
 496        LOAD(t0, UNIT(0)(src), .Ll_exc\@)
 497        LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
 498        LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
 499        LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
 500        LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
 501        LOAD(t5, UNIT(5)(src), .Ll_exc_copy\@)
 502        LOAD(t6, UNIT(6)(src), .Ll_exc_copy\@)
 503        LOAD(t7, UNIT(7)(src), .Ll_exc_copy\@)
 504        SUB     len, len, 8*NBYTES
 505        ADD     src, src, 8*NBYTES
 506        STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 507        ADDC(t0, t1)
 508        STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 509        ADDC(sum, t0)
 510        STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 511        ADDC(t2, t3)
 512        STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 513        ADDC(sum, t2)
 514        STORE(t4, UNIT(4)(dst), .Ls_exc\@)
 515        ADDC(t4, t5)
 516        STORE(t5, UNIT(5)(dst), .Ls_exc\@)
 517        ADDC(sum, t4)
 518        STORE(t6, UNIT(6)(dst), .Ls_exc\@)
 519        ADDC(t6, t7)
 520        STORE(t7, UNIT(7)(dst), .Ls_exc\@)
 521        ADDC(sum, t6)
 522        .set    reorder                         /* DADDI_WAR */
 523        ADD     dst, dst, 8*NBYTES
 524        bgez    len, 1b
 525        .set    noreorder
 526        ADD     len, 8*NBYTES           # revert len (see above)
 527
 528        /*
 529         * len == the number of bytes left to copy < 8*NBYTES
 530         */
 531.Lcleanup_both_aligned\@:
 532#define rem t7
 533        beqz    len, .Ldone\@
 534         sltu   t0, len, 4*NBYTES
 535        bnez    t0, .Lless_than_4units\@
 536         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 537        /*
 538         * len >= 4*NBYTES
 539         */
 540        LOAD(t0, UNIT(0)(src), .Ll_exc\@)
 541        LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
 542        LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
 543        LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
 544        SUB     len, len, 4*NBYTES
 545        ADD     src, src, 4*NBYTES
 546        STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 547        ADDC(t0, t1)
 548        STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 549        ADDC(sum, t0)
 550        STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 551        ADDC(t2, t3)
 552        STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 553        ADDC(sum, t2)
 554        .set    reorder                         /* DADDI_WAR */
 555        ADD     dst, dst, 4*NBYTES
 556        beqz    len, .Ldone\@
 557        .set    noreorder
 558.Lless_than_4units\@:
 559        /*
 560         * rem = len % NBYTES
 561         */
 562        beq     rem, len, .Lcopy_bytes\@
 563         nop
 5641:
 565        LOAD(t0, 0(src), .Ll_exc\@)
 566        ADD     src, src, NBYTES
 567        SUB     len, len, NBYTES
 568        STORE(t0, 0(dst), .Ls_exc\@)
 569        ADDC(sum, t0)
 570        .set    reorder                         /* DADDI_WAR */
 571        ADD     dst, dst, NBYTES
 572        bne     rem, len, 1b
 573        .set    noreorder
 574
 575        /*
 576         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 577         * A loop would do only a byte at a time with possible branch
 578         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 579         * because can't assume read-access to dst.  Instead, use
 580         * STREST dst, which doesn't require read access to dst.
 581         *
 582         * This code should perform better than a simple loop on modern,
 583         * wide-issue mips processors because the code has fewer branches and
 584         * more instruction-level parallelism.
 585         */
 586#define bits t2
 587        beqz    len, .Ldone\@
 588         ADD    t1, dst, len    # t1 is just past last byte of dst
 589        li      bits, 8*NBYTES
 590        SLL     rem, len, 3     # rem = number of bits to keep
 591        LOAD(t0, 0(src), .Ll_exc\@)
 592        SUB     bits, bits, rem # bits = number of bits to discard
 593        SHIFT_DISCARD t0, t0, bits
 594        STREST(t0, -1(t1), .Ls_exc\@)
 595        SHIFT_DISCARD_REVERT t0, t0, bits
 596        .set reorder
 597        ADDC(sum, t0)
 598        b       .Ldone\@
 599        .set noreorder
 600.Ldst_unaligned\@:
 601        /*
 602         * dst is unaligned
 603         * t0 = src & ADDRMASK
 604         * t1 = dst & ADDRMASK; T1 > 0
 605         * len >= NBYTES
 606         *
 607         * Copy enough bytes to align dst
 608         * Set match = (src and dst have same alignment)
 609         */
 610#define match rem
 611        LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
 612        ADD     t2, zero, NBYTES
 613        LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
 614        SUB     t2, t2, t1      # t2 = number of bytes copied
 615        xor     match, t0, t1
 616        STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
 617        SLL     t4, t1, 3               # t4 = number of bits to discard
 618        SHIFT_DISCARD t3, t3, t4
 619        /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 620        ADDC(sum, t3)
 621        beq     len, t2, .Ldone\@
 622         SUB    len, len, t2
 623        ADD     dst, dst, t2
 624        beqz    match, .Lboth_aligned\@
 625         ADD    src, src, t2
 626
 627.Lsrc_unaligned_dst_aligned\@:
 628        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 629        beqz    t0, .Lcleanup_src_unaligned\@
 630         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 6311:
 632/*
 633 * Avoid consecutive LD*'s to the same register since some mips
 634 * implementations can't issue them in the same cycle.
 635 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 636 * are to the same unit (unless src is aligned, but it's not).
 637 */
 638        LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
 639        LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
 640        SUB     len, len, 4*NBYTES
 641        LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
 642        LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
 643        LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
 644        LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
 645        LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
 646        LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
 647        ADD     src, src, 4*NBYTES
 648#ifdef CONFIG_CPU_SB1
 649        nop                             # improves slotting
 650#endif
 651        STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 652        ADDC(t0, t1)
 653        STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 654        ADDC(sum, t0)
 655        STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 656        ADDC(t2, t3)
 657        STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 658        ADDC(sum, t2)
 659        .set    reorder                         /* DADDI_WAR */
 660        ADD     dst, dst, 4*NBYTES
 661        bne     len, rem, 1b
 662        .set    noreorder
 663
 664.Lcleanup_src_unaligned\@:
 665        beqz    len, .Ldone\@
 666         and    rem, len, NBYTES-1  # rem = len % NBYTES
 667        beq     rem, len, .Lcopy_bytes\@
 668         nop
 6691:
 670        LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
 671        LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
 672        ADD     src, src, NBYTES
 673        SUB     len, len, NBYTES
 674        STORE(t0, 0(dst), .Ls_exc\@)
 675        ADDC(sum, t0)
 676        .set    reorder                         /* DADDI_WAR */
 677        ADD     dst, dst, NBYTES
 678        bne     len, rem, 1b
 679        .set    noreorder
 680
 681.Lcopy_bytes_checklen\@:
 682        beqz    len, .Ldone\@
 683         nop
 684.Lcopy_bytes\@:
 685        /* 0 < len < NBYTES  */
 686#ifdef CONFIG_CPU_LITTLE_ENDIAN
 687#define SHIFT_START 0
 688#define SHIFT_INC 8
 689#else
 690#define SHIFT_START 8*(NBYTES-1)
 691#define SHIFT_INC -8
 692#endif
 693        move    t2, zero        # partial word
 694        li      t3, SHIFT_START # shift
 695/* use .Ll_exc_copy here to return correct sum on fault */
 696#define COPY_BYTE(N)                    \
 697        LOADBU(t0, N(src), .Ll_exc_copy\@);     \
 698        SUB     len, len, 1;            \
 699        STOREB(t0, N(dst), .Ls_exc\@);  \
 700        SLLV    t0, t0, t3;             \
 701        addu    t3, SHIFT_INC;          \
 702        beqz    len, .Lcopy_bytes_done\@; \
 703         or     t2, t0
 704
 705        COPY_BYTE(0)
 706        COPY_BYTE(1)
 707#ifdef USE_DOUBLE
 708        COPY_BYTE(2)
 709        COPY_BYTE(3)
 710        COPY_BYTE(4)
 711        COPY_BYTE(5)
 712#endif
 713        LOADBU(t0, NBYTES-2(src), .Ll_exc_copy\@)
 714        SUB     len, len, 1
 715        STOREB(t0, NBYTES-2(dst), .Ls_exc\@)
 716        SLLV    t0, t0, t3
 717        or      t2, t0
 718.Lcopy_bytes_done\@:
 719        ADDC(sum, t2)
 720.Ldone\@:
 721        /* fold checksum */
 722        .set    push
 723        .set    noat
 724#ifdef USE_DOUBLE
 725        dsll32  v1, sum, 0
 726        daddu   sum, v1
 727        sltu    v1, sum, v1
 728        dsra32  sum, sum, 0
 729        addu    sum, v1
 730#endif
 731
 732#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
 733        .set    push
 734        .set    arch=mips32r2
 735        wsbh    v1, sum
 736        movn    sum, v1, odd
 737        .set    pop
 738#else
 739        beqz    odd, 1f                 /* odd buffer alignment? */
 740         lui    v1, 0x00ff
 741        addu    v1, 0x00ff
 742        and     t0, sum, v1
 743        sll     t0, t0, 8
 744        srl     sum, sum, 8
 745        and     sum, sum, v1
 746        or      sum, sum, t0
 7471:
 748#endif
 749        .set    pop
 750        .set reorder
 751        ADDC32(sum, psum)
 752        jr      ra
 753        .set noreorder
 754
 755.Ll_exc_copy\@:
 756        /*
 757         * Copy bytes from src until faulting load address (or until a
 758         * lb faults)
 759         *
 760         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 761         * may be more than a byte beyond the last address.
 762         * Hence, the lb below may get an exception.
 763         *
 764         * Assumes src < THREAD_BUADDR($28)
 765         */
 766        LOADK   t0, TI_TASK($28)
 767         li     t2, SHIFT_START
 768        LOADK   t0, THREAD_BUADDR(t0)
 7691:
 770        LOADBU(t1, 0(src), .Ll_exc\@)
 771        ADD     src, src, 1
 772        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 773        SLLV    t1, t1, t2
 774        addu    t2, SHIFT_INC
 775        ADDC(sum, t1)
 776        .set    reorder                         /* DADDI_WAR */
 777        ADD     dst, dst, 1
 778        bne     src, t0, 1b
 779        .set    noreorder
 780.Ll_exc\@:
 781        LOADK   t0, TI_TASK($28)
 782         nop
 783        LOADK   t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 784         nop
 785        SUB     len, AT, t0             # len number of uncopied bytes
 786        /*
 787         * Here's where we rely on src and dst being incremented in tandem,
 788         *   See (3) above.
 789         * dst += (fault addr - src) to put dst at first byte to clear
 790         */
 791        ADD     dst, t0                 # compute start address in a1
 792        SUB     dst, src
 793        /*
 794         * Clear len bytes starting at dst.  Can't call __bzero because it
 795         * might modify len.  An inefficient loop for these rare times...
 796         */
 797        .set    reorder                         /* DADDI_WAR */
 798        SUB     src, len, 1
 799        beqz    len, .Ldone\@
 800        .set    noreorder
 8011:      sb      zero, 0(dst)
 802        ADD     dst, dst, 1
 803        .set    push
 804        .set    noat
 805#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 806        bnez    src, 1b
 807         SUB    src, src, 1
 808#else
 809        li      v1, 1
 810        bnez    src, 1b
 811         SUB    src, src, v1
 812#endif
 813        li      v1, -EFAULT
 814        b       .Ldone\@
 815         sw     v1, (errptr)
 816
 817.Ls_exc\@:
 818        li      v0, -1 /* invalid checksum */
 819        li      v1, -EFAULT
 820        jr      ra
 821         sw     v1, (errptr)
 822        .set    pop
 823        .endm
 824
 825LEAF(__csum_partial_copy_kernel)
 826#ifndef CONFIG_EVA
 827FEXPORT(__csum_partial_copy_to_user)
 828FEXPORT(__csum_partial_copy_from_user)
 829#endif
 830__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP 1
 831END(__csum_partial_copy_kernel)
 832
 833#ifdef CONFIG_EVA
 834LEAF(__csum_partial_copy_to_user)
 835__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP 0
 836END(__csum_partial_copy_to_user)
 837
 838LEAF(__csum_partial_copy_from_user)
 839__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP 0
 840END(__csum_partial_copy_from_user)
 841#endif
 842