LXR linux/arch/sparc/lib/checksum

   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* checksum.S: Sparc optimized checksum code.
   3 *
   4 *  Copyright(C) 1995 Linus Torvalds
   5 *  Copyright(C) 1995 Miguel de Icaza
   6 *  Copyright(C) 1996 David S. Miller
   7 *  Copyright(C) 1997 Jakub Jelinek
   8 *
   9 * derived from:
  10 *      Linux/Alpha checksum c-code
  11 *      Linux/ix86 inline checksum assembly
  12 *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  13 *      David Mosberger-Tang for optimized reference c-code
  14 *      BSD4.4 portable checksum routine
  15 */
  16
  17#include <asm/errno.h>
  18#include <asm/export.h>
  19
  20#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
  21        ldd     [buf + offset + 0x00], t0;                      \
  22        ldd     [buf + offset + 0x08], t2;                      \
  23        addxcc  t0, sum, sum;                                   \
  24        addxcc  t1, sum, sum;                                   \
  25        ldd     [buf + offset + 0x10], t4;                      \
  26        addxcc  t2, sum, sum;                                   \
  27        addxcc  t3, sum, sum;                                   \
  28        ldd     [buf + offset + 0x18], t0;                      \
  29        addxcc  t4, sum, sum;                                   \
  30        addxcc  t5, sum, sum;                                   \
  31        addxcc  t0, sum, sum;                                   \
  32        addxcc  t1, sum, sum;
  33
  34#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)        \
  35        ldd     [buf - offset - 0x08], t0;                      \
  36        ldd     [buf - offset - 0x00], t2;                      \
  37        addxcc  t0, sum, sum;                                   \
  38        addxcc  t1, sum, sum;                                   \
  39        addxcc  t2, sum, sum;                                   \
  40        addxcc  t3, sum, sum;
  41
  42        /* Do end cruft out of band to get better cache patterns. */
  43csum_partial_end_cruft:
  44        be      1f                              ! caller asks %o1 & 0x8
  45         andcc  %o1, 4, %g0                     ! nope, check for word remaining
  46        ldd     [%o0], %g2                      ! load two
  47        addcc   %g2, %o2, %o2                   ! add first word to sum
  48        addxcc  %g3, %o2, %o2                   ! add second word as well
  49        add     %o0, 8, %o0                     ! advance buf ptr
  50        addx    %g0, %o2, %o2                   ! add in final carry
  51        andcc   %o1, 4, %g0                     ! check again for word remaining
  521:      be      1f                              ! nope, skip this code
  53         andcc  %o1, 3, %o1                     ! check for trailing bytes
  54        ld      [%o0], %g2                      ! load it
  55        addcc   %g2, %o2, %o2                   ! add to sum
  56        add     %o0, 4, %o0                     ! advance buf ptr
  57        addx    %g0, %o2, %o2                   ! add in final carry
  58        andcc   %o1, 3, %g0                     ! check again for trailing bytes
  591:      be      1f                              ! no trailing bytes, return
  60         addcc  %o1, -1, %g0                    ! only one byte remains?
  61        bne     2f                              ! at least two bytes more
  62         subcc  %o1, 2, %o1                     ! only two bytes more?
  63        b       4f                              ! only one byte remains
  64         or     %g0, %g0, %o4                   ! clear fake hword value
  652:      lduh    [%o0], %o4                      ! get hword
  66        be      6f                              ! jmp if only hword remains
  67         add    %o0, 2, %o0                     ! advance buf ptr either way
  68        sll     %o4, 16, %o4                    ! create upper hword
  694:      ldub    [%o0], %o5                      ! get final byte
  70        sll     %o5, 8, %o5                     ! put into place
  71        or      %o5, %o4, %o4                   ! coalese with hword (if any)
  726:      addcc   %o4, %o2, %o2                   ! add to sum
  731:      retl                                    ! get outta here
  74         addx   %g0, %o2, %o0                   ! add final carry into retval
  75
  76        /* Also do alignment out of band to get better cache patterns. */
  77csum_partial_fix_alignment:
  78        cmp     %o1, 6
  79        bl      cpte - 0x4
  80         andcc  %o0, 0x2, %g0
  81        be      1f
  82         andcc  %o0, 0x4, %g0
  83        lduh    [%o0 + 0x00], %g2
  84        sub     %o1, 2, %o1
  85        add     %o0, 2, %o0
  86        sll     %g2, 16, %g2
  87        addcc   %g2, %o2, %o2
  88        srl     %o2, 16, %g3
  89        addx    %g0, %g3, %g2
  90        sll     %o2, 16, %o2
  91        sll     %g2, 16, %g3
  92        srl     %o2, 16, %o2
  93        andcc   %o0, 0x4, %g0
  94        or      %g3, %o2, %o2
  951:      be      cpa
  96         andcc  %o1, 0xffffff80, %o3
  97        ld      [%o0 + 0x00], %g2
  98        sub     %o1, 4, %o1
  99        addcc   %g2, %o2, %o2
 100        add     %o0, 4, %o0
 101        addx    %g0, %o2, %o2
 102        b       cpa
 103         andcc  %o1, 0xffffff80, %o3
 104
 105        /* The common case is to get called with a nicely aligned
 106         * buffer of size 0x20.  Follow the code path for that case.
 107         */
 108        .globl  csum_partial
 109        EXPORT_SYMBOL(csum_partial)
 110csum_partial:                   /* %o0=buf, %o1=len, %o2=sum */
 111        andcc   %o0, 0x7, %g0                           ! alignment problems?
 112        bne     csum_partial_fix_alignment              ! yep, handle it
 113         sethi  %hi(cpte - 8), %g7                      ! prepare table jmp ptr
 114        andcc   %o1, 0xffffff80, %o3                    ! num loop iterations
 115cpa:    be      3f                                      ! none to do
 116         andcc  %o1, 0x70, %g1                          ! clears carry flag too
 1175:      CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 118        CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 119        CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 120        CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 121        addx    %g0, %o2, %o2                           ! sink in final carry
 122        subcc   %o3, 128, %o3                           ! detract from loop iters
 123        bne     5b                                      ! more to do
 124         add    %o0, 128, %o0                           ! advance buf ptr
 125        andcc   %o1, 0x70, %g1                          ! clears carry flag too
 1263:      be      cpte                                    ! nope
 127         andcc  %o1, 0xf, %g0                           ! anything left at all?
 128        srl     %g1, 1, %o4                             ! compute offset
 129        sub     %g7, %g1, %g7                           ! adjust jmp ptr
 130        sub     %g7, %o4, %g7                           ! final jmp ptr adjust
 131        jmp     %g7 + %lo(cpte - 8)                     ! enter the table
 132         add    %o0, %g1, %o0                           ! advance buf ptr
 133cptbl:  CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
 134        CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
 135        CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
 136        CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
 137        CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
 138        CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
 139        CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
 140        addx    %g0, %o2, %o2                           ! fetch final carry
 141        andcc   %o1, 0xf, %g0                           ! anything left at all?
 142cpte:   bne     csum_partial_end_cruft                  ! yep, handle it
 143         andcc  %o1, 8, %g0                             ! check how much
 144cpout:  retl                                            ! get outta here
 145         mov    %o2, %o0                                ! return computed csum
 146
 147        .globl __csum_partial_copy_start, __csum_partial_copy_end
 148__csum_partial_copy_start:
 149
 150/* Work around cpp -rob */
 151#define ALLOC #alloc
 152#define EXECINSTR #execinstr
 153#define EX(x,y,a,b)                             \
 15498:     x,y;                                    \
 155        .section .fixup,ALLOC,EXECINSTR;        \
 156        .align  4;                              \
 15799:     ba 30f;                                 \
 158         a, b, %o3;                             \
 159        .section __ex_table,ALLOC;              \
 160        .align  4;                              \
 161        .word   98b, 99b;                       \
 162        .text;                                  \
 163        .align  4
 164
 165#define EX2(x,y)                                \
 16698:     x,y;                                    \
 167        .section __ex_table,ALLOC;              \
 168        .align  4;                              \
 169        .word   98b, 30f;                       \
 170        .text;                                  \
 171        .align  4
 172
 173#define EX3(x,y)                                \
 17498:     x,y;                                    \
 175        .section __ex_table,ALLOC;              \
 176        .align  4;                              \
 177        .word   98b, 96f;                       \
 178        .text;                                  \
 179        .align  4
 180
 181#define EXT(start,end,handler)                  \
 182        .section __ex_table,ALLOC;              \
 183        .align  4;                              \
 184        .word   start, 0, end, handler;         \
 185        .text;                                  \
 186        .align  4
 187
 188        /* This aligned version executes typically in 8.5 superscalar cycles, this
 189         * is the best I can do.  I say 8.5 because the final add will pair with
 190         * the next ldd in the main unrolled loop.  Thus the pipe is always full.
 191         * If you change these macros (including order of instructions),
 192         * please check the fixup code below as well.
 193         */
 194#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 195        ldd     [src + off + 0x00], t0;                                                 \
 196        ldd     [src + off + 0x08], t2;                                                 \
 197        addxcc  t0, sum, sum;                                                           \
 198        ldd     [src + off + 0x10], t4;                                                 \
 199        addxcc  t1, sum, sum;                                                           \
 200        ldd     [src + off + 0x18], t6;                                                 \
 201        addxcc  t2, sum, sum;                                                           \
 202        std     t0, [dst + off + 0x00];                                                 \
 203        addxcc  t3, sum, sum;                                                           \
 204        std     t2, [dst + off + 0x08];                                                 \
 205        addxcc  t4, sum, sum;                                                           \
 206        std     t4, [dst + off + 0x10];                                                 \
 207        addxcc  t5, sum, sum;                                                           \
 208        std     t6, [dst + off + 0x18];                                                 \
 209        addxcc  t6, sum, sum;                                                           \
 210        addxcc  t7, sum, sum;
 211
 212        /* 12 superscalar cycles seems to be the limit for this case,
 213         * because of this we thus do all the ldd's together to get
 214         * Viking MXCC into streaming mode.  Ho hum...
 215         */
 216#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 217        ldd     [src + off + 0x00], t0;                                         \
 218        ldd     [src + off + 0x08], t2;                                         \
 219        ldd     [src + off + 0x10], t4;                                         \
 220        ldd     [src + off + 0x18], t6;                                         \
 221        st      t0, [dst + off + 0x00];                                         \
 222        addxcc  t0, sum, sum;                                                   \
 223        st      t1, [dst + off + 0x04];                                         \
 224        addxcc  t1, sum, sum;                                                   \
 225        st      t2, [dst + off + 0x08];                                         \
 226        addxcc  t2, sum, sum;                                                   \
 227        st      t3, [dst + off + 0x0c];                                         \
 228        addxcc  t3, sum, sum;                                                   \
 229        st      t4, [dst + off + 0x10];                                         \
 230        addxcc  t4, sum, sum;                                                   \
 231        st      t5, [dst + off + 0x14];                                         \
 232        addxcc  t5, sum, sum;                                                   \
 233        st      t6, [dst + off + 0x18];                                         \
 234        addxcc  t6, sum, sum;                                                   \
 235        st      t7, [dst + off + 0x1c];                                         \
 236        addxcc  t7, sum, sum;
 237
 238        /* Yuck, 6 superscalar cycles... */
 239#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)  \
 240        ldd     [src - off - 0x08], t0;                         \
 241        ldd     [src - off - 0x00], t2;                         \
 242        addxcc  t0, sum, sum;                                   \
 243        st      t0, [dst - off - 0x08];                         \
 244        addxcc  t1, sum, sum;                                   \
 245        st      t1, [dst - off - 0x04];                         \
 246        addxcc  t2, sum, sum;                                   \
 247        st      t2, [dst - off - 0x00];                         \
 248        addxcc  t3, sum, sum;                                   \
 249        st      t3, [dst - off + 0x04];
 250
 251        /* Handle the end cruft code out of band for better cache patterns. */
 252cc_end_cruft:
 253        be      1f
 254         andcc  %o3, 4, %g0
 255        EX(ldd  [%o0 + 0x00], %g2, and %o3, 0xf)
 256        add     %o1, 8, %o1
 257        addcc   %g2, %g7, %g7
 258        add     %o0, 8, %o0
 259        addxcc  %g3, %g7, %g7
 260        EX2(st  %g2, [%o1 - 0x08])
 261        addx    %g0, %g7, %g7
 262        andcc   %o3, 4, %g0
 263        EX2(st  %g3, [%o1 - 0x04])
 2641:      be      1f
 265         andcc  %o3, 3, %o3
 266        EX(ld   [%o0 + 0x00], %g2, add %o3, 4)
 267        add     %o1, 4, %o1
 268        addcc   %g2, %g7, %g7
 269        EX2(st  %g2, [%o1 - 0x04])
 270        addx    %g0, %g7, %g7
 271        andcc   %o3, 3, %g0
 272        add     %o0, 4, %o0
 2731:      be      1f
 274         addcc  %o3, -1, %g0
 275        bne     2f
 276         subcc  %o3, 2, %o3
 277        b       4f
 278         or     %g0, %g0, %o4
 2792:      EX(lduh [%o0 + 0x00], %o4, add %o3, 2)
 280        add     %o0, 2, %o0
 281        EX2(sth %o4, [%o1 + 0x00])
 282        be      6f
 283         add    %o1, 2, %o1
 284        sll     %o4, 16, %o4
 2854:      EX(ldub [%o0 + 0x00], %o5, add %g0, 1)
 286        EX2(stb %o5, [%o1 + 0x00])
 287        sll     %o5, 8, %o5
 288        or      %o5, %o4, %o4
 2896:      addcc   %o4, %g7, %g7
 2901:      retl
 291         addx   %g0, %g7, %o0
 292
 293        /* Also, handle the alignment code out of band. */
 294cc_dword_align:
 295        cmp     %g1, 16
 296        bge     1f
 297         srl    %g1, 1, %o3
 2982:      cmp     %o3, 0
 299        be,a    ccte
 300         andcc  %g1, 0xf, %o3
 301        andcc   %o3, %o0, %g0   ! Check %o0 only (%o1 has the same last 2 bits)
 302        be,a    2b
 303         srl    %o3, 1, %o3
 3041:      andcc   %o0, 0x1, %g0
 305        bne     ccslow
 306         andcc  %o0, 0x2, %g0
 307        be      1f
 308         andcc  %o0, 0x4, %g0
 309        EX(lduh [%o0 + 0x00], %g4, add %g1, 0)
 310        sub     %g1, 2, %g1
 311        EX2(sth %g4, [%o1 + 0x00])
 312        add     %o0, 2, %o0
 313        sll     %g4, 16, %g4
 314        addcc   %g4, %g7, %g7
 315        add     %o1, 2, %o1
 316        srl     %g7, 16, %g3
 317        addx    %g0, %g3, %g4
 318        sll     %g7, 16, %g7
 319        sll     %g4, 16, %g3
 320        srl     %g7, 16, %g7
 321        andcc   %o0, 0x4, %g0
 322        or      %g3, %g7, %g7
 3231:      be      3f
 324         andcc  %g1, 0xffffff80, %g0
 325        EX(ld   [%o0 + 0x00], %g4, add %g1, 0)
 326        sub     %g1, 4, %g1
 327        EX2(st  %g4, [%o1 + 0x00])
 328        add     %o0, 4, %o0
 329        addcc   %g4, %g7, %g7
 330        add     %o1, 4, %o1
 331        addx    %g0, %g7, %g7
 332        b       3f
 333         andcc  %g1, 0xffffff80, %g0
 334
 335        /* Sun, you just can't beat me, you just can't.  Stop trying,
 336         * give up.  I'm serious, I am going to kick the living shit
 337         * out of you, game over, lights out.
 338         */
 339        .align  8
 340        .globl  __csum_partial_copy_sparc_generic
 341        EXPORT_SYMBOL(__csum_partial_copy_sparc_generic)
 342__csum_partial_copy_sparc_generic:
 343                                        /* %o0=src, %o1=dest, %g1=len, %g7=sum */
 344        xor     %o0, %o1, %o4           ! get changing bits
 345        andcc   %o4, 3, %g0             ! check for mismatched alignment
 346        bne     ccslow                  ! better this than unaligned/fixups
 347         andcc  %o0, 7, %g0             ! need to align things?
 348        bne     cc_dword_align          ! yes, we check for short lengths there
 349         andcc  %g1, 0xffffff80, %g0    ! can we use unrolled loop?
 3503:      be      3f                      ! nope, less than one loop remains
 351         andcc  %o1, 4, %g0             ! dest aligned on 4 or 8 byte boundary?
 352        be      ccdbl + 4               ! 8 byte aligned, kick ass
 3535:      CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 354        CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 355        CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 356        CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 35710:     EXT(5b, 10b, 20f)               ! note for exception handling
 358        sub     %g1, 128, %g1           ! detract from length
 359        addx    %g0, %g7, %g7           ! add in last carry bit
 360        andcc   %g1, 0xffffff80, %g0    ! more to csum?
 361        add     %o0, 128, %o0           ! advance src ptr
 362        bne     5b                      ! we did not go negative, continue looping
 363         add    %o1, 128, %o1           ! advance dest ptr
 3643:      andcc   %g1, 0x70, %o2          ! can use table?
 365ccmerge:be      ccte                    ! nope, go and check for end cruft
 366         andcc  %g1, 0xf, %o3           ! get low bits of length (clears carry btw)
 367        srl     %o2, 1, %o4             ! begin negative offset computation
 368        sethi   %hi(12f), %o5           ! set up table ptr end
 369        add     %o0, %o2, %o0           ! advance src ptr
 370        sub     %o5, %o4, %o5           ! continue table calculation
 371        sll     %o2, 1, %g2             ! constant multiplies are fun...
 372        sub     %o5, %g2, %o5           ! some more adjustments
 373        jmp     %o5 + %lo(12f)          ! jump into it, duff style, wheee...
 374         add    %o1, %o2, %o1           ! advance dest ptr (carry is clear btw)
 375cctbl:  CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
 376        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
 377        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
 378        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
 379        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
 380        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
 381        CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
 38212:     EXT(cctbl, 12b, 22f)            ! note for exception table handling
 383        addx    %g0, %g7, %g7
 384        andcc   %o3, 0xf, %g0           ! check for low bits set
 385ccte:   bne     cc_end_cruft            ! something left, handle it out of band
 386         andcc  %o3, 8, %g0             ! begin checks for that code
 387        retl                            ! return
 388         mov    %g7, %o0                ! give em the computed checksum
 389ccdbl:  CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 390        CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 391        CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 392        CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 39311:     EXT(ccdbl, 11b, 21f)            ! note for exception table handling
 394        sub     %g1, 128, %g1           ! detract from length
 395        addx    %g0, %g7, %g7           ! add in last carry bit
 396        andcc   %g1, 0xffffff80, %g0    ! more to csum?
 397        add     %o0, 128, %o0           ! advance src ptr
 398        bne     ccdbl                   ! we did not go negative, continue looping
 399         add    %o1, 128, %o1           ! advance dest ptr
 400        b       ccmerge                 ! finish it off, above
 401         andcc  %g1, 0x70, %o2          ! can use table? (clears carry btw)
 402
 403ccslow: cmp     %g1, 0
 404        mov     0, %g5
 405        bleu    4f
 406         andcc  %o0, 1, %o5             
 407        be,a    1f
 408         srl    %g1, 1, %g4             
 409        sub     %g1, 1, %g1     
 410        EX(ldub [%o0], %g5, add %g1, 1)
 411        add     %o0, 1, %o0     
 412        EX2(stb %g5, [%o1])
 413        srl     %g1, 1, %g4
 414        add     %o1, 1, %o1
 4151:      cmp     %g4, 0          
 416        be,a    3f
 417         andcc  %g1, 1, %g0
 418        andcc   %o0, 2, %g0     
 419        be,a    1f
 420         srl    %g4, 1, %g4
 421        EX(lduh [%o0], %o4, add %g1, 0)
 422        sub     %g1, 2, %g1     
 423        srl     %o4, 8, %g2
 424        sub     %g4, 1, %g4     
 425        EX2(stb %g2, [%o1])
 426        add     %o4, %g5, %g5
 427        EX2(stb %o4, [%o1 + 1])
 428        add     %o0, 2, %o0     
 429        srl     %g4, 1, %g4
 430        add     %o1, 2, %o1
 4311:      cmp     %g4, 0          
 432        be,a    2f
 433         andcc  %g1, 2, %g0
 434        EX3(ld  [%o0], %o4)
 4355:      srl     %o4, 24, %g2
 436        srl     %o4, 16, %g3
 437        EX2(stb %g2, [%o1])
 438        srl     %o4, 8, %g2
 439        EX2(stb %g3, [%o1 + 1])
 440        add     %o0, 4, %o0
 441        EX2(stb %g2, [%o1 + 2])
 442        addcc   %o4, %g5, %g5
 443        EX2(stb %o4, [%o1 + 3])
 444        addx    %g5, %g0, %g5   ! I am now to lazy to optimize this (question it
 445        add     %o1, 4, %o1     ! is worthy). Maybe some day - with the sll/srl
 446        subcc   %g4, 1, %g4     ! tricks
 447        bne,a   5b
 448         EX3(ld [%o0], %o4)
 449        sll     %g5, 16, %g2
 450        srl     %g5, 16, %g5
 451        srl     %g2, 16, %g2
 452        andcc   %g1, 2, %g0
 453        add     %g2, %g5, %g5 
 4542:      be,a    3f              
 455         andcc  %g1, 1, %g0
 456        EX(lduh [%o0], %o4, and %g1, 3)
 457        andcc   %g1, 1, %g0
 458        srl     %o4, 8, %g2
 459        add     %o0, 2, %o0     
 460        EX2(stb %g2, [%o1])
 461        add     %g5, %o4, %g5
 462        EX2(stb %o4, [%o1 + 1])
 463        add     %o1, 2, %o1
 4643:      be,a    1f              
 465         sll    %g5, 16, %o4
 466        EX(ldub [%o0], %g2, add %g0, 1)
 467        sll     %g2, 8, %o4     
 468        EX2(stb %g2, [%o1])
 469        add     %g5, %o4, %g5
 470        sll     %g5, 16, %o4
 4711:      addcc   %o4, %g5, %g5
 472        srl     %g5, 16, %o4
 473        addx    %g0, %o4, %g5
 474        orcc    %o5, %g0, %g0
 475        be      4f
 476         srl    %g5, 8, %o4
 477        and     %g5, 0xff, %g2
 478        and     %o4, 0xff, %o4
 479        sll     %g2, 8, %g2
 480        or      %g2, %o4, %g5
 4814:      addcc   %g7, %g5, %g7
 482        retl    
 483         addx   %g0, %g7, %o0
 484__csum_partial_copy_end:
 485
 486/* We do these strange calculations for the csum_*_from_user case only, ie.
 487 * we only bother with faults on loads... */
 488
 489/* o2 = ((g2%20)&3)*8
 490 * o3 = g1 - (g2/20)*32 - o2 */
 49120:
 492        cmp     %g2, 20
 493        blu,a   1f
 494         and    %g2, 3, %o2
 495        sub     %g1, 32, %g1
 496        b       20b
 497         sub    %g2, 20, %g2
 4981:
 499        sll     %o2, 3, %o2
 500        b       31f
 501         sub    %g1, %o2, %o3
 502
 503/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
 504 * o3 = g1 - (g2/16)*32 - o2 */
 50521:
 506        andcc   %g2, 15, %o3
 507        srl     %g2, 4, %g2
 508        be,a    1f
 509         clr    %o2
 510        add     %o3, 1, %o3
 511        and     %o3, 14, %o3
 512        sll     %o3, 3, %o2
 5131:
 514        sll     %g2, 5, %g2
 515        sub     %g1, %g2, %o3
 516        b       31f
 517         sub    %o3, %o2, %o3
 518
 519/* o0 += (g2/10)*16 - 0x70
 520 * 01 += (g2/10)*16 - 0x70
 521 * o2 = (g2 % 10) ? 8 : 0
 522 * o3 += 0x70 - (g2/10)*16 - o2 */
 52322:
 524        cmp     %g2, 10
 525        blu,a   1f
 526         sub    %o0, 0x70, %o0
 527        add     %o0, 16, %o0
 528        add     %o1, 16, %o1
 529        sub     %o3, 16, %o3
 530        b       22b
 531         sub    %g2, 10, %g2
 5321:
 533        sub     %o1, 0x70, %o1
 534        add     %o3, 0x70, %o3
 535        clr     %o2
 536        tst     %g2
 537        bne,a   1f
 538         mov    8, %o2
 5391:
 540        b       31f
 541         sub    %o3, %o2, %o3
 54296:
 543        and     %g1, 3, %g1
 544        sll     %g4, 2, %g4
 545        add     %g1, %g4, %o3
 54630:
 547/* %o1 is dst
 548 * %o3 is # bytes to zero out
 549 * %o4 is faulting address
 550 * %o5 is %pc where fault occurred */
 551        clr     %o2
 55231:
 553/* %o0 is src
 554 * %o1 is dst
 555 * %o2 is # of bytes to copy from src to dst
 556 * %o3 is # bytes to zero out
 557 * %o4 is faulting address
 558 * %o5 is %pc where fault occurred */
 559        save    %sp, -104, %sp
 560        mov     %i5, %o0
 561        mov     %i7, %o1
 562        mov     %i4, %o2
 563        call    lookup_fault
 564         mov    %g7, %i4
 565        cmp     %o0, 2
 566        bne     1f      
 567         add    %g0, -EFAULT, %i5
 568        tst     %i2
 569        be      2f
 570         mov    %i0, %o1
 571        mov     %i1, %o0
 5725:
 573        call    memcpy
 574         mov    %i2, %o2
 575        tst     %o0
 576        bne,a   2f
 577         add    %i3, %i2, %i3
 578        add     %i1, %i2, %i1
 5792:
 580        mov     %i1, %o0
 5816:
 582        call    __bzero
 583         mov    %i3, %o1
 5841:
 585        ld      [%sp + 168], %o2                ! struct_ptr of parent
 586        st      %i5, [%o2]
 587        ret
 588         restore
 589
 590        .section __ex_table,#alloc
 591        .align 4
 592        .word 5b,2
 593        .word 6b,2
 594