linux/arch/xtensa/lib/checksum.S
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IP/TCP/UDP checksumming routines
   7 *
   8 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
   9 *                  Optimized by Joe Taylor
  10 *
  11 *              This program is free software; you can redistribute it and/or
  12 *              modify it under the terms of the GNU General Public License
  13 *              as published by the Free Software Foundation; either version
  14 *              2 of the License, or (at your option) any later version.
  15 */
  16
  17#include <linux/errno.h>
  18#include <linux/linkage.h>
  19#include <variant/core.h>
  20#include <asm/asmmacro.h>
  21
  22/*
  23 * computes a partial checksum, e.g. for TCP/UDP fragments
  24 */
  25
  26/*
  27 * unsigned int csum_partial(const unsigned char *buf, int len,
  28 *                           unsigned int sum);
  29 *    a2 = buf
  30 *    a3 = len
  31 *    a4 = sum
  32 *
  33 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  34 */
  35
  36/* ONES_ADD converts twos-complement math to ones-complement. */
  37#define ONES_ADD(sum, val)        \
  38        add     sum, sum, val   ; \
  39        bgeu    sum, val, 99f   ; \
  40        addi    sum, sum, 1     ; \
  4199:                             ;
  42
  43.text
  44ENTRY(csum_partial)
  45
  46        /*
  47         * Experiments with Ethernet and SLIP connections show that buf
  48         * is aligned on either a 2-byte or 4-byte boundary.
  49         */
  50        entry   sp, 32
  51        extui   a5, a2, 0, 2
  52        bnez    a5, 8f          /* branch if 2-byte aligned */
  53        /* Fall-through on common case, 4-byte alignment */
  541:
  55        srli    a5, a3, 5       /* 32-byte chunks */
  56#if XCHAL_HAVE_LOOPS
  57        loopgtz a5, 2f
  58#else
  59        beqz    a5, 2f
  60        slli    a5, a5, 5
  61        add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  62.Loop1:
  63#endif
  64        l32i    a6, a2, 0
  65        l32i    a7, a2, 4
  66        ONES_ADD(a4, a6)
  67        ONES_ADD(a4, a7)
  68        l32i    a6, a2, 8
  69        l32i    a7, a2, 12
  70        ONES_ADD(a4, a6)
  71        ONES_ADD(a4, a7)
  72        l32i    a6, a2, 16
  73        l32i    a7, a2, 20
  74        ONES_ADD(a4, a6)
  75        ONES_ADD(a4, a7)
  76        l32i    a6, a2, 24
  77        l32i    a7, a2, 28
  78        ONES_ADD(a4, a6)
  79        ONES_ADD(a4, a7)
  80        addi    a2, a2, 4*8
  81#if !XCHAL_HAVE_LOOPS
  82        blt     a2, a5, .Loop1
  83#endif
  842:
  85        extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  86#if XCHAL_HAVE_LOOPS
  87        loopgtz a5, 3f
  88#else
  89        beqz    a5, 3f
  90        slli    a5, a5, 2
  91        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  92.Loop2:
  93#endif
  94        l32i    a6, a2, 0
  95        ONES_ADD(a4, a6)
  96        addi    a2, a2, 4
  97#if !XCHAL_HAVE_LOOPS
  98        blt     a2, a5, .Loop2
  99#endif
 1003:
 101        _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
 102        l16ui   a6, a2, 0
 103        ONES_ADD(a4, a6)
 104        addi    a2, a2, 2
 1055:
 106        _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 1076:      l8ui    a6, a2, 0
 108#ifdef __XTENSA_EB__
 109        slli    a6, a6, 8       /* load byte into bits 8..15 */
 110#endif
 111        ONES_ADD(a4, a6)
 1127:
 113        mov     a2, a4
 114        retw
 115
 116        /* uncommon case, buf is 2-byte aligned */
 1178:
 118        beqz    a3, 7b          /* branch if len == 0 */
 119        beqi    a3, 1, 6b       /* branch if len == 1 */
 120
 121        extui   a5, a2, 0, 1
 122        bnez    a5, 8f          /* branch if 1-byte aligned */
 123
 124        l16ui   a6, a2, 0       /* common case, len >= 2 */
 125        ONES_ADD(a4, a6)
 126        addi    a2, a2, 2       /* adjust buf */
 127        addi    a3, a3, -2      /* adjust len */
 128        j       1b              /* now buf is 4-byte aligned */
 129
 130        /* case: odd-byte aligned, len > 1
 131         * This case is dog slow, so don't give us an odd address.
 132         * (I don't think this ever happens, but just in case.)
 133         */
 1348:
 135        srli    a5, a3, 2       /* 4-byte chunks */
 136#if XCHAL_HAVE_LOOPS
 137        loopgtz a5, 2f
 138#else
 139        beqz    a5, 2f
 140        slli    a5, a5, 2
 141        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 142.Loop3:
 143#endif
 144        l8ui    a6, a2, 0       /* bits 24..31 */
 145        l16ui   a7, a2, 1       /* bits  8..23 */
 146        l8ui    a8, a2, 3       /* bits  0.. 8 */
 147#ifdef  __XTENSA_EB__
 148        slli    a6, a6, 24
 149#else
 150        slli    a8, a8, 24
 151#endif
 152        slli    a7, a7, 8
 153        or      a7, a7, a6
 154        or      a7, a7, a8
 155        ONES_ADD(a4, a7)
 156        addi    a2, a2, 4
 157#if !XCHAL_HAVE_LOOPS
 158        blt     a2, a5, .Loop3
 159#endif
 1602:
 161        _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 162        l8ui    a6, a2, 0
 163        l8ui    a7, a2, 1
 164#ifdef  __XTENSA_EB__
 165        slli    a6, a6, 8
 166#else
 167        slli    a7, a7, 8
 168#endif
 169        or      a7, a7, a6
 170        ONES_ADD(a4, a7)
 171        addi    a2, a2, 2
 1723:
 173        j       5b              /* branch to handle the remaining byte */
 174
 175ENDPROC(csum_partial)
 176
 177/*
 178 * Copy from ds while checksumming, otherwise like csum_partial
 179 */
 180
 181/*
 182unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
 183                                        int sum, int *src_err_ptr, int *dst_err_ptr)
 184        a2  = src
 185        a3  = dst
 186        a4  = len
 187        a5  = sum
 188        a6  = src_err_ptr
 189        a7  = dst_err_ptr
 190        a8  = temp
 191        a9  = temp
 192        a10 = temp
 193        a11 = original len for exception handling
 194        a12 = original dst for exception handling
 195
 196    This function is optimized for 4-byte aligned addresses.  Other
 197    alignments work, but not nearly as efficiently.
 198 */
 199
 200ENTRY(csum_partial_copy_generic)
 201
 202        entry   sp, 32
 203        mov     a12, a3
 204        mov     a11, a4
 205        or      a10, a2, a3
 206
 207        /* We optimize the following alignment tests for the 4-byte
 208        aligned case.  Two bbsi.l instructions might seem more optimal
 209        (commented out below).  However, both labels 5: and 3: are out
 210        of the imm8 range, so the assembler relaxes them into
 211        equivalent bbci.l, j combinations, which is actually
 212        slower. */
 213
 214        extui   a9, a10, 0, 2
 215        beqz    a9, 1f          /* branch if both are 4-byte aligned */
 216        bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 217        j       3f              /* one address is 2-byte aligned */
 218
 219/*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 220/*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 221
 2221:
 223        /* src and dst are both 4-byte aligned */
 224        srli    a10, a4, 5      /* 32-byte chunks */
 225#if XCHAL_HAVE_LOOPS
 226        loopgtz a10, 2f
 227#else
 228        beqz    a10, 2f
 229        slli    a10, a10, 5
 230        add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 231.Loop5:
 232#endif
 233EX(10f) l32i    a9, a2, 0
 234EX(10f) l32i    a8, a2, 4
 235EX(11f) s32i    a9, a3, 0
 236EX(11f) s32i    a8, a3, 4
 237        ONES_ADD(a5, a9)
 238        ONES_ADD(a5, a8)
 239EX(10f) l32i    a9, a2, 8
 240EX(10f) l32i    a8, a2, 12
 241EX(11f) s32i    a9, a3, 8
 242EX(11f) s32i    a8, a3, 12
 243        ONES_ADD(a5, a9)
 244        ONES_ADD(a5, a8)
 245EX(10f) l32i    a9, a2, 16
 246EX(10f) l32i    a8, a2, 20
 247EX(11f) s32i    a9, a3, 16
 248EX(11f) s32i    a8, a3, 20
 249        ONES_ADD(a5, a9)
 250        ONES_ADD(a5, a8)
 251EX(10f) l32i    a9, a2, 24
 252EX(10f) l32i    a8, a2, 28
 253EX(11f) s32i    a9, a3, 24
 254EX(11f) s32i    a8, a3, 28
 255        ONES_ADD(a5, a9)
 256        ONES_ADD(a5, a8)
 257        addi    a2, a2, 32
 258        addi    a3, a3, 32
 259#if !XCHAL_HAVE_LOOPS
 260        blt     a2, a10, .Loop5
 261#endif
 2622:
 263        extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 264        extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 265#if XCHAL_HAVE_LOOPS
 266        loopgtz a10, 3f
 267#else
 268        beqz    a10, 3f
 269        slli    a10, a10, 2
 270        add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 271.Loop6:
 272#endif
 273EX(10f) l32i    a9, a2, 0
 274EX(11f) s32i    a9, a3, 0
 275        ONES_ADD(a5, a9)
 276        addi    a2, a2, 4
 277        addi    a3, a3, 4
 278#if !XCHAL_HAVE_LOOPS
 279        blt     a2, a10, .Loop6
 280#endif
 2813:
 282        /*
 283        Control comes to here in two cases: (1) It may fall through
 284        to here from the 4-byte alignment case to process, at most,
 285        one 2-byte chunk.  (2) It branches to here from above if
 286        either src or dst is 2-byte aligned, and we process all bytes
 287        here, except for perhaps a trailing odd byte.  It's
 288        inefficient, so align your addresses to 4-byte boundaries.
 289
 290        a2 = src
 291        a3 = dst
 292        a4 = len
 293        a5 = sum
 294        */
 295        srli    a10, a4, 1      /* 2-byte chunks */
 296#if XCHAL_HAVE_LOOPS
 297        loopgtz a10, 4f
 298#else
 299        beqz    a10, 4f
 300        slli    a10, a10, 1
 301        add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 302.Loop7:
 303#endif
 304EX(10f) l16ui   a9, a2, 0
 305EX(11f) s16i    a9, a3, 0
 306        ONES_ADD(a5, a9)
 307        addi    a2, a2, 2
 308        addi    a3, a3, 2
 309#if !XCHAL_HAVE_LOOPS
 310        blt     a2, a10, .Loop7
 311#endif
 3124:
 313        /* This section processes a possible trailing odd byte. */
 314        _bbci.l a4, 0, 8f       /* 1-byte chunk */
 315EX(10f) l8ui    a9, a2, 0
 316EX(11f) s8i     a9, a3, 0
 317#ifdef __XTENSA_EB__
 318        slli    a9, a9, 8       /* shift byte to bits 8..15 */
 319#endif
 320        ONES_ADD(a5, a9)
 3218:
 322        mov     a2, a5
 323        retw
 324
 3255:
 326        /* Control branch to here when either src or dst is odd.  We
 327        process all bytes using 8-bit accesses.  Grossly inefficient,
 328        so don't feed us an odd address. */
 329
 330        srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 331#if XCHAL_HAVE_LOOPS
 332        loopgtz a10, 6f
 333#else
 334        beqz    a10, 6f
 335        slli    a10, a10, 1
 336        add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 337.Loop8:
 338#endif
 339EX(10f) l8ui    a9, a2, 0
 340EX(10f) l8ui    a8, a2, 1
 341EX(11f) s8i     a9, a3, 0
 342EX(11f) s8i     a8, a3, 1
 343#ifdef __XTENSA_EB__
 344        slli    a9, a9, 8       /* combine into a single 16-bit value */
 345#else                           /* for checksum computation */
 346        slli    a8, a8, 8
 347#endif
 348        or      a9, a9, a8
 349        ONES_ADD(a5, a9)
 350        addi    a2, a2, 2
 351        addi    a3, a3, 2
 352#if !XCHAL_HAVE_LOOPS
 353        blt     a2, a10, .Loop8
 354#endif
 3556:
 356        j       4b              /* process the possible trailing odd byte */
 357
 358ENDPROC(csum_partial_copy_generic)
 359
 360
 361# Exception handler:
 362.section .fixup, "ax"
 363/*
 364        a6  = src_err_ptr
 365        a7  = dst_err_ptr
 366        a11 = original len for exception handling
 367        a12 = original dst for exception handling
 368*/
 369
 37010:
 371        _movi   a2, -EFAULT
 372        s32i    a2, a6, 0       /* src_err_ptr */
 373
 374        # clear the complete destination - computing the rest
 375        # is too much work
 376        movi    a2, 0
 377#if XCHAL_HAVE_LOOPS
 378        loopgtz a11, 2f
 379#else
 380        beqz    a11, 2f
 381        add     a11, a11, a12   /* a11 = ending address */
 382.Leloop:
 383#endif
 384        s8i     a2, a12, 0
 385        addi    a12, a12, 1
 386#if !XCHAL_HAVE_LOOPS
 387        blt     a12, a11, .Leloop
 388#endif
 3892:
 390        retw
 391
 39211:
 393        movi    a2, -EFAULT
 394        s32i    a2, a7, 0       /* dst_err_ptr */
 395        movi    a2, 0
 396        retw
 397
 398.previous
 399