linux/arch/xtensa/lib/checksum.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              IP/TCP/UDP checksumming routines
   8 *
   9 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
  10 *                  Optimized by Joe Taylor
  11 */
  12
  13#include <linux/errno.h>
  14#include <linux/linkage.h>
  15#include <asm/asmmacro.h>
  16#include <asm/core.h>
  17
  18/*
  19 * computes a partial checksum, e.g. for TCP/UDP fragments
  20 */
  21
  22/*
  23 * unsigned int csum_partial(const unsigned char *buf, int len,
  24 *                           unsigned int sum);
  25 *    a2 = buf
  26 *    a3 = len
  27 *    a4 = sum
  28 *
  29 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  30 */
  31
  32/* ONES_ADD converts twos-complement math to ones-complement. */
  33#define ONES_ADD(sum, val)        \
  34        add     sum, sum, val   ; \
  35        bgeu    sum, val, 99f   ; \
  36        addi    sum, sum, 1     ; \
  3799:                             ;
  38
  39.text
  40ENTRY(csum_partial)
  41
  42        /*
  43         * Experiments with Ethernet and SLIP connections show that buf
  44         * is aligned on either a 2-byte or 4-byte boundary.
  45         */
  46        abi_entry_default
  47        extui   a5, a2, 0, 2
  48        bnez    a5, 8f          /* branch if 2-byte aligned */
  49        /* Fall-through on common case, 4-byte alignment */
  501:
  51        srli    a5, a3, 5       /* 32-byte chunks */
  52#if XCHAL_HAVE_LOOPS
  53        loopgtz a5, 2f
  54#else
  55        beqz    a5, 2f
  56        slli    a5, a5, 5
  57        add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  58.Loop1:
  59#endif
  60        l32i    a6, a2, 0
  61        l32i    a7, a2, 4
  62        ONES_ADD(a4, a6)
  63        ONES_ADD(a4, a7)
  64        l32i    a6, a2, 8
  65        l32i    a7, a2, 12
  66        ONES_ADD(a4, a6)
  67        ONES_ADD(a4, a7)
  68        l32i    a6, a2, 16
  69        l32i    a7, a2, 20
  70        ONES_ADD(a4, a6)
  71        ONES_ADD(a4, a7)
  72        l32i    a6, a2, 24
  73        l32i    a7, a2, 28
  74        ONES_ADD(a4, a6)
  75        ONES_ADD(a4, a7)
  76        addi    a2, a2, 4*8
  77#if !XCHAL_HAVE_LOOPS
  78        blt     a2, a5, .Loop1
  79#endif
  802:
  81        extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  82#if XCHAL_HAVE_LOOPS
  83        loopgtz a5, 3f
  84#else
  85        beqz    a5, 3f
  86        slli    a5, a5, 2
  87        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  88.Loop2:
  89#endif
  90        l32i    a6, a2, 0
  91        ONES_ADD(a4, a6)
  92        addi    a2, a2, 4
  93#if !XCHAL_HAVE_LOOPS
  94        blt     a2, a5, .Loop2
  95#endif
  963:
  97        _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
  98        l16ui   a6, a2, 0
  99        ONES_ADD(a4, a6)
 100        addi    a2, a2, 2
 1015:
 102        _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 1036:      l8ui    a6, a2, 0
 104#ifdef __XTENSA_EB__
 105        slli    a6, a6, 8       /* load byte into bits 8..15 */
 106#endif
 107        ONES_ADD(a4, a6)
 1087:
 109        mov     a2, a4
 110        abi_ret_default
 111
 112        /* uncommon case, buf is 2-byte aligned */
 1138:
 114        beqz    a3, 7b          /* branch if len == 0 */
 115        beqi    a3, 1, 6b       /* branch if len == 1 */
 116
 117        extui   a5, a2, 0, 1
 118        bnez    a5, 8f          /* branch if 1-byte aligned */
 119
 120        l16ui   a6, a2, 0       /* common case, len >= 2 */
 121        ONES_ADD(a4, a6)
 122        addi    a2, a2, 2       /* adjust buf */
 123        addi    a3, a3, -2      /* adjust len */
 124        j       1b              /* now buf is 4-byte aligned */
 125
 126        /* case: odd-byte aligned, len > 1
 127         * This case is dog slow, so don't give us an odd address.
 128         * (I don't think this ever happens, but just in case.)
 129         */
 1308:
 131        srli    a5, a3, 2       /* 4-byte chunks */
 132#if XCHAL_HAVE_LOOPS
 133        loopgtz a5, 2f
 134#else
 135        beqz    a5, 2f
 136        slli    a5, a5, 2
 137        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 138.Loop3:
 139#endif
 140        l8ui    a6, a2, 0       /* bits 24..31 */
 141        l16ui   a7, a2, 1       /* bits  8..23 */
 142        l8ui    a8, a2, 3       /* bits  0.. 8 */
 143#ifdef  __XTENSA_EB__
 144        slli    a6, a6, 24
 145#else
 146        slli    a8, a8, 24
 147#endif
 148        slli    a7, a7, 8
 149        or      a7, a7, a6
 150        or      a7, a7, a8
 151        ONES_ADD(a4, a7)
 152        addi    a2, a2, 4
 153#if !XCHAL_HAVE_LOOPS
 154        blt     a2, a5, .Loop3
 155#endif
 1562:
 157        _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 158        l8ui    a6, a2, 0
 159        l8ui    a7, a2, 1
 160#ifdef  __XTENSA_EB__
 161        slli    a6, a6, 8
 162#else
 163        slli    a7, a7, 8
 164#endif
 165        or      a7, a7, a6
 166        ONES_ADD(a4, a7)
 167        addi    a2, a2, 2
 1683:
 169        j       5b              /* branch to handle the remaining byte */
 170
 171ENDPROC(csum_partial)
 172
 173/*
 174 * Copy from ds while checksumming, otherwise like csum_partial
 175 */
 176
 177/*
 178unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
 179        a2  = src
 180        a3  = dst
 181        a4  = len
 182        a5  = sum
 183        a8  = temp
 184        a9  = temp
 185        a10 = temp
 186
 187    This function is optimized for 4-byte aligned addresses.  Other
 188    alignments work, but not nearly as efficiently.
 189 */
 190
 191ENTRY(csum_partial_copy_generic)
 192
 193        abi_entry_default
 194        movi    a5, -1
 195        or      a10, a2, a3
 196
 197        /* We optimize the following alignment tests for the 4-byte
 198        aligned case.  Two bbsi.l instructions might seem more optimal
 199        (commented out below).  However, both labels 5: and 3: are out
 200        of the imm8 range, so the assembler relaxes them into
 201        equivalent bbci.l, j combinations, which is actually
 202        slower. */
 203
 204        extui   a9, a10, 0, 2
 205        beqz    a9, 1f          /* branch if both are 4-byte aligned */
 206        bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 207        j       3f              /* one address is 2-byte aligned */
 208
 209/*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 210/*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 211
 2121:
 213        /* src and dst are both 4-byte aligned */
 214        srli    a10, a4, 5      /* 32-byte chunks */
 215#if XCHAL_HAVE_LOOPS
 216        loopgtz a10, 2f
 217#else
 218        beqz    a10, 2f
 219        slli    a10, a10, 5
 220        add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 221.Loop5:
 222#endif
 223EX(10f) l32i    a9, a2, 0
 224EX(10f) l32i    a8, a2, 4
 225EX(10f) s32i    a9, a3, 0
 226EX(10f) s32i    a8, a3, 4
 227        ONES_ADD(a5, a9)
 228        ONES_ADD(a5, a8)
 229EX(10f) l32i    a9, a2, 8
 230EX(10f) l32i    a8, a2, 12
 231EX(10f) s32i    a9, a3, 8
 232EX(10f) s32i    a8, a3, 12
 233        ONES_ADD(a5, a9)
 234        ONES_ADD(a5, a8)
 235EX(10f) l32i    a9, a2, 16
 236EX(10f) l32i    a8, a2, 20
 237EX(10f) s32i    a9, a3, 16
 238EX(10f) s32i    a8, a3, 20
 239        ONES_ADD(a5, a9)
 240        ONES_ADD(a5, a8)
 241EX(10f) l32i    a9, a2, 24
 242EX(10f) l32i    a8, a2, 28
 243EX(10f) s32i    a9, a3, 24
 244EX(10f) s32i    a8, a3, 28
 245        ONES_ADD(a5, a9)
 246        ONES_ADD(a5, a8)
 247        addi    a2, a2, 32
 248        addi    a3, a3, 32
 249#if !XCHAL_HAVE_LOOPS
 250        blt     a2, a10, .Loop5
 251#endif
 2522:
 253        extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 254        extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 255#if XCHAL_HAVE_LOOPS
 256        loopgtz a10, 3f
 257#else
 258        beqz    a10, 3f
 259        slli    a10, a10, 2
 260        add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 261.Loop6:
 262#endif
 263EX(10f) l32i    a9, a2, 0
 264EX(10f) s32i    a9, a3, 0
 265        ONES_ADD(a5, a9)
 266        addi    a2, a2, 4
 267        addi    a3, a3, 4
 268#if !XCHAL_HAVE_LOOPS
 269        blt     a2, a10, .Loop6
 270#endif
 2713:
 272        /*
 273        Control comes to here in two cases: (1) It may fall through
 274        to here from the 4-byte alignment case to process, at most,
 275        one 2-byte chunk.  (2) It branches to here from above if
 276        either src or dst is 2-byte aligned, and we process all bytes
 277        here, except for perhaps a trailing odd byte.  It's
 278        inefficient, so align your addresses to 4-byte boundaries.
 279
 280        a2 = src
 281        a3 = dst
 282        a4 = len
 283        a5 = sum
 284        */
 285        srli    a10, a4, 1      /* 2-byte chunks */
 286#if XCHAL_HAVE_LOOPS
 287        loopgtz a10, 4f
 288#else
 289        beqz    a10, 4f
 290        slli    a10, a10, 1
 291        add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 292.Loop7:
 293#endif
 294EX(10f) l16ui   a9, a2, 0
 295EX(10f) s16i    a9, a3, 0
 296        ONES_ADD(a5, a9)
 297        addi    a2, a2, 2
 298        addi    a3, a3, 2
 299#if !XCHAL_HAVE_LOOPS
 300        blt     a2, a10, .Loop7
 301#endif
 3024:
 303        /* This section processes a possible trailing odd byte. */
 304        _bbci.l a4, 0, 8f       /* 1-byte chunk */
 305EX(10f) l8ui    a9, a2, 0
 306EX(10f) s8i     a9, a3, 0
 307#ifdef __XTENSA_EB__
 308        slli    a9, a9, 8       /* shift byte to bits 8..15 */
 309#endif
 310        ONES_ADD(a5, a9)
 3118:
 312        mov     a2, a5
 313        abi_ret_default
 314
 3155:
 316        /* Control branch to here when either src or dst is odd.  We
 317        process all bytes using 8-bit accesses.  Grossly inefficient,
 318        so don't feed us an odd address. */
 319
 320        srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 321#if XCHAL_HAVE_LOOPS
 322        loopgtz a10, 6f
 323#else
 324        beqz    a10, 6f
 325        slli    a10, a10, 1
 326        add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 327.Loop8:
 328#endif
 329EX(10f) l8ui    a9, a2, 0
 330EX(10f) l8ui    a8, a2, 1
 331EX(10f) s8i     a9, a3, 0
 332EX(10f) s8i     a8, a3, 1
 333#ifdef __XTENSA_EB__
 334        slli    a9, a9, 8       /* combine into a single 16-bit value */
 335#else                           /* for checksum computation */
 336        slli    a8, a8, 8
 337#endif
 338        or      a9, a9, a8
 339        ONES_ADD(a5, a9)
 340        addi    a2, a2, 2
 341        addi    a3, a3, 2
 342#if !XCHAL_HAVE_LOOPS
 343        blt     a2, a10, .Loop8
 344#endif
 3456:
 346        j       4b              /* process the possible trailing odd byte */
 347
 348ENDPROC(csum_partial_copy_generic)
 349
 350
 351# Exception handler:
 352.section .fixup, "ax"
 35310:
 354        movi    a2, 0
 355        abi_ret_default
 356
 357.previous
 358