linux/arch/x86/lib/checksum_32.S
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IP/TCP/UDP checksumming routines
   7 *
   8 * Authors:     Jorge Cwik, <jorge@laser.satlink.net>
   9 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  10 *              Tom May, <ftom@netcom.com>
  11 *              Pentium Pro/II routines:
  12 *              Alexander Kjeldaas <astor@guardian.no>
  13 *              Finn Arne Gangstad <finnag@guardian.no>
  14 *              Lots of code moved from tcp.c and ip.c; see those files
  15 *              for more names.
  16 *
  17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
  18 *                           handling.
  19 *              Andi Kleen,  add zeroing on error
  20 *                   converted to pure assembler
  21 *
  22 *              This program is free software; you can redistribute it and/or
  23 *              modify it under the terms of the GNU General Public License
  24 *              as published by the Free Software Foundation; either version
  25 *              2 of the License, or (at your option) any later version.
  26 */
  27
  28#include <linux/linkage.h>
  29#include <asm/errno.h>
  30#include <asm/asm.h>
  31#include <asm/export.h>
  32                                
  33/*
  34 * computes a partial checksum, e.g. for TCP/UDP fragments
  35 */
  36
  37/*      
  38unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
  39 */
  40                
  41.text
  42                
  43#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
  44
  45          /*            
  46           * Experiments with Ethernet and SLIP connections show that buff
  47           * is aligned on either a 2-byte or 4-byte boundary.  We get at
  48           * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
  49           * Fortunately, it is easy to convert 2-byte alignment to 4-byte
  50           * alignment for the unrolled loop.
  51           */           
  52ENTRY(csum_partial)
  53        pushl %esi
  54        pushl %ebx
  55        movl 20(%esp),%eax      # Function arg: unsigned int sum
  56        movl 16(%esp),%ecx      # Function arg: int len
  57        movl 12(%esp),%esi      # Function arg: unsigned char *buff
  58        testl $3, %esi          # Check alignment.
  59        jz 2f                   # Jump if alignment is ok.
  60        testl $1, %esi          # Check alignment.
  61        jz 10f                  # Jump if alignment is boundary of 2 bytes.
  62
  63        # buf is odd
  64        dec %ecx
  65        jl 8f
  66        movzbl (%esi), %ebx
  67        adcl %ebx, %eax
  68        roll $8, %eax
  69        inc %esi
  70        testl $2, %esi
  71        jz 2f
  7210:
  73        subl $2, %ecx           # Alignment uses up two bytes.
  74        jae 1f                  # Jump if we had at least two bytes.
  75        addl $2, %ecx           # ecx was < 2.  Deal with it.
  76        jmp 4f
  771:      movw (%esi), %bx
  78        addl $2, %esi
  79        addw %bx, %ax
  80        adcl $0, %eax
  812:
  82        movl %ecx, %edx
  83        shrl $5, %ecx
  84        jz 2f
  85        testl %esi, %esi
  861:      movl (%esi), %ebx
  87        adcl %ebx, %eax
  88        movl 4(%esi), %ebx
  89        adcl %ebx, %eax
  90        movl 8(%esi), %ebx
  91        adcl %ebx, %eax
  92        movl 12(%esi), %ebx
  93        adcl %ebx, %eax
  94        movl 16(%esi), %ebx
  95        adcl %ebx, %eax
  96        movl 20(%esi), %ebx
  97        adcl %ebx, %eax
  98        movl 24(%esi), %ebx
  99        adcl %ebx, %eax
 100        movl 28(%esi), %ebx
 101        adcl %ebx, %eax
 102        lea 32(%esi), %esi
 103        dec %ecx
 104        jne 1b
 105        adcl $0, %eax
 1062:      movl %edx, %ecx
 107        andl $0x1c, %edx
 108        je 4f
 109        shrl $2, %edx           # This clears CF
 1103:      adcl (%esi), %eax
 111        lea 4(%esi), %esi
 112        dec %edx
 113        jne 3b
 114        adcl $0, %eax
 1154:      andl $3, %ecx
 116        jz 7f
 117        cmpl $2, %ecx
 118        jb 5f
 119        movw (%esi),%cx
 120        leal 2(%esi),%esi
 121        je 6f
 122        shll $16,%ecx
 1235:      movb (%esi),%cl
 1246:      addl %ecx,%eax
 125        adcl $0, %eax 
 1267:      
 127        testb $1, 12(%esp)
 128        jz 8f
 129        roll $8, %eax
 1308:
 131        popl %ebx
 132        popl %esi
 133        ret
 134ENDPROC(csum_partial)
 135
 136#else
 137
 138/* Version for PentiumII/PPro */
 139
 140ENTRY(csum_partial)
 141        pushl %esi
 142        pushl %ebx
 143        movl 20(%esp),%eax      # Function arg: unsigned int sum
 144        movl 16(%esp),%ecx      # Function arg: int len
 145        movl 12(%esp),%esi      # Function arg: const unsigned char *buf
 146
 147        testl $3, %esi         
 148        jnz 25f                 
 14910:
 150        movl %ecx, %edx
 151        movl %ecx, %ebx
 152        andl $0x7c, %ebx
 153        shrl $7, %ecx
 154        addl %ebx,%esi
 155        shrl $2, %ebx  
 156        negl %ebx
 157        lea 45f(%ebx,%ebx,2), %ebx
 158        testl %esi, %esi
 159        jmp *%ebx
 160
 161        # Handle 2-byte-aligned regions
 16220:     addw (%esi), %ax
 163        lea 2(%esi), %esi
 164        adcl $0, %eax
 165        jmp 10b
 16625:
 167        testl $1, %esi         
 168        jz 30f                 
 169        # buf is odd
 170        dec %ecx
 171        jl 90f
 172        movzbl (%esi), %ebx
 173        addl %ebx, %eax
 174        adcl $0, %eax
 175        roll $8, %eax
 176        inc %esi
 177        testl $2, %esi
 178        jz 10b
 179
 18030:     subl $2, %ecx          
 181        ja 20b                 
 182        je 32f
 183        addl $2, %ecx
 184        jz 80f
 185        movzbl (%esi),%ebx      # csumming 1 byte, 2-aligned
 186        addl %ebx, %eax
 187        adcl $0, %eax
 188        jmp 80f
 18932:
 190        addw (%esi), %ax        # csumming 2 bytes, 2-aligned
 191        adcl $0, %eax
 192        jmp 80f
 193
 19440: 
 195        addl -128(%esi), %eax
 196        adcl -124(%esi), %eax
 197        adcl -120(%esi), %eax
 198        adcl -116(%esi), %eax   
 199        adcl -112(%esi), %eax   
 200        adcl -108(%esi), %eax
 201        adcl -104(%esi), %eax
 202        adcl -100(%esi), %eax
 203        adcl -96(%esi), %eax
 204        adcl -92(%esi), %eax
 205        adcl -88(%esi), %eax
 206        adcl -84(%esi), %eax
 207        adcl -80(%esi), %eax
 208        adcl -76(%esi), %eax
 209        adcl -72(%esi), %eax
 210        adcl -68(%esi), %eax
 211        adcl -64(%esi), %eax     
 212        adcl -60(%esi), %eax     
 213        adcl -56(%esi), %eax     
 214        adcl -52(%esi), %eax   
 215        adcl -48(%esi), %eax   
 216        adcl -44(%esi), %eax
 217        adcl -40(%esi), %eax
 218        adcl -36(%esi), %eax
 219        adcl -32(%esi), %eax
 220        adcl -28(%esi), %eax
 221        adcl -24(%esi), %eax
 222        adcl -20(%esi), %eax
 223        adcl -16(%esi), %eax
 224        adcl -12(%esi), %eax
 225        adcl -8(%esi), %eax
 226        adcl -4(%esi), %eax
 22745:
 228        lea 128(%esi), %esi
 229        adcl $0, %eax
 230        dec %ecx
 231        jge 40b
 232        movl %edx, %ecx
 23350:     andl $3, %ecx
 234        jz 80f
 235
 236        # Handle the last 1-3 bytes without jumping
 237        notl %ecx               # 1->2, 2->1, 3->0, higher bits are masked
 238        movl $0xffffff,%ebx     # by the shll and shrl instructions
 239        shll $3,%ecx
 240        shrl %cl,%ebx
 241        andl -128(%esi),%ebx    # esi is 4-aligned so should be ok
 242        addl %ebx,%eax
 243        adcl $0,%eax
 24480: 
 245        testb $1, 12(%esp)
 246        jz 90f
 247        roll $8, %eax
 24890: 
 249        popl %ebx
 250        popl %esi
 251        ret
 252ENDPROC(csum_partial)
 253                                
 254#endif
 255EXPORT_SYMBOL(csum_partial)
 256
 257/*
 258unsigned int csum_partial_copy_generic (const char *src, char *dst,
 259                                  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
 260 */ 
 261
 262/*
 263 * Copy from ds while checksumming, otherwise like csum_partial
 264 *
 265 * The macros SRC and DST specify the type of access for the instruction.
 266 * thus we can call a custom exception handler for all access types.
 267 *
 268 * FIXME: could someone double-check whether I haven't mixed up some SRC and
 269 *        DST definitions? It's damn hard to trigger all cases.  I hope I got
 270 *        them all but there's no guarantee.
 271 */
 272
 273#define SRC(y...)                       \
 274        9999: y;                        \
 275        _ASM_EXTABLE(9999b, 6001f)
 276
 277#define DST(y...)                       \
 278        9999: y;                        \
 279        _ASM_EXTABLE(9999b, 6002f)
 280
 281#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 282
 283#define ARGBASE 16              
 284#define FP              12
 285                
 286ENTRY(csum_partial_copy_generic)
 287        subl  $4,%esp   
 288        pushl %edi
 289        pushl %esi
 290        pushl %ebx
 291        movl ARGBASE+16(%esp),%eax      # sum
 292        movl ARGBASE+12(%esp),%ecx      # len
 293        movl ARGBASE+4(%esp),%esi       # src
 294        movl ARGBASE+8(%esp),%edi       # dst
 295
 296        testl $2, %edi                  # Check alignment. 
 297        jz 2f                           # Jump if alignment is ok.
 298        subl $2, %ecx                   # Alignment uses up two bytes.
 299        jae 1f                          # Jump if we had at least two bytes.
 300        addl $2, %ecx                   # ecx was < 2.  Deal with it.
 301        jmp 4f
 302SRC(1:  movw (%esi), %bx        )
 303        addl $2, %esi
 304DST(    movw %bx, (%edi)        )
 305        addl $2, %edi
 306        addw %bx, %ax   
 307        adcl $0, %eax
 3082:
 309        movl %ecx, FP(%esp)
 310        shrl $5, %ecx
 311        jz 2f
 312        testl %esi, %esi
 313SRC(1:  movl (%esi), %ebx       )
 314SRC(    movl 4(%esi), %edx      )
 315        adcl %ebx, %eax
 316DST(    movl %ebx, (%edi)       )
 317        adcl %edx, %eax
 318DST(    movl %edx, 4(%edi)      )
 319
 320SRC(    movl 8(%esi), %ebx      )
 321SRC(    movl 12(%esi), %edx     )
 322        adcl %ebx, %eax
 323DST(    movl %ebx, 8(%edi)      )
 324        adcl %edx, %eax
 325DST(    movl %edx, 12(%edi)     )
 326
 327SRC(    movl 16(%esi), %ebx     )
 328SRC(    movl 20(%esi), %edx     )
 329        adcl %ebx, %eax
 330DST(    movl %ebx, 16(%edi)     )
 331        adcl %edx, %eax
 332DST(    movl %edx, 20(%edi)     )
 333
 334SRC(    movl 24(%esi), %ebx     )
 335SRC(    movl 28(%esi), %edx     )
 336        adcl %ebx, %eax
 337DST(    movl %ebx, 24(%edi)     )
 338        adcl %edx, %eax
 339DST(    movl %edx, 28(%edi)     )
 340
 341        lea 32(%esi), %esi
 342        lea 32(%edi), %edi
 343        dec %ecx
 344        jne 1b
 345        adcl $0, %eax
 3462:      movl FP(%esp), %edx
 347        movl %edx, %ecx
 348        andl $0x1c, %edx
 349        je 4f
 350        shrl $2, %edx                   # This clears CF
 351SRC(3:  movl (%esi), %ebx       )
 352        adcl %ebx, %eax
 353DST(    movl %ebx, (%edi)       )
 354        lea 4(%esi), %esi
 355        lea 4(%edi), %edi
 356        dec %edx
 357        jne 3b
 358        adcl $0, %eax
 3594:      andl $3, %ecx
 360        jz 7f
 361        cmpl $2, %ecx
 362        jb 5f
 363SRC(    movw (%esi), %cx        )
 364        leal 2(%esi), %esi
 365DST(    movw %cx, (%edi)        )
 366        leal 2(%edi), %edi
 367        je 6f
 368        shll $16,%ecx
 369SRC(5:  movb (%esi), %cl        )
 370DST(    movb %cl, (%edi)        )
 3716:      addl %ecx, %eax
 372        adcl $0, %eax
 3737:
 3745000:
 375
 376# Exception handler:
 377.section .fixup, "ax"                                                   
 378
 3796001:
 380        movl ARGBASE+20(%esp), %ebx     # src_err_ptr
 381        movl $-EFAULT, (%ebx)
 382
 383        # zero the complete destination - computing the rest
 384        # is too much work 
 385        movl ARGBASE+8(%esp), %edi      # dst
 386        movl ARGBASE+12(%esp), %ecx     # len
 387        xorl %eax,%eax
 388        rep ; stosb
 389
 390        jmp 5000b
 391
 3926002:
 393        movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 394        movl $-EFAULT,(%ebx)
 395        jmp 5000b
 396
 397.previous
 398
 399        popl %ebx
 400        popl %esi
 401        popl %edi
 402        popl %ecx                       # equivalent to addl $4,%esp
 403        ret     
 404ENDPROC(csum_partial_copy_generic)
 405
 406#else
 407
 408/* Version for PentiumII/PPro */
 409
 410#define ROUND1(x) \
 411        SRC(movl x(%esi), %ebx  )       ;       \
 412        addl %ebx, %eax                 ;       \
 413        DST(movl %ebx, x(%edi)  )       ; 
 414
 415#define ROUND(x) \
 416        SRC(movl x(%esi), %ebx  )       ;       \
 417        adcl %ebx, %eax                 ;       \
 418        DST(movl %ebx, x(%edi)  )       ;
 419
 420#define ARGBASE 12
 421                
 422ENTRY(csum_partial_copy_generic)
 423        pushl %ebx
 424        pushl %edi
 425        pushl %esi
 426        movl ARGBASE+4(%esp),%esi       #src
 427        movl ARGBASE+8(%esp),%edi       #dst    
 428        movl ARGBASE+12(%esp),%ecx      #len
 429        movl ARGBASE+16(%esp),%eax      #sum
 430#       movl %ecx, %edx  
 431        movl %ecx, %ebx  
 432        movl %esi, %edx
 433        shrl $6, %ecx     
 434        andl $0x3c, %ebx  
 435        negl %ebx
 436        subl %ebx, %esi  
 437        subl %ebx, %edi  
 438        lea  -1(%esi),%edx
 439        andl $-32,%edx
 440        lea 3f(%ebx,%ebx), %ebx
 441        testl %esi, %esi 
 442        jmp *%ebx
 4431:      addl $64,%esi
 444        addl $64,%edi 
 445        SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
 446        ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)    
 447        ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)    
 448        ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)    
 449        ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)     
 4503:      adcl $0,%eax
 451        addl $64, %edx
 452        dec %ecx
 453        jge 1b
 4544:      movl ARGBASE+12(%esp),%edx      #len
 455        andl $3, %edx
 456        jz 7f
 457        cmpl $2, %edx
 458        jb 5f
 459SRC(    movw (%esi), %dx         )
 460        leal 2(%esi), %esi
 461DST(    movw %dx, (%edi)         )
 462        leal 2(%edi), %edi
 463        je 6f
 464        shll $16,%edx
 4655:
 466SRC(    movb (%esi), %dl         )
 467DST(    movb %dl, (%edi)         )
 4686:      addl %edx, %eax
 469        adcl $0, %eax
 4707:
 471.section .fixup, "ax"
 4726001:   movl    ARGBASE+20(%esp), %ebx  # src_err_ptr   
 473        movl $-EFAULT, (%ebx)
 474        # zero the complete destination (computing the rest is too much work)
 475        movl ARGBASE+8(%esp),%edi       # dst
 476        movl ARGBASE+12(%esp),%ecx      # len
 477        xorl %eax,%eax
 478        rep; stosb
 479        jmp 7b
 4806002:   movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 481        movl $-EFAULT, (%ebx)
 482        jmp  7b                 
 483.previous                               
 484
 485        popl %esi
 486        popl %edi
 487        popl %ebx
 488        ret
 489ENDPROC(csum_partial_copy_generic)
 490                                
 491#undef ROUND
 492#undef ROUND1           
 493                
 494#endif
 495EXPORT_SYMBOL(csum_partial_copy_generic)
 496