linux/arch/x86/um/checksum_32.S
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IP/TCP/UDP checksumming routines
   7 *
   8 * Authors:     Jorge Cwik, <jorge@laser.satlink.net>
   9 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  10 *              Tom May, <ftom@netcom.com>
  11 *              Pentium Pro/II routines:
  12 *              Alexander Kjeldaas <astor@guardian.no>
  13 *              Finn Arne Gangstad <finnag@guardian.no>
  14 *              Lots of code moved from tcp.c and ip.c; see those files
  15 *              for more names.
  16 *
  17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
  18 *                           handling.
  19 *              Andi Kleen,  add zeroing on error
  20 *                   converted to pure assembler
  21 *
  22 *              This program is free software; you can redistribute it and/or
  23 *              modify it under the terms of the GNU General Public License
  24 *              as published by the Free Software Foundation; either version
  25 *              2 of the License, or (at your option) any later version.
  26 */
  27
  28#include <asm/errno.h>
  29#include <asm/asm.h>
  30                                
  31/*
  32 * computes a partial checksum, e.g. for TCP/UDP fragments
  33 */
  34
  35/*      
  36unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
  37 */
  38                
  39.text
  40.align 4
  41.globl csum_partial
  42                
  43#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
  44
  45          /*            
  46           * Experiments with Ethernet and SLIP connections show that buff
  47           * is aligned on either a 2-byte or 4-byte boundary.  We get at
  48           * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
  49           * Fortunately, it is easy to convert 2-byte alignment to 4-byte
  50           * alignment for the unrolled loop.
  51           */           
  52csum_partial:
  53        pushl %esi
  54        pushl %ebx
  55        movl 20(%esp),%eax      # Function arg: unsigned int sum
  56        movl 16(%esp),%ecx      # Function arg: int len
  57        movl 12(%esp),%esi      # Function arg: unsigned char *buff
  58        testl $2, %esi          # Check alignment.
  59        jz 2f                   # Jump if alignment is ok.
  60        subl $2, %ecx           # Alignment uses up two bytes.
  61        jae 1f                  # Jump if we had at least two bytes.
  62        addl $2, %ecx           # ecx was < 2.  Deal with it.
  63        jmp 4f
  641:      movw (%esi), %bx
  65        addl $2, %esi
  66        addw %bx, %ax
  67        adcl $0, %eax
  682:
  69        movl %ecx, %edx
  70        shrl $5, %ecx
  71        jz 2f
  72        testl %esi, %esi
  731:      movl (%esi), %ebx
  74        adcl %ebx, %eax
  75        movl 4(%esi), %ebx
  76        adcl %ebx, %eax
  77        movl 8(%esi), %ebx
  78        adcl %ebx, %eax
  79        movl 12(%esi), %ebx
  80        adcl %ebx, %eax
  81        movl 16(%esi), %ebx
  82        adcl %ebx, %eax
  83        movl 20(%esi), %ebx
  84        adcl %ebx, %eax
  85        movl 24(%esi), %ebx
  86        adcl %ebx, %eax
  87        movl 28(%esi), %ebx
  88        adcl %ebx, %eax
  89        lea 32(%esi), %esi
  90        dec %ecx
  91        jne 1b
  92        adcl $0, %eax
  932:      movl %edx, %ecx
  94        andl $0x1c, %edx
  95        je 4f
  96        shrl $2, %edx           # This clears CF
  973:      adcl (%esi), %eax
  98        lea 4(%esi), %esi
  99        dec %edx
 100        jne 3b
 101        adcl $0, %eax
 1024:      andl $3, %ecx
 103        jz 7f
 104        cmpl $2, %ecx
 105        jb 5f
 106        movw (%esi),%cx
 107        leal 2(%esi),%esi
 108        je 6f
 109        shll $16,%ecx
 1105:      movb (%esi),%cl
 1116:      addl %ecx,%eax
 112        adcl $0, %eax 
 1137:      
 114        popl %ebx
 115        popl %esi
 116        ret
 117
 118#else
 119
 120/* Version for PentiumII/PPro */
 121
 122csum_partial:
 123        pushl %esi
 124        pushl %ebx
 125        movl 20(%esp),%eax      # Function arg: unsigned int sum
 126        movl 16(%esp),%ecx      # Function arg: int len
 127        movl 12(%esp),%esi      # Function arg: const unsigned char *buf
 128
 129        testl $2, %esi         
 130        jnz 30f                 
 13110:
 132        movl %ecx, %edx
 133        movl %ecx, %ebx
 134        andl $0x7c, %ebx
 135        shrl $7, %ecx
 136        addl %ebx,%esi
 137        shrl $2, %ebx  
 138        negl %ebx
 139        lea 45f(%ebx,%ebx,2), %ebx
 140        testl %esi, %esi
 141        jmp *%ebx
 142
 143        # Handle 2-byte-aligned regions
 14420:     addw (%esi), %ax
 145        lea 2(%esi), %esi
 146        adcl $0, %eax
 147        jmp 10b
 148
 14930:     subl $2, %ecx          
 150        ja 20b                 
 151        je 32f
 152        movzbl (%esi),%ebx      # csumming 1 byte, 2-aligned
 153        addl %ebx, %eax
 154        adcl $0, %eax
 155        jmp 80f
 15632:
 157        addw (%esi), %ax        # csumming 2 bytes, 2-aligned
 158        adcl $0, %eax
 159        jmp 80f
 160
 16140: 
 162        addl -128(%esi), %eax
 163        adcl -124(%esi), %eax
 164        adcl -120(%esi), %eax
 165        adcl -116(%esi), %eax   
 166        adcl -112(%esi), %eax   
 167        adcl -108(%esi), %eax
 168        adcl -104(%esi), %eax
 169        adcl -100(%esi), %eax
 170        adcl -96(%esi), %eax
 171        adcl -92(%esi), %eax
 172        adcl -88(%esi), %eax
 173        adcl -84(%esi), %eax
 174        adcl -80(%esi), %eax
 175        adcl -76(%esi), %eax
 176        adcl -72(%esi), %eax
 177        adcl -68(%esi), %eax
 178        adcl -64(%esi), %eax     
 179        adcl -60(%esi), %eax     
 180        adcl -56(%esi), %eax     
 181        adcl -52(%esi), %eax   
 182        adcl -48(%esi), %eax   
 183        adcl -44(%esi), %eax
 184        adcl -40(%esi), %eax
 185        adcl -36(%esi), %eax
 186        adcl -32(%esi), %eax
 187        adcl -28(%esi), %eax
 188        adcl -24(%esi), %eax
 189        adcl -20(%esi), %eax
 190        adcl -16(%esi), %eax
 191        adcl -12(%esi), %eax
 192        adcl -8(%esi), %eax
 193        adcl -4(%esi), %eax
 19445:
 195        lea 128(%esi), %esi
 196        adcl $0, %eax
 197        dec %ecx
 198        jge 40b
 199        movl %edx, %ecx
 20050:     andl $3, %ecx
 201        jz 80f
 202
 203        # Handle the last 1-3 bytes without jumping
 204        notl %ecx               # 1->2, 2->1, 3->0, higher bits are masked
 205        movl $0xffffff,%ebx     # by the shll and shrl instructions
 206        shll $3,%ecx
 207        shrl %cl,%ebx
 208        andl -128(%esi),%ebx    # esi is 4-aligned so should be ok
 209        addl %ebx,%eax
 210        adcl $0,%eax
 21180: 
 212        popl %ebx
 213        popl %esi
 214        ret
 215                                
 216#endif
 217
 218/*
 219unsigned int csum_partial_copy_generic (const char *src, char *dst,
 220                                  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
 221 */ 
 222
 223/*
 224 * Copy from ds while checksumming, otherwise like csum_partial
 225 *
 226 * The macros SRC and DST specify the type of access for the instruction.
 227 * thus we can call a custom exception handler for all access types.
 228 *
 229 * FIXME: could someone double-check whether I haven't mixed up some SRC and
 230 *        DST definitions? It's damn hard to trigger all cases.  I hope I got
 231 *        them all but there's no guarantee.
 232 */
 233
 234#define SRC(y...)                       \
 235        9999: y;                        \
 236        _ASM_EXTABLE(9999b, 6001f)
 237
 238#define DST(y...)                       \
 239        9999: y;                        \
 240        _ASM_EXTABLE(9999b, 6002f)
 241
 242.align 4
 243
 244#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 245
 246#define ARGBASE 16              
 247#define FP              12
 248
 249csum_partial_copy_generic_i386:
 250        subl  $4,%esp   
 251        pushl %edi
 252        pushl %esi
 253        pushl %ebx
 254        movl ARGBASE+16(%esp),%eax      # sum
 255        movl ARGBASE+12(%esp),%ecx      # len
 256        movl ARGBASE+4(%esp),%esi       # src
 257        movl ARGBASE+8(%esp),%edi       # dst
 258
 259        testl $2, %edi                  # Check alignment. 
 260        jz 2f                           # Jump if alignment is ok.
 261        subl $2, %ecx                   # Alignment uses up two bytes.
 262        jae 1f                          # Jump if we had at least two bytes.
 263        addl $2, %ecx                   # ecx was < 2.  Deal with it.
 264        jmp 4f
 265SRC(1:  movw (%esi), %bx        )
 266        addl $2, %esi
 267DST(    movw %bx, (%edi)        )
 268        addl $2, %edi
 269        addw %bx, %ax   
 270        adcl $0, %eax
 2712:
 272        movl %ecx, FP(%esp)
 273        shrl $5, %ecx
 274        jz 2f
 275        testl %esi, %esi
 276SRC(1:  movl (%esi), %ebx       )
 277SRC(    movl 4(%esi), %edx      )
 278        adcl %ebx, %eax
 279DST(    movl %ebx, (%edi)       )
 280        adcl %edx, %eax
 281DST(    movl %edx, 4(%edi)      )
 282
 283SRC(    movl 8(%esi), %ebx      )
 284SRC(    movl 12(%esi), %edx     )
 285        adcl %ebx, %eax
 286DST(    movl %ebx, 8(%edi)      )
 287        adcl %edx, %eax
 288DST(    movl %edx, 12(%edi)     )
 289
 290SRC(    movl 16(%esi), %ebx     )
 291SRC(    movl 20(%esi), %edx     )
 292        adcl %ebx, %eax
 293DST(    movl %ebx, 16(%edi)     )
 294        adcl %edx, %eax
 295DST(    movl %edx, 20(%edi)     )
 296
 297SRC(    movl 24(%esi), %ebx     )
 298SRC(    movl 28(%esi), %edx     )
 299        adcl %ebx, %eax
 300DST(    movl %ebx, 24(%edi)     )
 301        adcl %edx, %eax
 302DST(    movl %edx, 28(%edi)     )
 303
 304        lea 32(%esi), %esi
 305        lea 32(%edi), %edi
 306        dec %ecx
 307        jne 1b
 308        adcl $0, %eax
 3092:      movl FP(%esp), %edx
 310        movl %edx, %ecx
 311        andl $0x1c, %edx
 312        je 4f
 313        shrl $2, %edx                   # This clears CF
 314SRC(3:  movl (%esi), %ebx       )
 315        adcl %ebx, %eax
 316DST(    movl %ebx, (%edi)       )
 317        lea 4(%esi), %esi
 318        lea 4(%edi), %edi
 319        dec %edx
 320        jne 3b
 321        adcl $0, %eax
 3224:      andl $3, %ecx
 323        jz 7f
 324        cmpl $2, %ecx
 325        jb 5f
 326SRC(    movw (%esi), %cx        )
 327        leal 2(%esi), %esi
 328DST(    movw %cx, (%edi)        )
 329        leal 2(%edi), %edi
 330        je 6f
 331        shll $16,%ecx
 332SRC(5:  movb (%esi), %cl        )
 333DST(    movb %cl, (%edi)        )
 3346:      addl %ecx, %eax
 335        adcl $0, %eax
 3367:
 3375000:
 338
 339# Exception handler:
 340.section .fixup, "ax"                                                   
 341
 3426001:
 343        movl ARGBASE+20(%esp), %ebx     # src_err_ptr
 344        movl $-EFAULT, (%ebx)
 345
 346        # zero the complete destination - computing the rest
 347        # is too much work 
 348        movl ARGBASE+8(%esp), %edi      # dst
 349        movl ARGBASE+12(%esp), %ecx     # len
 350        xorl %eax,%eax
 351        rep ; stosb
 352
 353        jmp 5000b
 354
 3556002:
 356        movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 357        movl $-EFAULT,(%ebx)
 358        jmp 5000b
 359
 360.previous
 361
 362        popl %ebx
 363        popl %esi
 364        popl %edi
 365        popl %ecx                       # equivalent to addl $4,%esp
 366        ret     
 367
 368#else
 369
 370/* Version for PentiumII/PPro */
 371
 372#define ROUND1(x) \
 373        SRC(movl x(%esi), %ebx  )       ;       \
 374        addl %ebx, %eax                 ;       \
 375        DST(movl %ebx, x(%edi)  )       ; 
 376
 377#define ROUND(x) \
 378        SRC(movl x(%esi), %ebx  )       ;       \
 379        adcl %ebx, %eax                 ;       \
 380        DST(movl %ebx, x(%edi)  )       ;
 381
 382#define ARGBASE 12
 383                
 384csum_partial_copy_generic_i386:
 385        pushl %ebx
 386        pushl %edi
 387        pushl %esi
 388        movl ARGBASE+4(%esp),%esi       #src
 389        movl ARGBASE+8(%esp),%edi       #dst    
 390        movl ARGBASE+12(%esp),%ecx      #len
 391        movl ARGBASE+16(%esp),%eax      #sum
 392#       movl %ecx, %edx  
 393        movl %ecx, %ebx  
 394        movl %esi, %edx
 395        shrl $6, %ecx     
 396        andl $0x3c, %ebx  
 397        negl %ebx
 398        subl %ebx, %esi  
 399        subl %ebx, %edi  
 400        lea  -1(%esi),%edx
 401        andl $-32,%edx
 402        lea 3f(%ebx,%ebx), %ebx
 403        testl %esi, %esi 
 404        jmp *%ebx
 4051:      addl $64,%esi
 406        addl $64,%edi 
 407        SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
 408        ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)    
 409        ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)    
 410        ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)    
 411        ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)     
 4123:      adcl $0,%eax
 413        addl $64, %edx
 414        dec %ecx
 415        jge 1b
 4164:      movl ARGBASE+12(%esp),%edx      #len
 417        andl $3, %edx
 418        jz 7f
 419        cmpl $2, %edx
 420        jb 5f
 421SRC(    movw (%esi), %dx         )
 422        leal 2(%esi), %esi
 423DST(    movw %dx, (%edi)         )
 424        leal 2(%edi), %edi
 425        je 6f
 426        shll $16,%edx
 4275:
 428SRC(    movb (%esi), %dl         )
 429DST(    movb %dl, (%edi)         )
 4306:      addl %edx, %eax
 431        adcl $0, %eax
 4327:
 433.section .fixup, "ax"
 4346001:   movl    ARGBASE+20(%esp), %ebx  # src_err_ptr   
 435        movl $-EFAULT, (%ebx)
 436        # zero the complete destination (computing the rest is too much work)
 437        movl ARGBASE+8(%esp),%edi       # dst
 438        movl ARGBASE+12(%esp),%ecx      # len
 439        xorl %eax,%eax
 440        rep; stosb
 441        jmp 7b
 4426002:   movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 443        movl $-EFAULT, (%ebx)
 444        jmp  7b                 
 445.previous                               
 446
 447        popl %esi
 448        popl %edi
 449        popl %ebx
 450        ret
 451                                
 452#undef ROUND
 453#undef ROUND1           
 454                
 455#endif
 456