linux/arch/powerpc/lib/checksum_64.S
<<
>>
Prefs
   1/*
   2 * This file contains assembly-language implementations
   3 * of IP-style 1's complement checksum routines.
   4 *      
   5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6 *
   7 *  This program is free software; you can redistribute it and/or
   8 *  modify it under the terms of the GNU General Public License
   9 *  as published by the Free Software Foundation; either version
  10 *  2 of the License, or (at your option) any later version.
  11 *
  12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13 */
  14
  15#include <linux/sys.h>
  16#include <asm/processor.h>
  17#include <asm/errno.h>
  18#include <asm/ppc_asm.h>
  19#include <asm/export.h>
  20
  21/*
  22 * Computes the checksum of a memory block at buff, length len,
  23 * and adds in "sum" (32-bit).
  24 *
  25 * __csum_partial(r3=buff, r4=len, r5=sum)
  26 */
  27_GLOBAL(__csum_partial)
  28        addic   r0,r5,0                 /* clear carry */
  29
  30        srdi.   r6,r4,3                 /* less than 8 bytes? */
  31        beq     .Lcsum_tail_word
  32
  33        /*
  34         * If only halfword aligned, align to a double word. Since odd
  35         * aligned addresses should be rare and they would require more
  36         * work to calculate the correct checksum, we ignore that case
  37         * and take the potential slowdown of unaligned loads.
  38         */
  39        rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
  40        beq     .Lcsum_aligned
  41
  42        li      r7,4
  43        sub     r6,r7,r6
  44        mtctr   r6
  45
  461:
  47        lhz     r6,0(r3)                /* align to doubleword */
  48        subi    r4,r4,2
  49        addi    r3,r3,2
  50        adde    r0,r0,r6
  51        bdnz    1b
  52
  53.Lcsum_aligned:
  54        /*
  55         * We unroll the loop such that each iteration is 64 bytes with an
  56         * entry and exit limb of 64 bytes, meaning a minimum size of
  57         * 128 bytes.
  58         */
  59        srdi.   r6,r4,7
  60        beq     .Lcsum_tail_doublewords         /* len < 128 */
  61
  62        srdi    r6,r4,6
  63        subi    r6,r6,1
  64        mtctr   r6
  65
  66        stdu    r1,-STACKFRAMESIZE(r1)
  67        std     r14,STK_REG(R14)(r1)
  68        std     r15,STK_REG(R15)(r1)
  69        std     r16,STK_REG(R16)(r1)
  70
  71        ld      r6,0(r3)
  72        ld      r9,8(r3)
  73
  74        ld      r10,16(r3)
  75        ld      r11,24(r3)
  76
  77        /*
  78         * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  79         * because of the XER dependency. This means the fastest this loop can
  80         * go is 16 cycles per iteration. The scheduling of the loop below has
  81         * been shown to hit this on both POWER6 and POWER7.
  82         */
  83        .align 5
  842:
  85        adde    r0,r0,r6
  86        ld      r12,32(r3)
  87        ld      r14,40(r3)
  88
  89        adde    r0,r0,r9
  90        ld      r15,48(r3)
  91        ld      r16,56(r3)
  92        addi    r3,r3,64
  93
  94        adde    r0,r0,r10
  95
  96        adde    r0,r0,r11
  97
  98        adde    r0,r0,r12
  99
 100        adde    r0,r0,r14
 101
 102        adde    r0,r0,r15
 103        ld      r6,0(r3)
 104        ld      r9,8(r3)
 105
 106        adde    r0,r0,r16
 107        ld      r10,16(r3)
 108        ld      r11,24(r3)
 109        bdnz    2b
 110
 111
 112        adde    r0,r0,r6
 113        ld      r12,32(r3)
 114        ld      r14,40(r3)
 115
 116        adde    r0,r0,r9
 117        ld      r15,48(r3)
 118        ld      r16,56(r3)
 119        addi    r3,r3,64
 120
 121        adde    r0,r0,r10
 122        adde    r0,r0,r11
 123        adde    r0,r0,r12
 124        adde    r0,r0,r14
 125        adde    r0,r0,r15
 126        adde    r0,r0,r16
 127
 128        ld      r14,STK_REG(R14)(r1)
 129        ld      r15,STK_REG(R15)(r1)
 130        ld      r16,STK_REG(R16)(r1)
 131        addi    r1,r1,STACKFRAMESIZE
 132
 133        andi.   r4,r4,63
 134
 135.Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 136        srdi.   r6,r4,3
 137        beq     .Lcsum_tail_word
 138
 139        mtctr   r6
 1403:
 141        ld      r6,0(r3)
 142        addi    r3,r3,8
 143        adde    r0,r0,r6
 144        bdnz    3b
 145
 146        andi.   r4,r4,7
 147
 148.Lcsum_tail_word:                       /* Up to 7 bytes to go */
 149        srdi.   r6,r4,2
 150        beq     .Lcsum_tail_halfword
 151
 152        lwz     r6,0(r3)
 153        addi    r3,r3,4
 154        adde    r0,r0,r6
 155        subi    r4,r4,4
 156
 157.Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 158        srdi.   r6,r4,1
 159        beq     .Lcsum_tail_byte
 160
 161        lhz     r6,0(r3)
 162        addi    r3,r3,2
 163        adde    r0,r0,r6
 164        subi    r4,r4,2
 165
 166.Lcsum_tail_byte:                       /* Up to 1 byte to go */
 167        andi.   r6,r4,1
 168        beq     .Lcsum_finish
 169
 170        lbz     r6,0(r3)
 171#ifdef __BIG_ENDIAN__
 172        sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 173        adde    r0,r0,r9
 174#else
 175        adde    r0,r0,r6
 176#endif
 177
 178.Lcsum_finish:
 179        addze   r0,r0                   /* add in final carry */
 180        rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 181        add     r3,r4,r0
 182        srdi    r3,r3,32
 183        blr
 184EXPORT_SYMBOL(__csum_partial)
 185
 186
 187        .macro srcnr
 188100:
 189        EX_TABLE(100b,.Lsrc_error_nr)
 190        .endm
 191
 192        .macro source
 193150:
 194        EX_TABLE(150b,.Lsrc_error)
 195        .endm
 196
 197        .macro dstnr
 198200:
 199        EX_TABLE(200b,.Ldest_error_nr)
 200        .endm
 201
 202        .macro dest
 203250:
 204        EX_TABLE(250b,.Ldest_error)
 205        .endm
 206
 207/*
 208 * Computes the checksum of a memory block at src, length len,
 209 * and adds in "sum" (32-bit), while copying the block to dst.
 210 * If an access exception occurs on src or dst, it stores -EFAULT
 211 * to *src_err or *dst_err respectively. The caller must take any action
 212 * required in this case (zeroing memory, recalculating partial checksum etc).
 213 *
 214 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 215 */
 216_GLOBAL(csum_partial_copy_generic)
 217        addic   r0,r6,0                 /* clear carry */
 218
 219        srdi.   r6,r5,3                 /* less than 8 bytes? */
 220        beq     .Lcopy_tail_word
 221
 222        /*
 223         * If only halfword aligned, align to a double word. Since odd
 224         * aligned addresses should be rare and they would require more
 225         * work to calculate the correct checksum, we ignore that case
 226         * and take the potential slowdown of unaligned loads.
 227         *
 228         * If the source and destination are relatively unaligned we only
 229         * align the source. This keeps things simple.
 230         */
 231        rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
 232        beq     .Lcopy_aligned
 233
 234        li      r9,4
 235        sub     r6,r9,r6
 236        mtctr   r6
 237
 2381:
 239srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 240        subi    r5,r5,2
 241        addi    r3,r3,2
 242        adde    r0,r0,r6
 243dstnr;  sth     r6,0(r4)
 244        addi    r4,r4,2
 245        bdnz    1b
 246
 247.Lcopy_aligned:
 248        /*
 249         * We unroll the loop such that each iteration is 64 bytes with an
 250         * entry and exit limb of 64 bytes, meaning a minimum size of
 251         * 128 bytes.
 252         */
 253        srdi.   r6,r5,7
 254        beq     .Lcopy_tail_doublewords         /* len < 128 */
 255
 256        srdi    r6,r5,6
 257        subi    r6,r6,1
 258        mtctr   r6
 259
 260        stdu    r1,-STACKFRAMESIZE(r1)
 261        std     r14,STK_REG(R14)(r1)
 262        std     r15,STK_REG(R15)(r1)
 263        std     r16,STK_REG(R16)(r1)
 264
 265source; ld      r6,0(r3)
 266source; ld      r9,8(r3)
 267
 268source; ld      r10,16(r3)
 269source; ld      r11,24(r3)
 270
 271        /*
 272         * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 273         * because of the XER dependency. This means the fastest this loop can
 274         * go is 16 cycles per iteration. The scheduling of the loop below has
 275         * been shown to hit this on both POWER6 and POWER7.
 276         */
 277        .align 5
 2782:
 279        adde    r0,r0,r6
 280source; ld      r12,32(r3)
 281source; ld      r14,40(r3)
 282
 283        adde    r0,r0,r9
 284source; ld      r15,48(r3)
 285source; ld      r16,56(r3)
 286        addi    r3,r3,64
 287
 288        adde    r0,r0,r10
 289dest;   std     r6,0(r4)
 290dest;   std     r9,8(r4)
 291
 292        adde    r0,r0,r11
 293dest;   std     r10,16(r4)
 294dest;   std     r11,24(r4)
 295
 296        adde    r0,r0,r12
 297dest;   std     r12,32(r4)
 298dest;   std     r14,40(r4)
 299
 300        adde    r0,r0,r14
 301dest;   std     r15,48(r4)
 302dest;   std     r16,56(r4)
 303        addi    r4,r4,64
 304
 305        adde    r0,r0,r15
 306source; ld      r6,0(r3)
 307source; ld      r9,8(r3)
 308
 309        adde    r0,r0,r16
 310source; ld      r10,16(r3)
 311source; ld      r11,24(r3)
 312        bdnz    2b
 313
 314
 315        adde    r0,r0,r6
 316source; ld      r12,32(r3)
 317source; ld      r14,40(r3)
 318
 319        adde    r0,r0,r9
 320source; ld      r15,48(r3)
 321source; ld      r16,56(r3)
 322        addi    r3,r3,64
 323
 324        adde    r0,r0,r10
 325dest;   std     r6,0(r4)
 326dest;   std     r9,8(r4)
 327
 328        adde    r0,r0,r11
 329dest;   std     r10,16(r4)
 330dest;   std     r11,24(r4)
 331
 332        adde    r0,r0,r12
 333dest;   std     r12,32(r4)
 334dest;   std     r14,40(r4)
 335
 336        adde    r0,r0,r14
 337dest;   std     r15,48(r4)
 338dest;   std     r16,56(r4)
 339        addi    r4,r4,64
 340
 341        adde    r0,r0,r15
 342        adde    r0,r0,r16
 343
 344        ld      r14,STK_REG(R14)(r1)
 345        ld      r15,STK_REG(R15)(r1)
 346        ld      r16,STK_REG(R16)(r1)
 347        addi    r1,r1,STACKFRAMESIZE
 348
 349        andi.   r5,r5,63
 350
 351.Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 352        srdi.   r6,r5,3
 353        beq     .Lcopy_tail_word
 354
 355        mtctr   r6
 3563:
 357srcnr;  ld      r6,0(r3)
 358        addi    r3,r3,8
 359        adde    r0,r0,r6
 360dstnr;  std     r6,0(r4)
 361        addi    r4,r4,8
 362        bdnz    3b
 363
 364        andi.   r5,r5,7
 365
 366.Lcopy_tail_word:                       /* Up to 7 bytes to go */
 367        srdi.   r6,r5,2
 368        beq     .Lcopy_tail_halfword
 369
 370srcnr;  lwz     r6,0(r3)
 371        addi    r3,r3,4
 372        adde    r0,r0,r6
 373dstnr;  stw     r6,0(r4)
 374        addi    r4,r4,4
 375        subi    r5,r5,4
 376
 377.Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 378        srdi.   r6,r5,1
 379        beq     .Lcopy_tail_byte
 380
 381srcnr;  lhz     r6,0(r3)
 382        addi    r3,r3,2
 383        adde    r0,r0,r6
 384dstnr;  sth     r6,0(r4)
 385        addi    r4,r4,2
 386        subi    r5,r5,2
 387
 388.Lcopy_tail_byte:                       /* Up to 1 byte to go */
 389        andi.   r6,r5,1
 390        beq     .Lcopy_finish
 391
 392srcnr;  lbz     r6,0(r3)
 393#ifdef __BIG_ENDIAN__
 394        sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 395        adde    r0,r0,r9
 396#else
 397        adde    r0,r0,r6
 398#endif
 399dstnr;  stb     r6,0(r4)
 400
 401.Lcopy_finish:
 402        addze   r0,r0                   /* add in final carry */
 403        rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 404        add     r3,r4,r0
 405        srdi    r3,r3,32
 406        blr
 407
 408.Lsrc_error:
 409        ld      r14,STK_REG(R14)(r1)
 410        ld      r15,STK_REG(R15)(r1)
 411        ld      r16,STK_REG(R16)(r1)
 412        addi    r1,r1,STACKFRAMESIZE
 413.Lsrc_error_nr:
 414        cmpdi   0,r7,0
 415        beqlr
 416        li      r6,-EFAULT
 417        stw     r6,0(r7)
 418        blr
 419
 420.Ldest_error:
 421        ld      r14,STK_REG(R14)(r1)
 422        ld      r15,STK_REG(R15)(r1)
 423        ld      r16,STK_REG(R16)(r1)
 424        addi    r1,r1,STACKFRAMESIZE
 425.Ldest_error_nr:
 426        cmpdi   0,r8,0
 427        beqlr
 428        li      r6,-EFAULT
 429        stw     r6,0(r8)
 430        blr
 431EXPORT_SYMBOL(csum_partial_copy_generic)
 432