linux/arch/powerpc/lib/checksum_32.S
<<
>>
Prefs
   1/*
   2 * This file contains assembly-language implementations
   3 * of IP-style 1's complement checksum routines.
   4 *      
   5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6 *
   7 *  This program is free software; you can redistribute it and/or
   8 *  modify it under the terms of the GNU General Public License
   9 *  as published by the Free Software Foundation; either version
  10 *  2 of the License, or (at your option) any later version.
  11 *
  12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13 */
  14
  15#include <linux/sys.h>
  16#include <asm/processor.h>
  17#include <asm/cache.h>
  18#include <asm/errno.h>
  19#include <asm/ppc_asm.h>
  20#include <asm/export.h>
  21
  22        .text
  23
  24/*
  25 * computes the checksum of a memory block at buff, length len,
  26 * and adds in "sum" (32-bit)
  27 *
  28 * __csum_partial(buff, len, sum)
  29 */
  30_GLOBAL(__csum_partial)
  31        subi    r3,r3,4
  32        srawi.  r6,r4,2         /* Divide len by 4 and also clear carry */
  33        beq     3f              /* if we're doing < 4 bytes */
  34        andi.   r0,r3,2         /* Align buffer to longword boundary */
  35        beq+    1f
  36        lhz     r0,4(r3)        /* do 2 bytes to get aligned */
  37        subi    r4,r4,2
  38        addi    r3,r3,2
  39        srwi.   r6,r4,2         /* # words to do */
  40        adde    r5,r5,r0
  41        beq     3f
  421:      andi.   r6,r6,3         /* Prepare to handle words 4 by 4 */
  43        beq     21f
  44        mtctr   r6
  452:      lwzu    r0,4(r3)
  46        adde    r5,r5,r0
  47        bdnz    2b
  4821:     srwi.   r6,r4,4         /* # blocks of 4 words to do */
  49        beq     3f
  50        mtctr   r6
  5122:     lwz     r0,4(r3)
  52        lwz     r6,8(r3)
  53        lwz     r7,12(r3)
  54        lwzu    r8,16(r3)
  55        adde    r5,r5,r0
  56        adde    r5,r5,r6
  57        adde    r5,r5,r7
  58        adde    r5,r5,r8
  59        bdnz    22b
  603:      andi.   r0,r4,2
  61        beq+    4f
  62        lhz     r0,4(r3)
  63        addi    r3,r3,2
  64        adde    r5,r5,r0
  654:      andi.   r0,r4,1
  66        beq+    5f
  67        lbz     r0,4(r3)
  68        slwi    r0,r0,8         /* Upper byte of word */
  69        adde    r5,r5,r0
  705:      addze   r3,r5           /* add in final carry */
  71        blr
  72EXPORT_SYMBOL(__csum_partial)
  73
  74/*
  75 * Computes the checksum of a memory block at src, length len,
  76 * and adds in "sum" (32-bit), while copying the block to dst.
  77 * If an access exception occurs on src or dst, it stores -EFAULT
  78 * to *src_err or *dst_err respectively, and (for an error on
  79 * src) zeroes the rest of dst.
  80 *
  81 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
  82 */
  83#define CSUM_COPY_16_BYTES_WITHEX(n)    \
  848 ## n ## 0:                    \
  85        lwz     r7,4(r4);       \
  868 ## n ## 1:                    \
  87        lwz     r8,8(r4);       \
  888 ## n ## 2:                    \
  89        lwz     r9,12(r4);      \
  908 ## n ## 3:                    \
  91        lwzu    r10,16(r4);     \
  928 ## n ## 4:                    \
  93        stw     r7,4(r6);       \
  94        adde    r12,r12,r7;     \
  958 ## n ## 5:                    \
  96        stw     r8,8(r6);       \
  97        adde    r12,r12,r8;     \
  988 ## n ## 6:                    \
  99        stw     r9,12(r6);      \
 100        adde    r12,r12,r9;     \
 1018 ## n ## 7:                    \
 102        stwu    r10,16(r6);     \
 103        adde    r12,r12,r10
 104
 105#define CSUM_COPY_16_BYTES_EXCODE(n)            \
 106        EX_TABLE(8 ## n ## 0b, src_error);      \
 107        EX_TABLE(8 ## n ## 1b, src_error);      \
 108        EX_TABLE(8 ## n ## 2b, src_error);      \
 109        EX_TABLE(8 ## n ## 3b, src_error);      \
 110        EX_TABLE(8 ## n ## 4b, dst_error);      \
 111        EX_TABLE(8 ## n ## 5b, dst_error);      \
 112        EX_TABLE(8 ## n ## 6b, dst_error);      \
 113        EX_TABLE(8 ## n ## 7b, dst_error);
 114
 115        .text
 116        .stabs  "arch/powerpc/lib/",N_SO,0,0,0f
 117        .stabs  "checksum_32.S",N_SO,0,0,0f
 1180:
 119
 120CACHELINE_BYTES = L1_CACHE_BYTES
 121LG_CACHELINE_BYTES = L1_CACHE_SHIFT
 122CACHELINE_MASK = (L1_CACHE_BYTES-1)
 123
 124_GLOBAL(csum_partial_copy_generic)
 125        stwu    r1,-16(r1)
 126        stw     r7,12(r1)
 127        stw     r8,8(r1)
 128
 129        addic   r12,r6,0
 130        addi    r6,r4,-4
 131        neg     r0,r4
 132        addi    r4,r3,-4
 133        andi.   r0,r0,CACHELINE_MASK    /* # bytes to start of cache line */
 134        crset   4*cr7+eq
 135        beq     58f
 136
 137        cmplw   0,r5,r0                 /* is this more than total to do? */
 138        blt     63f                     /* if not much to do */
 139        rlwinm  r7,r6,3,0x8
 140        rlwnm   r12,r12,r7,0,31 /* odd destination address: rotate one byte */
 141        cmplwi  cr7,r7,0        /* is destination address even ? */
 142        andi.   r8,r0,3                 /* get it word-aligned first */
 143        mtctr   r8
 144        beq+    61f
 145        li      r3,0
 14670:     lbz     r9,4(r4)                /* do some bytes */
 147        addi    r4,r4,1
 148        slwi    r3,r3,8
 149        rlwimi  r3,r9,0,24,31
 15071:     stb     r9,4(r6)
 151        addi    r6,r6,1
 152        bdnz    70b
 153        adde    r12,r12,r3
 15461:     subf    r5,r0,r5
 155        srwi.   r0,r0,2
 156        mtctr   r0
 157        beq     58f
 15872:     lwzu    r9,4(r4)                /* do some words */
 159        adde    r12,r12,r9
 16073:     stwu    r9,4(r6)
 161        bdnz    72b
 162
 16358:     srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
 164        clrlwi  r5,r5,32-LG_CACHELINE_BYTES
 165        li      r11,4
 166        beq     63f
 167
 168        /* Here we decide how far ahead to prefetch the source */
 169        li      r3,4
 170        cmpwi   r0,1
 171        li      r7,0
 172        ble     114f
 173        li      r7,1
 174#if MAX_COPY_PREFETCH > 1
 175        /* Heuristically, for large transfers we prefetch
 176           MAX_COPY_PREFETCH cachelines ahead.  For small transfers
 177           we prefetch 1 cacheline ahead. */
 178        cmpwi   r0,MAX_COPY_PREFETCH
 179        ble     112f
 180        li      r7,MAX_COPY_PREFETCH
 181112:    mtctr   r7
 182111:    dcbt    r3,r4
 183        addi    r3,r3,CACHELINE_BYTES
 184        bdnz    111b
 185#else
 186        dcbt    r3,r4
 187        addi    r3,r3,CACHELINE_BYTES
 188#endif /* MAX_COPY_PREFETCH > 1 */
 189
 190114:    subf    r8,r7,r0
 191        mr      r0,r7
 192        mtctr   r8
 193
 19453:     dcbt    r3,r4
 19554:     dcbz    r11,r6
 196/* the main body of the cacheline loop */
 197        CSUM_COPY_16_BYTES_WITHEX(0)
 198#if L1_CACHE_BYTES >= 32
 199        CSUM_COPY_16_BYTES_WITHEX(1)
 200#if L1_CACHE_BYTES >= 64
 201        CSUM_COPY_16_BYTES_WITHEX(2)
 202        CSUM_COPY_16_BYTES_WITHEX(3)
 203#if L1_CACHE_BYTES >= 128
 204        CSUM_COPY_16_BYTES_WITHEX(4)
 205        CSUM_COPY_16_BYTES_WITHEX(5)
 206        CSUM_COPY_16_BYTES_WITHEX(6)
 207        CSUM_COPY_16_BYTES_WITHEX(7)
 208#endif
 209#endif
 210#endif
 211        bdnz    53b
 212        cmpwi   r0,0
 213        li      r3,4
 214        li      r7,0
 215        bne     114b
 216
 21763:     srwi.   r0,r5,2
 218        mtctr   r0
 219        beq     64f
 22030:     lwzu    r0,4(r4)
 221        adde    r12,r12,r0
 22231:     stwu    r0,4(r6)
 223        bdnz    30b
 224
 22564:     andi.   r0,r5,2
 226        beq+    65f
 22740:     lhz     r0,4(r4)
 228        addi    r4,r4,2
 22941:     sth     r0,4(r6)
 230        adde    r12,r12,r0
 231        addi    r6,r6,2
 23265:     andi.   r0,r5,1
 233        beq+    66f
 23450:     lbz     r0,4(r4)
 23551:     stb     r0,4(r6)
 236        slwi    r0,r0,8
 237        adde    r12,r12,r0
 23866:     addze   r3,r12
 239        addi    r1,r1,16
 240        beqlr+  cr7
 241        rlwinm  r3,r3,8,0,31    /* odd destination address: rotate one byte */
 242        blr
 243
 244/* read fault */
 245src_error:
 246        lwz     r7,12(r1)
 247        addi    r1,r1,16
 248        cmpwi   cr0,r7,0
 249        beqlr
 250        li      r0,-EFAULT
 251        stw     r0,0(r7)
 252        blr
 253/* write fault */
 254dst_error:
 255        lwz     r8,8(r1)
 256        addi    r1,r1,16
 257        cmpwi   cr0,r8,0
 258        beqlr
 259        li      r0,-EFAULT
 260        stw     r0,0(r8)
 261        blr
 262
 263        EX_TABLE(70b, src_error);
 264        EX_TABLE(71b, dst_error);
 265        EX_TABLE(72b, src_error);
 266        EX_TABLE(73b, dst_error);
 267        EX_TABLE(54b, dst_error);
 268
 269/*
 270 * this stuff handles faults in the cacheline loop and branches to either
 271 * src_error (if in read part) or dst_error (if in write part)
 272 */
 273        CSUM_COPY_16_BYTES_EXCODE(0)
 274#if L1_CACHE_BYTES >= 32
 275        CSUM_COPY_16_BYTES_EXCODE(1)
 276#if L1_CACHE_BYTES >= 64
 277        CSUM_COPY_16_BYTES_EXCODE(2)
 278        CSUM_COPY_16_BYTES_EXCODE(3)
 279#if L1_CACHE_BYTES >= 128
 280        CSUM_COPY_16_BYTES_EXCODE(4)
 281        CSUM_COPY_16_BYTES_EXCODE(5)
 282        CSUM_COPY_16_BYTES_EXCODE(6)
 283        CSUM_COPY_16_BYTES_EXCODE(7)
 284#endif
 285#endif
 286#endif
 287
 288        EX_TABLE(30b, src_error);
 289        EX_TABLE(31b, dst_error);
 290        EX_TABLE(40b, src_error);
 291        EX_TABLE(41b, dst_error);
 292        EX_TABLE(50b, src_error);
 293        EX_TABLE(51b, dst_error);
 294
 295EXPORT_SYMBOL(csum_partial_copy_generic)
 296