linux/arch/powerpc/lib/copy_32.S
<<
>>
Prefs
   1/*
   2 * Memory copy functions for 32-bit PowerPC.
   3 *
   4 * Copyright (C) 1996-2005 Paul Mackerras.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11#include <asm/processor.h>
  12#include <asm/cache.h>
  13#include <asm/errno.h>
  14#include <asm/ppc_asm.h>
  15#include <asm/export.h>
  16
  17#define COPY_16_BYTES           \
  18        lwz     r7,4(r4);       \
  19        lwz     r8,8(r4);       \
  20        lwz     r9,12(r4);      \
  21        lwzu    r10,16(r4);     \
  22        stw     r7,4(r6);       \
  23        stw     r8,8(r6);       \
  24        stw     r9,12(r6);      \
  25        stwu    r10,16(r6)
  26
  27#define COPY_16_BYTES_WITHEX(n) \
  288 ## n ## 0:                    \
  29        lwz     r7,4(r4);       \
  308 ## n ## 1:                    \
  31        lwz     r8,8(r4);       \
  328 ## n ## 2:                    \
  33        lwz     r9,12(r4);      \
  348 ## n ## 3:                    \
  35        lwzu    r10,16(r4);     \
  368 ## n ## 4:                    \
  37        stw     r7,4(r6);       \
  388 ## n ## 5:                    \
  39        stw     r8,8(r6);       \
  408 ## n ## 6:                    \
  41        stw     r9,12(r6);      \
  428 ## n ## 7:                    \
  43        stwu    r10,16(r6)
  44
  45#define COPY_16_BYTES_EXCODE(n)                 \
  469 ## n ## 0:                                    \
  47        addi    r5,r5,-(16 * n);                \
  48        b       104f;                           \
  499 ## n ## 1:                                    \
  50        addi    r5,r5,-(16 * n);                \
  51        b       105f;                           \
  52        EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);    \
  53        EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);    \
  54        EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);    \
  55        EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);    \
  56        EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);    \
  57        EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);    \
  58        EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);    \
  59        EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
  60
  61        .text
  62        .stabs  "arch/powerpc/lib/",N_SO,0,0,0f
  63        .stabs  "copy_32.S",N_SO,0,0,0f
  640:
  65
  66CACHELINE_BYTES = L1_CACHE_BYTES
  67LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  68CACHELINE_MASK = (L1_CACHE_BYTES-1)
  69
  70_GLOBAL(memset16)
  71        rlwinm. r0 ,r5, 31, 1, 31
  72        addi    r6, r3, -4
  73        beq-    2f
  74        rlwimi  r4 ,r4 ,16 ,0 ,15
  75        mtctr   r0
  761:      stwu    r4, 4(r6)
  77        bdnz    1b
  782:      andi.   r0, r5, 1
  79        beqlr
  80        sth     r4, 4(r6)
  81        blr
  82EXPORT_SYMBOL(memset16)
  83
  84/*
  85 * Use dcbz on the complete cache lines in the destination
  86 * to set them to zero.  This requires that the destination
  87 * area is cacheable.  -- paulus
  88 *
  89 * During early init, cache might not be active yet, so dcbz cannot be used.
  90 * We therefore skip the optimised bloc that uses dcbz. This jump is
  91 * replaced by a nop once cache is active. This is done in machine_init()
  92 */
  93_GLOBAL(memset)
  94        cmplwi  0,r5,4
  95        blt     7f
  96
  97        rlwimi  r4,r4,8,16,23
  98        rlwimi  r4,r4,16,0,15
  99
 100        stw     r4,0(r3)
 101        beqlr
 102        andi.   r0,r3,3
 103        add     r5,r0,r5
 104        subf    r6,r0,r3
 105        cmplwi  0,r4,0
 106        /*
 107         * Skip optimised bloc until cache is enabled. Will be replaced
 108         * by 'bne' during boot to use normal procedure if r4 is not zero
 109         */
 110_GLOBAL(memset_nocache_branch)
 111        b       2f
 112
 113        clrlwi  r7,r6,32-LG_CACHELINE_BYTES
 114        add     r8,r7,r5
 115        srwi    r9,r8,LG_CACHELINE_BYTES
 116        addic.  r9,r9,-1        /* total number of complete cachelines */
 117        ble     2f
 118        xori    r0,r7,CACHELINE_MASK & ~3
 119        srwi.   r0,r0,2
 120        beq     3f
 121        mtctr   r0
 1224:      stwu    r4,4(r6)
 123        bdnz    4b
 1243:      mtctr   r9
 125        li      r7,4
 12610:     dcbz    r7,r6
 127        addi    r6,r6,CACHELINE_BYTES
 128        bdnz    10b
 129        clrlwi  r5,r8,32-LG_CACHELINE_BYTES
 130        addi    r5,r5,4
 131
 1322:      srwi    r0,r5,2
 133        mtctr   r0
 134        bdz     6f
 1351:      stwu    r4,4(r6)
 136        bdnz    1b
 1376:      andi.   r5,r5,3
 138        beqlr
 139        mtctr   r5
 140        addi    r6,r6,3
 1418:      stbu    r4,1(r6)
 142        bdnz    8b
 143        blr
 144
 1457:      cmpwi   0,r5,0
 146        beqlr
 147        mtctr   r5
 148        addi    r6,r3,-1
 1499:      stbu    r4,1(r6)
 150        bdnz    9b
 151        blr
 152EXPORT_SYMBOL(memset)
 153
 154/*
 155 * This version uses dcbz on the complete cache lines in the
 156 * destination area to reduce memory traffic.  This requires that
 157 * the destination area is cacheable.
 158 * We only use this version if the source and dest don't overlap.
 159 * -- paulus.
 160 *
 161 * During early init, cache might not be active yet, so dcbz cannot be used.
 162 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
 163 * replaced by a nop once cache is active. This is done in machine_init()
 164 */
 165_GLOBAL(memmove)
 166        cmplw   0,r3,r4
 167        bgt     backwards_memcpy
 168        /* fall through */
 169
 170_GLOBAL(memcpy)
 171        b       generic_memcpy
 172        add     r7,r3,r5                /* test if the src & dst overlap */
 173        add     r8,r4,r5
 174        cmplw   0,r4,r7
 175        cmplw   1,r3,r8
 176        crand   0,0,4                   /* cr0.lt &= cr1.lt */
 177        blt     generic_memcpy          /* if regions overlap */
 178
 179        addi    r4,r4,-4
 180        addi    r6,r3,-4
 181        neg     r0,r3
 182        andi.   r0,r0,CACHELINE_MASK    /* # bytes to start of cache line */
 183        beq     58f
 184
 185        cmplw   0,r5,r0                 /* is this more than total to do? */
 186        blt     63f                     /* if not much to do */
 187        andi.   r8,r0,3                 /* get it word-aligned first */
 188        subf    r5,r0,r5
 189        mtctr   r8
 190        beq+    61f
 19170:     lbz     r9,4(r4)                /* do some bytes */
 192        addi    r4,r4,1
 193        addi    r6,r6,1
 194        stb     r9,3(r6)
 195        bdnz    70b
 19661:     srwi.   r0,r0,2
 197        mtctr   r0
 198        beq     58f
 19972:     lwzu    r9,4(r4)                /* do some words */
 200        stwu    r9,4(r6)
 201        bdnz    72b
 202
 20358:     srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
 204        clrlwi  r5,r5,32-LG_CACHELINE_BYTES
 205        li      r11,4
 206        mtctr   r0
 207        beq     63f
 20853:
 209        dcbz    r11,r6
 210        COPY_16_BYTES
 211#if L1_CACHE_BYTES >= 32
 212        COPY_16_BYTES
 213#if L1_CACHE_BYTES >= 64
 214        COPY_16_BYTES
 215        COPY_16_BYTES
 216#if L1_CACHE_BYTES >= 128
 217        COPY_16_BYTES
 218        COPY_16_BYTES
 219        COPY_16_BYTES
 220        COPY_16_BYTES
 221#endif
 222#endif
 223#endif
 224        bdnz    53b
 225
 22663:     srwi.   r0,r5,2
 227        mtctr   r0
 228        beq     64f
 22930:     lwzu    r0,4(r4)
 230        stwu    r0,4(r6)
 231        bdnz    30b
 232
 23364:     andi.   r0,r5,3
 234        mtctr   r0
 235        beq+    65f
 236        addi    r4,r4,3
 237        addi    r6,r6,3
 23840:     lbzu    r0,1(r4)
 239        stbu    r0,1(r6)
 240        bdnz    40b
 24165:     blr
 242EXPORT_SYMBOL(memcpy)
 243EXPORT_SYMBOL(memmove)
 244
 245generic_memcpy:
 246        srwi.   r7,r5,3
 247        addi    r6,r3,-4
 248        addi    r4,r4,-4
 249        beq     2f                      /* if less than 8 bytes to do */
 250        andi.   r0,r6,3                 /* get dest word aligned */
 251        mtctr   r7
 252        bne     5f
 2531:      lwz     r7,4(r4)
 254        lwzu    r8,8(r4)
 255        stw     r7,4(r6)
 256        stwu    r8,8(r6)
 257        bdnz    1b
 258        andi.   r5,r5,7
 2592:      cmplwi  0,r5,4
 260        blt     3f
 261        lwzu    r0,4(r4)
 262        addi    r5,r5,-4
 263        stwu    r0,4(r6)
 2643:      cmpwi   0,r5,0
 265        beqlr
 266        mtctr   r5
 267        addi    r4,r4,3
 268        addi    r6,r6,3
 2694:      lbzu    r0,1(r4)
 270        stbu    r0,1(r6)
 271        bdnz    4b
 272        blr
 2735:      subfic  r0,r0,4
 274        mtctr   r0
 2756:      lbz     r7,4(r4)
 276        addi    r4,r4,1
 277        stb     r7,4(r6)
 278        addi    r6,r6,1
 279        bdnz    6b
 280        subf    r5,r0,r5
 281        rlwinm. r7,r5,32-3,3,31
 282        beq     2b
 283        mtctr   r7
 284        b       1b
 285
 286_GLOBAL(backwards_memcpy)
 287        rlwinm. r7,r5,32-3,3,31         /* r0 = r5 >> 3 */
 288        add     r6,r3,r5
 289        add     r4,r4,r5
 290        beq     2f
 291        andi.   r0,r6,3
 292        mtctr   r7
 293        bne     5f
 2941:      lwz     r7,-4(r4)
 295        lwzu    r8,-8(r4)
 296        stw     r7,-4(r6)
 297        stwu    r8,-8(r6)
 298        bdnz    1b
 299        andi.   r5,r5,7
 3002:      cmplwi  0,r5,4
 301        blt     3f
 302        lwzu    r0,-4(r4)
 303        subi    r5,r5,4
 304        stwu    r0,-4(r6)
 3053:      cmpwi   0,r5,0
 306        beqlr
 307        mtctr   r5
 3084:      lbzu    r0,-1(r4)
 309        stbu    r0,-1(r6)
 310        bdnz    4b
 311        blr
 3125:      mtctr   r0
 3136:      lbzu    r7,-1(r4)
 314        stbu    r7,-1(r6)
 315        bdnz    6b
 316        subf    r5,r0,r5
 317        rlwinm. r7,r5,32-3,3,31
 318        beq     2b
 319        mtctr   r7
 320        b       1b
 321
 322_GLOBAL(__copy_tofrom_user)
 323        addi    r4,r4,-4
 324        addi    r6,r3,-4
 325        neg     r0,r3
 326        andi.   r0,r0,CACHELINE_MASK    /* # bytes to start of cache line */
 327        beq     58f
 328
 329        cmplw   0,r5,r0                 /* is this more than total to do? */
 330        blt     63f                     /* if not much to do */
 331        andi.   r8,r0,3                 /* get it word-aligned first */
 332        mtctr   r8
 333        beq+    61f
 33470:     lbz     r9,4(r4)                /* do some bytes */
 33571:     stb     r9,4(r6)
 336        addi    r4,r4,1
 337        addi    r6,r6,1
 338        bdnz    70b
 33961:     subf    r5,r0,r5
 340        srwi.   r0,r0,2
 341        mtctr   r0
 342        beq     58f
 34372:     lwzu    r9,4(r4)                /* do some words */
 34473:     stwu    r9,4(r6)
 345        bdnz    72b
 346
 347        EX_TABLE(70b,100f)
 348        EX_TABLE(71b,101f)
 349        EX_TABLE(72b,102f)
 350        EX_TABLE(73b,103f)
 351
 35258:     srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
 353        clrlwi  r5,r5,32-LG_CACHELINE_BYTES
 354        li      r11,4
 355        beq     63f
 356
 357        /* Here we decide how far ahead to prefetch the source */
 358        li      r3,4
 359        cmpwi   r0,1
 360        li      r7,0
 361        ble     114f
 362        li      r7,1
 363#if MAX_COPY_PREFETCH > 1
 364        /* Heuristically, for large transfers we prefetch
 365           MAX_COPY_PREFETCH cachelines ahead.  For small transfers
 366           we prefetch 1 cacheline ahead. */
 367        cmpwi   r0,MAX_COPY_PREFETCH
 368        ble     112f
 369        li      r7,MAX_COPY_PREFETCH
 370112:    mtctr   r7
 371111:    dcbt    r3,r4
 372        addi    r3,r3,CACHELINE_BYTES
 373        bdnz    111b
 374#else
 375        dcbt    r3,r4
 376        addi    r3,r3,CACHELINE_BYTES
 377#endif /* MAX_COPY_PREFETCH > 1 */
 378
 379114:    subf    r8,r7,r0
 380        mr      r0,r7
 381        mtctr   r8
 382
 38353:     dcbt    r3,r4
 38454:     dcbz    r11,r6
 385        EX_TABLE(54b,105f)
 386/* the main body of the cacheline loop */
 387        COPY_16_BYTES_WITHEX(0)
 388#if L1_CACHE_BYTES >= 32
 389        COPY_16_BYTES_WITHEX(1)
 390#if L1_CACHE_BYTES >= 64
 391        COPY_16_BYTES_WITHEX(2)
 392        COPY_16_BYTES_WITHEX(3)
 393#if L1_CACHE_BYTES >= 128
 394        COPY_16_BYTES_WITHEX(4)
 395        COPY_16_BYTES_WITHEX(5)
 396        COPY_16_BYTES_WITHEX(6)
 397        COPY_16_BYTES_WITHEX(7)
 398#endif
 399#endif
 400#endif
 401        bdnz    53b
 402        cmpwi   r0,0
 403        li      r3,4
 404        li      r7,0
 405        bne     114b
 406
 40763:     srwi.   r0,r5,2
 408        mtctr   r0
 409        beq     64f
 41030:     lwzu    r0,4(r4)
 41131:     stwu    r0,4(r6)
 412        bdnz    30b
 413
 41464:     andi.   r0,r5,3
 415        mtctr   r0
 416        beq+    65f
 41740:     lbz     r0,4(r4)
 41841:     stb     r0,4(r6)
 419        addi    r4,r4,1
 420        addi    r6,r6,1
 421        bdnz    40b
 42265:     li      r3,0
 423        blr
 424
 425/* read fault, initial single-byte copy */
 426100:    li      r9,0
 427        b       90f
 428/* write fault, initial single-byte copy */
 429101:    li      r9,1
 43090:     subf    r5,r8,r5
 431        li      r3,0
 432        b       99f
 433/* read fault, initial word copy */
 434102:    li      r9,0
 435        b       91f
 436/* write fault, initial word copy */
 437103:    li      r9,1
 43891:     li      r3,2
 439        b       99f
 440
 441/*
 442 * this stuff handles faults in the cacheline loop and branches to either
 443 * 104f (if in read part) or 105f (if in write part), after updating r5
 444 */
 445        COPY_16_BYTES_EXCODE(0)
 446#if L1_CACHE_BYTES >= 32
 447        COPY_16_BYTES_EXCODE(1)
 448#if L1_CACHE_BYTES >= 64
 449        COPY_16_BYTES_EXCODE(2)
 450        COPY_16_BYTES_EXCODE(3)
 451#if L1_CACHE_BYTES >= 128
 452        COPY_16_BYTES_EXCODE(4)
 453        COPY_16_BYTES_EXCODE(5)
 454        COPY_16_BYTES_EXCODE(6)
 455        COPY_16_BYTES_EXCODE(7)
 456#endif
 457#endif
 458#endif
 459
 460/* read fault in cacheline loop */
 461104:    li      r9,0
 462        b       92f
 463/* fault on dcbz (effectively a write fault) */
 464/* or write fault in cacheline loop */
 465105:    li      r9,1
 46692:     li      r3,LG_CACHELINE_BYTES
 467        mfctr   r8
 468        add     r0,r0,r8
 469        b       106f
 470/* read fault in final word loop */
 471108:    li      r9,0
 472        b       93f
 473/* write fault in final word loop */
 474109:    li      r9,1
 47593:     andi.   r5,r5,3
 476        li      r3,2
 477        b       99f
 478/* read fault in final byte loop */
 479110:    li      r9,0
 480        b       94f
 481/* write fault in final byte loop */
 482111:    li      r9,1
 48394:     li      r5,0
 484        li      r3,0
 485/*
 486 * At this stage the number of bytes not copied is
 487 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
 488 */
 48999:     mfctr   r0
 490106:    slw     r3,r0,r3
 491        add.    r3,r3,r5
 492        beq     120f                    /* shouldn't happen */
 493        cmpwi   0,r9,0
 494        bne     120f
 495/* for a read fault, first try to continue the copy one byte at a time */
 496        mtctr   r3
 497130:    lbz     r0,4(r4)
 498131:    stb     r0,4(r6)
 499        addi    r4,r4,1
 500        addi    r6,r6,1
 501        bdnz    130b
 502/* then clear out the destination: r3 bytes starting at 4(r6) */
 503132:    mfctr   r3
 504120:    blr
 505
 506        EX_TABLE(30b,108b)
 507        EX_TABLE(31b,109b)
 508        EX_TABLE(40b,110b)
 509        EX_TABLE(41b,111b)
 510        EX_TABLE(130b,132b)
 511        EX_TABLE(131b,120b)
 512
 513EXPORT_SYMBOL(__copy_tofrom_user)
 514