linux/arch/sparc/lib/NGmemcpy.S
<<
>>
Prefs
   1/* NGmemcpy.S: Niagara optimized memcpy.
   2 *
   3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
   4 */
   5
   6#ifdef __KERNEL__
   7#include <asm/asi.h>
   8#include <asm/thread_info.h>
   9#define GLOBAL_SPARE    %g7
  10#define RESTORE_ASI(TMP)        \
  11        ldub    [%g6 + TI_CURRENT_DS], TMP;  \
  12        wr      TMP, 0x0, %asi;
  13#else
  14#define GLOBAL_SPARE    %g5
  15#define RESTORE_ASI(TMP)        \
  16        wr      %g0, ASI_PNF, %asi
  17#endif
  18
  19#ifdef __sparc_v9__
  20#define SAVE_AMOUNT     128
  21#else
  22#define SAVE_AMOUNT     64
  23#endif
  24
  25#ifndef STORE_ASI
  26#define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
  27#endif
  28
  29#ifndef EX_LD
  30#define EX_LD(x)        x
  31#endif
  32
  33#ifndef EX_ST
  34#define EX_ST(x)        x
  35#endif
  36
  37#ifndef EX_RETVAL
  38#define EX_RETVAL(x)    x
  39#endif
  40
  41#ifndef LOAD
  42#ifndef MEMCPY_DEBUG
  43#define LOAD(type,addr,dest)    type [addr], dest
  44#else
  45#define LOAD(type,addr,dest)    type##a [addr] 0x80, dest
  46#endif
  47#endif
  48
  49#ifndef LOAD_TWIN
  50#define LOAD_TWIN(addr_reg,dest0,dest1) \
  51        ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  52#endif
  53
  54#ifndef STORE
  55#define STORE(type,src,addr)    type src, [addr]
  56#endif
  57
  58#ifndef STORE_INIT
  59#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  60#define STORE_INIT(src,addr)    stxa src, [addr] %asi
  61#else
  62#define STORE_INIT(src,addr)    stx src, [addr + 0x00]
  63#endif
  64#endif
  65
  66#ifndef FUNC_NAME
  67#define FUNC_NAME       NGmemcpy
  68#endif
  69
  70#ifndef PREAMBLE
  71#define PREAMBLE
  72#endif
  73
  74#ifndef XCC
  75#define XCC xcc
  76#endif
  77
  78        .register       %g2,#scratch
  79        .register       %g3,#scratch
  80
  81        .text
  82        .align          64
  83
  84        .globl  FUNC_NAME
  85        .type   FUNC_NAME,#function
  86FUNC_NAME:      /* %i0=dst, %i1=src, %i2=len */
  87        PREAMBLE
  88        save            %sp, -SAVE_AMOUNT, %sp
  89        srlx            %i2, 31, %g2
  90        cmp             %g2, 0
  91        tne             %xcc, 5
  92        mov             %i0, %o0
  93        cmp             %i2, 0
  94        be,pn           %XCC, 85f
  95         or             %o0, %i1, %i3
  96        cmp             %i2, 16
  97        blu,a,pn        %XCC, 80f
  98         or             %i3, %i2, %i3
  99
 100        /* 2 blocks (128 bytes) is the minimum we can do the block
 101         * copy with.  We need to ensure that we'll iterate at least
 102         * once in the block copy loop.  At worst we'll need to align
 103         * the destination to a 64-byte boundary which can chew up
 104         * to (64 - 1) bytes from the length before we perform the
 105         * block copy loop.
 106         */
 107        cmp             %i2, (2 * 64)
 108        blu,pt          %XCC, 70f
 109         andcc          %i3, 0x7, %g0
 110
 111        /* %o0: dst
 112         * %i1: src
 113         * %i2: len  (known to be >= 128)
 114         *
 115         * The block copy loops will use %i4/%i5,%g2/%g3 as
 116         * temporaries while copying the data.
 117         */
 118
 119        LOAD(prefetch, %i1, #one_read)
 120        wr              %g0, STORE_ASI, %asi
 121
 122        /* Align destination on 64-byte boundary.  */
 123        andcc           %o0, (64 - 1), %i4
 124        be,pt           %XCC, 2f
 125         sub            %i4, 64, %i4
 126        sub             %g0, %i4, %i4   ! bytes to align dst
 127        sub             %i2, %i4, %i2
 1281:      subcc           %i4, 1, %i4
 129        EX_LD(LOAD(ldub, %i1, %g1))
 130        EX_ST(STORE(stb, %g1, %o0))
 131        add             %i1, 1, %i1
 132        bne,pt          %XCC, 1b
 133        add             %o0, 1, %o0
 134
 135        /* If the source is on a 16-byte boundary we can do
 136         * the direct block copy loop.  If it is 8-byte aligned
 137         * we can do the 16-byte loads offset by -8 bytes and the
 138         * init stores offset by one register.
 139         *
 140         * If the source is not even 8-byte aligned, we need to do
 141         * shifting and masking (basically integer faligndata).
 142         *
 143         * The careful bit with init stores is that if we store
 144         * to any part of the cache line we have to store the whole
 145         * cacheline else we can end up with corrupt L2 cache line
 146         * contents.  Since the loop works on 64-bytes of 64-byte
 147         * aligned store data at a time, this is easy to ensure.
 148         */
 1492:
 150        andcc           %i1, (16 - 1), %i4
 151        andn            %i2, (64 - 1), %g1      ! block copy loop iterator
 152        be,pt           %XCC, 50f
 153         sub            %i2, %g1, %i2           ! final sub-block copy bytes
 154
 155        cmp             %i4, 8
 156        be,pt           %XCC, 10f
 157         sub            %i1, %i4, %i1
 158
 159        /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 160        and             %i4, 0x7, GLOBAL_SPARE
 161        sll             GLOBAL_SPARE, 3, GLOBAL_SPARE
 162        mov             64, %i5
 163        EX_LD(LOAD_TWIN(%i1, %g2, %g3))
 164        sub             %i5, GLOBAL_SPARE, %i5
 165        mov             16, %o4
 166        mov             32, %o5
 167        mov             48, %o7
 168        mov             64, %i3
 169
 170        bg,pn           %XCC, 9f
 171         nop
 172
 173#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
 174        sllx            WORD1, POST_SHIFT, WORD1; \
 175        srlx            WORD2, PRE_SHIFT, TMP; \
 176        sllx            WORD2, POST_SHIFT, WORD2; \
 177        or              WORD1, TMP, WORD1; \
 178        srlx            WORD3, PRE_SHIFT, TMP; \
 179        or              WORD2, TMP, WORD2;
 180
 1818:      EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
 182        MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 183        LOAD(prefetch, %i1 + %i3, #one_read)
 184
 185        EX_ST(STORE_INIT(%g2, %o0 + 0x00))
 186        EX_ST(STORE_INIT(%g3, %o0 + 0x08))
 187
 188        EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
 189        MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 190
 191        EX_ST(STORE_INIT(%o2, %o0 + 0x10))
 192        EX_ST(STORE_INIT(%o3, %o0 + 0x18))
 193
 194        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
 195        MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 196
 197        EX_ST(STORE_INIT(%g2, %o0 + 0x20))
 198        EX_ST(STORE_INIT(%g3, %o0 + 0x28))
 199
 200        EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
 201        add             %i1, 64, %i1
 202        MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 203
 204        EX_ST(STORE_INIT(%o2, %o0 + 0x30))
 205        EX_ST(STORE_INIT(%o3, %o0 + 0x38))
 206
 207        subcc           %g1, 64, %g1
 208        bne,pt          %XCC, 8b
 209         add            %o0, 64, %o0
 210
 211        ba,pt           %XCC, 60f
 212         add            %i1, %i4, %i1
 213
 2149:      EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
 215        MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 216        LOAD(prefetch, %i1 + %i3, #one_read)
 217
 218        EX_ST(STORE_INIT(%g3, %o0 + 0x00))
 219        EX_ST(STORE_INIT(%o2, %o0 + 0x08))
 220
 221        EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
 222        MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 223
 224        EX_ST(STORE_INIT(%o3, %o0 + 0x10))
 225        EX_ST(STORE_INIT(%g2, %o0 + 0x18))
 226
 227        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
 228        MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 229
 230        EX_ST(STORE_INIT(%g3, %o0 + 0x20))
 231        EX_ST(STORE_INIT(%o2, %o0 + 0x28))
 232
 233        EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
 234        add             %i1, 64, %i1
 235        MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 236
 237        EX_ST(STORE_INIT(%o3, %o0 + 0x30))
 238        EX_ST(STORE_INIT(%g2, %o0 + 0x38))
 239
 240        subcc           %g1, 64, %g1
 241        bne,pt          %XCC, 9b
 242         add            %o0, 64, %o0
 243
 244        ba,pt           %XCC, 60f
 245         add            %i1, %i4, %i1
 246
 24710:     /* Destination is 64-byte aligned, source was only 8-byte
 248         * aligned but it has been subtracted by 8 and we perform
 249         * one twin load ahead, then add 8 back into source when
 250         * we finish the loop.
 251         */
 252        EX_LD(LOAD_TWIN(%i1, %o4, %o5))
 253        mov     16, %o7
 254        mov     32, %g2
 255        mov     48, %g3
 256        mov     64, %o1
 2571:      EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
 258        LOAD(prefetch, %i1 + %o1, #one_read)
 259        EX_ST(STORE_INIT(%o5, %o0 + 0x00))      ! initializes cache line
 260        EX_ST(STORE_INIT(%o2, %o0 + 0x08))
 261        EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
 262        EX_ST(STORE_INIT(%o3, %o0 + 0x10))
 263        EX_ST(STORE_INIT(%o4, %o0 + 0x18))
 264        EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
 265        EX_ST(STORE_INIT(%o5, %o0 + 0x20))
 266        EX_ST(STORE_INIT(%o2, %o0 + 0x28))
 267        EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5))
 268        add             %i1, 64, %i1
 269        EX_ST(STORE_INIT(%o3, %o0 + 0x30))
 270        EX_ST(STORE_INIT(%o4, %o0 + 0x38))
 271        subcc           %g1, 64, %g1
 272        bne,pt          %XCC, 1b
 273         add            %o0, 64, %o0
 274
 275        ba,pt           %XCC, 60f
 276         add            %i1, 0x8, %i1
 277
 27850:     /* Destination is 64-byte aligned, and source is 16-byte
 279         * aligned.
 280         */
 281        mov     16, %o7
 282        mov     32, %g2
 283        mov     48, %g3
 284        mov     64, %o1
 2851:      EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5))
 286        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
 287        LOAD(prefetch, %i1 + %o1, #one_read)
 288        EX_ST(STORE_INIT(%o4, %o0 + 0x00))      ! initializes cache line
 289        EX_ST(STORE_INIT(%o5, %o0 + 0x08))
 290        EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
 291        EX_ST(STORE_INIT(%o2, %o0 + 0x10))
 292        EX_ST(STORE_INIT(%o3, %o0 + 0x18))
 293        EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
 294        add     %i1, 64, %i1
 295        EX_ST(STORE_INIT(%o4, %o0 + 0x20))
 296        EX_ST(STORE_INIT(%o5, %o0 + 0x28))
 297        EX_ST(STORE_INIT(%o2, %o0 + 0x30))
 298        EX_ST(STORE_INIT(%o3, %o0 + 0x38))
 299        subcc   %g1, 64, %g1
 300        bne,pt  %XCC, 1b
 301         add    %o0, 64, %o0
 302        /* fall through */
 303
 30460:     
 305        membar          #Sync
 306
 307        /* %i2 contains any final bytes still needed to be copied
 308         * over. If anything is left, we copy it one byte at a time.
 309         */
 310        RESTORE_ASI(%i3)
 311        brz,pt          %i2, 85f
 312         sub            %o0, %i1, %i3
 313        ba,a,pt         %XCC, 90f
 314
 315        .align          64
 31670: /* 16 < len <= 64 */
 317        bne,pn          %XCC, 75f
 318         sub            %o0, %i1, %i3
 319
 32072:
 321        andn            %i2, 0xf, %i4
 322        and             %i2, 0xf, %i2
 3231:      subcc           %i4, 0x10, %i4
 324        EX_LD(LOAD(ldx, %i1, %o4))
 325        add             %i1, 0x08, %i1
 326        EX_LD(LOAD(ldx, %i1, %g1))
 327        sub             %i1, 0x08, %i1
 328        EX_ST(STORE(stx, %o4, %i1 + %i3))
 329        add             %i1, 0x8, %i1
 330        EX_ST(STORE(stx, %g1, %i1 + %i3))
 331        bgu,pt          %XCC, 1b
 332         add            %i1, 0x8, %i1
 33373:     andcc           %i2, 0x8, %g0
 334        be,pt           %XCC, 1f
 335         nop
 336        sub             %i2, 0x8, %i2
 337        EX_LD(LOAD(ldx, %i1, %o4))
 338        EX_ST(STORE(stx, %o4, %i1 + %i3))
 339        add             %i1, 0x8, %i1
 3401:      andcc           %i2, 0x4, %g0
 341        be,pt           %XCC, 1f
 342         nop
 343        sub             %i2, 0x4, %i2
 344        EX_LD(LOAD(lduw, %i1, %i5))
 345        EX_ST(STORE(stw, %i5, %i1 + %i3))
 346        add             %i1, 0x4, %i1
 3471:      cmp             %i2, 0
 348        be,pt           %XCC, 85f
 349         nop
 350        ba,pt           %xcc, 90f
 351         nop
 352
 35375:
 354        andcc           %o0, 0x7, %g1
 355        sub             %g1, 0x8, %g1
 356        be,pn           %icc, 2f
 357         sub            %g0, %g1, %g1
 358        sub             %i2, %g1, %i2
 359
 3601:      subcc           %g1, 1, %g1
 361        EX_LD(LOAD(ldub, %i1, %i5))
 362        EX_ST(STORE(stb, %i5, %i1 + %i3))
 363        bgu,pt          %icc, 1b
 364         add            %i1, 1, %i1
 365
 3662:      add             %i1, %i3, %o0
 367        andcc           %i1, 0x7, %g1
 368        bne,pt          %icc, 8f
 369         sll            %g1, 3, %g1
 370
 371        cmp             %i2, 16
 372        bgeu,pt         %icc, 72b
 373         nop
 374        ba,a,pt         %xcc, 73b
 375
 3768:      mov             64, %i3
 377        andn            %i1, 0x7, %i1
 378        EX_LD(LOAD(ldx, %i1, %g2))
 379        sub             %i3, %g1, %i3
 380        andn            %i2, 0x7, %i4
 381        sllx            %g2, %g1, %g2
 3821:      add             %i1, 0x8, %i1
 383        EX_LD(LOAD(ldx, %i1, %g3))
 384        subcc           %i4, 0x8, %i4
 385        srlx            %g3, %i3, %i5
 386        or              %i5, %g2, %i5
 387        EX_ST(STORE(stx, %i5, %o0))
 388        add             %o0, 0x8, %o0
 389        bgu,pt          %icc, 1b
 390         sllx           %g3, %g1, %g2
 391
 392        srl             %g1, 3, %g1
 393        andcc           %i2, 0x7, %i2
 394        be,pn           %icc, 85f
 395         add            %i1, %g1, %i1
 396        ba,pt           %xcc, 90f
 397         sub            %o0, %i1, %i3
 398
 399        .align          64
 40080: /* 0 < len <= 16 */
 401        andcc           %i3, 0x3, %g0
 402        bne,pn          %XCC, 90f
 403         sub            %o0, %i1, %i3
 404
 4051:
 406        subcc           %i2, 4, %i2
 407        EX_LD(LOAD(lduw, %i1, %g1))
 408        EX_ST(STORE(stw, %g1, %i1 + %i3))
 409        bgu,pt          %XCC, 1b
 410         add            %i1, 4, %i1
 411
 41285:     ret
 413         restore        EX_RETVAL(%i0), %g0, %o0
 414
 415        .align          32
 41690:
 417        subcc           %i2, 1, %i2
 418        EX_LD(LOAD(ldub, %i1, %g1))
 419        EX_ST(STORE(stb, %g1, %i1 + %i3))
 420        bgu,pt          %XCC, 90b
 421         add            %i1, 1, %i1
 422        ret
 423         restore        EX_RETVAL(%i0), %g0, %o0
 424
 425        .size           FUNC_NAME, .-FUNC_NAME
 426