linux/arch/sparc/lib/NG2memcpy.S
<<
>>
Prefs
   1/* NG2memcpy.S: Niagara-2 optimized memcpy.
   2 *
   3 * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
   4 */
   5
   6#ifdef __KERNEL__
   7#include <asm/visasm.h>
   8#include <asm/asi.h>
   9#define GLOBAL_SPARE    %g7
  10#else
  11#define ASI_PNF 0x82
  12#define ASI_BLK_P 0xf0
  13#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  14#define FPRS_FEF  0x04
  15#ifdef MEMCPY_DEBUG
  16#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  17                     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
  18#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  19#else
  20#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  21#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  22#endif
  23#define GLOBAL_SPARE    %g5
  24#endif
  25
  26#ifndef STORE_ASI
  27#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  28#define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
  29#else
  30#define STORE_ASI       0x80            /* ASI_P */
  31#endif
  32#endif
  33
  34#ifndef EX_LD
  35#define EX_LD(x)        x
  36#endif
  37
  38#ifndef EX_ST
  39#define EX_ST(x)        x
  40#endif
  41
  42#ifndef EX_RETVAL
  43#define EX_RETVAL(x)    x
  44#endif
  45
  46#ifndef LOAD
  47#define LOAD(type,addr,dest)    type [addr], dest
  48#endif
  49
  50#ifndef LOAD_BLK
  51#define LOAD_BLK(addr,dest)     ldda [addr] ASI_BLK_P, dest
  52#endif
  53
  54#ifndef STORE
  55#ifndef MEMCPY_DEBUG
  56#define STORE(type,src,addr)    type src, [addr]
  57#else
  58#define STORE(type,src,addr)    type##a src, [addr] 0x80
  59#endif
  60#endif
  61
  62#ifndef STORE_BLK
  63#define STORE_BLK(src,addr)     stda src, [addr] ASI_BLK_P
  64#endif
  65
  66#ifndef STORE_INIT
  67#define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
  68#endif
  69
  70#ifndef FUNC_NAME
  71#define FUNC_NAME       NG2memcpy
  72#endif
  73
  74#ifndef PREAMBLE
  75#define PREAMBLE
  76#endif
  77
  78#ifndef XCC
  79#define XCC xcc
  80#endif
  81
  82#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
  83        faligndata      %x0, %x1, %f0; \
  84        faligndata      %x1, %x2, %f2; \
  85        faligndata      %x2, %x3, %f4; \
  86        faligndata      %x3, %x4, %f6; \
  87        faligndata      %x4, %x5, %f8; \
  88        faligndata      %x5, %x6, %f10; \
  89        faligndata      %x6, %x7, %f12; \
  90        faligndata      %x7, %x8, %f14;
  91
  92#define FREG_MOVE_1(x0) \
  93        fsrc2           %x0, %f0;
  94#define FREG_MOVE_2(x0, x1) \
  95        fsrc2           %x0, %f0; \
  96        fsrc2           %x1, %f2;
  97#define FREG_MOVE_3(x0, x1, x2) \
  98        fsrc2           %x0, %f0; \
  99        fsrc2           %x1, %f2; \
 100        fsrc2           %x2, %f4;
 101#define FREG_MOVE_4(x0, x1, x2, x3) \
 102        fsrc2           %x0, %f0; \
 103        fsrc2           %x1, %f2; \
 104        fsrc2           %x2, %f4; \
 105        fsrc2           %x3, %f6;
 106#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
 107        fsrc2           %x0, %f0; \
 108        fsrc2           %x1, %f2; \
 109        fsrc2           %x2, %f4; \
 110        fsrc2           %x3, %f6; \
 111        fsrc2           %x4, %f8;
 112#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
 113        fsrc2           %x0, %f0; \
 114        fsrc2           %x1, %f2; \
 115        fsrc2           %x2, %f4; \
 116        fsrc2           %x3, %f6; \
 117        fsrc2           %x4, %f8; \
 118        fsrc2           %x5, %f10;
 119#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
 120        fsrc2           %x0, %f0; \
 121        fsrc2           %x1, %f2; \
 122        fsrc2           %x2, %f4; \
 123        fsrc2           %x3, %f6; \
 124        fsrc2           %x4, %f8; \
 125        fsrc2           %x5, %f10; \
 126        fsrc2           %x6, %f12;
 127#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
 128        fsrc2           %x0, %f0; \
 129        fsrc2           %x1, %f2; \
 130        fsrc2           %x2, %f4; \
 131        fsrc2           %x3, %f6; \
 132        fsrc2           %x4, %f8; \
 133        fsrc2           %x5, %f10; \
 134        fsrc2           %x6, %f12; \
 135        fsrc2           %x7, %f14;
 136#define FREG_LOAD_1(base, x0) \
 137        EX_LD(LOAD(ldd, base + 0x00, %x0))
 138#define FREG_LOAD_2(base, x0, x1) \
 139        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 140        EX_LD(LOAD(ldd, base + 0x08, %x1));
 141#define FREG_LOAD_3(base, x0, x1, x2) \
 142        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 143        EX_LD(LOAD(ldd, base + 0x08, %x1)); \
 144        EX_LD(LOAD(ldd, base + 0x10, %x2));
 145#define FREG_LOAD_4(base, x0, x1, x2, x3) \
 146        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 147        EX_LD(LOAD(ldd, base + 0x08, %x1)); \
 148        EX_LD(LOAD(ldd, base + 0x10, %x2)); \
 149        EX_LD(LOAD(ldd, base + 0x18, %x3));
 150#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
 151        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 152        EX_LD(LOAD(ldd, base + 0x08, %x1)); \
 153        EX_LD(LOAD(ldd, base + 0x10, %x2)); \
 154        EX_LD(LOAD(ldd, base + 0x18, %x3)); \
 155        EX_LD(LOAD(ldd, base + 0x20, %x4));
 156#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
 157        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 158        EX_LD(LOAD(ldd, base + 0x08, %x1)); \
 159        EX_LD(LOAD(ldd, base + 0x10, %x2)); \
 160        EX_LD(LOAD(ldd, base + 0x18, %x3)); \
 161        EX_LD(LOAD(ldd, base + 0x20, %x4)); \
 162        EX_LD(LOAD(ldd, base + 0x28, %x5));
 163#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
 164        EX_LD(LOAD(ldd, base + 0x00, %x0)); \
 165        EX_LD(LOAD(ldd, base + 0x08, %x1)); \
 166        EX_LD(LOAD(ldd, base + 0x10, %x2)); \
 167        EX_LD(LOAD(ldd, base + 0x18, %x3)); \
 168        EX_LD(LOAD(ldd, base + 0x20, %x4)); \
 169        EX_LD(LOAD(ldd, base + 0x28, %x5)); \
 170        EX_LD(LOAD(ldd, base + 0x30, %x6));
 171
 172        .register       %g2,#scratch
 173        .register       %g3,#scratch
 174
 175        .text
 176        .align          64
 177
 178        .globl  FUNC_NAME
 179        .type   FUNC_NAME,#function
 180FUNC_NAME:      /* %o0=dst, %o1=src, %o2=len */
 181        srlx            %o2, 31, %g2
 182        cmp             %g2, 0
 183        tne             %xcc, 5
 184        PREAMBLE
 185        mov             %o0, %o3
 186        cmp             %o2, 0
 187        be,pn           %XCC, 85f
 188         or             %o0, %o1, GLOBAL_SPARE
 189        cmp             %o2, 16
 190        blu,a,pn        %XCC, 80f
 191         or             GLOBAL_SPARE, %o2, GLOBAL_SPARE
 192
 193        /* 2 blocks (128 bytes) is the minimum we can do the block
 194         * copy with.  We need to ensure that we'll iterate at least
 195         * once in the block copy loop.  At worst we'll need to align
 196         * the destination to a 64-byte boundary which can chew up
 197         * to (64 - 1) bytes from the length before we perform the
 198         * block copy loop.
 199         *
 200         * However, the cut-off point, performance wise, is around
 201         * 4 64-byte blocks.
 202         */
 203        cmp             %o2, (4 * 64)
 204        blu,pt          %XCC, 75f
 205         andcc          GLOBAL_SPARE, 0x7, %g0
 206
 207        /* %o0: dst
 208         * %o1: src
 209         * %o2: len  (known to be >= 128)
 210         *
 211         * The block copy loops can use %o4, %g2, %g3 as
 212         * temporaries while copying the data.  %o5 must
 213         * be preserved between VISEntryHalf and VISExitHalf
 214         */
 215
 216        LOAD(prefetch, %o1 + 0x000, #one_read)
 217        LOAD(prefetch, %o1 + 0x040, #one_read)
 218        LOAD(prefetch, %o1 + 0x080, #one_read)
 219
 220        /* Align destination on 64-byte boundary.  */
 221        andcc           %o0, (64 - 1), %o4
 222        be,pt           %XCC, 2f
 223         sub            %o4, 64, %o4
 224        sub             %g0, %o4, %o4   ! bytes to align dst
 225        sub             %o2, %o4, %o2
 2261:      subcc           %o4, 1, %o4
 227        EX_LD(LOAD(ldub, %o1, %g1))
 228        EX_ST(STORE(stb, %g1, %o0))
 229        add             %o1, 1, %o1
 230        bne,pt          %XCC, 1b
 231        add             %o0, 1, %o0
 232
 2332:
 234        /* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
 235         * o5 from here until we hit VISExitHalf.
 236         */
 237        VISEntryHalf
 238
 239        membar          #Sync
 240        alignaddr       %o1, %g0, %g0
 241
 242        add             %o1, (64 - 1), %o4
 243        andn            %o4, (64 - 1), %o4
 244        andn            %o2, (64 - 1), %g1
 245        sub             %o2, %g1, %o2
 246
 247        and             %o1, (64 - 1), %g2
 248        add             %o1, %g1, %o1
 249        sub             %o0, %o4, %g3
 250        brz,pt          %g2, 190f
 251         cmp            %g2, 32
 252        blu,a           5f
 253         cmp            %g2, 16
 254        cmp             %g2, 48
 255        blu,a           4f
 256         cmp            %g2, 40
 257        cmp             %g2, 56
 258        blu             170f
 259         nop
 260        ba,a,pt         %xcc, 180f
 261
 2624:      /* 32 <= low bits < 48 */
 263        blu             150f
 264         nop
 265        ba,a,pt         %xcc, 160f
 2665:      /* 0 < low bits < 32 */
 267        blu,a           6f
 268         cmp            %g2, 8
 269        cmp             %g2, 24
 270        blu             130f
 271         nop
 272        ba,a,pt         %xcc, 140f
 2736:      /* 0 < low bits < 16 */
 274        bgeu            120f
 275         nop
 276        /* fall through for 0 < low bits < 8 */
 277110:    sub             %o4, 64, %g2
 278        EX_LD(LOAD_BLK(%g2, %f0))
 2791:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 280        EX_LD(LOAD_BLK(%o4, %f16))
 281        FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
 282        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 283        FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
 284        subcc           %g1, 64, %g1
 285        add             %o4, 64, %o4
 286        bne,pt          %xcc, 1b
 287         LOAD(prefetch, %o4 + 64, #one_read)
 288        ba,pt           %xcc, 195f
 289         nop
 290
 291120:    sub             %o4, 56, %g2
 292        FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
 2931:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 294        EX_LD(LOAD_BLK(%o4, %f16))
 295        FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
 296        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 297        FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
 298        subcc           %g1, 64, %g1
 299        add             %o4, 64, %o4
 300        bne,pt          %xcc, 1b
 301         LOAD(prefetch, %o4 + 64, #one_read)
 302        ba,pt           %xcc, 195f
 303         nop
 304
 305130:    sub             %o4, 48, %g2
 306        FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
 3071:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 308        EX_LD(LOAD_BLK(%o4, %f16))
 309        FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
 310        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 311        FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
 312        subcc           %g1, 64, %g1
 313        add             %o4, 64, %o4
 314        bne,pt          %xcc, 1b
 315         LOAD(prefetch, %o4 + 64, #one_read)
 316        ba,pt           %xcc, 195f
 317         nop
 318
 319140:    sub             %o4, 40, %g2
 320        FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
 3211:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 322        EX_LD(LOAD_BLK(%o4, %f16))
 323        FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
 324        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 325        FREG_MOVE_5(f22, f24, f26, f28, f30)
 326        subcc           %g1, 64, %g1
 327        add             %o4, 64, %o4
 328        bne,pt          %xcc, 1b
 329         LOAD(prefetch, %o4 + 64, #one_read)
 330        ba,pt           %xcc, 195f
 331         nop
 332
 333150:    sub             %o4, 32, %g2
 334        FREG_LOAD_4(%g2, f0, f2, f4, f6)
 3351:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 336        EX_LD(LOAD_BLK(%o4, %f16))
 337        FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
 338        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 339        FREG_MOVE_4(f24, f26, f28, f30)
 340        subcc           %g1, 64, %g1
 341        add             %o4, 64, %o4
 342        bne,pt          %xcc, 1b
 343         LOAD(prefetch, %o4 + 64, #one_read)
 344        ba,pt           %xcc, 195f
 345         nop
 346
 347160:    sub             %o4, 24, %g2
 348        FREG_LOAD_3(%g2, f0, f2, f4)
 3491:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 350        EX_LD(LOAD_BLK(%o4, %f16))
 351        FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
 352        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 353        FREG_MOVE_3(f26, f28, f30)
 354        subcc           %g1, 64, %g1
 355        add             %o4, 64, %o4
 356        bne,pt          %xcc, 1b
 357         LOAD(prefetch, %o4 + 64, #one_read)
 358        ba,pt           %xcc, 195f
 359         nop
 360
 361170:    sub             %o4, 16, %g2
 362        FREG_LOAD_2(%g2, f0, f2)
 3631:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 364        EX_LD(LOAD_BLK(%o4, %f16))
 365        FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
 366        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 367        FREG_MOVE_2(f28, f30)
 368        subcc           %g1, 64, %g1
 369        add             %o4, 64, %o4
 370        bne,pt          %xcc, 1b
 371         LOAD(prefetch, %o4 + 64, #one_read)
 372        ba,pt           %xcc, 195f
 373         nop
 374
 375180:    sub             %o4, 8, %g2
 376        FREG_LOAD_1(%g2, f0)
 3771:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 378        EX_LD(LOAD_BLK(%o4, %f16))
 379        FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
 380        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 381        FREG_MOVE_1(f30)
 382        subcc           %g1, 64, %g1
 383        add             %o4, 64, %o4
 384        bne,pt          %xcc, 1b
 385         LOAD(prefetch, %o4 + 64, #one_read)
 386        ba,pt           %xcc, 195f
 387         nop
 388
 389190:
 3901:      EX_ST(STORE_INIT(%g0, %o4 + %g3))
 391        subcc           %g1, 64, %g1
 392        EX_LD(LOAD_BLK(%o4, %f0))
 393        EX_ST(STORE_BLK(%f0, %o4 + %g3))
 394        add             %o4, 64, %o4
 395        bne,pt          %xcc, 1b
 396         LOAD(prefetch, %o4 + 64, #one_read)
 397
 398195:
 399        add             %o4, %g3, %o0
 400        membar          #Sync
 401
 402        VISExitHalf
 403
 404        /* %o2 contains any final bytes still needed to be copied
 405         * over. If anything is left, we copy it one byte at a time.
 406         */
 407        brz,pt          %o2, 85f
 408         sub            %o0, %o1, GLOBAL_SPARE
 409        ba,a,pt         %XCC, 90f
 410
 411        .align          64
 41275: /* 16 < len <= 64 */
 413        bne,pn          %XCC, 75f
 414         sub            %o0, %o1, GLOBAL_SPARE
 415
 41672:
 417        andn            %o2, 0xf, %o4
 418        and             %o2, 0xf, %o2
 4191:      subcc           %o4, 0x10, %o4
 420        EX_LD(LOAD(ldx, %o1, %o5))
 421        add             %o1, 0x08, %o1
 422        EX_LD(LOAD(ldx, %o1, %g1))
 423        sub             %o1, 0x08, %o1
 424        EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
 425        add             %o1, 0x8, %o1
 426        EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE))
 427        bgu,pt          %XCC, 1b
 428         add            %o1, 0x8, %o1
 42973:     andcc           %o2, 0x8, %g0
 430        be,pt           %XCC, 1f
 431         nop
 432        sub             %o2, 0x8, %o2
 433        EX_LD(LOAD(ldx, %o1, %o5))
 434        EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
 435        add             %o1, 0x8, %o1
 4361:      andcc           %o2, 0x4, %g0
 437        be,pt           %XCC, 1f
 438         nop
 439        sub             %o2, 0x4, %o2
 440        EX_LD(LOAD(lduw, %o1, %o5))
 441        EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE))
 442        add             %o1, 0x4, %o1
 4431:      cmp             %o2, 0
 444        be,pt           %XCC, 85f
 445         nop
 446        ba,pt           %xcc, 90f
 447         nop
 448
 44975:
 450        andcc           %o0, 0x7, %g1
 451        sub             %g1, 0x8, %g1
 452        be,pn           %icc, 2f
 453         sub            %g0, %g1, %g1
 454        sub             %o2, %g1, %o2
 455
 4561:      subcc           %g1, 1, %g1
 457        EX_LD(LOAD(ldub, %o1, %o5))
 458        EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE))
 459        bgu,pt          %icc, 1b
 460         add            %o1, 1, %o1
 461
 4622:      add             %o1, GLOBAL_SPARE, %o0
 463        andcc           %o1, 0x7, %g1
 464        bne,pt          %icc, 8f
 465         sll            %g1, 3, %g1
 466
 467        cmp             %o2, 16
 468        bgeu,pt         %icc, 72b
 469         nop
 470        ba,a,pt         %xcc, 73b
 471
 4728:      mov             64, GLOBAL_SPARE
 473        andn            %o1, 0x7, %o1
 474        EX_LD(LOAD(ldx, %o1, %g2))
 475        sub             GLOBAL_SPARE, %g1, GLOBAL_SPARE
 476        andn            %o2, 0x7, %o4
 477        sllx            %g2, %g1, %g2
 4781:      add             %o1, 0x8, %o1
 479        EX_LD(LOAD(ldx, %o1, %g3))
 480        subcc           %o4, 0x8, %o4
 481        srlx            %g3, GLOBAL_SPARE, %o5
 482        or              %o5, %g2, %o5
 483        EX_ST(STORE(stx, %o5, %o0))
 484        add             %o0, 0x8, %o0
 485        bgu,pt          %icc, 1b
 486         sllx           %g3, %g1, %g2
 487
 488        srl             %g1, 3, %g1
 489        andcc           %o2, 0x7, %o2
 490        be,pn           %icc, 85f
 491         add            %o1, %g1, %o1
 492        ba,pt           %xcc, 90f
 493         sub            %o0, %o1, GLOBAL_SPARE
 494
 495        .align          64
 49680: /* 0 < len <= 16 */
 497        andcc           GLOBAL_SPARE, 0x3, %g0
 498        bne,pn          %XCC, 90f
 499         sub            %o0, %o1, GLOBAL_SPARE
 500
 5011:
 502        subcc           %o2, 4, %o2
 503        EX_LD(LOAD(lduw, %o1, %g1))
 504        EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE))
 505        bgu,pt          %XCC, 1b
 506         add            %o1, 4, %o1
 507
 50885:     retl
 509         mov            EX_RETVAL(%o3), %o0
 510
 511        .align          32
 51290:
 513        subcc           %o2, 1, %o2
 514        EX_LD(LOAD(ldub, %o1, %g1))
 515        EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE))
 516        bgu,pt          %XCC, 90b
 517         add            %o1, 1, %o1
 518        retl
 519         mov            EX_RETVAL(%o3), %o0
 520
 521        .size           FUNC_NAME, .-FUNC_NAME
 522