linux/arch/sparc/lib/U3memcpy.S
<<
>>
Prefs
   1/* U3memcpy.S: UltraSparc-III optimized memcpy.
   2 *
   3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
   4 */
   5
   6#ifdef __KERNEL__
   7#include <asm/visasm.h>
   8#include <asm/asi.h>
   9#define GLOBAL_SPARE    %g7
  10#else
  11#define ASI_BLK_P 0xf0
  12#define FPRS_FEF  0x04
  13#ifdef MEMCPY_DEBUG
  14#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  15                     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
  16#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  17#else
  18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  20#endif
  21#define GLOBAL_SPARE    %g5
  22#endif
  23
  24#ifndef EX_LD
  25#define EX_LD(x)        x
  26#endif
  27
  28#ifndef EX_ST
  29#define EX_ST(x)        x
  30#endif
  31
  32#ifndef EX_RETVAL
  33#define EX_RETVAL(x)    x
  34#endif
  35
  36#ifndef LOAD
  37#define LOAD(type,addr,dest)    type [addr], dest
  38#endif
  39
  40#ifndef STORE
  41#define STORE(type,src,addr)    type src, [addr]
  42#endif
  43
  44#ifndef STORE_BLK
  45#define STORE_BLK(src,addr)     stda src, [addr] ASI_BLK_P
  46#endif
  47
  48#ifndef FUNC_NAME
  49#define FUNC_NAME       U3memcpy
  50#endif
  51
  52#ifndef PREAMBLE
  53#define PREAMBLE
  54#endif
  55
  56#ifndef XCC
  57#define XCC xcc
  58#endif
  59
  60        .register       %g2,#scratch
  61        .register       %g3,#scratch
  62
  63        /* Special/non-trivial issues of this code:
  64         *
  65         * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
  66         * 2) Only low 32 FPU registers are used so that only the
  67         *    lower half of the FPU register set is dirtied by this
  68         *    code.  This is especially important in the kernel.
  69         * 3) This code never prefetches cachelines past the end
  70         *    of the source buffer.
  71         */
  72
  73        .text
  74        .align          64
  75
  76        /* The cheetah's flexible spine, oversized liver, enlarged heart,
  77         * slender muscular body, and claws make it the swiftest hunter
  78         * in Africa and the fastest animal on land.  Can reach speeds
  79         * of up to 2.4GB per second.
  80         */
  81
  82        .globl  FUNC_NAME
  83        .type   FUNC_NAME,#function
  84FUNC_NAME:      /* %o0=dst, %o1=src, %o2=len */
  85        srlx            %o2, 31, %g2
  86        cmp             %g2, 0
  87        tne             %xcc, 5
  88        PREAMBLE
  89        mov             %o0, %o4
  90        cmp             %o2, 0
  91        be,pn           %XCC, 85f
  92         or             %o0, %o1, %o3
  93        cmp             %o2, 16
  94        blu,a,pn        %XCC, 80f
  95         or             %o3, %o2, %o3
  96
  97        cmp             %o2, (3 * 64)
  98        blu,pt          %XCC, 70f
  99         andcc          %o3, 0x7, %g0
 100
 101        /* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
 102         * o5 from here until we hit VISExitHalf.
 103         */
 104        VISEntryHalf
 105
 106        /* Is 'dst' already aligned on an 64-byte boundary? */
 107        andcc           %o0, 0x3f, %g2
 108        be,pt           %XCC, 2f
 109
 110        /* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
 111         * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
 112         * subtract this from 'len'.
 113         */
 114         sub            %o0, %o1, GLOBAL_SPARE
 115        sub             %g2, 0x40, %g2
 116        sub             %g0, %g2, %g2
 117        sub             %o2, %g2, %o2
 118        andcc           %g2, 0x7, %g1
 119        be,pt           %icc, 2f
 120         and            %g2, 0x38, %g2
 121
 1221:      subcc           %g1, 0x1, %g1
 123        EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
 124        EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
 125        bgu,pt          %XCC, 1b
 126         add            %o1, 0x1, %o1
 127
 128        add             %o1, GLOBAL_SPARE, %o0
 129
 1302:      cmp             %g2, 0x0
 131        and             %o1, 0x7, %g1
 132        be,pt           %icc, 3f
 133         alignaddr      %o1, %g0, %o1
 134
 135        EX_LD(LOAD(ldd, %o1, %f4))
 1361:      EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
 137        add             %o1, 0x8, %o1
 138        subcc           %g2, 0x8, %g2
 139        faligndata      %f4, %f6, %f0
 140        EX_ST(STORE(std, %f0, %o0))
 141        be,pn           %icc, 3f
 142         add            %o0, 0x8, %o0
 143
 144        EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
 145        add             %o1, 0x8, %o1
 146        subcc           %g2, 0x8, %g2
 147        faligndata      %f6, %f4, %f2
 148        EX_ST(STORE(std, %f2, %o0))
 149        bne,pt          %icc, 1b
 150         add            %o0, 0x8, %o0
 151
 1523:      LOAD(prefetch, %o1 + 0x000, #one_read)
 153        LOAD(prefetch, %o1 + 0x040, #one_read)
 154        andn            %o2, (0x40 - 1), GLOBAL_SPARE
 155        LOAD(prefetch, %o1 + 0x080, #one_read)
 156        LOAD(prefetch, %o1 + 0x0c0, #one_read)
 157        LOAD(prefetch, %o1 + 0x100, #one_read)
 158        EX_LD(LOAD(ldd, %o1 + 0x000, %f0))
 159        LOAD(prefetch, %o1 + 0x140, #one_read)
 160        EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
 161        LOAD(prefetch, %o1 + 0x180, #one_read)
 162        EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
 163        LOAD(prefetch, %o1 + 0x1c0, #one_read)
 164        faligndata      %f0, %f2, %f16
 165        EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
 166        faligndata      %f2, %f4, %f18
 167        EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
 168        faligndata      %f4, %f6, %f20
 169        EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
 170        faligndata      %f6, %f8, %f22
 171
 172        EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
 173        faligndata      %f8, %f10, %f24
 174        EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
 175        faligndata      %f10, %f12, %f26
 176        EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
 177
 178        subcc           GLOBAL_SPARE, 0x80, GLOBAL_SPARE
 179        add             %o1, 0x40, %o1
 180        bgu,pt          %XCC, 1f
 181         srl            GLOBAL_SPARE, 6, %o3
 182        ba,pt           %xcc, 2f
 183         nop
 184
 185        .align          64
 1861:
 187        EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
 188        faligndata      %f12, %f14, %f28
 189        EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
 190        faligndata      %f14, %f0, %f30
 191        EX_ST(STORE_BLK(%f16, %o0))
 192        EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
 193        faligndata      %f0, %f2, %f16
 194        add             %o0, 0x40, %o0
 195
 196        EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
 197        faligndata      %f2, %f4, %f18
 198        EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
 199        faligndata      %f4, %f6, %f20
 200        EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
 201        subcc           %o3, 0x01, %o3
 202        faligndata      %f6, %f8, %f22
 203        EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
 204
 205        faligndata      %f8, %f10, %f24
 206        EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
 207        LOAD(prefetch, %o1 + 0x1c0, #one_read)
 208        faligndata      %f10, %f12, %f26
 209        bg,pt           %XCC, 1b
 210         add            %o1, 0x40, %o1
 211
 212        /* Finally we copy the last full 64-byte block. */
 2132:
 214        EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
 215        faligndata      %f12, %f14, %f28
 216        EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
 217        faligndata      %f14, %f0, %f30
 218        EX_ST(STORE_BLK(%f16, %o0))
 219        EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
 220        faligndata      %f0, %f2, %f16
 221        EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
 222        faligndata      %f2, %f4, %f18
 223        EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
 224        faligndata      %f4, %f6, %f20
 225        EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
 226        faligndata      %f6, %f8, %f22
 227        EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
 228        faligndata      %f8, %f10, %f24
 229        cmp             %g1, 0
 230        be,pt           %XCC, 1f
 231         add            %o0, 0x40, %o0
 232        EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
 2331:      faligndata      %f10, %f12, %f26
 234        faligndata      %f12, %f14, %f28
 235        faligndata      %f14, %f0, %f30
 236        EX_ST(STORE_BLK(%f16, %o0))
 237        add             %o0, 0x40, %o0
 238        add             %o1, 0x40, %o1
 239        membar          #Sync
 240
 241        /* Now we copy the (len modulo 64) bytes at the end.
 242         * Note how we borrow the %f0 loaded above.
 243         *
 244         * Also notice how this code is careful not to perform a
 245         * load past the end of the src buffer.
 246         */
 247        and             %o2, 0x3f, %o2
 248        andcc           %o2, 0x38, %g2
 249        be,pn           %XCC, 2f
 250         subcc          %g2, 0x8, %g2
 251        be,pn           %XCC, 2f
 252         cmp            %g1, 0
 253
 254        sub             %o2, %g2, %o2
 255        be,a,pt         %XCC, 1f
 256         EX_LD(LOAD(ldd, %o1 + 0x00, %f0))
 257
 2581:      EX_LD(LOAD(ldd, %o1 + 0x08, %f2))
 259        add             %o1, 0x8, %o1
 260        subcc           %g2, 0x8, %g2
 261        faligndata      %f0, %f2, %f8
 262        EX_ST(STORE(std, %f8, %o0))
 263        be,pn           %XCC, 2f
 264         add            %o0, 0x8, %o0
 265        EX_LD(LOAD(ldd, %o1 + 0x08, %f0))
 266        add             %o1, 0x8, %o1
 267        subcc           %g2, 0x8, %g2
 268        faligndata      %f2, %f0, %f8
 269        EX_ST(STORE(std, %f8, %o0))
 270        bne,pn          %XCC, 1b
 271         add            %o0, 0x8, %o0
 272
 273        /* If anything is left, we copy it one byte at a time.
 274         * Note that %g1 is (src & 0x3) saved above before the
 275         * alignaddr was performed.
 276         */
 2772:
 278        cmp             %o2, 0
 279        add             %o1, %g1, %o1
 280        VISExitHalf
 281        be,pn           %XCC, 85f
 282         sub            %o0, %o1, %o3
 283
 284        andcc           %g1, 0x7, %g0
 285        bne,pn          %icc, 90f
 286         andcc          %o2, 0x8, %g0
 287        be,pt           %icc, 1f
 288         nop
 289        EX_LD(LOAD(ldx, %o1, %o5))
 290        EX_ST(STORE(stx, %o5, %o1 + %o3))
 291        add             %o1, 0x8, %o1
 292
 2931:      andcc           %o2, 0x4, %g0
 294        be,pt           %icc, 1f
 295         nop
 296        EX_LD(LOAD(lduw, %o1, %o5))
 297        EX_ST(STORE(stw, %o5, %o1 + %o3))
 298        add             %o1, 0x4, %o1
 299
 3001:      andcc           %o2, 0x2, %g0
 301        be,pt           %icc, 1f
 302         nop
 303        EX_LD(LOAD(lduh, %o1, %o5))
 304        EX_ST(STORE(sth, %o5, %o1 + %o3))
 305        add             %o1, 0x2, %o1
 306
 3071:      andcc           %o2, 0x1, %g0
 308        be,pt           %icc, 85f
 309         nop
 310        EX_LD(LOAD(ldub, %o1, %o5))
 311        ba,pt           %xcc, 85f
 312         EX_ST(STORE(stb, %o5, %o1 + %o3))
 313
 314        .align          64
 31570: /* 16 < len <= 64 */
 316        bne,pn          %XCC, 75f
 317         sub            %o0, %o1, %o3
 318
 31972:
 320        andn            %o2, 0xf, GLOBAL_SPARE
 321        and             %o2, 0xf, %o2
 3221:      subcc           GLOBAL_SPARE, 0x10, GLOBAL_SPARE
 323        EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
 324        EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
 325        EX_ST(STORE(stx, %o5, %o1 + %o3))
 326        add             %o1, 0x8, %o1
 327        EX_ST(STORE(stx, %g1, %o1 + %o3))
 328        bgu,pt          %XCC, 1b
 329         add            %o1, 0x8, %o1
 33073:     andcc           %o2, 0x8, %g0
 331        be,pt           %XCC, 1f
 332         nop
 333        sub             %o2, 0x8, %o2
 334        EX_LD(LOAD(ldx, %o1, %o5))
 335        EX_ST(STORE(stx, %o5, %o1 + %o3))
 336        add             %o1, 0x8, %o1
 3371:      andcc           %o2, 0x4, %g0
 338        be,pt           %XCC, 1f
 339         nop
 340        sub             %o2, 0x4, %o2
 341        EX_LD(LOAD(lduw, %o1, %o5))
 342        EX_ST(STORE(stw, %o5, %o1 + %o3))
 343        add             %o1, 0x4, %o1
 3441:      cmp             %o2, 0
 345        be,pt           %XCC, 85f
 346         nop
 347        ba,pt           %xcc, 90f
 348         nop
 349
 35075:
 351        andcc           %o0, 0x7, %g1
 352        sub             %g1, 0x8, %g1
 353        be,pn           %icc, 2f
 354         sub            %g0, %g1, %g1
 355        sub             %o2, %g1, %o2
 356
 3571:      subcc           %g1, 1, %g1
 358        EX_LD(LOAD(ldub, %o1, %o5))
 359        EX_ST(STORE(stb, %o5, %o1 + %o3))
 360        bgu,pt          %icc, 1b
 361         add            %o1, 1, %o1
 362
 3632:      add             %o1, %o3, %o0
 364        andcc           %o1, 0x7, %g1
 365        bne,pt          %icc, 8f
 366         sll            %g1, 3, %g1
 367
 368        cmp             %o2, 16
 369        bgeu,pt         %icc, 72b
 370         nop
 371        ba,a,pt         %xcc, 73b
 372
 3738:      mov             64, %o3
 374        andn            %o1, 0x7, %o1
 375        EX_LD(LOAD(ldx, %o1, %g2))
 376        sub             %o3, %g1, %o3
 377        andn            %o2, 0x7, GLOBAL_SPARE
 378        sllx            %g2, %g1, %g2
 3791:      EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
 380        subcc           GLOBAL_SPARE, 0x8, GLOBAL_SPARE
 381        add             %o1, 0x8, %o1
 382        srlx            %g3, %o3, %o5
 383        or              %o5, %g2, %o5
 384        EX_ST(STORE(stx, %o5, %o0))
 385        add             %o0, 0x8, %o0
 386        bgu,pt          %icc, 1b
 387         sllx           %g3, %g1, %g2
 388
 389        srl             %g1, 3, %g1
 390        andcc           %o2, 0x7, %o2
 391        be,pn           %icc, 85f
 392         add            %o1, %g1, %o1
 393        ba,pt           %xcc, 90f
 394         sub            %o0, %o1, %o3
 395
 396        .align          64
 39780: /* 0 < len <= 16 */
 398        andcc           %o3, 0x3, %g0
 399        bne,pn          %XCC, 90f
 400         sub            %o0, %o1, %o3
 401
 4021:
 403        subcc           %o2, 4, %o2
 404        EX_LD(LOAD(lduw, %o1, %g1))
 405        EX_ST(STORE(stw, %g1, %o1 + %o3))
 406        bgu,pt          %XCC, 1b
 407         add            %o1, 4, %o1
 408
 40985:     retl
 410         mov            EX_RETVAL(%o4), %o0
 411
 412        .align          32
 41390:
 414        subcc           %o2, 1, %o2
 415        EX_LD(LOAD(ldub, %o1, %g1))
 416        EX_ST(STORE(stb, %g1, %o1 + %o3))
 417        bgu,pt          %XCC, 90b
 418         add            %o1, 1, %o1
 419        retl
 420         mov            EX_RETVAL(%o4), %o0
 421
 422        .size           FUNC_NAME, .-FUNC_NAME
 423