linux/arch/sparc/lib/NG2memcpy.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* NG2memcpy.S: Niagara-2 optimized memcpy.
   3 *
   4 * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
   5 */
   6
   7#ifdef __KERNEL__
   8#include <linux/linkage.h>
   9#include <asm/visasm.h>
  10#include <asm/asi.h>
  11#define GLOBAL_SPARE    %g7
  12#else
  13#define ASI_PNF 0x82
  14#define ASI_BLK_P 0xf0
  15#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  16#define FPRS_FEF  0x04
  17#ifdef MEMCPY_DEBUG
  18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  19                     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
  20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  21#else
  22#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  23#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  24#endif
  25#define GLOBAL_SPARE    %g5
  26#endif
  27
  28#ifndef STORE_ASI
  29#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  30#define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
  31#else
  32#define STORE_ASI       0x80            /* ASI_P */
  33#endif
  34#endif
  35
  36#ifndef EX_LD
  37#define EX_LD(x,y)      x
  38#endif
  39#ifndef EX_LD_FP
  40#define EX_LD_FP(x,y)   x
  41#endif
  42
  43#ifndef EX_ST
  44#define EX_ST(x,y)      x
  45#endif
  46#ifndef EX_ST_FP
  47#define EX_ST_FP(x,y)   x
  48#endif
  49
  50#ifndef LOAD
  51#define LOAD(type,addr,dest)    type [addr], dest
  52#endif
  53
  54#ifndef LOAD_BLK
  55#define LOAD_BLK(addr,dest)     ldda [addr] ASI_BLK_P, dest
  56#endif
  57
  58#ifndef STORE
  59#ifndef MEMCPY_DEBUG
  60#define STORE(type,src,addr)    type src, [addr]
  61#else
  62#define STORE(type,src,addr)    type##a src, [addr] 0x80
  63#endif
  64#endif
  65
  66#ifndef STORE_BLK
  67#define STORE_BLK(src,addr)     stda src, [addr] ASI_BLK_P
  68#endif
  69
  70#ifndef STORE_INIT
  71#define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
  72#endif
  73
  74#ifndef FUNC_NAME
  75#define FUNC_NAME       NG2memcpy
  76#endif
  77
  78#ifndef PREAMBLE
  79#define PREAMBLE
  80#endif
  81
  82#ifndef XCC
  83#define XCC xcc
  84#endif
  85
  86#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
  87        faligndata      %x0, %x1, %f0; \
  88        faligndata      %x1, %x2, %f2; \
  89        faligndata      %x2, %x3, %f4; \
  90        faligndata      %x3, %x4, %f6; \
  91        faligndata      %x4, %x5, %f8; \
  92        faligndata      %x5, %x6, %f10; \
  93        faligndata      %x6, %x7, %f12; \
  94        faligndata      %x7, %x8, %f14;
  95
  96#define FREG_MOVE_1(x0) \
  97        fsrc2           %x0, %f0;
  98#define FREG_MOVE_2(x0, x1) \
  99        fsrc2           %x0, %f0; \
 100        fsrc2           %x1, %f2;
 101#define FREG_MOVE_3(x0, x1, x2) \
 102        fsrc2           %x0, %f0; \
 103        fsrc2           %x1, %f2; \
 104        fsrc2           %x2, %f4;
 105#define FREG_MOVE_4(x0, x1, x2, x3) \
 106        fsrc2           %x0, %f0; \
 107        fsrc2           %x1, %f2; \
 108        fsrc2           %x2, %f4; \
 109        fsrc2           %x3, %f6;
 110#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
 111        fsrc2           %x0, %f0; \
 112        fsrc2           %x1, %f2; \
 113        fsrc2           %x2, %f4; \
 114        fsrc2           %x3, %f6; \
 115        fsrc2           %x4, %f8;
 116#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
 117        fsrc2           %x0, %f0; \
 118        fsrc2           %x1, %f2; \
 119        fsrc2           %x2, %f4; \
 120        fsrc2           %x3, %f6; \
 121        fsrc2           %x4, %f8; \
 122        fsrc2           %x5, %f10;
 123#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
 124        fsrc2           %x0, %f0; \
 125        fsrc2           %x1, %f2; \
 126        fsrc2           %x2, %f4; \
 127        fsrc2           %x3, %f6; \
 128        fsrc2           %x4, %f8; \
 129        fsrc2           %x5, %f10; \
 130        fsrc2           %x6, %f12;
 131#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
 132        fsrc2           %x0, %f0; \
 133        fsrc2           %x1, %f2; \
 134        fsrc2           %x2, %f4; \
 135        fsrc2           %x3, %f6; \
 136        fsrc2           %x4, %f8; \
 137        fsrc2           %x5, %f10; \
 138        fsrc2           %x6, %f12; \
 139        fsrc2           %x7, %f14;
 140#define FREG_LOAD_1(base, x0) \
 141        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
 142#define FREG_LOAD_2(base, x0, x1) \
 143        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 144        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
 145#define FREG_LOAD_3(base, x0, x1, x2) \
 146        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 147        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
 148        EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
 149#define FREG_LOAD_4(base, x0, x1, x2, x3) \
 150        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 151        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
 152        EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
 153        EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
 154#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
 155        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 156        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
 157        EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
 158        EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
 159        EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
 160#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
 161        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 162        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
 163        EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
 164        EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
 165        EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
 166        EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
 167#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
 168        EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
 169        EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
 170        EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
 171        EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
 172        EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
 173        EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
 174        EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
 175
 176        .register       %g2,#scratch
 177        .register       %g3,#scratch
 178
 179        .text
 180#ifndef EX_RETVAL
 181#define EX_RETVAL(x)    x
 182__restore_fp:
 183        VISExitHalf
 184__restore_asi:
 185        retl
 186         wr     %g0, ASI_AIUS, %asi
 187ENTRY(NG2_retl_o2)
 188        ba,pt   %xcc, __restore_asi
 189         mov    %o2, %o0
 190ENDPROC(NG2_retl_o2)
 191ENTRY(NG2_retl_o2_plus_1)
 192        ba,pt   %xcc, __restore_asi
 193         add    %o2, 1, %o0
 194ENDPROC(NG2_retl_o2_plus_1)
 195ENTRY(NG2_retl_o2_plus_4)
 196        ba,pt   %xcc, __restore_asi
 197         add    %o2, 4, %o0
 198ENDPROC(NG2_retl_o2_plus_4)
 199ENTRY(NG2_retl_o2_plus_8)
 200        ba,pt   %xcc, __restore_asi
 201         add    %o2, 8, %o0
 202ENDPROC(NG2_retl_o2_plus_8)
 203ENTRY(NG2_retl_o2_plus_o4_plus_1)
 204        add     %o4, 1, %o4
 205        ba,pt   %xcc, __restore_asi
 206         add    %o2, %o4, %o0
 207ENDPROC(NG2_retl_o2_plus_o4_plus_1)
 208ENTRY(NG2_retl_o2_plus_o4_plus_8)
 209        add     %o4, 8, %o4
 210        ba,pt   %xcc, __restore_asi
 211         add    %o2, %o4, %o0
 212ENDPROC(NG2_retl_o2_plus_o4_plus_8)
 213ENTRY(NG2_retl_o2_plus_o4_plus_16)
 214        add     %o4, 16, %o4
 215        ba,pt   %xcc, __restore_asi
 216         add    %o2, %o4, %o0
 217ENDPROC(NG2_retl_o2_plus_o4_plus_16)
 218ENTRY(NG2_retl_o2_plus_g1_fp)
 219        ba,pt   %xcc, __restore_fp
 220         add    %o2, %g1, %o0
 221ENDPROC(NG2_retl_o2_plus_g1_fp)
 222ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
 223        add     %g1, 64, %g1
 224        ba,pt   %xcc, __restore_fp
 225         add    %o2, %g1, %o0
 226ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
 227ENTRY(NG2_retl_o2_plus_g1_plus_1)
 228        add     %g1, 1, %g1
 229        ba,pt   %xcc, __restore_asi
 230         add    %o2, %g1, %o0
 231ENDPROC(NG2_retl_o2_plus_g1_plus_1)
 232ENTRY(NG2_retl_o2_and_7_plus_o4)
 233        and     %o2, 7, %o2
 234        ba,pt   %xcc, __restore_asi
 235         add    %o2, %o4, %o0
 236ENDPROC(NG2_retl_o2_and_7_plus_o4)
 237ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
 238        and     %o2, 7, %o2
 239        add     %o4, 8, %o4
 240        ba,pt   %xcc, __restore_asi
 241         add    %o2, %o4, %o0
 242ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
 243#endif
 244
 245        .align          64
 246
 247        .globl  FUNC_NAME
 248        .type   FUNC_NAME,#function
 249FUNC_NAME:      /* %o0=dst, %o1=src, %o2=len */
 250        srlx            %o2, 31, %g2
 251        cmp             %g2, 0
 252        tne             %xcc, 5
 253        PREAMBLE
 254        mov             %o0, %o3
 255        cmp             %o2, 0
 256        be,pn           %XCC, 85f
 257         or             %o0, %o1, GLOBAL_SPARE
 258        cmp             %o2, 16
 259        blu,a,pn        %XCC, 80f
 260         or             GLOBAL_SPARE, %o2, GLOBAL_SPARE
 261
 262        /* 2 blocks (128 bytes) is the minimum we can do the block
 263         * copy with.  We need to ensure that we'll iterate at least
 264         * once in the block copy loop.  At worst we'll need to align
 265         * the destination to a 64-byte boundary which can chew up
 266         * to (64 - 1) bytes from the length before we perform the
 267         * block copy loop.
 268         *
 269         * However, the cut-off point, performance wise, is around
 270         * 4 64-byte blocks.
 271         */
 272        cmp             %o2, (4 * 64)
 273        blu,pt          %XCC, 75f
 274         andcc          GLOBAL_SPARE, 0x7, %g0
 275
 276        /* %o0: dst
 277         * %o1: src
 278         * %o2: len  (known to be >= 128)
 279         *
 280         * The block copy loops can use %o4, %g2, %g3 as
 281         * temporaries while copying the data.  %o5 must
 282         * be preserved between VISEntryHalf and VISExitHalf
 283         */
 284
 285        LOAD(prefetch, %o1 + 0x000, #one_read)
 286        LOAD(prefetch, %o1 + 0x040, #one_read)
 287        LOAD(prefetch, %o1 + 0x080, #one_read)
 288
 289        /* Align destination on 64-byte boundary.  */
 290        andcc           %o0, (64 - 1), %o4
 291        be,pt           %XCC, 2f
 292         sub            %o4, 64, %o4
 293        sub             %g0, %o4, %o4   ! bytes to align dst
 294        sub             %o2, %o4, %o2
 2951:      subcc           %o4, 1, %o4
 296        EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
 297        EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
 298        add             %o1, 1, %o1
 299        bne,pt          %XCC, 1b
 300        add             %o0, 1, %o0
 301
 3022:
 303        /* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
 304         * o5 from here until we hit VISExitHalf.
 305         */
 306        VISEntryHalf
 307
 308        membar          #Sync
 309        alignaddr       %o1, %g0, %g0
 310
 311        add             %o1, (64 - 1), %o4
 312        andn            %o4, (64 - 1), %o4
 313        andn            %o2, (64 - 1), %g1
 314        sub             %o2, %g1, %o2
 315
 316        and             %o1, (64 - 1), %g2
 317        add             %o1, %g1, %o1
 318        sub             %o0, %o4, %g3
 319        brz,pt          %g2, 190f
 320         cmp            %g2, 32
 321        blu,a           5f
 322         cmp            %g2, 16
 323        cmp             %g2, 48
 324        blu,a           4f
 325         cmp            %g2, 40
 326        cmp             %g2, 56
 327        blu             170f
 328         nop
 329        ba,a,pt         %xcc, 180f
 330         nop
 331
 3324:      /* 32 <= low bits < 48 */
 333        blu             150f
 334         nop
 335        ba,a,pt         %xcc, 160f
 336         nop
 3375:      /* 0 < low bits < 32 */
 338        blu,a           6f
 339         cmp            %g2, 8
 340        cmp             %g2, 24
 341        blu             130f
 342         nop
 343        ba,a,pt         %xcc, 140f
 344         nop
 3456:      /* 0 < low bits < 16 */
 346        bgeu            120f
 347         nop
 348        /* fall through for 0 < low bits < 8 */
 349110:    sub             %o4, 64, %g2
 350        EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
 3511:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 352        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 353        FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
 354        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 355        FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
 356        subcc           %g1, 64, %g1
 357        add             %o4, 64, %o4
 358        bne,pt          %xcc, 1b
 359         LOAD(prefetch, %o4 + 64, #one_read)
 360        ba,pt           %xcc, 195f
 361         nop
 362
 363120:    sub             %o4, 56, %g2
 364        FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
 3651:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 366        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 367        FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
 368        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 369        FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
 370        subcc           %g1, 64, %g1
 371        add             %o4, 64, %o4
 372        bne,pt          %xcc, 1b
 373         LOAD(prefetch, %o4 + 64, #one_read)
 374        ba,pt           %xcc, 195f
 375         nop
 376
 377130:    sub             %o4, 48, %g2
 378        FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
 3791:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 380        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 381        FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
 382        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 383        FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
 384        subcc           %g1, 64, %g1
 385        add             %o4, 64, %o4
 386        bne,pt          %xcc, 1b
 387         LOAD(prefetch, %o4 + 64, #one_read)
 388        ba,pt           %xcc, 195f
 389         nop
 390
 391140:    sub             %o4, 40, %g2
 392        FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
 3931:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 394        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 395        FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
 396        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 397        FREG_MOVE_5(f22, f24, f26, f28, f30)
 398        subcc           %g1, 64, %g1
 399        add             %o4, 64, %o4
 400        bne,pt          %xcc, 1b
 401         LOAD(prefetch, %o4 + 64, #one_read)
 402        ba,pt           %xcc, 195f
 403         nop
 404
 405150:    sub             %o4, 32, %g2
 406        FREG_LOAD_4(%g2, f0, f2, f4, f6)
 4071:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 408        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 409        FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
 410        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 411        FREG_MOVE_4(f24, f26, f28, f30)
 412        subcc           %g1, 64, %g1
 413        add             %o4, 64, %o4
 414        bne,pt          %xcc, 1b
 415         LOAD(prefetch, %o4 + 64, #one_read)
 416        ba,pt           %xcc, 195f
 417         nop
 418
 419160:    sub             %o4, 24, %g2
 420        FREG_LOAD_3(%g2, f0, f2, f4)
 4211:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 422        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 423        FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
 424        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 425        FREG_MOVE_3(f26, f28, f30)
 426        subcc           %g1, 64, %g1
 427        add             %o4, 64, %o4
 428        bne,pt          %xcc, 1b
 429         LOAD(prefetch, %o4 + 64, #one_read)
 430        ba,pt           %xcc, 195f
 431         nop
 432
 433170:    sub             %o4, 16, %g2
 434        FREG_LOAD_2(%g2, f0, f2)
 4351:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 436        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 437        FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
 438        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 439        FREG_MOVE_2(f28, f30)
 440        subcc           %g1, 64, %g1
 441        add             %o4, 64, %o4
 442        bne,pt          %xcc, 1b
 443         LOAD(prefetch, %o4 + 64, #one_read)
 444        ba,pt           %xcc, 195f
 445         nop
 446
 447180:    sub             %o4, 8, %g2
 448        FREG_LOAD_1(%g2, f0)
 4491:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 450        EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 451        FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
 452        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 453        FREG_MOVE_1(f30)
 454        subcc           %g1, 64, %g1
 455        add             %o4, 64, %o4
 456        bne,pt          %xcc, 1b
 457         LOAD(prefetch, %o4 + 64, #one_read)
 458        ba,pt           %xcc, 195f
 459         nop
 460
 461190:
 4621:      EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 463        subcc           %g1, 64, %g1
 464        EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
 465        EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
 466        add             %o4, 64, %o4
 467        bne,pt          %xcc, 1b
 468         LOAD(prefetch, %o4 + 64, #one_read)
 469
 470195:
 471        add             %o4, %g3, %o0
 472        membar          #Sync
 473
 474        VISExitHalf
 475
 476        /* %o2 contains any final bytes still needed to be copied
 477         * over. If anything is left, we copy it one byte at a time.
 478         */
 479        brz,pt          %o2, 85f
 480         sub            %o0, %o1, GLOBAL_SPARE
 481        ba,a,pt         %XCC, 90f
 482         nop
 483
 484        .align          64
 48575: /* 16 < len <= 64 */
 486        bne,pn          %XCC, 75f
 487         sub            %o0, %o1, GLOBAL_SPARE
 488
 48972:
 490        andn            %o2, 0xf, %o4
 491        and             %o2, 0xf, %o2
 4921:      subcc           %o4, 0x10, %o4
 493        EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
 494        add             %o1, 0x08, %o1
 495        EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
 496        sub             %o1, 0x08, %o1
 497        EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
 498        add             %o1, 0x8, %o1
 499        EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
 500        bgu,pt          %XCC, 1b
 501         add            %o1, 0x8, %o1
 50273:     andcc           %o2, 0x8, %g0
 503        be,pt           %XCC, 1f
 504         nop
 505        sub             %o2, 0x8, %o2
 506        EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
 507        EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
 508        add             %o1, 0x8, %o1
 5091:      andcc           %o2, 0x4, %g0
 510        be,pt           %XCC, 1f
 511         nop
 512        sub             %o2, 0x4, %o2
 513        EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
 514        EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
 515        add             %o1, 0x4, %o1
 5161:      cmp             %o2, 0
 517        be,pt           %XCC, 85f
 518         nop
 519        ba,pt           %xcc, 90f
 520         nop
 521
 52275:
 523        andcc           %o0, 0x7, %g1
 524        sub             %g1, 0x8, %g1
 525        be,pn           %icc, 2f
 526         sub            %g0, %g1, %g1
 527        sub             %o2, %g1, %o2
 528
 5291:      subcc           %g1, 1, %g1
 530        EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
 531        EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
 532        bgu,pt          %icc, 1b
 533         add            %o1, 1, %o1
 534
 5352:      add             %o1, GLOBAL_SPARE, %o0
 536        andcc           %o1, 0x7, %g1
 537        bne,pt          %icc, 8f
 538         sll            %g1, 3, %g1
 539
 540        cmp             %o2, 16
 541        bgeu,pt         %icc, 72b
 542         nop
 543        ba,a,pt         %xcc, 73b
 544
 5458:      mov             64, GLOBAL_SPARE
 546        andn            %o1, 0x7, %o1
 547        EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
 548        sub             GLOBAL_SPARE, %g1, GLOBAL_SPARE
 549        andn            %o2, 0x7, %o4
 550        sllx            %g2, %g1, %g2
 5511:      add             %o1, 0x8, %o1
 552        EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
 553        subcc           %o4, 0x8, %o4
 554        srlx            %g3, GLOBAL_SPARE, %o5
 555        or              %o5, %g2, %o5
 556        EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
 557        add             %o0, 0x8, %o0
 558        bgu,pt          %icc, 1b
 559         sllx           %g3, %g1, %g2
 560
 561        srl             %g1, 3, %g1
 562        andcc           %o2, 0x7, %o2
 563        be,pn           %icc, 85f
 564         add            %o1, %g1, %o1
 565        ba,pt           %xcc, 90f
 566         sub            %o0, %o1, GLOBAL_SPARE
 567
 568        .align          64
 56980: /* 0 < len <= 16 */
 570        andcc           GLOBAL_SPARE, 0x3, %g0
 571        bne,pn          %XCC, 90f
 572         sub            %o0, %o1, GLOBAL_SPARE
 573
 5741:
 575        subcc           %o2, 4, %o2
 576        EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
 577        EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
 578        bgu,pt          %XCC, 1b
 579         add            %o1, 4, %o1
 580
 58185:     retl
 582         mov            EX_RETVAL(%o3), %o0
 583
 584        .align          32
 58590:
 586        subcc           %o2, 1, %o2
 587        EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
 588        EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
 589        bgu,pt          %XCC, 90b
 590         add            %o1, 1, %o1
 591        retl
 592         mov            EX_RETVAL(%o3), %o0
 593
 594        .size           FUNC_NAME, .-FUNC_NAME
 595