linux/arch/sparc/lib/M7memcpy.S
<<
>>
Prefs
   1/*
   2 * M7memcpy: Optimized SPARC M7 memcpy
   3 *
   4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   5 */
   6
   7        .file   "M7memcpy.S"
   8
   9/*
  10 * memcpy(s1, s2, len)
  11 *
  12 * Copy s2 to s1, always copy n bytes.
  13 * Note: this C code does not work for overlapped copies.
  14 *
  15 * Fast assembler language version of the following C-program for memcpy
  16 * which represents the `standard' for the C-library.
  17 *
  18 *      void *
  19 *      memcpy(void *s, const void *s0, size_t n)
  20 *      {
  21 *              if (n != 0) {
  22 *                  char *s1 = s;
  23 *                  const char *s2 = s0;
  24 *                  do {
  25 *                      *s1++ = *s2++;
  26 *                  } while (--n != 0);
  27 *              }
  28 *              return (s);
  29 *      }
  30 *
  31 *
  32 * SPARC T7/M7 Flow :
  33 *
  34 * if (count < SMALL_MAX) {
  35 *   if count < SHORTCOPY              (SHORTCOPY=3)
  36 *      copy bytes; exit with dst addr
  37 *   if src & dst aligned on word boundary but not long word boundary,
  38 *     copy with ldw/stw; branch to finish_up
  39 *   if src & dst aligned on long word boundary
  40 *     copy with ldx/stx; branch to finish_up
  41 *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
  42 *     copy bytes; exit with dst addr
  43 *   move enough bytes to get src to word boundary
  44 *   if dst now on word boundary
  45 * move_words:
  46 *     copy words; branch to finish_up
  47 *   if dst now on half word boundary
  48 *     load words, shift half words, store words; branch to finish_up
  49 *   if dst on byte 1
  50 *     load words, shift 3 bytes, store words; branch to finish_up
  51 *   if dst on byte 3
  52 *     load words, shift 1 byte, store words; branch to finish_up
  53 * finish_up:
  54 *     copy bytes; exit with dst addr
  55 * } else {                                         More than SMALL_MAX bytes
  56 *   move bytes until dst is on long word boundary
  57 *   if( src is on long word boundary ) {
  58 *     if (count < MED_MAX) {
  59 * finish_long:                                    src/dst aligned on 8 bytes
  60 *       copy with ldx/stx in 8-way unrolled loop;
  61 *       copy final 0-63 bytes; exit with dst addr
  62 *     } else {                              src/dst aligned; count > MED_MAX
  63 *       align dst on 64 byte boundary; for main data movement:
  64 *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
  65 *       Use BIS (block initializing store) to avoid copying store cache
  66 *       lines from memory. But pre-store first element of each cache line
  67 *       ST_CHUNK lines in advance of the rest of that cache line. That
  68 *       gives time for replacement cache lines to be written back without
  69 *       excess STQ and Miss Buffer filling. Repeat until near the end,
  70 *       then finish up storing before going to finish_long.
  71 *     }
  72 *   } else {                                   src/dst not aligned on 8 bytes
  73 *     if src is word aligned and count < MED_WMAX
  74 *       move words in 8-way unrolled loop
  75 *       move final 0-31 bytes; exit with dst addr
  76 *     if count < MED_UMAX
  77 *       use alignaddr/faligndata combined with ldd/std in 8-way
  78 *       unrolled loop to move data.
  79 *       go to unalign_done
  80 *     else
  81 *       setup alignaddr for faligndata instructions
  82 *       align dst on 64 byte boundary; prefetch src data to L1 cache
  83 *       loadx8, falign, block-store, prefetch loop
  84 *       (only use block-init-store when src/dst on 8 byte boundaries.)
  85 * unalign_done:
  86 *       move remaining bytes for unaligned cases. exit with dst addr.
  87 * }
  88 *
  89 */
  90
  91#include <asm/visasm.h>
  92#include <asm/asi.h>
  93
  94#if !defined(EX_LD) && !defined(EX_ST)
  95#define NON_USER_COPY
  96#endif
  97
  98#ifndef EX_LD
  99#define EX_LD(x,y)      x
 100#endif
 101#ifndef EX_LD_FP
 102#define EX_LD_FP(x,y)   x
 103#endif
 104
 105#ifndef EX_ST
 106#define EX_ST(x,y)      x
 107#endif
 108#ifndef EX_ST_FP
 109#define EX_ST_FP(x,y)   x
 110#endif
 111
 112#ifndef EX_RETVAL
 113#define EX_RETVAL(x)    x
 114#endif
 115
 116#ifndef LOAD
 117#define LOAD(type,addr,dest)    type [addr], dest
 118#endif
 119
 120#ifndef STORE
 121#define STORE(type,src,addr)    type src, [addr]
 122#endif
 123
 124/*
 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
 126 * line as "least recently used" which means if many threads are
 127 * active, it has a high probability of being pushed out of the cache
 128 * between the first initializing store and the final stores.
 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
 130 * marks the cache line as "most recently used" for all
 131 * but the last cache line
 132 */
 133#ifndef STORE_ASI
 134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
 135#define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
 136#else
 137#define STORE_ASI       0x80            /* ASI_P */
 138#endif
 139#endif
 140
 141#ifndef STORE_MRU_ASI
 142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
 143#define STORE_MRU_ASI   ASI_ST_BLKINIT_MRU_P
 144#else
 145#define STORE_MRU_ASI   0x80            /* ASI_P */
 146#endif
 147#endif
 148
 149#ifndef STORE_INIT
 150#define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
 151#endif
 152
 153#ifndef STORE_INIT_MRU
 154#define STORE_INIT_MRU(src,addr)        stxa src, [addr] STORE_MRU_ASI
 155#endif
 156
 157#ifndef FUNC_NAME
 158#define FUNC_NAME       M7memcpy
 159#endif
 160
 161#ifndef PREAMBLE
 162#define PREAMBLE
 163#endif
 164
 165#define BLOCK_SIZE      64
 166#define SHORTCOPY       3
 167#define SHORTCHECK      14
 168#define SHORT_LONG      64      /* max copy for short longword-aligned case */
 169                                /* must be at least 64 */
 170#define SMALL_MAX       128
 171#define MED_UMAX        1024    /* max copy for medium un-aligned case */
 172#define MED_WMAX        1024    /* max copy for medium word-aligned case */
 173#define MED_MAX         1024    /* max copy for medium longword-aligned case */
 174#define ST_CHUNK        24      /* ST_CHUNK - block of values for BIS Store */
 175#define ALIGN_PRE       24      /* distance for aligned prefetch loop */
 176
 177        .register       %g2,#scratch
 178
 179        .section        ".text"
 180        .global         FUNC_NAME
 181        .type           FUNC_NAME, #function
 182        .align          16
 183FUNC_NAME:
 184        srlx            %o2, 31, %g2
 185        cmp             %g2, 0
 186        tne             %xcc, 5
 187        PREAMBLE
 188        mov             %o0, %g1        ! save %o0
 189        brz,pn          %o2, .Lsmallx
 190         cmp            %o2, 3
 191        ble,pn          %icc, .Ltiny_cp
 192         cmp            %o2, 19
 193        ble,pn          %icc, .Lsmall_cp
 194         or             %o0, %o1, %g2
 195        cmp             %o2, SMALL_MAX
 196        bl,pn           %icc, .Lmedium_cp
 197         nop
 198
 199.Lmedium:
 200        neg     %o0, %o5
 201        andcc   %o5, 7, %o5             ! bytes till DST 8 byte aligned
 202        brz,pt  %o5, .Ldst_aligned_on_8
 203
 204        ! %o5 has the bytes to be written in partial store.
 205         sub    %o2, %o5, %o2
 206        sub     %o1, %o0, %o1           ! %o1 gets the difference
 2077:                                      ! dst aligning loop
 208        add     %o1, %o0, %o4
 209        EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)     ! load one byte
 210        subcc   %o5, 1, %o5
 211        EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
 212        bgu,pt  %xcc, 7b
 213         add    %o0, 1, %o0             ! advance dst
 214        add     %o1, %o0, %o1           ! restore %o1
 215.Ldst_aligned_on_8:
 216        andcc   %o1, 7, %o5
 217        brnz,pt %o5, .Lsrc_dst_unaligned_on_8
 218         nop
 219
 220.Lsrc_dst_aligned_on_8:
 221        ! check if we are copying MED_MAX or more bytes
 222        set MED_MAX, %o3
 223        cmp %o2, %o3                    ! limit to store buffer size
 224        bgu,pn  %xcc, .Llarge_align8_copy
 225         nop
 226
 227/*
 228 * Special case for handling when src and dest are both long word aligned
 229 * and total data to move is less than MED_MAX bytes
 230 */
 231.Lmedlong:
 232        subcc   %o2, 63, %o2            ! adjust length to allow cc test
 233        ble,pn  %xcc, .Lmedl63          ! skip big loop if less than 64 bytes
 234         nop
 235.Lmedl64:
 236        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)      ! load
 237        subcc   %o2, 64, %o2            ! decrement length count
 238        EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)  ! and store
 239        EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
 240        EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
 241        EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
 242        EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
 243        EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
 244        EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
 245        EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
 246        EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
 247        EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
 248        add     %o1, 64, %o1            ! increase src ptr by 64
 249        EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
 250        EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
 251        add     %o0, 64, %o0            ! increase dst ptr by 64
 252        EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
 253        EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
 254        bgu,pt  %xcc, .Lmedl64          ! repeat if at least 64 bytes left
 255         EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
 256.Lmedl63:
 257        addcc   %o2, 32, %o2            ! adjust remaining count
 258        ble,pt  %xcc, .Lmedl31          ! to skip if 31 or fewer bytes left
 259         nop
 260        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)      ! load
 261        sub     %o2, 32, %o2            ! decrement length count
 262        EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)  ! and store
 263        EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
 264        add     %o1, 32, %o1            ! increase src ptr by 32
 265        EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
 266        EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
 267        add     %o0, 32, %o0            ! increase dst ptr by 32
 268        EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
 269        EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
 270        EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
 271.Lmedl31:
 272        addcc   %o2, 16, %o2            ! adjust remaining count
 273        ble,pt  %xcc, .Lmedl15          ! skip if 15 or fewer bytes left
 274         nop                            !
 275        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
 276        add     %o1, 16, %o1            ! increase src ptr by 16
 277        EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
 278        sub     %o2, 16, %o2            ! decrease count by 16
 279        EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
 280        add     %o0, 16, %o0            ! increase dst ptr by 16
 281        EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
 282.Lmedl15:
 283        addcc   %o2, 15, %o2            ! restore count
 284        bz,pt   %xcc, .Lsmallx  ! exit if finished
 285         cmp    %o2, 8
 286        blt,pt  %xcc, .Lmedw7           ! skip if 7 or fewer bytes left
 287         tst    %o2
 288        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)      ! load 8 bytes
 289        add     %o1, 8, %o1             ! increase src ptr by 8
 290        add     %o0, 8, %o0             ! increase dst ptr by 8
 291        subcc   %o2, 8, %o2             ! decrease count by 8
 292        bnz,pn  %xcc, .Lmedw7
 293         EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)   ! and store 8
 294        retl
 295         mov    EX_RETVAL(%g1), %o0     ! restore %o0
 296
 297        .align 16
 298.Lsrc_dst_unaligned_on_8:
 299        ! DST is 8-byte aligned, src is not
 3002:
 301        andcc   %o1, 0x3, %o5           ! test word alignment
 302        bnz,pt  %xcc, .Lunalignsetup    ! branch to skip if not word aligned
 303         nop
 304
 305/*
 306 * Handle all cases where src and dest are aligned on word
 307 * boundaries. Use unrolled loops for better performance.
 308 * This option wins over standard large data move when
 309 * source and destination is in cache for.Lmedium
 310 * to short data moves.
 311 */
 312        set MED_WMAX, %o3
 313        cmp %o2, %o3                    ! limit to store buffer size
 314        bge,pt  %xcc, .Lunalignrejoin   ! otherwise rejoin main loop
 315         nop
 316
 317        subcc   %o2, 31, %o2            ! adjust length to allow cc test
 318                                        ! for end of loop
 319        ble,pt  %xcc, .Lmedw31          ! skip big loop if less than 16
 320.Lmedw32:
 321        EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
 322        sllx    %o4, 32, %o5
 323        EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
 324        or      %o4, %o5, %o5
 325        EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
 326        subcc   %o2, 32, %o2            ! decrement length count
 327        EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
 328        sllx    %o4, 32, %o5
 329        EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
 330        or      %o4, %o5, %o5
 331        EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
 332        add     %o1, 32, %o1            ! increase src ptr by 32
 333        EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
 334        sllx    %o4, 32, %o5
 335        EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
 336        or      %o4, %o5, %o5
 337        EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
 338        add     %o0, 32, %o0            ! increase dst ptr by 32
 339        EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
 340        sllx    %o4, 32, %o5
 341        EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
 342        or      %o4, %o5, %o5
 343        bgu,pt  %xcc, .Lmedw32          ! repeat if at least 32 bytes left
 344         EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
 345.Lmedw31:
 346        addcc   %o2, 31, %o2            ! restore count
 347
 348        bz,pt   %xcc, .Lsmallx  ! exit if finished
 349         nop
 350        cmp     %o2, 16
 351        blt,pt  %xcc, .Lmedw15
 352         nop
 353        EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
 354        sllx    %o4, 32, %o5
 355        subcc   %o2, 16, %o2            ! decrement length count
 356        EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
 357        or      %o4, %o5, %o5
 358        EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
 359        add     %o1, 16, %o1            ! increase src ptr by 16
 360        EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
 361        add     %o0, 16, %o0            ! increase dst ptr by 16
 362        sllx    %o4, 32, %o5
 363        EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
 364        or      %o4, %o5, %o5
 365        EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
 366.Lmedw15:
 367        bz,pt   %xcc, .Lsmallx  ! exit if finished
 368         cmp    %o2, 8
 369        blt,pn  %xcc, .Lmedw7           ! skip if 7 or fewer bytes left
 370         tst    %o2
 371        EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)       ! load 4 bytes
 372        subcc   %o2, 8, %o2             ! decrease count by 8
 373        EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
 374        add     %o1, 8, %o1             ! increase src ptr by 8
 375        EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)      ! load 4 bytes
 376        add     %o0, 8, %o0             ! increase dst ptr by 8
 377        EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
 378        bz,pt   %xcc, .Lsmallx  ! exit if finished
 379.Lmedw7:                                ! count is ge 1, less than 8
 380        cmp     %o2, 4                  ! check for 4 bytes left
 381        blt,pn  %xcc, .Lsmallleft3      ! skip if 3 or fewer bytes left
 382         nop                            !
 383        EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)       ! load 4 bytes
 384        add     %o1, 4, %o1             ! increase src ptr by 4
 385        add     %o0, 4, %o0             ! increase dst ptr by 4
 386        subcc   %o2, 4, %o2             ! decrease count by 4
 387        bnz     .Lsmallleft3
 388         EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
 389        retl
 390         mov    EX_RETVAL(%g1), %o0
 391
 392        .align 16
 393.Llarge_align8_copy:                    ! Src and dst share 8 byte alignment
 394        ! align dst to 64 byte boundary
 395        andcc   %o0, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
 396        brz,pn  %o3, .Laligned_to_64
 397         andcc  %o0, 8, %o3             ! odd long words to move?
 398        brz,pt  %o3, .Laligned_to_16
 399         nop
 400        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 401        sub     %o2, 8, %o2
 402        add     %o1, 8, %o1             ! increment src ptr
 403        add     %o0, 8, %o0             ! increment dst ptr
 404        EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 405.Laligned_to_16:
 406        andcc   %o0, 16, %o3            ! pair of long words to move?
 407        brz,pt  %o3, .Laligned_to_32
 408         nop
 409        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 410        sub     %o2, 16, %o2
 411        EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
 412        add     %o1, 16, %o1            ! increment src ptr
 413        EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
 414        add     %o0, 16, %o0            ! increment dst ptr
 415        EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 416.Laligned_to_32:
 417        andcc   %o0, 32, %o3            ! four long words to move?
 418        brz,pt  %o3, .Laligned_to_64
 419         nop
 420        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 421        sub     %o2, 32, %o2
 422        EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
 423        EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
 424        EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
 425        EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
 426        EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
 427        add     %o1, 32, %o1            ! increment src ptr
 428        EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
 429        add     %o0, 32, %o0            ! increment dst ptr
 430        EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 431.Laligned_to_64:
 432!
 433!       Using block init store (BIS) instructions to avoid fetching cache
 434!       lines from memory. Use ST_CHUNK stores to first element of each cache
 435!       line (similar to prefetching) to avoid overfilling STQ or miss buffers.
 436!       Gives existing cache lines time to be moved out of L1/L2/L3 cache.
 437!       Initial stores using MRU version of BIS to keep cache line in
 438!       cache until we are ready to store final element of cache line.
 439!       Then store last element using the LRU version of BIS.
 440!
 441        andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 442        and     %o2, 0x3f, %o2          ! residue bytes in %o2
 443!
 444!       We use STORE_MRU_ASI for the first seven stores to each cache line
 445!       followed by STORE_ASI (mark as LRU) for the last store. That
 446!       mixed approach reduces the probability that the cache line is removed
 447!       before we finish setting it, while minimizing the effects on
 448!       other cached values during a large memcpy
 449!
 450!       ST_CHUNK batches up initial BIS operations for several cache lines
 451!       to allow multiple requests to not be blocked by overflowing the
 452!       the store miss buffer. Then the matching stores for all those
 453!       BIS operations are executed.
 454!
 455
 456        sub     %o0, 8, %o0             ! adjust %o0 for ASI alignment
 457.Lalign_loop:
 458        cmp     %o5, ST_CHUNK*64
 459        blu,pt  %xcc, .Lalign_loop_fin
 460         mov    ST_CHUNK,%o3
 461.Lalign_loop_start:
 462        prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
 463        subcc   %o3, 1, %o3
 464        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
 465        add     %o1, 64, %o1
 466        add     %o0, 8, %o0
 467        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 468        bgu     %xcc,.Lalign_loop_start
 469         add    %o0, 56, %o0
 470
 471        mov     ST_CHUNK,%o3
 472        sllx    %o3, 6, %o4             ! ST_CHUNK*64
 473        sub     %o1, %o4, %o1           ! reset %o1
 474        sub     %o0, %o4, %o0           ! reset %o0
 475
 476.Lalign_loop_rest:
 477        EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
 478        add     %o0, 16, %o0
 479        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 480        EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
 481        add     %o0, 8, %o0
 482        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 483        subcc   %o3, 1, %o3
 484        EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
 485        add     %o0, 8, %o0
 486        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 487        EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
 488        add     %o0, 8, %o0
 489        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 490        EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
 491        add     %o0, 8, %o0
 492        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 493        EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
 494        add     %o1, 64, %o1
 495        add     %o0, 8, %o0
 496        EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 497        add     %o0, 8, %o0
 498        EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
 499        sub     %o5, 64, %o5
 500        bgu     %xcc,.Lalign_loop_rest
 501        ! mark cache line as LRU
 502         EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
 503
 504        cmp     %o5, ST_CHUNK*64
 505        bgu,pt  %xcc, .Lalign_loop_start
 506         mov    ST_CHUNK,%o3
 507
 508        cmp     %o5, 0
 509        beq     .Lalign_done
 510         nop
 511.Lalign_loop_fin:
 512        EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
 513        EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
 514        EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
 515        EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
 516        EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
 517        EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
 518        subcc   %o5, 64, %o5
 519        EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
 520        EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
 521        EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
 522        EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
 523        EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
 524        EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
 525        EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
 526        add     %o1, 64, %o1
 527        EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
 528        add     %o0, 64, %o0
 529        EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
 530        bgu     %xcc,.Lalign_loop_fin
 531         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
 532
 533.Lalign_done:
 534        add     %o0, 8, %o0             ! restore %o0 from ASI alignment
 535        membar  #StoreStore
 536        sub     %o2, 63, %o2            ! adjust length to allow cc test
 537        ba      .Lmedl63                ! in .Lmedl63
 538         nop
 539
 540        .align 16
 541        ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
 542.Lunalignsetup:
 543.Lunalignrejoin:
 544        mov     %g1, %o3        ! save %g1 as VISEntryHalf clobbers it
 545#ifdef NON_USER_COPY
 546        VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
 547#else
 548        VISEntryHalf
 549#endif
 550        mov     %o3, %g1        ! restore %g1
 551
 552        set MED_UMAX, %o3
 553        cmp %o2, %o3            ! check for.Lmedium unaligned limit
 554        bge,pt  %xcc,.Lunalign_large
 555         prefetch [%o1 + (4 * BLOCK_SIZE)], 20
 556        andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 557        and     %o2, 0x3f, %o2          ! residue bytes in %o2
 558        cmp     %o2, 8                  ! Insure we do not load beyond
 559        bgt     .Lunalign_adjust        ! end of source buffer
 560         andn   %o1, 0x7, %o4           ! %o4 has long word aligned src address
 561        add     %o2, 64, %o2            ! adjust to leave loop
 562        sub     %o5, 64, %o5            ! early if necessary
 563.Lunalign_adjust:
 564        alignaddr %o1, %g0, %g0         ! generate %gsr
 565        add     %o1, %o5, %o1           ! advance %o1 to after blocks
 566        EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
 567.Lunalign_loop:
 568        EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
 569        faligndata %f0, %f2, %f16
 570        EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
 571        subcc   %o5, BLOCK_SIZE, %o5
 572        EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
 573        faligndata %f2, %f4, %f18
 574        EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
 575        EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
 576        faligndata %f4, %f6, %f20
 577        EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
 578        EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
 579        faligndata %f6, %f8, %f22
 580        EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
 581        EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
 582        faligndata %f8, %f10, %f24
 583        EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
 584        EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
 585        faligndata %f10, %f12, %f26
 586        EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
 587        add     %o4, BLOCK_SIZE, %o4
 588        EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
 589        faligndata %f12, %f14, %f28
 590        EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
 591        EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
 592        faligndata %f14, %f0, %f30
 593        EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
 594        add     %o0, BLOCK_SIZE, %o0
 595        bgu,pt  %xcc, .Lunalign_loop
 596         prefetch [%o4 + (5 * BLOCK_SIZE)], 20
 597        ba      .Lunalign_done
 598         nop
 599
 600.Lunalign_large:
 601        andcc   %o0, 0x3f, %o3          ! is dst 64-byte block aligned?
 602        bz      %xcc, .Lunalignsrc
 603         sub    %o3, 64, %o3            ! %o3 will be multiple of 8
 604        neg     %o3                     ! bytes until dest is 64 byte aligned
 605        sub     %o2, %o3, %o2           ! update cnt with bytes to be moved
 606        ! Move bytes according to source alignment
 607        andcc   %o1, 0x1, %o5
 608        bnz     %xcc, .Lunalignbyte     ! check for byte alignment
 609         nop
 610        andcc   %o1, 2, %o5             ! check for half word alignment
 611        bnz     %xcc, .Lunalignhalf
 612         nop
 613        ! Src is word aligned
 614.Lunalignword:
 615        EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)    ! load 4 bytes
 616        add     %o1, 8, %o1             ! increase src ptr by 8
 617        EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)  ! and store 4
 618        subcc   %o3, 8, %o3             ! decrease count by 8
 619        EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
 620        add     %o0, 8, %o0             ! increase dst ptr by 8
 621        bnz     %xcc, .Lunalignword
 622         EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
 623        ba      .Lunalignsrc
 624         nop
 625
 626        ! Src is half-word aligned
 627.Lunalignhalf:
 628        EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)  ! load 2 bytes
 629        sllx    %o4, 32, %o5            ! shift left
 630        EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
 631        or      %o4, %o5, %o5
 632        sllx    %o5, 16, %o5
 633        EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
 634        or      %o4, %o5, %o5
 635        EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
 636        add     %o1, 8, %o1
 637        subcc   %o3, 8, %o3
 638        bnz     %xcc, .Lunalignhalf
 639         add    %o0, 8, %o0
 640        ba      .Lunalignsrc
 641         nop
 642
 643        ! Src is Byte aligned
 644.Lunalignbyte:
 645        sub     %o0, %o1, %o0           ! share pointer advance
 646.Lunalignbyte_loop:
 647        EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
 648        sllx    %o4, 56, %o5
 649        EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
 650        sllx    %o4, 40, %o4
 651        or      %o4, %o5, %o5
 652        EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
 653        sllx    %o4, 24, %o4
 654        or      %o4, %o5, %o5
 655        EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
 656        sllx    %o4,  8, %o4
 657        or      %o4, %o5, %o5
 658        EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
 659        or      %o4, %o5, %o5
 660        add     %o0, %o1, %o0
 661        EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
 662        sub     %o0, %o1, %o0
 663        subcc   %o3, 8, %o3
 664        bnz     %xcc, .Lunalignbyte_loop
 665         add    %o1, 8, %o1
 666        add     %o0,%o1, %o0            ! restore pointer
 667
 668        ! Destination is now block (64 byte aligned)
 669.Lunalignsrc:
 670        andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 671        and     %o2, 0x3f, %o2          ! residue bytes in %o2
 672        add     %o2, 64, %o2            ! Insure we do not load beyond
 673        sub     %o5, 64, %o5            ! end of source buffer
 674
 675        andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
 676        alignaddr %o1, %g0, %g0         ! generate %gsr
 677        add     %o1, %o5, %o1           ! advance %o1 to after blocks
 678
 679        EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
 680        add     %o4, 8, %o4
 681.Lunalign_sloop:
 682        EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
 683        faligndata %f14, %f16, %f0
 684        EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
 685        faligndata %f16, %f18, %f2
 686        EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
 687        faligndata %f18, %f20, %f4
 688        EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
 689        subcc   %o5, 64, %o5
 690        EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
 691        faligndata %f20, %f22, %f6
 692        EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
 693        EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
 694        faligndata %f22, %f24, %f8
 695        EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
 696        EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
 697        faligndata %f24, %f26, %f10
 698        EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
 699        EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
 700        faligndata %f26, %f28, %f12
 701        EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
 702        add     %o4, 64, %o4
 703        EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
 704        faligndata %f28, %f30, %f14
 705        EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
 706        EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
 707        add     %o0, 64, %o0
 708        EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
 709        fsrc2   %f30, %f14
 710        bgu,pt  %xcc, .Lunalign_sloop
 711         prefetch [%o4 + (8 * BLOCK_SIZE)], 20
 712
 713.Lunalign_done:
 714        ! Handle trailing bytes, 64 to 127
 715        ! Dest long word aligned, Src not long word aligned
 716        cmp     %o2, 15
 717        bleu    %xcc, .Lunalign_short
 718
 719         andn   %o2, 0x7, %o5           ! %o5 is multiple of 8
 720        and     %o2, 0x7, %o2           ! residue bytes in %o2
 721        add     %o2, 8, %o2
 722        sub     %o5, 8, %o5             ! insure we do not load past end of src
 723        andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
 724        add     %o1, %o5, %o1           ! advance %o1 to after multiple of 8
 725        EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
 726.Lunalign_by8:
 727        EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
 728        add     %o4, 8, %o4
 729        faligndata %f0, %f2, %f16
 730        subcc   %o5, 8, %o5
 731        EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
 732        fsrc2   %f2, %f0
 733        bgu,pt  %xcc, .Lunalign_by8
 734         add    %o0, 8, %o0
 735
 736.Lunalign_short:
 737#ifdef NON_USER_COPY
 738        VISExitHalfFast
 739#else
 740        VISExitHalf
 741#endif
 742        ba      .Lsmallrest
 743         nop
 744
 745/*
 746 * This is a special case of nested memcpy. This can happen when kernel
 747 * calls unaligned memcpy back to back without saving FP registers. We need
 748 * traps(context switch) to save/restore FP registers. If the kernel calls
 749 * memcpy without this trap sequence we will hit FP corruption. Let's use
 750 * the normal integer load/store method in this case.
 751 */
 752
 753#ifdef NON_USER_COPY
 754.Lmedium_vis_entry_fail_cp:
 755        or      %o0, %o1, %g2
 756#endif
 757.Lmedium_cp:
 758        LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
 759        andcc   %g2, 0x7, %g0
 760        bne,pn  %xcc, .Lmedium_unaligned_cp
 761         nop
 762
 763.Lmedium_noprefetch_cp:
 764        andncc  %o2, 0x20 - 1, %o5
 765        be,pn   %xcc, 2f
 766         sub    %o2, %o5, %o2
 7671:      EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 768        EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
 769        EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
 770        EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
 771        add     %o1, 0x20, %o1
 772        subcc   %o5, 0x20, %o5
 773        EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
 774        EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
 775        EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
 776        EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
 777        bne,pt  %xcc, 1b
 778         add    %o0, 0x20, %o0
 7792:      andcc   %o2, 0x18, %o5
 780        be,pt   %xcc, 3f
 781         sub    %o2, %o5, %o2
 7821:      EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 783        add     %o1, 0x08, %o1
 784        add     %o0, 0x08, %o0
 785        subcc   %o5, 0x08, %o5
 786        bne,pt  %xcc, 1b
 787         EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
 7883:      brz,pt  %o2, .Lexit_cp
 789         cmp    %o2, 0x04
 790        bl,pn   %xcc, .Ltiny_cp
 791         nop
 792        EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
 793        add     %o1, 0x04, %o1
 794        add     %o0, 0x04, %o0
 795        subcc   %o2, 0x04, %o2
 796        bne,pn  %xcc, .Ltiny_cp
 797         EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
 798        ba,a,pt %xcc, .Lexit_cp
 799
 800.Lmedium_unaligned_cp:
 801        /* First get dest 8 byte aligned.  */
 802        sub     %g0, %o0, %o3
 803        and     %o3, 0x7, %o3
 804        brz,pt  %o3, 2f
 805         sub    %o2, %o3, %o2
 806
 8071:      EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
 808        add     %o1, 1, %o1
 809        subcc   %o3, 1, %o3
 810        add     %o0, 1, %o0
 811        bne,pt  %xcc, 1b
 812         EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
 8132:
 814        and     %o1, 0x7, %o3
 815        brz,pn  %o3, .Lmedium_noprefetch_cp
 816         sll    %o3, 3, %o3
 817        mov     64, %g2
 818        sub     %g2, %o3, %g2
 819        andn    %o1, 0x7, %o1
 820        EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
 821        sllx    %o4, %o3, %o4
 822        andn    %o2, 0x08 - 1, %o5
 823        sub     %o2, %o5, %o2
 824
 8251:      EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
 826        add     %o1, 0x08, %o1
 827        subcc   %o5, 0x08, %o5
 828        srlx    %g3, %g2, %g7
 829        or      %g7, %o4, %g7
 830        EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
 831        add     %o0, 0x08, %o0
 832        bne,pt  %xcc, 1b
 833         sllx   %g3, %o3, %o4
 834        srl     %o3, 3, %o3
 835        add     %o1, %o3, %o1
 836        brz,pn  %o2, .Lexit_cp
 837         nop
 838        ba,pt   %xcc, .Lsmall_unaligned_cp
 839
 840.Ltiny_cp:
 841        EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
 842        subcc   %o2, 1, %o2
 843        be,pn   %xcc, .Lexit_cp
 844         EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
 845        EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
 846        subcc   %o2, 1, %o2
 847        be,pn   %xcc, .Lexit_cp
 848         EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
 849        EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
 850        ba,pt   %xcc, .Lexit_cp
 851         EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
 852
 853.Lsmall_cp:
 854        andcc   %g2, 0x3, %g0
 855        bne,pn  %xcc, .Lsmall_unaligned_cp
 856         andn   %o2, 0x4 - 1, %o5
 857        sub     %o2, %o5, %o2
 8581:
 859        EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 860        add     %o1, 0x04, %o1
 861        subcc   %o5, 0x04, %o5
 862        add     %o0, 0x04, %o0
 863        bne,pt  %xcc, 1b
 864         EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
 865        brz,pt  %o2, .Lexit_cp
 866         nop
 867        ba,a,pt %xcc, .Ltiny_cp
 868
 869.Lsmall_unaligned_cp:
 8701:      EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
 871        add     %o1, 1, %o1
 872        add     %o0, 1, %o0
 873        subcc   %o2, 1, %o2
 874        bne,pt  %xcc, 1b
 875         EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
 876        ba,a,pt %xcc, .Lexit_cp
 877
 878.Lsmallrest:
 879        tst     %o2
 880        bz,pt   %xcc, .Lsmallx
 881         cmp    %o2, 4
 882        blt,pn  %xcc, .Lsmallleft3
 883         nop
 884        sub     %o2, 3, %o2
 885.Lsmallnotalign4:
 886        EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
 887        subcc   %o2, 4, %o2             ! reduce count by 4
 888        EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
 889        EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
 890        add     %o1, 4, %o1             ! advance SRC by 4
 891        EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
 892        EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
 893        add     %o0, 4, %o0             ! advance DST by 4
 894        EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
 895        EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
 896        bgu,pt  %xcc, .Lsmallnotalign4  ! loop til 3 or fewer bytes remain
 897        EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
 898        addcc   %o2, 3, %o2             ! restore count
 899        bz,pt   %xcc, .Lsmallx
 900.Lsmallleft3:                           ! 1, 2, or 3 bytes remain
 901        subcc   %o2, 1, %o2
 902        EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)      ! load one byte
 903        bz,pt   %xcc, .Lsmallx
 904        EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)      ! store one byte
 905        EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)   ! load second byte
 906        subcc   %o2, 1, %o2
 907        bz,pt   %xcc, .Lsmallx
 908        EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
 909        EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)   ! load third byte
 910        EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)   ! store third byte
 911.Lsmallx:
 912        retl
 913         mov    EX_RETVAL(%g1), %o0
 914.Lsmallfin:
 915        tst     %o2
 916        bnz,pn  %xcc, .Lsmallleft3
 917         nop
 918        retl
 919         mov    EX_RETVAL(%g1), %o0     ! restore %o0
 920.Lexit_cp:
 921        retl
 922         mov    EX_RETVAL(%g1), %o0
 923        .size  FUNC_NAME, .-FUNC_NAME
 924