linux/arch/sparc/lib/NGmemcpy.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* NGmemcpy.S: Niagara optimized memcpy.
   3 *
   4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
   5 */
   6
   7#ifdef __KERNEL__
   8#include <linux/linkage.h>
   9#include <asm/asi.h>
  10#include <asm/thread_info.h>
  11#define GLOBAL_SPARE    %g7
  12#define RESTORE_ASI(TMP)        \
  13        ldub    [%g6 + TI_CURRENT_DS], TMP;  \
  14        wr      TMP, 0x0, %asi;
  15#else
  16#define GLOBAL_SPARE    %g5
  17#define RESTORE_ASI(TMP)        \
  18        wr      %g0, ASI_PNF, %asi
  19#endif
  20
  21#ifdef __sparc_v9__
  22#define SAVE_AMOUNT     128
  23#else
  24#define SAVE_AMOUNT     64
  25#endif
  26
  27#ifndef STORE_ASI
  28#define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
  29#endif
  30
  31#ifndef EX_LD
  32#define EX_LD(x,y)      x
  33#endif
  34
  35#ifndef EX_ST
  36#define EX_ST(x,y)      x
  37#endif
  38
  39#ifndef LOAD
  40#ifndef MEMCPY_DEBUG
  41#define LOAD(type,addr,dest)    type [addr], dest
  42#else
  43#define LOAD(type,addr,dest)    type##a [addr] 0x80, dest
  44#endif
  45#endif
  46
  47#ifndef LOAD_TWIN
  48#define LOAD_TWIN(addr_reg,dest0,dest1) \
  49        ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  50#endif
  51
  52#ifndef STORE
  53#define STORE(type,src,addr)    type src, [addr]
  54#endif
  55
  56#ifndef STORE_INIT
  57#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  58#define STORE_INIT(src,addr)    stxa src, [addr] %asi
  59#else
  60#define STORE_INIT(src,addr)    stx src, [addr + 0x00]
  61#endif
  62#endif
  63
  64#ifndef FUNC_NAME
  65#define FUNC_NAME       NGmemcpy
  66#endif
  67
  68#ifndef PREAMBLE
  69#define PREAMBLE
  70#endif
  71
  72#ifndef XCC
  73#define XCC xcc
  74#endif
  75
  76        .register       %g2,#scratch
  77        .register       %g3,#scratch
  78
  79        .text
  80#ifndef EX_RETVAL
  81#define EX_RETVAL(x)    x
  82__restore_asi:
  83        ret
  84        wr      %g0, ASI_AIUS, %asi
  85         restore
  86ENTRY(NG_ret_i2_plus_i4_plus_1)
  87        ba,pt   %xcc, __restore_asi
  88         add    %i2, %i5, %i0
  89ENDPROC(NG_ret_i2_plus_i4_plus_1)
  90ENTRY(NG_ret_i2_plus_g1)
  91        ba,pt   %xcc, __restore_asi
  92         add    %i2, %g1, %i0
  93ENDPROC(NG_ret_i2_plus_g1)
  94ENTRY(NG_ret_i2_plus_g1_minus_8)
  95        sub     %g1, 8, %g1
  96        ba,pt   %xcc, __restore_asi
  97         add    %i2, %g1, %i0
  98ENDPROC(NG_ret_i2_plus_g1_minus_8)
  99ENTRY(NG_ret_i2_plus_g1_minus_16)
 100        sub     %g1, 16, %g1
 101        ba,pt   %xcc, __restore_asi
 102         add    %i2, %g1, %i0
 103ENDPROC(NG_ret_i2_plus_g1_minus_16)
 104ENTRY(NG_ret_i2_plus_g1_minus_24)
 105        sub     %g1, 24, %g1
 106        ba,pt   %xcc, __restore_asi
 107         add    %i2, %g1, %i0
 108ENDPROC(NG_ret_i2_plus_g1_minus_24)
 109ENTRY(NG_ret_i2_plus_g1_minus_32)
 110        sub     %g1, 32, %g1
 111        ba,pt   %xcc, __restore_asi
 112         add    %i2, %g1, %i0
 113ENDPROC(NG_ret_i2_plus_g1_minus_32)
 114ENTRY(NG_ret_i2_plus_g1_minus_40)
 115        sub     %g1, 40, %g1
 116        ba,pt   %xcc, __restore_asi
 117         add    %i2, %g1, %i0
 118ENDPROC(NG_ret_i2_plus_g1_minus_40)
 119ENTRY(NG_ret_i2_plus_g1_minus_48)
 120        sub     %g1, 48, %g1
 121        ba,pt   %xcc, __restore_asi
 122         add    %i2, %g1, %i0
 123ENDPROC(NG_ret_i2_plus_g1_minus_48)
 124ENTRY(NG_ret_i2_plus_g1_minus_56)
 125        sub     %g1, 56, %g1
 126        ba,pt   %xcc, __restore_asi
 127         add    %i2, %g1, %i0
 128ENDPROC(NG_ret_i2_plus_g1_minus_56)
 129ENTRY(NG_ret_i2_plus_i4)
 130        ba,pt   %xcc, __restore_asi
 131         add    %i2, %i4, %i0
 132ENDPROC(NG_ret_i2_plus_i4)
 133ENTRY(NG_ret_i2_plus_i4_minus_8)
 134        sub     %i4, 8, %i4
 135        ba,pt   %xcc, __restore_asi
 136         add    %i2, %i4, %i0
 137ENDPROC(NG_ret_i2_plus_i4_minus_8)
 138ENTRY(NG_ret_i2_plus_8)
 139        ba,pt   %xcc, __restore_asi
 140         add    %i2, 8, %i0
 141ENDPROC(NG_ret_i2_plus_8)
 142ENTRY(NG_ret_i2_plus_4)
 143        ba,pt   %xcc, __restore_asi
 144         add    %i2, 4, %i0
 145ENDPROC(NG_ret_i2_plus_4)
 146ENTRY(NG_ret_i2_plus_1)
 147        ba,pt   %xcc, __restore_asi
 148         add    %i2, 1, %i0
 149ENDPROC(NG_ret_i2_plus_1)
 150ENTRY(NG_ret_i2_plus_g1_plus_1)
 151        add     %g1, 1, %g1
 152        ba,pt   %xcc, __restore_asi
 153         add    %i2, %g1, %i0
 154ENDPROC(NG_ret_i2_plus_g1_plus_1)
 155ENTRY(NG_ret_i2)
 156        ba,pt   %xcc, __restore_asi
 157         mov    %i2, %i0
 158ENDPROC(NG_ret_i2)
 159ENTRY(NG_ret_i2_and_7_plus_i4)
 160        and     %i2, 7, %i2
 161        ba,pt   %xcc, __restore_asi
 162         add    %i2, %i4, %i0
 163ENDPROC(NG_ret_i2_and_7_plus_i4)
 164#endif
 165
 166        .align          64
 167
 168        .globl  FUNC_NAME
 169        .type   FUNC_NAME,#function
 170FUNC_NAME:      /* %i0=dst, %i1=src, %i2=len */
 171        PREAMBLE
 172        save            %sp, -SAVE_AMOUNT, %sp
 173        srlx            %i2, 31, %g2
 174        cmp             %g2, 0
 175        tne             %xcc, 5
 176        mov             %i0, %o0
 177        cmp             %i2, 0
 178        be,pn           %XCC, 85f
 179         or             %o0, %i1, %i3
 180        cmp             %i2, 16
 181        blu,a,pn        %XCC, 80f
 182         or             %i3, %i2, %i3
 183
 184        /* 2 blocks (128 bytes) is the minimum we can do the block
 185         * copy with.  We need to ensure that we'll iterate at least
 186         * once in the block copy loop.  At worst we'll need to align
 187         * the destination to a 64-byte boundary which can chew up
 188         * to (64 - 1) bytes from the length before we perform the
 189         * block copy loop.
 190         */
 191        cmp             %i2, (2 * 64)
 192        blu,pt          %XCC, 70f
 193         andcc          %i3, 0x7, %g0
 194
 195        /* %o0: dst
 196         * %i1: src
 197         * %i2: len  (known to be >= 128)
 198         *
 199         * The block copy loops will use %i4/%i5,%g2/%g3 as
 200         * temporaries while copying the data.
 201         */
 202
 203        LOAD(prefetch, %i1, #one_read)
 204        wr              %g0, STORE_ASI, %asi
 205
 206        /* Align destination on 64-byte boundary.  */
 207        andcc           %o0, (64 - 1), %i4
 208        be,pt           %XCC, 2f
 209         sub            %i4, 64, %i4
 210        sub             %g0, %i4, %i4   ! bytes to align dst
 211        sub             %i2, %i4, %i2
 2121:      subcc           %i4, 1, %i4
 213        EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
 214        EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
 215        add             %i1, 1, %i1
 216        bne,pt          %XCC, 1b
 217        add             %o0, 1, %o0
 218
 219        /* If the source is on a 16-byte boundary we can do
 220         * the direct block copy loop.  If it is 8-byte aligned
 221         * we can do the 16-byte loads offset by -8 bytes and the
 222         * init stores offset by one register.
 223         *
 224         * If the source is not even 8-byte aligned, we need to do
 225         * shifting and masking (basically integer faligndata).
 226         *
 227         * The careful bit with init stores is that if we store
 228         * to any part of the cache line we have to store the whole
 229         * cacheline else we can end up with corrupt L2 cache line
 230         * contents.  Since the loop works on 64-bytes of 64-byte
 231         * aligned store data at a time, this is easy to ensure.
 232         */
 2332:
 234        andcc           %i1, (16 - 1), %i4
 235        andn            %i2, (64 - 1), %g1      ! block copy loop iterator
 236        be,pt           %XCC, 50f
 237         sub            %i2, %g1, %i2           ! final sub-block copy bytes
 238
 239        cmp             %i4, 8
 240        be,pt           %XCC, 10f
 241         sub            %i1, %i4, %i1
 242
 243        /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 244        and             %i4, 0x7, GLOBAL_SPARE
 245        sll             GLOBAL_SPARE, 3, GLOBAL_SPARE
 246        mov             64, %i5
 247        EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
 248        sub             %i5, GLOBAL_SPARE, %i5
 249        mov             16, %o4
 250        mov             32, %o5
 251        mov             48, %o7
 252        mov             64, %i3
 253
 254        bg,pn           %XCC, 9f
 255         nop
 256
 257#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
 258        sllx            WORD1, POST_SHIFT, WORD1; \
 259        srlx            WORD2, PRE_SHIFT, TMP; \
 260        sllx            WORD2, POST_SHIFT, WORD2; \
 261        or              WORD1, TMP, WORD1; \
 262        srlx            WORD3, PRE_SHIFT, TMP; \
 263        or              WORD2, TMP, WORD2;
 264
 2658:      EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
 266        MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 267        LOAD(prefetch, %i1 + %i3, #one_read)
 268
 269        EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
 270        EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 271
 272        EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
 273        MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 274
 275        EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
 276        EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 277
 278        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 279        MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 280
 281        EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
 282        EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 283
 284        EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
 285        add             %i1, 64, %i1
 286        MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 287
 288        EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
 289        EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 290
 291        subcc           %g1, 64, %g1
 292        bne,pt          %XCC, 8b
 293         add            %o0, 64, %o0
 294
 295        ba,pt           %XCC, 60f
 296         add            %i1, %i4, %i1
 297
 2989:      EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
 299        MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 300        LOAD(prefetch, %i1 + %i3, #one_read)
 301
 302        EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
 303        EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 304
 305        EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
 306        MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 307
 308        EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
 309        EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 310
 311        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 312        MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 313
 314        EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
 315        EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 316
 317        EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
 318        add             %i1, 64, %i1
 319        MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 320
 321        EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
 322        EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 323
 324        subcc           %g1, 64, %g1
 325        bne,pt          %XCC, 9b
 326         add            %o0, 64, %o0
 327
 328        ba,pt           %XCC, 60f
 329         add            %i1, %i4, %i1
 330
 33110:     /* Destination is 64-byte aligned, source was only 8-byte
 332         * aligned but it has been subtracted by 8 and we perform
 333         * one twin load ahead, then add 8 back into source when
 334         * we finish the loop.
 335         */
 336        EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
 337        mov     16, %o7
 338        mov     32, %g2
 339        mov     48, %g3
 340        mov     64, %o1
 3411:      EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
 342        LOAD(prefetch, %i1 + %o1, #one_read)
 343        EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)   ! initializes cache line
 344        EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 345        EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
 346        EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
 347        EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 348        EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 349        EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
 350        EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 351        EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
 352        add             %i1, 64, %i1
 353        EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
 354        EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 355        subcc           %g1, 64, %g1
 356        bne,pt          %XCC, 1b
 357         add            %o0, 64, %o0
 358
 359        ba,pt           %XCC, 60f
 360         add            %i1, 0x8, %i1
 361
 36250:     /* Destination is 64-byte aligned, and source is 16-byte
 363         * aligned.
 364         */
 365        mov     16, %o7
 366        mov     32, %g2
 367        mov     48, %g3
 368        mov     64, %o1
 3691:      EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
 370        EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
 371        LOAD(prefetch, %i1 + %o1, #one_read)
 372        EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)   ! initializes cache line
 373        EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 374        EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
 375        EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
 376        EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 377        EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 378        add     %i1, 64, %i1
 379        EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
 380        EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 381        EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
 382        EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 383        subcc   %g1, 64, %g1
 384        bne,pt  %XCC, 1b
 385         add    %o0, 64, %o0
 386        /* fall through */
 387
 38860:     
 389        membar          #Sync
 390
 391        /* %i2 contains any final bytes still needed to be copied
 392         * over. If anything is left, we copy it one byte at a time.
 393         */
 394        RESTORE_ASI(%i3)
 395        brz,pt          %i2, 85f
 396         sub            %o0, %i1, %i3
 397        ba,a,pt         %XCC, 90f
 398         nop
 399
 400        .align          64
 40170: /* 16 < len <= 64 */
 402        bne,pn          %XCC, 75f
 403         sub            %o0, %i1, %i3
 404
 40572:
 406        andn            %i2, 0xf, %i4
 407        and             %i2, 0xf, %i2
 4081:      subcc           %i4, 0x10, %i4
 409        EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
 410        add             %i1, 0x08, %i1
 411        EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
 412        sub             %i1, 0x08, %i1
 413        EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
 414        add             %i1, 0x8, %i1
 415        EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
 416        bgu,pt          %XCC, 1b
 417         add            %i1, 0x8, %i1
 41873:     andcc           %i2, 0x8, %g0
 419        be,pt           %XCC, 1f
 420         nop
 421        sub             %i2, 0x8, %i2
 422        EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
 423        EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
 424        add             %i1, 0x8, %i1
 4251:      andcc           %i2, 0x4, %g0
 426        be,pt           %XCC, 1f
 427         nop
 428        sub             %i2, 0x4, %i2
 429        EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
 430        EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
 431        add             %i1, 0x4, %i1
 4321:      cmp             %i2, 0
 433        be,pt           %XCC, 85f
 434         nop
 435        ba,pt           %xcc, 90f
 436         nop
 437
 43875:
 439        andcc           %o0, 0x7, %g1
 440        sub             %g1, 0x8, %g1
 441        be,pn           %icc, 2f
 442         sub            %g0, %g1, %g1
 443        sub             %i2, %g1, %i2
 444
 4451:      subcc           %g1, 1, %g1
 446        EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
 447        EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
 448        bgu,pt          %icc, 1b
 449         add            %i1, 1, %i1
 450
 4512:      add             %i1, %i3, %o0
 452        andcc           %i1, 0x7, %g1
 453        bne,pt          %icc, 8f
 454         sll            %g1, 3, %g1
 455
 456        cmp             %i2, 16
 457        bgeu,pt         %icc, 72b
 458         nop
 459        ba,a,pt         %xcc, 73b
 460
 4618:      mov             64, %i3
 462        andn            %i1, 0x7, %i1
 463        EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
 464        sub             %i3, %g1, %i3
 465        andn            %i2, 0x7, %i4
 466        sllx            %g2, %g1, %g2
 4671:      add             %i1, 0x8, %i1
 468        EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
 469        subcc           %i4, 0x8, %i4
 470        srlx            %g3, %i3, %i5
 471        or              %i5, %g2, %i5
 472        EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
 473        add             %o0, 0x8, %o0
 474        bgu,pt          %icc, 1b
 475         sllx           %g3, %g1, %g2
 476
 477        srl             %g1, 3, %g1
 478        andcc           %i2, 0x7, %i2
 479        be,pn           %icc, 85f
 480         add            %i1, %g1, %i1
 481        ba,pt           %xcc, 90f
 482         sub            %o0, %i1, %i3
 483
 484        .align          64
 48580: /* 0 < len <= 16 */
 486        andcc           %i3, 0x3, %g0
 487        bne,pn          %XCC, 90f
 488         sub            %o0, %i1, %i3
 489
 4901:
 491        subcc           %i2, 4, %i2
 492        EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
 493        EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
 494        bgu,pt          %XCC, 1b
 495         add            %i1, 4, %i1
 496
 49785:     ret
 498         restore        EX_RETVAL(%i0), %g0, %o0
 499
 500        .align          32
 50190:
 502        subcc           %i2, 1, %i2
 503        EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
 504        EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
 505        bgu,pt          %XCC, 90b
 506         add            %i1, 1, %i1
 507        ret
 508         restore        EX_RETVAL(%i0), %g0, %o0
 509
 510        .size           FUNC_NAME, .-FUNC_NAME
 511