LXR linux/arch/xtensa/lib/memcopy.S

   1/*
   2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
   3 * xthal_memcpy and xthal_bcopy
   4 *
   5 * This file is subject to the terms and conditions of the GNU General Public
   6 * License.  See the file "COPYING" in the main directory of this archive
   7 * for more details.
   8 *
   9 * Copyright (C) 2002 - 2012 Tensilica Inc.
  10 */
  11
  12#include <variant/core.h>
  13
  14        .macro  src_b   r, w0, w1
  15#ifdef __XTENSA_EB__
  16        src     \r, \w0, \w1
  17#else
  18        src     \r, \w1, \w0
  19#endif
  20        .endm
  21
  22        .macro  ssa8    r
  23#ifdef __XTENSA_EB__
  24        ssa8b   \r
  25#else
  26        ssa8l   \r
  27#endif
  28        .endm
  29
  30/*
  31 * void *memcpy(void *dst, const void *src, size_t len);
  32 *
  33 * This function is intended to do the same thing as the standard
  34 * library function memcpy() for most cases.
  35 * However, where the source and/or destination references
  36 * an instruction RAM or ROM or a data RAM or ROM, that
  37 * source and/or destination will always be accessed with
  38 * 32-bit load and store instructions (as required for these
  39 * types of devices).
  40 *
  41 * !!!!!!!  XTFIXME:
  42 * !!!!!!!  Handling of IRAM/IROM has not yet
  43 * !!!!!!!  been implemented.
  44 *
  45 * The (general case) algorithm is as follows:
  46 *   If destination is unaligned, align it by conditionally
  47 *     copying 1 and 2 bytes.
  48 *   If source is aligned,
  49 *     do 16 bytes with a loop, and then finish up with
  50 *     8, 4, 2, and 1 byte copies conditional on the length;
  51 *   else (if source is unaligned),
  52 *     do the same, but use SRC to align the source data.
  53 *   This code tries to use fall-through branches for the common
  54 *     case of aligned source and destination and multiple
  55 *     of 4 (or 8) length.
  56 *
  57 * Register use:
  58 *      a0/ return address
  59 *      a1/ stack pointer
  60 *      a2/ return value
  61 *      a3/ src
  62 *      a4/ length
  63 *      a5/ dst
  64 *      a6/ tmp
  65 *      a7/ tmp
  66 *      a8/ tmp
  67 *      a9/ tmp
  68 *      a10/ tmp
  69 *      a11/ tmp
  70 */
  71
  72        .text
  73
  74/*
  75 * Byte by byte copy
  76 */
  77        .align  4
  78        .byte   0               # 1 mod 4 alignment for LOOPNEZ
  79                                # (0 mod 4 alignment for LBEG)
  80.Lbytecopy:
  81#if XCHAL_HAVE_LOOPS
  82        loopnez a4, .Lbytecopydone
  83#else /* !XCHAL_HAVE_LOOPS */
  84        beqz    a4, .Lbytecopydone
  85        add     a7, a3, a4      # a7 = end address for source
  86#endif /* !XCHAL_HAVE_LOOPS */
  87.Lnextbyte:
  88        l8ui    a6, a3, 0
  89        addi    a3, a3, 1
  90        s8i     a6, a5, 0
  91        addi    a5, a5, 1
  92#if !XCHAL_HAVE_LOOPS
  93        bne     a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  94#endif /* !XCHAL_HAVE_LOOPS */
  95.Lbytecopydone:
  96        retw
  97
  98/*
  99 * Destination is unaligned
 100 */
 101
 102        .align  4
 103.Ldst1mod2:     # dst is only byte aligned
 104        _bltui  a4, 7, .Lbytecopy       # do short copies byte by byte
 105
 106        # copy 1 byte
 107        l8ui    a6, a3,  0
 108        addi    a3, a3,  1
 109        addi    a4, a4, -1
 110        s8i     a6, a5,  0
 111        addi    a5, a5,  1
 112        _bbci.l a5, 1, .Ldstaligned     # if dst is now aligned, then
 113                                        # return to main algorithm
 114.Ldst2mod4:     # dst 16-bit aligned
 115        # copy 2 bytes
 116        _bltui  a4, 6, .Lbytecopy       # do short copies byte by byte
 117        l8ui    a6, a3,  0
 118        l8ui    a7, a3,  1
 119        addi    a3, a3,  2
 120        addi    a4, a4, -2
 121        s8i     a6, a5,  0
 122        s8i     a7, a5,  1
 123        addi    a5, a5,  2
 124        j       .Ldstaligned    # dst is now aligned, return to main algorithm
 125
 126        .align  4
 127        .global memcpy
 128        .type   memcpy,@function
 129memcpy:
 130
 131        entry   sp, 16          # minimal stack frame
 132        # a2/ dst, a3/ src, a4/ len
 133        mov     a5, a2          # copy dst so that a2 is return value
 134.Lcommon:
 135        _bbsi.l a2, 0, .Ldst1mod2       # if dst is 1 mod 2
 136        _bbsi.l a2, 1, .Ldst2mod4       # if dst is 2 mod 4
 137.Ldstaligned:   # return here from .Ldst?mod? once dst is aligned
 138        srli    a7, a4, 4       # number of loop iterations with 16B
 139                                # per iteration
 140        movi    a8, 3           # if source is not aligned,
 141        _bany   a3, a8, .Lsrcunaligned  # then use shifting copy
 142        /*
 143         * Destination and source are word-aligned, use word copy.
 144         */
 145        # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 146#if XCHAL_HAVE_LOOPS
 147        loopnez a7, .Loop1done
 148#else /* !XCHAL_HAVE_LOOPS */
 149        beqz    a7, .Loop1done
 150        slli    a8, a7, 4
 151        add     a8, a8, a3      # a8 = end of last 16B source chunk
 152#endif /* !XCHAL_HAVE_LOOPS */
 153.Loop1:
 154        l32i    a6, a3,  0
 155        l32i    a7, a3,  4
 156        s32i    a6, a5,  0
 157        l32i    a6, a3,  8
 158        s32i    a7, a5,  4
 159        l32i    a7, a3, 12
 160        s32i    a6, a5,  8
 161        addi    a3, a3, 16
 162        s32i    a7, a5, 12
 163        addi    a5, a5, 16
 164#if !XCHAL_HAVE_LOOPS
 165        bne     a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
 166#endif /* !XCHAL_HAVE_LOOPS */
 167.Loop1done:
 168        bbci.l  a4, 3, .L2
 169        # copy 8 bytes
 170        l32i    a6, a3,  0
 171        l32i    a7, a3,  4
 172        addi    a3, a3,  8
 173        s32i    a6, a5,  0
 174        s32i    a7, a5,  4
 175        addi    a5, a5,  8
 176.L2:
 177        bbsi.l  a4, 2, .L3
 178        bbsi.l  a4, 1, .L4
 179        bbsi.l  a4, 0, .L5
 180        retw
 181.L3:
 182        # copy 4 bytes
 183        l32i    a6, a3,  0
 184        addi    a3, a3,  4
 185        s32i    a6, a5,  0
 186        addi    a5, a5,  4
 187        bbsi.l  a4, 1, .L4
 188        bbsi.l  a4, 0, .L5
 189        retw
 190.L4:
 191        # copy 2 bytes
 192        l16ui   a6, a3,  0
 193        addi    a3, a3,  2
 194        s16i    a6, a5,  0
 195        addi    a5, a5,  2
 196        bbsi.l  a4, 0, .L5
 197        retw
 198.L5:
 199        # copy 1 byte
 200        l8ui    a6, a3,  0
 201        s8i     a6, a5,  0
 202        retw
 203
 204/*
 205 * Destination is aligned, Source is unaligned
 206 */
 207
 208        .align  4
 209.Lsrcunaligned:
 210        _beqz   a4, .Ldone      # avoid loading anything for zero-length copies
 211        # copy 16 bytes per iteration for word-aligned dst and unaligned src
 212        ssa8    a3              # set shift amount from byte offset
 213
 214/* set to 1 when running on ISS (simulator) with the
 215   lint or ferret client, or 0 to save a few cycles */
 216#define SIM_CHECKS_ALIGNMENT    1
 217#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 218        and     a11, a3, a8     # save unalignment offset for below
 219        sub     a3, a3, a11     # align a3
 220#endif
 221        l32i    a6, a3, 0       # load first word
 222#if XCHAL_HAVE_LOOPS
 223        loopnez a7, .Loop2done
 224#else /* !XCHAL_HAVE_LOOPS */
 225        beqz    a7, .Loop2done
 226        slli    a10, a7, 4
 227        add     a10, a10, a3    # a10 = end of last 16B source chunk
 228#endif /* !XCHAL_HAVE_LOOPS */
 229.Loop2:
 230        l32i    a7, a3,  4
 231        l32i    a8, a3,  8
 232        src_b   a6, a6, a7
 233        s32i    a6, a5,  0
 234        l32i    a9, a3, 12
 235        src_b   a7, a7, a8
 236        s32i    a7, a5,  4
 237        l32i    a6, a3, 16
 238        src_b   a8, a8, a9
 239        s32i    a8, a5,  8
 240        addi    a3, a3, 16
 241        src_b   a9, a9, a6
 242        s32i    a9, a5, 12
 243        addi    a5, a5, 16
 244#if !XCHAL_HAVE_LOOPS
 245        bne     a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
 246#endif /* !XCHAL_HAVE_LOOPS */
 247.Loop2done:
 248        bbci.l  a4, 3, .L12
 249        # copy 8 bytes
 250        l32i    a7, a3,  4
 251        l32i    a8, a3,  8
 252        src_b   a6, a6, a7
 253        s32i    a6, a5,  0
 254        addi    a3, a3,  8
 255        src_b   a7, a7, a8
 256        s32i    a7, a5,  4
 257        addi    a5, a5,  8
 258        mov     a6, a8
 259.L12:
 260        bbci.l  a4, 2, .L13
 261        # copy 4 bytes
 262        l32i    a7, a3,  4
 263        addi    a3, a3,  4
 264        src_b   a6, a6, a7
 265        s32i    a6, a5,  0
 266        addi    a5, a5,  4
 267        mov     a6, a7
 268.L13:
 269#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 270        add     a3, a3, a11     # readjust a3 with correct misalignment
 271#endif
 272        bbsi.l  a4, 1, .L14
 273        bbsi.l  a4, 0, .L15
 274.Ldone: retw
 275.L14:
 276        # copy 2 bytes
 277        l8ui    a6, a3,  0
 278        l8ui    a7, a3,  1
 279        addi    a3, a3,  2
 280        s8i     a6, a5,  0
 281        s8i     a7, a5,  1
 282        addi    a5, a5,  2
 283        bbsi.l  a4, 0, .L15
 284        retw
 285.L15:
 286        # copy 1 byte
 287        l8ui    a6, a3,  0
 288        s8i     a6, a5,  0
 289        retw
 290
 291
 292/*
 293 * void bcopy(const void *src, void *dest, size_t n);
 294 */
 295        .align  4
 296        .global bcopy
 297        .type   bcopy,@function
 298bcopy:
 299        entry   sp, 16          # minimal stack frame
 300        # a2=src, a3=dst, a4=len
 301        mov     a5, a3
 302        mov     a3, a2
 303        mov     a2, a5
 304        j       .Lmovecommon    # go to common code for memmove+bcopy
 305
 306/*
 307 * void *memmove(void *dst, const void *src, size_t len);
 308 *
 309 * This function is intended to do the same thing as the standard
 310 * library function memmove() for most cases.
 311 * However, where the source and/or destination references
 312 * an instruction RAM or ROM or a data RAM or ROM, that
 313 * source and/or destination will always be accessed with
 314 * 32-bit load and store instructions (as required for these
 315 * types of devices).
 316 *
 317 * !!!!!!!  XTFIXME:
 318 * !!!!!!!  Handling of IRAM/IROM has not yet
 319 * !!!!!!!  been implemented.
 320 *
 321 * The (general case) algorithm is as follows:
 322 *   If end of source doesn't overlap destination then use memcpy.
 323 *   Otherwise do memcpy backwards.
 324 *
 325 * Register use:
 326 *      a0/ return address
 327 *      a1/ stack pointer
 328 *      a2/ return value
 329 *      a3/ src
 330 *      a4/ length
 331 *      a5/ dst
 332 *      a6/ tmp
 333 *      a7/ tmp
 334 *      a8/ tmp
 335 *      a9/ tmp
 336 *      a10/ tmp
 337 *      a11/ tmp
 338 */
 339
 340/*
 341 * Byte by byte copy
 342 */
 343        .align  4
 344        .byte   0               # 1 mod 4 alignment for LOOPNEZ
 345                                # (0 mod 4 alignment for LBEG)
 346.Lbackbytecopy:
 347#if XCHAL_HAVE_LOOPS
 348        loopnez a4, .Lbackbytecopydone
 349#else /* !XCHAL_HAVE_LOOPS */
 350        beqz    a4, .Lbackbytecopydone
 351        sub     a7, a3, a4      # a7 = start address for source
 352#endif /* !XCHAL_HAVE_LOOPS */
 353.Lbacknextbyte:
 354        addi    a3, a3, -1
 355        l8ui    a6, a3, 0
 356        addi    a5, a5, -1
 357        s8i     a6, a5, 0
 358#if !XCHAL_HAVE_LOOPS
 359        bne     a3, a7, .Lbacknextbyte # continue loop if
 360                                       # $a3:src != $a7:src_start
 361#endif /* !XCHAL_HAVE_LOOPS */
 362.Lbackbytecopydone:
 363        retw
 364
 365/*
 366 * Destination is unaligned
 367 */
 368
 369        .align  4
 370.Lbackdst1mod2: # dst is only byte aligned
 371        _bltui  a4, 7, .Lbackbytecopy   # do short copies byte by byte
 372
 373        # copy 1 byte
 374        addi    a3, a3, -1
 375        l8ui    a6, a3,  0
 376        addi    a5, a5, -1
 377        s8i     a6, a5,  0
 378        addi    a4, a4, -1
 379        _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
 380                                        # return to main algorithm
 381.Lbackdst2mod4: # dst 16-bit aligned
 382        # copy 2 bytes
 383        _bltui  a4, 6, .Lbackbytecopy   # do short copies byte by byte
 384        addi    a3, a3, -2
 385        l8ui    a6, a3,  0
 386        l8ui    a7, a3,  1
 387        addi    a5, a5, -2
 388        s8i     a6, a5,  0
 389        s8i     a7, a5,  1
 390        addi    a4, a4, -2
 391        j       .Lbackdstaligned        # dst is now aligned,
 392                                        # return to main algorithm
 393
 394        .align  4
 395        .global memmove
 396        .type   memmove,@function
 397memmove:
 398
 399        entry   sp, 16          # minimal stack frame
 400        # a2/ dst, a3/ src, a4/ len
 401        mov     a5, a2          # copy dst so that a2 is return value
 402.Lmovecommon:
 403        sub     a6, a5, a3
 404        bgeu    a6, a4, .Lcommon
 405
 406        add     a5, a5, a4
 407        add     a3, a3, a4
 408
 409        _bbsi.l a5, 0, .Lbackdst1mod2   # if dst is 1 mod 2
 410        _bbsi.l a5, 1, .Lbackdst2mod4   # if dst is 2 mod 4
 411.Lbackdstaligned:       # return here from .Lbackdst?mod? once dst is aligned
 412        srli    a7, a4, 4       # number of loop iterations with 16B
 413                                # per iteration
 414        movi    a8, 3           # if source is not aligned,
 415        _bany   a3, a8, .Lbacksrcunaligned      # then use shifting copy
 416        /*
 417         * Destination and source are word-aligned, use word copy.
 418         */
 419        # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 420#if XCHAL_HAVE_LOOPS
 421        loopnez a7, .backLoop1done
 422#else /* !XCHAL_HAVE_LOOPS */
 423        beqz    a7, .backLoop1done
 424        slli    a8, a7, 4
 425        sub     a8, a3, a8      # a8 = start of first 16B source chunk
 426#endif /* !XCHAL_HAVE_LOOPS */
 427.backLoop1:
 428        addi    a3, a3, -16
 429        l32i    a7, a3, 12
 430        l32i    a6, a3,  8
 431        addi    a5, a5, -16
 432        s32i    a7, a5, 12
 433        l32i    a7, a3,  4
 434        s32i    a6, a5,  8
 435        l32i    a6, a3,  0
 436        s32i    a7, a5,  4
 437        s32i    a6, a5,  0
 438#if !XCHAL_HAVE_LOOPS
 439        bne     a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
 440#endif /* !XCHAL_HAVE_LOOPS */
 441.backLoop1done:
 442        bbci.l  a4, 3, .Lback2
 443        # copy 8 bytes
 444        addi    a3, a3, -8
 445        l32i    a6, a3,  0
 446        l32i    a7, a3,  4
 447        addi    a5, a5, -8
 448        s32i    a6, a5,  0
 449        s32i    a7, a5,  4
 450.Lback2:
 451        bbsi.l  a4, 2, .Lback3
 452        bbsi.l  a4, 1, .Lback4
 453        bbsi.l  a4, 0, .Lback5
 454        retw
 455.Lback3:
 456        # copy 4 bytes
 457        addi    a3, a3, -4
 458        l32i    a6, a3,  0
 459        addi    a5, a5, -4
 460        s32i    a6, a5,  0
 461        bbsi.l  a4, 1, .Lback4
 462        bbsi.l  a4, 0, .Lback5
 463        retw
 464.Lback4:
 465        # copy 2 bytes
 466        addi    a3, a3, -2
 467        l16ui   a6, a3,  0
 468        addi    a5, a5, -2
 469        s16i    a6, a5,  0
 470        bbsi.l  a4, 0, .Lback5
 471        retw
 472.Lback5:
 473        # copy 1 byte
 474        addi    a3, a3, -1
 475        l8ui    a6, a3,  0
 476        addi    a5, a5, -1
 477        s8i     a6, a5,  0
 478        retw
 479
 480/*
 481 * Destination is aligned, Source is unaligned
 482 */
 483
 484        .align  4
 485.Lbacksrcunaligned:
 486        _beqz   a4, .Lbackdone  # avoid loading anything for zero-length copies
 487        # copy 16 bytes per iteration for word-aligned dst and unaligned src
 488        ssa8    a3              # set shift amount from byte offset
 489#define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS with
 490                                         * the lint or ferret client, or 0
 491                                         * to save a few cycles */
 492#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 493        and     a11, a3, a8     # save unalignment offset for below
 494        sub     a3, a3, a11     # align a3
 495#endif
 496        l32i    a6, a3, 0       # load first word
 497#if XCHAL_HAVE_LOOPS
 498        loopnez a7, .backLoop2done
 499#else /* !XCHAL_HAVE_LOOPS */
 500        beqz    a7, .backLoop2done
 501        slli    a10, a7, 4
 502        sub     a10, a3, a10    # a10 = start of first 16B source chunk
 503#endif /* !XCHAL_HAVE_LOOPS */
 504.backLoop2:
 505        addi    a3, a3, -16
 506        l32i    a7, a3, 12
 507        l32i    a8, a3,  8
 508        addi    a5, a5, -16
 509        src_b   a6, a7, a6
 510        s32i    a6, a5, 12
 511        l32i    a9, a3,  4
 512        src_b   a7, a8, a7
 513        s32i    a7, a5,  8
 514        l32i    a6, a3,  0
 515        src_b   a8, a9, a8
 516        s32i    a8, a5,  4
 517        src_b   a9, a6, a9
 518        s32i    a9, a5,  0
 519#if !XCHAL_HAVE_LOOPS
 520        bne     a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
 521#endif /* !XCHAL_HAVE_LOOPS */
 522.backLoop2done:
 523        bbci.l  a4, 3, .Lback12
 524        # copy 8 bytes
 525        addi    a3, a3, -8
 526        l32i    a7, a3,  4
 527        l32i    a8, a3,  0
 528        addi    a5, a5, -8
 529        src_b   a6, a7, a6
 530        s32i    a6, a5,  4
 531        src_b   a7, a8, a7
 532        s32i    a7, a5,  0
 533        mov     a6, a8
 534.Lback12:
 535        bbci.l  a4, 2, .Lback13
 536        # copy 4 bytes
 537        addi    a3, a3, -4
 538        l32i    a7, a3,  0
 539        addi    a5, a5, -4
 540        src_b   a6, a7, a6
 541        s32i    a6, a5,  0
 542        mov     a6, a7
 543.Lback13:
 544#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 545        add     a3, a3, a11     # readjust a3 with correct misalignment
 546#endif
 547        bbsi.l  a4, 1, .Lback14
 548        bbsi.l  a4, 0, .Lback15
 549.Lbackdone:
 550        retw
 551.Lback14:
 552        # copy 2 bytes
 553        addi    a3, a3, -2
 554        l8ui    a6, a3,  0
 555        l8ui    a7, a3,  1
 556        addi    a5, a5, -2
 557        s8i     a6, a5,  0
 558        s8i     a7, a5,  1
 559        bbsi.l  a4, 0, .Lback15
 560        retw
 561.Lback15:
 562        # copy 1 byte
 563        addi    a3, a3, -1
 564        addi    a5, a5, -1
 565        l8ui    a6, a3,  0
 566        s8i     a6, a5,  0
 567        retw
 568
 569
 570/*
 571 * Local Variables:
 572 * mode:fundamental
 573 * comment-start: "# "
 574 * comment-start-skip: "# *"
 575 * End:
 576 */
 577