LXR linux/arch/xtensa/lib/memcopy.S

   1/*
   2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
   3 * xthal_memcpy and xthal_bcopy
   4 *
   5 * This file is subject to the terms and conditions of the GNU General Public
   6 * License.  See the file "COPYING" in the main directory of this archive
   7 * for more details.
   8 *
   9 * Copyright (C) 2002 - 2005 Tensilica Inc.
  10 */
  11
  12#include <variant/core.h>
  13
  14        .macro  src_b   r, w0, w1
  15#ifdef __XTENSA_EB__
  16        src     \r, \w0, \w1
  17#else
  18        src     \r, \w1, \w0
  19#endif
  20        .endm
  21
  22        .macro  ssa8    r
  23#ifdef __XTENSA_EB__
  24        ssa8b   \r
  25#else
  26        ssa8l   \r
  27#endif
  28        .endm
  29
  30
  31/*
  32 * void *memcpy(void *dst, const void *src, size_t len);
  33 * void *memmove(void *dst, const void *src, size_t len);
  34 * void *bcopy(const void *src, void *dst, size_t len);
  35 *
  36 * This function is intended to do the same thing as the standard
  37 * library function memcpy() (or bcopy()) for most cases.
  38 * However, where the source and/or destination references
  39 * an instruction RAM or ROM or a data RAM or ROM, that
  40 * source and/or destination will always be accessed with
  41 * 32-bit load and store instructions (as required for these
  42 * types of devices).
  43 *
  44 * !!!!!!!  XTFIXME:
  45 * !!!!!!!  Handling of IRAM/IROM has not yet
  46 * !!!!!!!  been implemented.
  47 *
  48 * The bcopy version is provided here to avoid the overhead
  49 * of an extra call, for callers that require this convention.
  50 *
  51 * The (general case) algorithm is as follows:
  52 *   If destination is unaligned, align it by conditionally
  53 *     copying 1 and 2 bytes.
  54 *   If source is aligned,
  55 *     do 16 bytes with a loop, and then finish up with
  56 *     8, 4, 2, and 1 byte copies conditional on the length;
  57 *   else (if source is unaligned),
  58 *     do the same, but use SRC to align the source data.
  59 *   This code tries to use fall-through branches for the common
  60 *     case of aligned source and destination and multiple
  61 *     of 4 (or 8) length.
  62 *
  63 * Register use:
  64 *      a0/ return address
  65 *      a1/ stack pointer
  66 *      a2/ return value
  67 *      a3/ src
  68 *      a4/ length
  69 *      a5/ dst
  70 *      a6/ tmp
  71 *      a7/ tmp
  72 *      a8/ tmp
  73 *      a9/ tmp
  74 *      a10/ tmp
  75 *      a11/ tmp
  76 */
  77
  78        .text
  79        .align  4
  80        .global bcopy
  81        .type   bcopy,@function
  82bcopy:
  83        entry   sp, 16          # minimal stack frame
  84        # a2=src, a3=dst, a4=len
  85        mov     a5, a3          # copy dst so that a2 is return value
  86        mov     a3, a2
  87        mov     a2, a5
  88        j       .Lcommon        # go to common code for memcpy+bcopy
  89
  90
  91/*
  92 * Byte by byte copy
  93 */
  94        .align  4
  95        .byte   0               # 1 mod 4 alignment for LOOPNEZ
  96                                # (0 mod 4 alignment for LBEG)
  97.Lbytecopy:
  98#if XCHAL_HAVE_LOOPS
  99        loopnez a4, .Lbytecopydone
 100#else /* !XCHAL_HAVE_LOOPS */
 101        beqz    a4, .Lbytecopydone
 102        add     a7, a3, a4      # a7 = end address for source
 103#endif /* !XCHAL_HAVE_LOOPS */
 104.Lnextbyte:
 105        l8ui    a6, a3, 0
 106        addi    a3, a3, 1
 107        s8i     a6, a5, 0
 108        addi    a5, a5, 1
 109#if !XCHAL_HAVE_LOOPS
 110        blt     a3, a7, .Lnextbyte
 111#endif /* !XCHAL_HAVE_LOOPS */
 112.Lbytecopydone:
 113        retw
 114
 115/*
 116 * Destination is unaligned
 117 */
 118
 119        .align  4
 120.Ldst1mod2:     # dst is only byte aligned
 121        _bltui  a4, 7, .Lbytecopy       # do short copies byte by byte
 122
 123        # copy 1 byte
 124        l8ui    a6, a3,  0
 125        addi    a3, a3,  1
 126        addi    a4, a4, -1
 127        s8i     a6, a5,  0
 128        addi    a5, a5,  1
 129        _bbci.l a5, 1, .Ldstaligned     # if dst is now aligned, then
 130                                        # return to main algorithm
 131.Ldst2mod4:     # dst 16-bit aligned
 132        # copy 2 bytes
 133        _bltui  a4, 6, .Lbytecopy       # do short copies byte by byte
 134        l8ui    a6, a3,  0
 135        l8ui    a7, a3,  1
 136        addi    a3, a3,  2
 137        addi    a4, a4, -2
 138        s8i     a6, a5,  0
 139        s8i     a7, a5,  1
 140        addi    a5, a5,  2
 141        j       .Ldstaligned    # dst is now aligned, return to main algorithm
 142
 143        .align  4
 144        .global memcpy
 145        .type   memcpy,@function
 146memcpy:
 147        .global memmove
 148        .type   memmove,@function
 149memmove:
 150
 151        entry   sp, 16          # minimal stack frame
 152        # a2/ dst, a3/ src, a4/ len
 153        mov     a5, a2          # copy dst so that a2 is return value
 154.Lcommon:
 155        _bbsi.l a2, 0, .Ldst1mod2       # if dst is 1 mod 2
 156        _bbsi.l a2, 1, .Ldst2mod4       # if dst is 2 mod 4
 157.Ldstaligned:   # return here from .Ldst?mod? once dst is aligned
 158        srli    a7, a4, 4       # number of loop iterations with 16B
 159                                # per iteration
 160        movi    a8, 3           # if source is not aligned,
 161        _bany   a3, a8, .Lsrcunaligned  # then use shifting copy
 162        /*
 163         * Destination and source are word-aligned, use word copy.
 164         */
 165        # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 166#if XCHAL_HAVE_LOOPS
 167        loopnez a7, .Loop1done
 168#else /* !XCHAL_HAVE_LOOPS */
 169        beqz    a7, .Loop1done
 170        slli    a8, a7, 4
 171        add     a8, a8, a3      # a8 = end of last 16B source chunk
 172#endif /* !XCHAL_HAVE_LOOPS */
 173.Loop1:
 174        l32i    a6, a3,  0
 175        l32i    a7, a3,  4
 176        s32i    a6, a5,  0
 177        l32i    a6, a3,  8
 178        s32i    a7, a5,  4
 179        l32i    a7, a3, 12
 180        s32i    a6, a5,  8
 181        addi    a3, a3, 16
 182        s32i    a7, a5, 12
 183        addi    a5, a5, 16
 184#if !XCHAL_HAVE_LOOPS
 185        blt     a3, a8, .Loop1
 186#endif /* !XCHAL_HAVE_LOOPS */
 187.Loop1done:
 188        bbci.l  a4, 3, .L2
 189        # copy 8 bytes
 190        l32i    a6, a3,  0
 191        l32i    a7, a3,  4
 192        addi    a3, a3,  8
 193        s32i    a6, a5,  0
 194        s32i    a7, a5,  4
 195        addi    a5, a5,  8
 196.L2:
 197        bbsi.l  a4, 2, .L3
 198        bbsi.l  a4, 1, .L4
 199        bbsi.l  a4, 0, .L5
 200        retw
 201.L3:
 202        # copy 4 bytes
 203        l32i    a6, a3,  0
 204        addi    a3, a3,  4
 205        s32i    a6, a5,  0
 206        addi    a5, a5,  4
 207        bbsi.l  a4, 1, .L4
 208        bbsi.l  a4, 0, .L5
 209        retw
 210.L4:
 211        # copy 2 bytes
 212        l16ui   a6, a3,  0
 213        addi    a3, a3,  2
 214        s16i    a6, a5,  0
 215        addi    a5, a5,  2
 216        bbsi.l  a4, 0, .L5
 217        retw
 218.L5:
 219        # copy 1 byte
 220        l8ui    a6, a3,  0
 221        s8i     a6, a5,  0
 222        retw
 223
 224/*
 225 * Destination is aligned, Source is unaligned
 226 */
 227
 228        .align  4
 229.Lsrcunaligned:
 230        _beqz   a4, .Ldone      # avoid loading anything for zero-length copies
 231        # copy 16 bytes per iteration for word-aligned dst and unaligned src
 232        ssa8    a3              # set shift amount from byte offset
 233#define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS (simulator) with the
 234                                           lint or ferret client, or 0 to save a few cycles */
 235#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 236        and     a11, a3, a8     # save unalignment offset for below
 237        sub     a3, a3, a11     # align a3
 238#endif
 239        l32i    a6, a3, 0       # load first word
 240#if XCHAL_HAVE_LOOPS
 241        loopnez a7, .Loop2done
 242#else /* !XCHAL_HAVE_LOOPS */
 243        beqz    a7, .Loop2done
 244        slli    a10, a7, 4
 245        add     a10, a10, a3    # a10 = end of last 16B source chunk
 246#endif /* !XCHAL_HAVE_LOOPS */
 247.Loop2:
 248        l32i    a7, a3,  4
 249        l32i    a8, a3,  8
 250        src_b   a6, a6, a7
 251        s32i    a6, a5,  0
 252        l32i    a9, a3, 12
 253        src_b   a7, a7, a8
 254        s32i    a7, a5,  4
 255        l32i    a6, a3, 16
 256        src_b   a8, a8, a9
 257        s32i    a8, a5,  8
 258        addi    a3, a3, 16
 259        src_b   a9, a9, a6
 260        s32i    a9, a5, 12
 261        addi    a5, a5, 16
 262#if !XCHAL_HAVE_LOOPS
 263        blt     a3, a10, .Loop2
 264#endif /* !XCHAL_HAVE_LOOPS */
 265.Loop2done:
 266        bbci.l  a4, 3, .L12
 267        # copy 8 bytes
 268        l32i    a7, a3,  4
 269        l32i    a8, a3,  8
 270        src_b   a6, a6, a7
 271        s32i    a6, a5,  0
 272        addi    a3, a3,  8
 273        src_b   a7, a7, a8
 274        s32i    a7, a5,  4
 275        addi    a5, a5,  8
 276        mov     a6, a8
 277.L12:
 278        bbci.l  a4, 2, .L13
 279        # copy 4 bytes
 280        l32i    a7, a3,  4
 281        addi    a3, a3,  4
 282        src_b   a6, a6, a7
 283        s32i    a6, a5,  0
 284        addi    a5, a5,  4
 285        mov     a6, a7
 286.L13:
 287#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 288        add     a3, a3, a11     # readjust a3 with correct misalignment
 289#endif
 290        bbsi.l  a4, 1, .L14
 291        bbsi.l  a4, 0, .L15
 292.Ldone: retw
 293.L14:
 294        # copy 2 bytes
 295        l8ui    a6, a3,  0
 296        l8ui    a7, a3,  1
 297        addi    a3, a3,  2
 298        s8i     a6, a5,  0
 299        s8i     a7, a5,  1
 300        addi    a5, a5,  2
 301        bbsi.l  a4, 0, .L15
 302        retw
 303.L15:
 304        # copy 1 byte
 305        l8ui    a6, a3,  0
 306        s8i     a6, a5,  0
 307        retw
 308
 309/*
 310 * Local Variables:
 311 * mode:fundamental
 312 * comment-start: "# "
 313 * comment-start-skip: "# *"
 314 * End:
 315 */
 316