linux/arch/arm64/lib/copy_template.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (C) 2013 ARM Ltd.
   4 * Copyright (C) 2013 Linaro.
   5 *
   6 * This code is based on glibc cortex strings work originally authored by Linaro
   7 * be found @
   8 *
   9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10 * files/head:/src/aarch64/
  11 */
  12
  13
  14/*
  15 * Copy a buffer from src to dest (alignment handled by the hardware)
  16 *
  17 * Parameters:
  18 *      x0 - dest
  19 *      x1 - src
  20 *      x2 - n
  21 * Returns:
  22 *      x0 - dest
  23 */
  24dstin   .req    x0
  25src     .req    x1
  26count   .req    x2
  27tmp1    .req    x3
  28tmp1w   .req    w3
  29tmp2    .req    x4
  30tmp2w   .req    w4
  31dst     .req    x6
  32
  33A_l     .req    x7
  34A_h     .req    x8
  35B_l     .req    x9
  36B_h     .req    x10
  37C_l     .req    x11
  38C_h     .req    x12
  39D_l     .req    x13
  40D_h     .req    x14
  41
  42        mov     dst, dstin
  43        cmp     count, #16
  44        /*When memory length is less than 16, the accessed are not aligned.*/
  45        b.lo    .Ltiny15
  46
  47        neg     tmp2, src
  48        ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
  49        b.eq    .LSrcAligned
  50        sub     count, count, tmp2
  51        /*
  52        * Copy the leading memory data from src to dst in an increasing
  53        * address order.By this way,the risk of overwriting the source
  54        * memory data is eliminated when the distance between src and
  55        * dst is less than 16. The memory accesses here are alignment.
  56        */
  57        tbz     tmp2, #0, 1f
  58        ldrb1   tmp1w, src, #1
  59        strb1   tmp1w, dst, #1
  601:
  61        tbz     tmp2, #1, 2f
  62        ldrh1   tmp1w, src, #2
  63        strh1   tmp1w, dst, #2
  642:
  65        tbz     tmp2, #2, 3f
  66        ldr1    tmp1w, src, #4
  67        str1    tmp1w, dst, #4
  683:
  69        tbz     tmp2, #3, .LSrcAligned
  70        ldr1    tmp1, src, #8
  71        str1    tmp1, dst, #8
  72
  73.LSrcAligned:
  74        cmp     count, #64
  75        b.ge    .Lcpy_over64
  76        /*
  77        * Deal with small copies quickly by dropping straight into the
  78        * exit block.
  79        */
  80.Ltail63:
  81        /*
  82        * Copy up to 48 bytes of data. At this point we only need the
  83        * bottom 6 bits of count to be accurate.
  84        */
  85        ands    tmp1, count, #0x30
  86        b.eq    .Ltiny15
  87        cmp     tmp1w, #0x20
  88        b.eq    1f
  89        b.lt    2f
  90        ldp1    A_l, A_h, src, #16
  91        stp1    A_l, A_h, dst, #16
  921:
  93        ldp1    A_l, A_h, src, #16
  94        stp1    A_l, A_h, dst, #16
  952:
  96        ldp1    A_l, A_h, src, #16
  97        stp1    A_l, A_h, dst, #16
  98.Ltiny15:
  99        /*
 100        * Prefer to break one ldp/stp into several load/store to access
 101        * memory in an increasing address order,rather than to load/store 16
 102        * bytes from (src-16) to (dst-16) and to backward the src to aligned
 103        * address,which way is used in original cortex memcpy. If keeping
 104        * the original memcpy process here, memmove need to satisfy the
 105        * precondition that src address is at least 16 bytes bigger than dst
 106        * address,otherwise some source data will be overwritten when memove
 107        * call memcpy directly. To make memmove simpler and decouple the
 108        * memcpy's dependency on memmove, withdrew the original process.
 109        */
 110        tbz     count, #3, 1f
 111        ldr1    tmp1, src, #8
 112        str1    tmp1, dst, #8
 1131:
 114        tbz     count, #2, 2f
 115        ldr1    tmp1w, src, #4
 116        str1    tmp1w, dst, #4
 1172:
 118        tbz     count, #1, 3f
 119        ldrh1   tmp1w, src, #2
 120        strh1   tmp1w, dst, #2
 1213:
 122        tbz     count, #0, .Lexitfunc
 123        ldrb1   tmp1w, src, #1
 124        strb1   tmp1w, dst, #1
 125
 126        b       .Lexitfunc
 127
 128.Lcpy_over64:
 129        subs    count, count, #128
 130        b.ge    .Lcpy_body_large
 131        /*
 132        * Less than 128 bytes to copy, so handle 64 here and then jump
 133        * to the tail.
 134        */
 135        ldp1    A_l, A_h, src, #16
 136        stp1    A_l, A_h, dst, #16
 137        ldp1    B_l, B_h, src, #16
 138        ldp1    C_l, C_h, src, #16
 139        stp1    B_l, B_h, dst, #16
 140        stp1    C_l, C_h, dst, #16
 141        ldp1    D_l, D_h, src, #16
 142        stp1    D_l, D_h, dst, #16
 143
 144        tst     count, #0x3f
 145        b.ne    .Ltail63
 146        b       .Lexitfunc
 147
 148        /*
 149        * Critical loop.  Start at a new cache line boundary.  Assuming
 150        * 64 bytes per line this ensures the entire loop is in one line.
 151        */
 152        .p2align        L1_CACHE_SHIFT
 153.Lcpy_body_large:
 154        /* pre-get 64 bytes data. */
 155        ldp1    A_l, A_h, src, #16
 156        ldp1    B_l, B_h, src, #16
 157        ldp1    C_l, C_h, src, #16
 158        ldp1    D_l, D_h, src, #16
 1591:
 160        /*
 161        * interlace the load of next 64 bytes data block with store of the last
 162        * loaded 64 bytes data.
 163        */
 164        stp1    A_l, A_h, dst, #16
 165        ldp1    A_l, A_h, src, #16
 166        stp1    B_l, B_h, dst, #16
 167        ldp1    B_l, B_h, src, #16
 168        stp1    C_l, C_h, dst, #16
 169        ldp1    C_l, C_h, src, #16
 170        stp1    D_l, D_h, dst, #16
 171        ldp1    D_l, D_h, src, #16
 172        subs    count, count, #64
 173        b.ge    1b
 174        stp1    A_l, A_h, dst, #16
 175        stp1    B_l, B_h, dst, #16
 176        stp1    C_l, C_h, dst, #16
 177        stp1    D_l, D_h, dst, #16
 178
 179        tst     count, #0x3f
 180        b.ne    .Ltail63
 181.Lexitfunc:
 182