linux/arch/arm64/lib/memcpy.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2012-2021, Arm Limited.
   4 *
   5 * Adapted from the original at:
   6 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/assembler.h>
  11
  12/* Assumptions:
  13 *
  14 * ARMv8-a, AArch64, unaligned accesses.
  15 *
  16 */
  17
  18#define L(label) .L ## label
  19
  20#define dstin   x0
  21#define src     x1
  22#define count   x2
  23#define dst     x3
  24#define srcend  x4
  25#define dstend  x5
  26#define A_l     x6
  27#define A_lw    w6
  28#define A_h     x7
  29#define B_l     x8
  30#define B_lw    w8
  31#define B_h     x9
  32#define C_l     x10
  33#define C_lw    w10
  34#define C_h     x11
  35#define D_l     x12
  36#define D_h     x13
  37#define E_l     x14
  38#define E_h     x15
  39#define F_l     x16
  40#define F_h     x17
  41#define G_l     count
  42#define G_h     dst
  43#define H_l     src
  44#define H_h     srcend
  45#define tmp1    x14
  46
  47/* This implementation handles overlaps and supports both memcpy and memmove
  48   from a single entry point.  It uses unaligned accesses and branchless
  49   sequences to keep the code small, simple and improve performance.
  50
  51   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  52   copies of up to 128 bytes, and large copies.  The overhead of the overlap
  53   check is negligible since it is only required for large copies.
  54
  55   Large copies use a software pipelined loop processing 64 bytes per iteration.
  56   The destination pointer is 16-byte aligned to minimize unaligned accesses.
  57   The loop tail is handled by always copying 64 bytes from the end.
  58*/
  59
  60SYM_FUNC_START_ALIAS(__memmove)
  61SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
  62SYM_FUNC_START_ALIAS(__memcpy)
  63SYM_FUNC_START_WEAK_PI(memcpy)
  64        add     srcend, src, count
  65        add     dstend, dstin, count
  66        cmp     count, 128
  67        b.hi    L(copy_long)
  68        cmp     count, 32
  69        b.hi    L(copy32_128)
  70
  71        /* Small copies: 0..32 bytes.  */
  72        cmp     count, 16
  73        b.lo    L(copy16)
  74        ldp     A_l, A_h, [src]
  75        ldp     D_l, D_h, [srcend, -16]
  76        stp     A_l, A_h, [dstin]
  77        stp     D_l, D_h, [dstend, -16]
  78        ret
  79
  80        /* Copy 8-15 bytes.  */
  81L(copy16):
  82        tbz     count, 3, L(copy8)
  83        ldr     A_l, [src]
  84        ldr     A_h, [srcend, -8]
  85        str     A_l, [dstin]
  86        str     A_h, [dstend, -8]
  87        ret
  88
  89        .p2align 3
  90        /* Copy 4-7 bytes.  */
  91L(copy8):
  92        tbz     count, 2, L(copy4)
  93        ldr     A_lw, [src]
  94        ldr     B_lw, [srcend, -4]
  95        str     A_lw, [dstin]
  96        str     B_lw, [dstend, -4]
  97        ret
  98
  99        /* Copy 0..3 bytes using a branchless sequence.  */
 100L(copy4):
 101        cbz     count, L(copy0)
 102        lsr     tmp1, count, 1
 103        ldrb    A_lw, [src]
 104        ldrb    C_lw, [srcend, -1]
 105        ldrb    B_lw, [src, tmp1]
 106        strb    A_lw, [dstin]
 107        strb    B_lw, [dstin, tmp1]
 108        strb    C_lw, [dstend, -1]
 109L(copy0):
 110        ret
 111
 112        .p2align 4
 113        /* Medium copies: 33..128 bytes.  */
 114L(copy32_128):
 115        ldp     A_l, A_h, [src]
 116        ldp     B_l, B_h, [src, 16]
 117        ldp     C_l, C_h, [srcend, -32]
 118        ldp     D_l, D_h, [srcend, -16]
 119        cmp     count, 64
 120        b.hi    L(copy128)
 121        stp     A_l, A_h, [dstin]
 122        stp     B_l, B_h, [dstin, 16]
 123        stp     C_l, C_h, [dstend, -32]
 124        stp     D_l, D_h, [dstend, -16]
 125        ret
 126
 127        .p2align 4
 128        /* Copy 65..128 bytes.  */
 129L(copy128):
 130        ldp     E_l, E_h, [src, 32]
 131        ldp     F_l, F_h, [src, 48]
 132        cmp     count, 96
 133        b.ls    L(copy96)
 134        ldp     G_l, G_h, [srcend, -64]
 135        ldp     H_l, H_h, [srcend, -48]
 136        stp     G_l, G_h, [dstend, -64]
 137        stp     H_l, H_h, [dstend, -48]
 138L(copy96):
 139        stp     A_l, A_h, [dstin]
 140        stp     B_l, B_h, [dstin, 16]
 141        stp     E_l, E_h, [dstin, 32]
 142        stp     F_l, F_h, [dstin, 48]
 143        stp     C_l, C_h, [dstend, -32]
 144        stp     D_l, D_h, [dstend, -16]
 145        ret
 146
 147        .p2align 4
 148        /* Copy more than 128 bytes.  */
 149L(copy_long):
 150        /* Use backwards copy if there is an overlap.  */
 151        sub     tmp1, dstin, src
 152        cbz     tmp1, L(copy0)
 153        cmp     tmp1, count
 154        b.lo    L(copy_long_backwards)
 155
 156        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 157
 158        ldp     D_l, D_h, [src]
 159        and     tmp1, dstin, 15
 160        bic     dst, dstin, 15
 161        sub     src, src, tmp1
 162        add     count, count, tmp1      /* Count is now 16 too large.  */
 163        ldp     A_l, A_h, [src, 16]
 164        stp     D_l, D_h, [dstin]
 165        ldp     B_l, B_h, [src, 32]
 166        ldp     C_l, C_h, [src, 48]
 167        ldp     D_l, D_h, [src, 64]!
 168        subs    count, count, 128 + 16  /* Test and readjust count.  */
 169        b.ls    L(copy64_from_end)
 170
 171L(loop64):
 172        stp     A_l, A_h, [dst, 16]
 173        ldp     A_l, A_h, [src, 16]
 174        stp     B_l, B_h, [dst, 32]
 175        ldp     B_l, B_h, [src, 32]
 176        stp     C_l, C_h, [dst, 48]
 177        ldp     C_l, C_h, [src, 48]
 178        stp     D_l, D_h, [dst, 64]!
 179        ldp     D_l, D_h, [src, 64]!
 180        subs    count, count, 64
 181        b.hi    L(loop64)
 182
 183        /* Write the last iteration and copy 64 bytes from the end.  */
 184L(copy64_from_end):
 185        ldp     E_l, E_h, [srcend, -64]
 186        stp     A_l, A_h, [dst, 16]
 187        ldp     A_l, A_h, [srcend, -48]
 188        stp     B_l, B_h, [dst, 32]
 189        ldp     B_l, B_h, [srcend, -32]
 190        stp     C_l, C_h, [dst, 48]
 191        ldp     C_l, C_h, [srcend, -16]
 192        stp     D_l, D_h, [dst, 64]
 193        stp     E_l, E_h, [dstend, -64]
 194        stp     A_l, A_h, [dstend, -48]
 195        stp     B_l, B_h, [dstend, -32]
 196        stp     C_l, C_h, [dstend, -16]
 197        ret
 198
 199        .p2align 4
 200
 201        /* Large backwards copy for overlapping copies.
 202           Copy 16 bytes and then align dst to 16-byte alignment.  */
 203L(copy_long_backwards):
 204        ldp     D_l, D_h, [srcend, -16]
 205        and     tmp1, dstend, 15
 206        sub     srcend, srcend, tmp1
 207        sub     count, count, tmp1
 208        ldp     A_l, A_h, [srcend, -16]
 209        stp     D_l, D_h, [dstend, -16]
 210        ldp     B_l, B_h, [srcend, -32]
 211        ldp     C_l, C_h, [srcend, -48]
 212        ldp     D_l, D_h, [srcend, -64]!
 213        sub     dstend, dstend, tmp1
 214        subs    count, count, 128
 215        b.ls    L(copy64_from_start)
 216
 217L(loop64_backwards):
 218        stp     A_l, A_h, [dstend, -16]
 219        ldp     A_l, A_h, [srcend, -16]
 220        stp     B_l, B_h, [dstend, -32]
 221        ldp     B_l, B_h, [srcend, -32]
 222        stp     C_l, C_h, [dstend, -48]
 223        ldp     C_l, C_h, [srcend, -48]
 224        stp     D_l, D_h, [dstend, -64]!
 225        ldp     D_l, D_h, [srcend, -64]!
 226        subs    count, count, 64
 227        b.hi    L(loop64_backwards)
 228
 229        /* Write the last iteration and copy 64 bytes from the start.  */
 230L(copy64_from_start):
 231        ldp     G_l, G_h, [src, 48]
 232        stp     A_l, A_h, [dstend, -16]
 233        ldp     A_l, A_h, [src, 32]
 234        stp     B_l, B_h, [dstend, -32]
 235        ldp     B_l, B_h, [src, 16]
 236        stp     C_l, C_h, [dstend, -48]
 237        ldp     C_l, C_h, [src]
 238        stp     D_l, D_h, [dstend, -64]
 239        stp     G_l, G_h, [dstin, 48]
 240        stp     A_l, A_h, [dstin, 32]
 241        stp     B_l, B_h, [dstin, 16]
 242        stp     C_l, C_h, [dstin]
 243        ret
 244
 245SYM_FUNC_END_PI(memcpy)
 246EXPORT_SYMBOL(memcpy)
 247SYM_FUNC_END_ALIAS(__memcpy)
 248EXPORT_SYMBOL(__memcpy)
 249SYM_FUNC_END_ALIAS_PI(memmove)
 250EXPORT_SYMBOL(memmove)
 251SYM_FUNC_END_ALIAS(__memmove)
 252EXPORT_SYMBOL(__memmove)
 253