uboot/arch/arm/lib/memcpy-arm64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: MIT */
   2/*
   3 * memcpy - copy memory area
   4 *
   5 * Copyright (c) 2012-2020, Arm Limited.
   6 */
   7
   8/* Assumptions:
   9 *
  10 * ARMv8-a, AArch64, unaligned accesses.
  11 *
  12 */
  13
  14#include "asmdefs.h"
  15
  16#define dstin   x0
  17#define src     x1
  18#define count   x2
  19#define dst     x3
  20#define srcend  x4
  21#define dstend  x5
  22#define A_l     x6
  23#define A_lw    w6
  24#define A_h     x7
  25#define B_l     x8
  26#define B_lw    w8
  27#define B_h     x9
  28#define C_l     x10
  29#define C_lw    w10
  30#define C_h     x11
  31#define D_l     x12
  32#define D_h     x13
  33#define E_l     x14
  34#define E_h     x15
  35#define F_l     x16
  36#define F_h     x17
  37#define G_l     count
  38#define G_h     dst
  39#define H_l     src
  40#define H_h     srcend
  41#define tmp1    x14
  42
  43/* This implementation handles overlaps and supports both memcpy and memmove
  44   from a single entry point.  It uses unaligned accesses and branchless
  45   sequences to keep the code small, simple and improve performance.
  46
  47   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  48   copies of up to 128 bytes, and large copies.  The overhead of the overlap
  49   check is negligible since it is only required for large copies.
  50
  51   Large copies use a software pipelined loop processing 64 bytes per iteration.
  52   The destination pointer is 16-byte aligned to minimize unaligned accesses.
  53   The loop tail is handled by always copying 64 bytes from the end.
  54*/
  55
  56ENTRY_ALIAS (memmove)
  57ENTRY (memcpy)
  58        PTR_ARG (0)
  59        PTR_ARG (1)
  60        SIZE_ARG (2)
  61        add     srcend, src, count
  62        add     dstend, dstin, count
  63        cmp     count, 128
  64        b.hi    L(copy_long)
  65        cmp     count, 32
  66        b.hi    L(copy32_128)
  67
  68        /* Small copies: 0..32 bytes.  */
  69        cmp     count, 16
  70        b.lo    L(copy16)
  71        ldp     A_l, A_h, [src]
  72        ldp     D_l, D_h, [srcend, -16]
  73        stp     A_l, A_h, [dstin]
  74        stp     D_l, D_h, [dstend, -16]
  75        ret
  76
  77        /* Copy 8-15 bytes.  */
  78L(copy16):
  79        tbz     count, 3, L(copy8)
  80        ldr     A_l, [src]
  81        ldr     A_h, [srcend, -8]
  82        str     A_l, [dstin]
  83        str     A_h, [dstend, -8]
  84        ret
  85
  86        .p2align 3
  87        /* Copy 4-7 bytes.  */
  88L(copy8):
  89        tbz     count, 2, L(copy4)
  90        ldr     A_lw, [src]
  91        ldr     B_lw, [srcend, -4]
  92        str     A_lw, [dstin]
  93        str     B_lw, [dstend, -4]
  94        ret
  95
  96        /* Copy 0..3 bytes using a branchless sequence.  */
  97L(copy4):
  98        cbz     count, L(copy0)
  99        lsr     tmp1, count, 1
 100        ldrb    A_lw, [src]
 101        ldrb    C_lw, [srcend, -1]
 102        ldrb    B_lw, [src, tmp1]
 103        strb    A_lw, [dstin]
 104        strb    B_lw, [dstin, tmp1]
 105        strb    C_lw, [dstend, -1]
 106L(copy0):
 107        ret
 108
 109        .p2align 4
 110        /* Medium copies: 33..128 bytes.  */
 111L(copy32_128):
 112        ldp     A_l, A_h, [src]
 113        ldp     B_l, B_h, [src, 16]
 114        ldp     C_l, C_h, [srcend, -32]
 115        ldp     D_l, D_h, [srcend, -16]
 116        cmp     count, 64
 117        b.hi    L(copy128)
 118        stp     A_l, A_h, [dstin]
 119        stp     B_l, B_h, [dstin, 16]
 120        stp     C_l, C_h, [dstend, -32]
 121        stp     D_l, D_h, [dstend, -16]
 122        ret
 123
 124        .p2align 4
 125        /* Copy 65..128 bytes.  */
 126L(copy128):
 127        ldp     E_l, E_h, [src, 32]
 128        ldp     F_l, F_h, [src, 48]
 129        cmp     count, 96
 130        b.ls    L(copy96)
 131        ldp     G_l, G_h, [srcend, -64]
 132        ldp     H_l, H_h, [srcend, -48]
 133        stp     G_l, G_h, [dstend, -64]
 134        stp     H_l, H_h, [dstend, -48]
 135L(copy96):
 136        stp     A_l, A_h, [dstin]
 137        stp     B_l, B_h, [dstin, 16]
 138        stp     E_l, E_h, [dstin, 32]
 139        stp     F_l, F_h, [dstin, 48]
 140        stp     C_l, C_h, [dstend, -32]
 141        stp     D_l, D_h, [dstend, -16]
 142        ret
 143
 144        .p2align 4
 145        /* Copy more than 128 bytes.  */
 146L(copy_long):
 147        /* Use backwards copy if there is an overlap.  */
 148        sub     tmp1, dstin, src
 149        cbz     tmp1, L(copy0)
 150        cmp     tmp1, count
 151        b.lo    L(copy_long_backwards)
 152
 153        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 154
 155        ldp     D_l, D_h, [src]
 156        and     tmp1, dstin, 15
 157        bic     dst, dstin, 15
 158        sub     src, src, tmp1
 159        add     count, count, tmp1      /* Count is now 16 too large.  */
 160        ldp     A_l, A_h, [src, 16]
 161        stp     D_l, D_h, [dstin]
 162        ldp     B_l, B_h, [src, 32]
 163        ldp     C_l, C_h, [src, 48]
 164        ldp     D_l, D_h, [src, 64]!
 165        subs    count, count, 128 + 16  /* Test and readjust count.  */
 166        b.ls    L(copy64_from_end)
 167
 168L(loop64):
 169        stp     A_l, A_h, [dst, 16]
 170        ldp     A_l, A_h, [src, 16]
 171        stp     B_l, B_h, [dst, 32]
 172        ldp     B_l, B_h, [src, 32]
 173        stp     C_l, C_h, [dst, 48]
 174        ldp     C_l, C_h, [src, 48]
 175        stp     D_l, D_h, [dst, 64]!
 176        ldp     D_l, D_h, [src, 64]!
 177        subs    count, count, 64
 178        b.hi    L(loop64)
 179
 180        /* Write the last iteration and copy 64 bytes from the end.  */
 181L(copy64_from_end):
 182        ldp     E_l, E_h, [srcend, -64]
 183        stp     A_l, A_h, [dst, 16]
 184        ldp     A_l, A_h, [srcend, -48]
 185        stp     B_l, B_h, [dst, 32]
 186        ldp     B_l, B_h, [srcend, -32]
 187        stp     C_l, C_h, [dst, 48]
 188        ldp     C_l, C_h, [srcend, -16]
 189        stp     D_l, D_h, [dst, 64]
 190        stp     E_l, E_h, [dstend, -64]
 191        stp     A_l, A_h, [dstend, -48]
 192        stp     B_l, B_h, [dstend, -32]
 193        stp     C_l, C_h, [dstend, -16]
 194        ret
 195
 196        .p2align 4
 197
 198        /* Large backwards copy for overlapping copies.
 199           Copy 16 bytes and then align dst to 16-byte alignment.  */
 200L(copy_long_backwards):
 201        ldp     D_l, D_h, [srcend, -16]
 202        and     tmp1, dstend, 15
 203        sub     srcend, srcend, tmp1
 204        sub     count, count, tmp1
 205        ldp     A_l, A_h, [srcend, -16]
 206        stp     D_l, D_h, [dstend, -16]
 207        ldp     B_l, B_h, [srcend, -32]
 208        ldp     C_l, C_h, [srcend, -48]
 209        ldp     D_l, D_h, [srcend, -64]!
 210        sub     dstend, dstend, tmp1
 211        subs    count, count, 128
 212        b.ls    L(copy64_from_start)
 213
 214L(loop64_backwards):
 215        stp     A_l, A_h, [dstend, -16]
 216        ldp     A_l, A_h, [srcend, -16]
 217        stp     B_l, B_h, [dstend, -32]
 218        ldp     B_l, B_h, [srcend, -32]
 219        stp     C_l, C_h, [dstend, -48]
 220        ldp     C_l, C_h, [srcend, -48]
 221        stp     D_l, D_h, [dstend, -64]!
 222        ldp     D_l, D_h, [srcend, -64]!
 223        subs    count, count, 64
 224        b.hi    L(loop64_backwards)
 225
 226        /* Write the last iteration and copy 64 bytes from the start.  */
 227L(copy64_from_start):
 228        ldp     G_l, G_h, [src, 48]
 229        stp     A_l, A_h, [dstend, -16]
 230        ldp     A_l, A_h, [src, 32]
 231        stp     B_l, B_h, [dstend, -32]
 232        ldp     B_l, B_h, [src, 16]
 233        stp     C_l, C_h, [dstend, -48]
 234        ldp     C_l, C_h, [src]
 235        stp     D_l, D_h, [dstend, -64]
 236        stp     G_l, G_h, [dstin, 48]
 237        stp     A_l, A_h, [dstin, 32]
 238        stp     B_l, B_h, [dstin, 16]
 239        stp     C_l, C_h, [dstin]
 240        ret
 241
 242END (memcpy)
 243