linux/arch/arm64/lib/strlen.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2013-2021, Arm Limited.
   4 *
   5 * Adapted from the original at:
   6 * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/assembler.h>
  11#include <asm/mte-def.h>
  12
  13/* Assumptions:
  14 *
  15 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  16 */
  17
  18#define L(label) .L ## label
  19
  20/* Arguments and results.  */
  21#define srcin           x0
  22#define len             x0
  23
  24/* Locals and temporaries.  */
  25#define src             x1
  26#define data1           x2
  27#define data2           x3
  28#define has_nul1        x4
  29#define has_nul2        x5
  30#define tmp1            x4
  31#define tmp2            x5
  32#define tmp3            x6
  33#define tmp4            x7
  34#define zeroones        x8
  35
  36        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  37           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  38           can be done in parallel across the entire word. A faster check
  39           (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
  40           false hits for characters 129..255.  */
  41
  42#define REP8_01 0x0101010101010101
  43#define REP8_7f 0x7f7f7f7f7f7f7f7f
  44#define REP8_80 0x8080808080808080
  45
  46/*
  47 * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
  48 * (16-byte) granularity, and we must ensure that no access straddles this
  49 * alignment boundary.
  50 */
  51#ifdef CONFIG_KASAN_HW_TAGS
  52#define MIN_PAGE_SIZE MTE_GRANULE_SIZE
  53#else
  54#define MIN_PAGE_SIZE 4096
  55#endif
  56
  57        /* Since strings are short on average, we check the first 16 bytes
  58           of the string for a NUL character.  In order to do an unaligned ldp
  59           safely we have to do a page cross check first.  If there is a NUL
  60           byte we calculate the length from the 2 8-byte words using
  61           conditional select to reduce branch mispredictions (it is unlikely
  62           strlen will be repeatedly called on strings with the same length).
  63
  64           If the string is longer than 16 bytes, we align src so don't need
  65           further page cross checks, and process 32 bytes per iteration
  66           using the fast NUL check.  If we encounter non-ASCII characters,
  67           fallback to a second loop using the full NUL check.
  68
  69           If the page cross check fails, we read 16 bytes from an aligned
  70           address, remove any characters before the string, and continue
  71           in the main loop using aligned loads.  Since strings crossing a
  72           page in the first 16 bytes are rare (probability of
  73           16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
  74
  75           AArch64 systems have a minimum page size of 4k.  We don't bother
  76           checking for larger page sizes - the cost of setting up the correct
  77           page size is just not worth the extra gain from a small reduction in
  78           the cases taking the slow path.  Note that we only care about
  79           whether the first fetch, which may be misaligned, crosses a page
  80           boundary.  */
  81
  82SYM_FUNC_START_WEAK_PI(strlen)
  83        and     tmp1, srcin, MIN_PAGE_SIZE - 1
  84        mov     zeroones, REP8_01
  85        cmp     tmp1, MIN_PAGE_SIZE - 16
  86        b.gt    L(page_cross)
  87        ldp     data1, data2, [srcin]
  88#ifdef __AARCH64EB__
  89        /* For big-endian, carry propagation (if the final byte in the
  90           string is 0x01) means we cannot use has_nul1/2 directly.
  91           Since we expect strings to be small and early-exit,
  92           byte-swap the data now so has_null1/2 will be correct.  */
  93        rev     data1, data1
  94        rev     data2, data2
  95#endif
  96        sub     tmp1, data1, zeroones
  97        orr     tmp2, data1, REP8_7f
  98        sub     tmp3, data2, zeroones
  99        orr     tmp4, data2, REP8_7f
 100        bics    has_nul1, tmp1, tmp2
 101        bic     has_nul2, tmp3, tmp4
 102        ccmp    has_nul2, 0, 0, eq
 103        beq     L(main_loop_entry)
 104
 105        /* Enter with C = has_nul1 == 0.  */
 106        csel    has_nul1, has_nul1, has_nul2, cc
 107        mov     len, 8
 108        rev     has_nul1, has_nul1
 109        clz     tmp1, has_nul1
 110        csel    len, xzr, len, cc
 111        add     len, len, tmp1, lsr 3
 112        ret
 113
 114        /* The inner loop processes 32 bytes per iteration and uses the fast
 115           NUL check.  If we encounter non-ASCII characters, use a second
 116           loop with the accurate NUL check.  */
 117        .p2align 4
 118L(main_loop_entry):
 119        bic     src, srcin, 15
 120        sub     src, src, 16
 121L(main_loop):
 122        ldp     data1, data2, [src, 32]!
 123L(page_cross_entry):
 124        sub     tmp1, data1, zeroones
 125        sub     tmp3, data2, zeroones
 126        orr     tmp2, tmp1, tmp3
 127        tst     tmp2, zeroones, lsl 7
 128        bne     1f
 129        ldp     data1, data2, [src, 16]
 130        sub     tmp1, data1, zeroones
 131        sub     tmp3, data2, zeroones
 132        orr     tmp2, tmp1, tmp3
 133        tst     tmp2, zeroones, lsl 7
 134        beq     L(main_loop)
 135        add     src, src, 16
 1361:
 137        /* The fast check failed, so do the slower, accurate NUL check.  */
 138        orr     tmp2, data1, REP8_7f
 139        orr     tmp4, data2, REP8_7f
 140        bics    has_nul1, tmp1, tmp2
 141        bic     has_nul2, tmp3, tmp4
 142        ccmp    has_nul2, 0, 0, eq
 143        beq     L(nonascii_loop)
 144
 145        /* Enter with C = has_nul1 == 0.  */
 146L(tail):
 147#ifdef __AARCH64EB__
 148        /* For big-endian, carry propagation (if the final byte in the
 149           string is 0x01) means we cannot use has_nul1/2 directly.  The
 150           easiest way to get the correct byte is to byte-swap the data
 151           and calculate the syndrome a second time.  */
 152        csel    data1, data1, data2, cc
 153        rev     data1, data1
 154        sub     tmp1, data1, zeroones
 155        orr     tmp2, data1, REP8_7f
 156        bic     has_nul1, tmp1, tmp2
 157#else
 158        csel    has_nul1, has_nul1, has_nul2, cc
 159#endif
 160        sub     len, src, srcin
 161        rev     has_nul1, has_nul1
 162        add     tmp2, len, 8
 163        clz     tmp1, has_nul1
 164        csel    len, len, tmp2, cc
 165        add     len, len, tmp1, lsr 3
 166        ret
 167
 168L(nonascii_loop):
 169        ldp     data1, data2, [src, 16]!
 170        sub     tmp1, data1, zeroones
 171        orr     tmp2, data1, REP8_7f
 172        sub     tmp3, data2, zeroones
 173        orr     tmp4, data2, REP8_7f
 174        bics    has_nul1, tmp1, tmp2
 175        bic     has_nul2, tmp3, tmp4
 176        ccmp    has_nul2, 0, 0, eq
 177        bne     L(tail)
 178        ldp     data1, data2, [src, 16]!
 179        sub     tmp1, data1, zeroones
 180        orr     tmp2, data1, REP8_7f
 181        sub     tmp3, data2, zeroones
 182        orr     tmp4, data2, REP8_7f
 183        bics    has_nul1, tmp1, tmp2
 184        bic     has_nul2, tmp3, tmp4
 185        ccmp    has_nul2, 0, 0, eq
 186        beq     L(nonascii_loop)
 187        b       L(tail)
 188
 189        /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
 190           srcin to 0x7f, so we ignore any NUL bytes before the string.
 191           Then continue in the aligned loop.  */
 192L(page_cross):
 193        bic     src, srcin, 15
 194        ldp     data1, data2, [src]
 195        lsl     tmp1, srcin, 3
 196        mov     tmp4, -1
 197#ifdef __AARCH64EB__
 198        /* Big-endian.  Early bytes are at MSB.  */
 199        lsr     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
 200#else
 201        /* Little-endian.  Early bytes are at LSB.  */
 202        lsl     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
 203#endif
 204        orr     tmp1, tmp1, REP8_80
 205        orn     data1, data1, tmp1
 206        orn     tmp2, data2, tmp1
 207        tst     srcin, 8
 208        csel    data1, data1, tmp4, eq
 209        csel    data2, data2, tmp2, eq
 210        b       L(page_cross_entry)
 211
 212SYM_FUNC_END_PI(strlen)
 213EXPORT_SYMBOL_NOKASAN(strlen)
 214