linux/arch/arm64/lib/strncmp.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2013-2021, Arm Limited.
   4 *
   5 * Adapted from the original at:
   6 * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/assembler.h>
  11
  12/* Assumptions:
  13 *
  14 * ARMv8-a, AArch64
  15 */
  16
  17#define L(label) .L ## label
  18
  19#define REP8_01 0x0101010101010101
  20#define REP8_7f 0x7f7f7f7f7f7f7f7f
  21#define REP8_80 0x8080808080808080
  22
  23/* Parameters and result.  */
  24#define src1            x0
  25#define src2            x1
  26#define limit           x2
  27#define result          x0
  28
  29/* Internal variables.  */
  30#define data1           x3
  31#define data1w          w3
  32#define data2           x4
  33#define data2w          w4
  34#define has_nul         x5
  35#define diff            x6
  36#define syndrome        x7
  37#define tmp1            x8
  38#define tmp2            x9
  39#define tmp3            x10
  40#define zeroones        x11
  41#define pos             x12
  42#define limit_wd        x13
  43#define mask            x14
  44#define endloop         x15
  45#define count           mask
  46
  47SYM_FUNC_START_WEAK_PI(strncmp)
  48        cbz     limit, L(ret0)
  49        eor     tmp1, src1, src2
  50        mov     zeroones, #REP8_01
  51        tst     tmp1, #7
  52        and     count, src1, #7
  53        b.ne    L(misaligned8)
  54        cbnz    count, L(mutual_align)
  55        /* Calculate the number of full and partial words -1.  */
  56        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
  57        lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
  58
  59        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  60           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  61           can be done in parallel across the entire word.  */
  62        .p2align 4
  63L(loop_aligned):
  64        ldr     data1, [src1], #8
  65        ldr     data2, [src2], #8
  66L(start_realigned):
  67        subs    limit_wd, limit_wd, #1
  68        sub     tmp1, data1, zeroones
  69        orr     tmp2, data1, #REP8_7f
  70        eor     diff, data1, data2      /* Non-zero if differences found.  */
  71        csinv   endloop, diff, xzr, pl  /* Last Dword or differences.  */
  72        bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  73        ccmp    endloop, #0, #0, eq
  74        b.eq    L(loop_aligned)
  75        /* End of main loop */
  76
  77        /* Not reached the limit, must have found the end or a diff.  */
  78        tbz     limit_wd, #63, L(not_limit)
  79
  80        /* Limit % 8 == 0 => all bytes significant.  */
  81        ands    limit, limit, #7
  82        b.eq    L(not_limit)
  83
  84        lsl     limit, limit, #3        /* Bits -> bytes.  */
  85        mov     mask, #~0
  86#ifdef __AARCH64EB__
  87        lsr     mask, mask, limit
  88#else
  89        lsl     mask, mask, limit
  90#endif
  91        bic     data1, data1, mask
  92        bic     data2, data2, mask
  93
  94        /* Make sure that the NUL byte is marked in the syndrome.  */
  95        orr     has_nul, has_nul, mask
  96
  97L(not_limit):
  98        orr     syndrome, diff, has_nul
  99
 100#ifndef __AARCH64EB__
 101        rev     syndrome, syndrome
 102        rev     data1, data1
 103        /* The MS-non-zero bit of the syndrome marks either the first bit
 104           that is different, or the top bit of the first zero byte.
 105           Shifting left now will bring the critical information into the
 106           top bits.  */
 107        clz     pos, syndrome
 108        rev     data2, data2
 109        lsl     data1, data1, pos
 110        lsl     data2, data2, pos
 111        /* But we need to zero-extend (char is unsigned) the value and then
 112           perform a signed 32-bit subtraction.  */
 113        lsr     data1, data1, #56
 114        sub     result, data1, data2, lsr #56
 115        ret
 116#else
 117        /* For big-endian we cannot use the trick with the syndrome value
 118           as carry-propagation can corrupt the upper bits if the trailing
 119           bytes in the string contain 0x01.  */
 120        /* However, if there is no NUL byte in the dword, we can generate
 121           the result directly.  We can't just subtract the bytes as the
 122           MSB might be significant.  */
 123        cbnz    has_nul, 1f
 124        cmp     data1, data2
 125        cset    result, ne
 126        cneg    result, result, lo
 127        ret
 1281:
 129        /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 130        rev     tmp3, data1
 131        sub     tmp1, tmp3, zeroones
 132        orr     tmp2, tmp3, #REP8_7f
 133        bic     has_nul, tmp1, tmp2
 134        rev     has_nul, has_nul
 135        orr     syndrome, diff, has_nul
 136        clz     pos, syndrome
 137        /* The MS-non-zero bit of the syndrome marks either the first bit
 138           that is different, or the top bit of the first zero byte.
 139           Shifting left now will bring the critical information into the
 140           top bits.  */
 141        lsl     data1, data1, pos
 142        lsl     data2, data2, pos
 143        /* But we need to zero-extend (char is unsigned) the value and then
 144           perform a signed 32-bit subtraction.  */
 145        lsr     data1, data1, #56
 146        sub     result, data1, data2, lsr #56
 147        ret
 148#endif
 149
 150L(mutual_align):
 151        /* Sources are mutually aligned, but are not currently at an
 152           alignment boundary.  Round down the addresses and then mask off
 153           the bytes that precede the start point.
 154           We also need to adjust the limit calculations, but without
 155           overflowing if the limit is near ULONG_MAX.  */
 156        bic     src1, src1, #7
 157        bic     src2, src2, #7
 158        ldr     data1, [src1], #8
 159        neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 160        ldr     data2, [src2], #8
 161        mov     tmp2, #~0
 162        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
 163#ifdef __AARCH64EB__
 164        /* Big-endian.  Early bytes are at MSB.  */
 165        lsl     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 166#else
 167        /* Little-endian.  Early bytes are at LSB.  */
 168        lsr     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 169#endif
 170        and     tmp3, limit_wd, #7
 171        lsr     limit_wd, limit_wd, #3
 172        /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
 173        add     limit, limit, count
 174        add     tmp3, tmp3, count
 175        orr     data1, data1, tmp2
 176        orr     data2, data2, tmp2
 177        add     limit_wd, limit_wd, tmp3, lsr #3
 178        b       L(start_realigned)
 179
 180        .p2align 4
 181        /* Don't bother with dwords for up to 16 bytes.  */
 182L(misaligned8):
 183        cmp     limit, #16
 184        b.hs    L(try_misaligned_words)
 185
 186L(byte_loop):
 187        /* Perhaps we can do better than this.  */
 188        ldrb    data1w, [src1], #1
 189        ldrb    data2w, [src2], #1
 190        subs    limit, limit, #1
 191        ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 192        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 193        b.eq    L(byte_loop)
 194L(done):
 195        sub     result, data1, data2
 196        ret
 197        /* Align the SRC1 to a dword by doing a bytewise compare and then do
 198           the dword loop.  */
 199L(try_misaligned_words):
 200        lsr     limit_wd, limit, #3
 201        cbz     count, L(do_misaligned)
 202
 203        neg     count, count
 204        and     count, count, #7
 205        sub     limit, limit, count
 206        lsr     limit_wd, limit, #3
 207
 208L(page_end_loop):
 209        ldrb    data1w, [src1], #1
 210        ldrb    data2w, [src2], #1
 211        cmp     data1w, #1
 212        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 213        b.ne    L(done)
 214        subs    count, count, #1
 215        b.hi    L(page_end_loop)
 216
 217L(do_misaligned):
 218        /* Prepare ourselves for the next page crossing.  Unlike the aligned
 219           loop, we fetch 1 less dword because we risk crossing bounds on
 220           SRC2.  */
 221        mov     count, #8
 222        subs    limit_wd, limit_wd, #1
 223        b.lo    L(done_loop)
 224L(loop_misaligned):
 225        and     tmp2, src2, #0xff8
 226        eor     tmp2, tmp2, #0xff8
 227        cbz     tmp2, L(page_end_loop)
 228
 229        ldr     data1, [src1], #8
 230        ldr     data2, [src2], #8
 231        sub     tmp1, data1, zeroones
 232        orr     tmp2, data1, #REP8_7f
 233        eor     diff, data1, data2      /* Non-zero if differences found.  */
 234        bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 235        ccmp    diff, #0, #0, eq
 236        b.ne    L(not_limit)
 237        subs    limit_wd, limit_wd, #1
 238        b.pl    L(loop_misaligned)
 239
 240L(done_loop):
 241        /* We found a difference or a NULL before the limit was reached.  */
 242        and     limit, limit, #7
 243        cbz     limit, L(not_limit)
 244        /* Read the last word.  */
 245        sub     src1, src1, 8
 246        sub     src2, src2, 8
 247        ldr     data1, [src1, limit]
 248        ldr     data2, [src2, limit]
 249        sub     tmp1, data1, zeroones
 250        orr     tmp2, data1, #REP8_7f
 251        eor     diff, data1, data2      /* Non-zero if differences found.  */
 252        bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 253        ccmp    diff, #0, #0, eq
 254        b.ne    L(not_limit)
 255
 256L(ret0):
 257        mov     result, #0
 258        ret
 259
 260SYM_FUNC_END_PI(strncmp)
 261EXPORT_SYMBOL_NOHWKASAN(strncmp)
 262