linux/arch/arm64/lib/strncmp.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2013-2022, Arm Limited.
   4 *
   5 * Adapted from the original at:
   6 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/assembler.h>
  11
  12/* Assumptions:
  13 *
  14 * ARMv8-a, AArch64.
  15 * MTE compatible.
  16 */
  17
  18#define L(label) .L ## label
  19
  20#define REP8_01 0x0101010101010101
  21#define REP8_7f 0x7f7f7f7f7f7f7f7f
  22
  23/* Parameters and result.  */
  24#define src1            x0
  25#define src2            x1
  26#define limit           x2
  27#define result          x0
  28
  29/* Internal variables.  */
  30#define data1           x3
  31#define data1w          w3
  32#define data2           x4
  33#define data2w          w4
  34#define has_nul         x5
  35#define diff            x6
  36#define syndrome        x7
  37#define tmp1            x8
  38#define tmp2            x9
  39#define tmp3            x10
  40#define zeroones        x11
  41#define pos             x12
  42#define mask            x13
  43#define endloop         x14
  44#define count           mask
  45#define offset          pos
  46#define neg_offset      x15
  47
  48/* Define endian dependent shift operations.
  49   On big-endian early bytes are at MSB and on little-endian LSB.
  50   LS_FW means shifting towards early bytes.
  51   LS_BK means shifting towards later bytes.
  52   */
  53#ifdef __AARCH64EB__
  54#define LS_FW lsl
  55#define LS_BK lsr
  56#else
  57#define LS_FW lsr
  58#define LS_BK lsl
  59#endif
  60
  61SYM_FUNC_START(__pi_strncmp)
  62        cbz     limit, L(ret0)
  63        eor     tmp1, src1, src2
  64        mov     zeroones, #REP8_01
  65        tst     tmp1, #7
  66        and     count, src1, #7
  67        b.ne    L(misaligned8)
  68        cbnz    count, L(mutual_align)
  69
  70        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  71           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  72           can be done in parallel across the entire word.  */
  73        .p2align 4
  74L(loop_aligned):
  75        ldr     data1, [src1], #8
  76        ldr     data2, [src2], #8
  77L(start_realigned):
  78        subs    limit, limit, #8
  79        sub     tmp1, data1, zeroones
  80        orr     tmp2, data1, #REP8_7f
  81        eor     diff, data1, data2      /* Non-zero if differences found.  */
  82        csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
  83        bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  84        ccmp    endloop, #0, #0, eq
  85        b.eq    L(loop_aligned)
  86        /* End of main loop */
  87
  88L(full_check):
  89#ifndef __AARCH64EB__
  90        orr     syndrome, diff, has_nul
  91        add     limit, limit, 8 /* Rewind limit to before last subs. */
  92L(syndrome_check):
  93        /* Limit was reached. Check if the NUL byte or the difference
  94           is before the limit. */
  95        rev     syndrome, syndrome
  96        rev     data1, data1
  97        clz     pos, syndrome
  98        rev     data2, data2
  99        lsl     data1, data1, pos
 100        cmp     limit, pos, lsr #3
 101        lsl     data2, data2, pos
 102        /* But we need to zero-extend (char is unsigned) the value and then
 103           perform a signed 32-bit subtraction.  */
 104        lsr     data1, data1, #56
 105        sub     result, data1, data2, lsr #56
 106        csel result, result, xzr, hi
 107        ret
 108#else
 109        /* Not reached the limit, must have found the end or a diff.  */
 110        tbz     limit, #63, L(not_limit)
 111        add     tmp1, limit, 8
 112        cbz     limit, L(not_limit)
 113
 114        lsl     limit, tmp1, #3 /* Bits -> bytes.  */
 115        mov     mask, #~0
 116        lsr     mask, mask, limit
 117        bic     data1, data1, mask
 118        bic     data2, data2, mask
 119
 120        /* Make sure that the NUL byte is marked in the syndrome.  */
 121        orr     has_nul, has_nul, mask
 122
 123L(not_limit):
 124        /* For big-endian we cannot use the trick with the syndrome value
 125           as carry-propagation can corrupt the upper bits if the trailing
 126           bytes in the string contain 0x01.  */
 127        /* However, if there is no NUL byte in the dword, we can generate
 128           the result directly.  We can't just subtract the bytes as the
 129           MSB might be significant.  */
 130        cbnz    has_nul, 1f
 131        cmp     data1, data2
 132        cset    result, ne
 133        cneg    result, result, lo
 134        ret
 1351:
 136        /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 137        rev     tmp3, data1
 138        sub     tmp1, tmp3, zeroones
 139        orr     tmp2, tmp3, #REP8_7f
 140        bic     has_nul, tmp1, tmp2
 141        rev     has_nul, has_nul
 142        orr     syndrome, diff, has_nul
 143        clz     pos, syndrome
 144        /* The most-significant-non-zero bit of the syndrome marks either the
 145           first bit that is different, or the top bit of the first zero byte.
 146           Shifting left now will bring the critical information into the
 147           top bits.  */
 148L(end_quick):
 149        lsl     data1, data1, pos
 150        lsl     data2, data2, pos
 151        /* But we need to zero-extend (char is unsigned) the value and then
 152           perform a signed 32-bit subtraction.  */
 153        lsr     data1, data1, #56
 154        sub     result, data1, data2, lsr #56
 155        ret
 156#endif
 157
 158L(mutual_align):
 159        /* Sources are mutually aligned, but are not currently at an
 160           alignment boundary.  Round down the addresses and then mask off
 161           the bytes that precede the start point.
 162           We also need to adjust the limit calculations, but without
 163           overflowing if the limit is near ULONG_MAX.  */
 164        bic     src1, src1, #7
 165        bic     src2, src2, #7
 166        ldr     data1, [src1], #8
 167        neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 168        ldr     data2, [src2], #8
 169        mov     tmp2, #~0
 170        LS_FW   tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 171        /* Adjust the limit and ensure it doesn't overflow.  */
 172        adds    limit, limit, count
 173        csinv   limit, limit, xzr, lo
 174        orr     data1, data1, tmp2
 175        orr     data2, data2, tmp2
 176        b       L(start_realigned)
 177
 178        .p2align 4
 179        /* Don't bother with dwords for up to 16 bytes.  */
 180L(misaligned8):
 181        cmp     limit, #16
 182        b.hs    L(try_misaligned_words)
 183
 184L(byte_loop):
 185        /* Perhaps we can do better than this.  */
 186        ldrb    data1w, [src1], #1
 187        ldrb    data2w, [src2], #1
 188        subs    limit, limit, #1
 189        ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 190        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 191        b.eq    L(byte_loop)
 192L(done):
 193        sub     result, data1, data2
 194        ret
 195        /* Align the SRC1 to a dword by doing a bytewise compare and then do
 196           the dword loop.  */
 197L(try_misaligned_words):
 198        cbz     count, L(src1_aligned)
 199
 200        neg     count, count
 201        and     count, count, #7
 202        sub     limit, limit, count
 203
 204L(page_end_loop):
 205        ldrb    data1w, [src1], #1
 206        ldrb    data2w, [src2], #1
 207        cmp     data1w, #1
 208        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 209        b.ne    L(done)
 210        subs    count, count, #1
 211        b.hi    L(page_end_loop)
 212
 213        /* The following diagram explains the comparison of misaligned strings.
 214           The bytes are shown in natural order. For little-endian, it is
 215           reversed in the registers. The "x" bytes are before the string.
 216           The "|" separates data that is loaded at one time.
 217           src1     | a a a a a a a a | b b b c c c c c | . . .
 218           src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 219
 220           After shifting in each step, the data looks like this:
 221                        STEP_A              STEP_B              STEP_C
 222           data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
 223           data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 224
 225           The bytes with "0" are eliminated from the syndrome via mask.
 226
 227           Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
 228           time from SRC2. The comparison happens in 3 steps. After each step
 229           the loop can exit, or read from SRC1 or SRC2. */
 230L(src1_aligned):
 231        /* Calculate offset from 8 byte alignment to string start in bits. No
 232           need to mask offset since shifts are ignoring upper bits. */
 233        lsl     offset, src2, #3
 234        bic     src2, src2, #0xf
 235        mov     mask, -1
 236        neg     neg_offset, offset
 237        ldr     data1, [src1], #8
 238        ldp     tmp1, tmp2, [src2], #16
 239        LS_BK   mask, mask, neg_offset
 240        and     neg_offset, neg_offset, #63     /* Need actual value for cmp later. */
 241        /* Skip the first compare if data in tmp1 is irrelevant. */
 242        tbnz    offset, 6, L(misaligned_mid_loop)
 243
 244L(loop_misaligned):
 245        /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
 246        LS_FW   data2, tmp1, offset
 247        LS_BK   tmp1, tmp2, neg_offset
 248        subs    limit, limit, #8
 249        orr     data2, data2, tmp1      /* 8 bytes from SRC2 combined from two regs.*/
 250        sub     has_nul, data1, zeroones
 251        eor     diff, data1, data2      /* Non-zero if differences found.  */
 252        orr     tmp3, data1, #REP8_7f
 253        csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
 254        bic     has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
 255        orr     tmp3, endloop, has_nul
 256        cbnz    tmp3, L(full_check)
 257
 258        ldr     data1, [src1], #8
 259L(misaligned_mid_loop):
 260        /* STEP_B: Compare first part of data1 to second part of tmp2. */
 261        LS_FW   data2, tmp2, offset
 262#ifdef __AARCH64EB__
 263        /* For big-endian we do a byte reverse to avoid carry-propagation
 264        problem described above. This way we can reuse the has_nul in the
 265        next step and also use syndrome value trick at the end. */
 266        rev     tmp3, data1
 267        #define data1_fixed tmp3
 268#else
 269        #define data1_fixed data1
 270#endif
 271        sub     has_nul, data1_fixed, zeroones
 272        orr     tmp3, data1_fixed, #REP8_7f
 273        eor     diff, data2, data1      /* Non-zero if differences found.  */
 274        bic     has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
 275#ifdef __AARCH64EB__
 276        rev     has_nul, has_nul
 277#endif
 278        cmp     limit, neg_offset, lsr #3
 279        orr     syndrome, diff, has_nul
 280        bic     syndrome, syndrome, mask        /* Ignore later bytes. */
 281        csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 282        cbnz    tmp3, L(syndrome_check)
 283
 284        /* STEP_C: Compare second part of data1 to first part of tmp1. */
 285        ldp     tmp1, tmp2, [src2], #16
 286        cmp     limit, #8
 287        LS_BK   data2, tmp1, neg_offset
 288        eor     diff, data2, data1      /* Non-zero if differences found.  */
 289        orr     syndrome, diff, has_nul
 290        and     syndrome, syndrome, mask        /* Ignore earlier bytes. */
 291        csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 292        cbnz    tmp3, L(syndrome_check)
 293
 294        ldr     data1, [src1], #8
 295        sub     limit, limit, #8
 296        b       L(loop_misaligned)
 297
 298#ifdef  __AARCH64EB__
 299L(syndrome_check):
 300        clz     pos, syndrome
 301        cmp     pos, limit, lsl #3
 302        b.lo    L(end_quick)
 303#endif
 304
 305L(ret0):
 306        mov     result, #0
 307        ret
 308SYM_FUNC_END(__pi_strncmp)
 309SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
 310EXPORT_SYMBOL_NOKASAN(strncmp)
 311