linux/arch/arm64/lib/strcmp.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2012-2021, Arm Limited.
   4 *
   5 * Adapted from the original at:
   6 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/assembler.h>
  11
  12/* Assumptions:
  13 *
  14 * ARMv8-a, AArch64
  15 */
  16
  17#define L(label) .L ## label
  18
  19#define REP8_01 0x0101010101010101
  20#define REP8_7f 0x7f7f7f7f7f7f7f7f
  21#define REP8_80 0x8080808080808080
  22
  23/* Parameters and result.  */
  24#define src1            x0
  25#define src2            x1
  26#define result          x0
  27
  28/* Internal variables.  */
  29#define data1           x2
  30#define data1w          w2
  31#define data2           x3
  32#define data2w          w3
  33#define has_nul         x4
  34#define diff            x5
  35#define syndrome        x6
  36#define tmp1            x7
  37#define tmp2            x8
  38#define tmp3            x9
  39#define zeroones        x10
  40#define pos             x11
  41
  42        /* Start of performance-critical section  -- one 64B cache line.  */
  43        .align 6
  44SYM_FUNC_START_WEAK_PI(strcmp)
  45        eor     tmp1, src1, src2
  46        mov     zeroones, #REP8_01
  47        tst     tmp1, #7
  48        b.ne    L(misaligned8)
  49        ands    tmp1, src1, #7
  50        b.ne    L(mutual_align)
  51        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  52           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  53           can be done in parallel across the entire word.  */
  54L(loop_aligned):
  55        ldr     data1, [src1], #8
  56        ldr     data2, [src2], #8
  57L(start_realigned):
  58        sub     tmp1, data1, zeroones
  59        orr     tmp2, data1, #REP8_7f
  60        eor     diff, data1, data2      /* Non-zero if differences found.  */
  61        bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  62        orr     syndrome, diff, has_nul
  63        cbz     syndrome, L(loop_aligned)
  64        /* End of performance-critical section  -- one 64B cache line.  */
  65
  66L(end):
  67#ifndef __AARCH64EB__
  68        rev     syndrome, syndrome
  69        rev     data1, data1
  70        /* The MS-non-zero bit of the syndrome marks either the first bit
  71           that is different, or the top bit of the first zero byte.
  72           Shifting left now will bring the critical information into the
  73           top bits.  */
  74        clz     pos, syndrome
  75        rev     data2, data2
  76        lsl     data1, data1, pos
  77        lsl     data2, data2, pos
  78        /* But we need to zero-extend (char is unsigned) the value and then
  79           perform a signed 32-bit subtraction.  */
  80        lsr     data1, data1, #56
  81        sub     result, data1, data2, lsr #56
  82        ret
  83#else
  84        /* For big-endian we cannot use the trick with the syndrome value
  85           as carry-propagation can corrupt the upper bits if the trailing
  86           bytes in the string contain 0x01.  */
  87        /* However, if there is no NUL byte in the dword, we can generate
  88           the result directly.  We can't just subtract the bytes as the
  89           MSB might be significant.  */
  90        cbnz    has_nul, 1f
  91        cmp     data1, data2
  92        cset    result, ne
  93        cneg    result, result, lo
  94        ret
  951:
  96        /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
  97        rev     tmp3, data1
  98        sub     tmp1, tmp3, zeroones
  99        orr     tmp2, tmp3, #REP8_7f
 100        bic     has_nul, tmp1, tmp2
 101        rev     has_nul, has_nul
 102        orr     syndrome, diff, has_nul
 103        clz     pos, syndrome
 104        /* The MS-non-zero bit of the syndrome marks either the first bit
 105           that is different, or the top bit of the first zero byte.
 106           Shifting left now will bring the critical information into the
 107           top bits.  */
 108        lsl     data1, data1, pos
 109        lsl     data2, data2, pos
 110        /* But we need to zero-extend (char is unsigned) the value and then
 111           perform a signed 32-bit subtraction.  */
 112        lsr     data1, data1, #56
 113        sub     result, data1, data2, lsr #56
 114        ret
 115#endif
 116
 117L(mutual_align):
 118        /* Sources are mutually aligned, but are not currently at an
 119           alignment boundary.  Round down the addresses and then mask off
 120           the bytes that preceed the start point.  */
 121        bic     src1, src1, #7
 122        bic     src2, src2, #7
 123        lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
 124        ldr     data1, [src1], #8
 125        neg     tmp1, tmp1              /* Bits to alignment -64.  */
 126        ldr     data2, [src2], #8
 127        mov     tmp2, #~0
 128#ifdef __AARCH64EB__
 129        /* Big-endian.  Early bytes are at MSB.  */
 130        lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 131#else
 132        /* Little-endian.  Early bytes are at LSB.  */
 133        lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 134#endif
 135        orr     data1, data1, tmp2
 136        orr     data2, data2, tmp2
 137        b       L(start_realigned)
 138
 139L(misaligned8):
 140        /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
 141           checking to make sure that we don't access beyond page boundary in
 142           SRC2.  */
 143        tst     src1, #7
 144        b.eq    L(loop_misaligned)
 145L(do_misaligned):
 146        ldrb    data1w, [src1], #1
 147        ldrb    data2w, [src2], #1
 148        cmp     data1w, #1
 149        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 150        b.ne    L(done)
 151        tst     src1, #7
 152        b.ne    L(do_misaligned)
 153
 154L(loop_misaligned):
 155        /* Test if we are within the last dword of the end of a 4K page.  If
 156           yes then jump back to the misaligned loop to copy a byte at a time.  */
 157        and     tmp1, src2, #0xff8
 158        eor     tmp1, tmp1, #0xff8
 159        cbz     tmp1, L(do_misaligned)
 160        ldr     data1, [src1], #8
 161        ldr     data2, [src2], #8
 162
 163        sub     tmp1, data1, zeroones
 164        orr     tmp2, data1, #REP8_7f
 165        eor     diff, data1, data2      /* Non-zero if differences found.  */
 166        bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 167        orr     syndrome, diff, has_nul
 168        cbz     syndrome, L(loop_misaligned)
 169        b       L(end)
 170
 171L(done):
 172        sub     result, data1, data2
 173        ret
 174
 175SYM_FUNC_END_PI(strcmp)
 176EXPORT_SYMBOL_NOHWKASAN(strcmp)
 177