linux/tools/perf/util/demangle-rust.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <string.h>
   3#include "debug.h"
   4
   5#include "demangle-rust.h"
   6
   7/*
   8 * Mangled Rust symbols look like this:
   9 *
  10 *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
  11 *
  12 * The original symbol is:
  13 *
  14 *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
  15 *
  16 * The last component of the path is a 64-bit hash in lowercase hex, prefixed
  17 * with "h". Rust does not have a global namespace between crates, an illusion
  18 * which Rust maintains by using the hash to distinguish things that would
  19 * otherwise have the same symbol.
  20 *
  21 * Any path component not starting with a XID_Start character is prefixed with
  22 * "_".
  23 *
  24 * The following escape sequences are used:
  25 *
  26 *     ","  =>  $C$
  27 *     "@"  =>  $SP$
  28 *     "*"  =>  $BP$
  29 *     "&"  =>  $RF$
  30 *     "<"  =>  $LT$
  31 *     ">"  =>  $GT$
  32 *     "("  =>  $LP$
  33 *     ")"  =>  $RP$
  34 *     " "  =>  $u20$
  35 *     "'"  =>  $u27$
  36 *     "["  =>  $u5b$
  37 *     "]"  =>  $u5d$
  38 *     "~"  =>  $u7e$
  39 *
  40 * A double ".." means "::" and a single "." means "-".
  41 *
  42 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
  43 */
  44
  45static const char *hash_prefix = "::h";
  46static const size_t hash_prefix_len = 3;
  47static const size_t hash_len = 16;
  48
  49static bool is_prefixed_hash(const char *start);
  50static bool looks_like_rust(const char *sym, size_t len);
  51static bool unescape(const char **in, char **out, const char *seq, char value);
  52
  53/*
  54 * INPUT:
  55 *     sym: symbol that has been through BFD-demangling
  56 *
  57 * This function looks for the following indicators:
  58 *
  59 *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
  60 *
  61 *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
  62 *     hex digits. This is true of 99.9998% of hashes so once in your life you
  63 *     may see a false negative. The point is to notice path components that
  64 *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
  65 *     this case a false positive (non-Rust symbol has an important path
  66 *     component removed because it looks like a Rust hash) is worse than a
  67 *     false negative (the rare Rust symbol is not demangled) so this sets the
  68 *     balance in favor of false negatives.
  69 *
  70 *  3. There must be no characters other than a-zA-Z0-9 and _.:$
  71 *
  72 *  4. There must be no unrecognized $-sign sequences.
  73 *
  74 *  5. There must be no sequence of three or more dots in a row ("...").
  75 */
  76bool
  77rust_is_mangled(const char *sym)
  78{
  79        size_t len, len_without_hash;
  80
  81        if (!sym)
  82                return false;
  83
  84        len = strlen(sym);
  85        if (len <= hash_prefix_len + hash_len)
  86                /* Not long enough to contain "::h" + hash + something else */
  87                return false;
  88
  89        len_without_hash = len - (hash_prefix_len + hash_len);
  90        if (!is_prefixed_hash(sym + len_without_hash))
  91                return false;
  92
  93        return looks_like_rust(sym, len_without_hash);
  94}
  95
  96/*
  97 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
  98 * digits must comprise between 5 and 15 (inclusive) distinct digits.
  99 */
 100static bool is_prefixed_hash(const char *str)
 101{
 102        const char *end;
 103        bool seen[16];
 104        size_t i;
 105        int count;
 106
 107        if (strncmp(str, hash_prefix, hash_prefix_len))
 108                return false;
 109        str += hash_prefix_len;
 110
 111        memset(seen, false, sizeof(seen));
 112        for (end = str + hash_len; str < end; str++)
 113                if (*str >= '0' && *str <= '9')
 114                        seen[*str - '0'] = true;
 115                else if (*str >= 'a' && *str <= 'f')
 116                        seen[*str - 'a' + 10] = true;
 117                else
 118                        return false;
 119
 120        /* Count how many distinct digits seen */
 121        count = 0;
 122        for (i = 0; i < 16; i++)
 123                if (seen[i])
 124                        count++;
 125
 126        return count >= 5 && count <= 15;
 127}
 128
 129static bool looks_like_rust(const char *str, size_t len)
 130{
 131        const char *end = str + len;
 132
 133        while (str < end)
 134                switch (*str) {
 135                case '$':
 136                        if (!strncmp(str, "$C$", 3))
 137                                str += 3;
 138                        else if (!strncmp(str, "$SP$", 4)
 139                                        || !strncmp(str, "$BP$", 4)
 140                                        || !strncmp(str, "$RF$", 4)
 141                                        || !strncmp(str, "$LT$", 4)
 142                                        || !strncmp(str, "$GT$", 4)
 143                                        || !strncmp(str, "$LP$", 4)
 144                                        || !strncmp(str, "$RP$", 4))
 145                                str += 4;
 146                        else if (!strncmp(str, "$u20$", 5)
 147                                        || !strncmp(str, "$u27$", 5)
 148                                        || !strncmp(str, "$u5b$", 5)
 149                                        || !strncmp(str, "$u5d$", 5)
 150                                        || !strncmp(str, "$u7e$", 5))
 151                                str += 5;
 152                        else
 153                                return false;
 154                        break;
 155                case '.':
 156                        /* Do not allow three or more consecutive dots */
 157                        if (!strncmp(str, "...", 3))
 158                                return false;
 159                        /* Fall through */
 160                case 'a' ... 'z':
 161                case 'A' ... 'Z':
 162                case '0' ... '9':
 163                case '_':
 164                case ':':
 165                        str++;
 166                        break;
 167                default:
 168                        return false;
 169                }
 170
 171        return true;
 172}
 173
 174/*
 175 * INPUT:
 176 *     sym: symbol for which rust_is_mangled(sym) returns true
 177 *
 178 * The input is demangled in-place because the mangled name is always longer
 179 * than the demangled one.
 180 */
 181void
 182rust_demangle_sym(char *sym)
 183{
 184        const char *in;
 185        char *out;
 186        const char *end;
 187
 188        if (!sym)
 189                return;
 190
 191        in = sym;
 192        out = sym;
 193        end = sym + strlen(sym) - (hash_prefix_len + hash_len);
 194
 195        while (in < end)
 196                switch (*in) {
 197                case '$':
 198                        if (!(unescape(&in, &out, "$C$", ',')
 199                                        || unescape(&in, &out, "$SP$", '@')
 200                                        || unescape(&in, &out, "$BP$", '*')
 201                                        || unescape(&in, &out, "$RF$", '&')
 202                                        || unescape(&in, &out, "$LT$", '<')
 203                                        || unescape(&in, &out, "$GT$", '>')
 204                                        || unescape(&in, &out, "$LP$", '(')
 205                                        || unescape(&in, &out, "$RP$", ')')
 206                                        || unescape(&in, &out, "$u20$", ' ')
 207                                        || unescape(&in, &out, "$u27$", '\'')
 208                                        || unescape(&in, &out, "$u5b$", '[')
 209                                        || unescape(&in, &out, "$u5d$", ']')
 210                                        || unescape(&in, &out, "$u7e$", '~'))) {
 211                                pr_err("demangle-rust: unexpected escape sequence");
 212                                goto done;
 213                        }
 214                        break;
 215                case '_':
 216                        /*
 217                         * If this is the start of a path component and the next
 218                         * character is an escape sequence, ignore the
 219                         * underscore. The mangler inserts an underscore to make
 220                         * sure the path component begins with a XID_Start
 221                         * character.
 222                         */
 223                        if ((in == sym || in[-1] == ':') && in[1] == '$')
 224                                in++;
 225                        else
 226                                *out++ = *in++;
 227                        break;
 228                case '.':
 229                        if (in[1] == '.') {
 230                                /* ".." becomes "::" */
 231                                *out++ = ':';
 232                                *out++ = ':';
 233                                in += 2;
 234                        } else {
 235                                /* "." becomes "-" */
 236                                *out++ = '-';
 237                                in++;
 238                        }
 239                        break;
 240                case 'a' ... 'z':
 241                case 'A' ... 'Z':
 242                case '0' ... '9':
 243                case ':':
 244                        *out++ = *in++;
 245                        break;
 246                default:
 247                        pr_err("demangle-rust: unexpected character '%c' in symbol\n",
 248                                *in);
 249                        goto done;
 250                }
 251
 252done:
 253        *out = '\0';
 254}
 255
 256static bool unescape(const char **in, char **out, const char *seq, char value)
 257{
 258        size_t len = strlen(seq);
 259
 260        if (strncmp(*in, seq, len))
 261                return false;
 262
 263        **out = value;
 264
 265        *in += len;
 266        *out += 1;
 267
 268        return true;
 269}
 270