linux/tools/perf/util/demangle-rust.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <string.h>
   3#include "util.h"
   4#include "debug.h"
   5
   6#include "demangle-rust.h"
   7
   8/*
   9 * Mangled Rust symbols look like this:
  10 *
  11 *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
  12 *
  13 * The original symbol is:
  14 *
  15 *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
  16 *
  17 * The last component of the path is a 64-bit hash in lowercase hex, prefixed
  18 * with "h". Rust does not have a global namespace between crates, an illusion
  19 * which Rust maintains by using the hash to distinguish things that would
  20 * otherwise have the same symbol.
  21 *
  22 * Any path component not starting with a XID_Start character is prefixed with
  23 * "_".
  24 *
  25 * The following escape sequences are used:
  26 *
  27 *     ","  =>  $C$
  28 *     "@"  =>  $SP$
  29 *     "*"  =>  $BP$
  30 *     "&"  =>  $RF$
  31 *     "<"  =>  $LT$
  32 *     ">"  =>  $GT$
  33 *     "("  =>  $LP$
  34 *     ")"  =>  $RP$
  35 *     " "  =>  $u20$
  36 *     "'"  =>  $u27$
  37 *     "["  =>  $u5b$
  38 *     "]"  =>  $u5d$
  39 *     "~"  =>  $u7e$
  40 *
  41 * A double ".." means "::" and a single "." means "-".
  42 *
  43 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
  44 */
  45
  46static const char *hash_prefix = "::h";
  47static const size_t hash_prefix_len = 3;
  48static const size_t hash_len = 16;
  49
  50static bool is_prefixed_hash(const char *start);
  51static bool looks_like_rust(const char *sym, size_t len);
  52static bool unescape(const char **in, char **out, const char *seq, char value);
  53
  54/*
  55 * INPUT:
  56 *     sym: symbol that has been through BFD-demangling
  57 *
  58 * This function looks for the following indicators:
  59 *
  60 *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
  61 *
  62 *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
  63 *     hex digits. This is true of 99.9998% of hashes so once in your life you
  64 *     may see a false negative. The point is to notice path components that
  65 *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
  66 *     this case a false positive (non-Rust symbol has an important path
  67 *     component removed because it looks like a Rust hash) is worse than a
  68 *     false negative (the rare Rust symbol is not demangled) so this sets the
  69 *     balance in favor of false negatives.
  70 *
  71 *  3. There must be no characters other than a-zA-Z0-9 and _.:$
  72 *
  73 *  4. There must be no unrecognized $-sign sequences.
  74 *
  75 *  5. There must be no sequence of three or more dots in a row ("...").
  76 */
  77bool
  78rust_is_mangled(const char *sym)
  79{
  80        size_t len, len_without_hash;
  81
  82        if (!sym)
  83                return false;
  84
  85        len = strlen(sym);
  86        if (len <= hash_prefix_len + hash_len)
  87                /* Not long enough to contain "::h" + hash + something else */
  88                return false;
  89
  90        len_without_hash = len - (hash_prefix_len + hash_len);
  91        if (!is_prefixed_hash(sym + len_without_hash))
  92                return false;
  93
  94        return looks_like_rust(sym, len_without_hash);
  95}
  96
  97/*
  98 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
  99 * digits must comprise between 5 and 15 (inclusive) distinct digits.
 100 */
 101static bool is_prefixed_hash(const char *str)
 102{
 103        const char *end;
 104        bool seen[16];
 105        size_t i;
 106        int count;
 107
 108        if (strncmp(str, hash_prefix, hash_prefix_len))
 109                return false;
 110        str += hash_prefix_len;
 111
 112        memset(seen, false, sizeof(seen));
 113        for (end = str + hash_len; str < end; str++)
 114                if (*str >= '0' && *str <= '9')
 115                        seen[*str - '0'] = true;
 116                else if (*str >= 'a' && *str <= 'f')
 117                        seen[*str - 'a' + 10] = true;
 118                else
 119                        return false;
 120
 121        /* Count how many distinct digits seen */
 122        count = 0;
 123        for (i = 0; i < 16; i++)
 124                if (seen[i])
 125                        count++;
 126
 127        return count >= 5 && count <= 15;
 128}
 129
 130static bool looks_like_rust(const char *str, size_t len)
 131{
 132        const char *end = str + len;
 133
 134        while (str < end)
 135                switch (*str) {
 136                case '$':
 137                        if (!strncmp(str, "$C$", 3))
 138                                str += 3;
 139                        else if (!strncmp(str, "$SP$", 4)
 140                                        || !strncmp(str, "$BP$", 4)
 141                                        || !strncmp(str, "$RF$", 4)
 142                                        || !strncmp(str, "$LT$", 4)
 143                                        || !strncmp(str, "$GT$", 4)
 144                                        || !strncmp(str, "$LP$", 4)
 145                                        || !strncmp(str, "$RP$", 4))
 146                                str += 4;
 147                        else if (!strncmp(str, "$u20$", 5)
 148                                        || !strncmp(str, "$u27$", 5)
 149                                        || !strncmp(str, "$u5b$", 5)
 150                                        || !strncmp(str, "$u5d$", 5)
 151                                        || !strncmp(str, "$u7e$", 5))
 152                                str += 5;
 153                        else
 154                                return false;
 155                        break;
 156                case '.':
 157                        /* Do not allow three or more consecutive dots */
 158                        if (!strncmp(str, "...", 3))
 159                                return false;
 160                        /* Fall through */
 161                case 'a' ... 'z':
 162                case 'A' ... 'Z':
 163                case '0' ... '9':
 164                case '_':
 165                case ':':
 166                        str++;
 167                        break;
 168                default:
 169                        return false;
 170                }
 171
 172        return true;
 173}
 174
 175/*
 176 * INPUT:
 177 *     sym: symbol for which rust_is_mangled(sym) returns true
 178 *
 179 * The input is demangled in-place because the mangled name is always longer
 180 * than the demangled one.
 181 */
 182void
 183rust_demangle_sym(char *sym)
 184{
 185        const char *in;
 186        char *out;
 187        const char *end;
 188
 189        if (!sym)
 190                return;
 191
 192        in = sym;
 193        out = sym;
 194        end = sym + strlen(sym) - (hash_prefix_len + hash_len);
 195
 196        while (in < end)
 197                switch (*in) {
 198                case '$':
 199                        if (!(unescape(&in, &out, "$C$", ',')
 200                                        || unescape(&in, &out, "$SP$", '@')
 201                                        || unescape(&in, &out, "$BP$", '*')
 202                                        || unescape(&in, &out, "$RF$", '&')
 203                                        || unescape(&in, &out, "$LT$", '<')
 204                                        || unescape(&in, &out, "$GT$", '>')
 205                                        || unescape(&in, &out, "$LP$", '(')
 206                                        || unescape(&in, &out, "$RP$", ')')
 207                                        || unescape(&in, &out, "$u20$", ' ')
 208                                        || unescape(&in, &out, "$u27$", '\'')
 209                                        || unescape(&in, &out, "$u5b$", '[')
 210                                        || unescape(&in, &out, "$u5d$", ']')
 211                                        || unescape(&in, &out, "$u7e$", '~'))) {
 212                                pr_err("demangle-rust: unexpected escape sequence");
 213                                goto done;
 214                        }
 215                        break;
 216                case '_':
 217                        /*
 218                         * If this is the start of a path component and the next
 219                         * character is an escape sequence, ignore the
 220                         * underscore. The mangler inserts an underscore to make
 221                         * sure the path component begins with a XID_Start
 222                         * character.
 223                         */
 224                        if ((in == sym || in[-1] == ':') && in[1] == '$')
 225                                in++;
 226                        else
 227                                *out++ = *in++;
 228                        break;
 229                case '.':
 230                        if (in[1] == '.') {
 231                                /* ".." becomes "::" */
 232                                *out++ = ':';
 233                                *out++ = ':';
 234                                in += 2;
 235                        } else {
 236                                /* "." becomes "-" */
 237                                *out++ = '-';
 238                                in++;
 239                        }
 240                        break;
 241                case 'a' ... 'z':
 242                case 'A' ... 'Z':
 243                case '0' ... '9':
 244                case ':':
 245                        *out++ = *in++;
 246                        break;
 247                default:
 248                        pr_err("demangle-rust: unexpected character '%c' in symbol\n",
 249                                *in);
 250                        goto done;
 251                }
 252
 253done:
 254        *out = '\0';
 255}
 256
 257static bool unescape(const char **in, char **out, const char *seq, char value)
 258{
 259        size_t len = strlen(seq);
 260
 261        if (strncmp(*in, seq, len))
 262                return false;
 263
 264        **out = value;
 265
 266        *in += len;
 267        *out += 1;
 268
 269        return true;
 270}
 271