linux/tools/testing/selftests/bpf/progs/strobemeta.h
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2// Copyright (c) 2019 Facebook
   3
   4#include <stdint.h>
   5#include <stddef.h>
   6#include <stdbool.h>
   7#include <linux/bpf.h>
   8#include <linux/ptrace.h>
   9#include <linux/sched.h>
  10#include <linux/types.h>
  11#include <bpf/bpf_helpers.h>
  12
  13typedef uint32_t pid_t;
  14struct task_struct {};
  15
  16#define TASK_COMM_LEN 16
  17#define PERF_MAX_STACK_DEPTH 127
  18
  19#define STROBE_TYPE_INVALID 0
  20#define STROBE_TYPE_INT 1
  21#define STROBE_TYPE_STR 2
  22#define STROBE_TYPE_MAP 3
  23
  24#define STACK_TABLE_EPOCH_SHIFT 20
  25#define STROBE_MAX_STR_LEN 1
  26#define STROBE_MAX_CFGS 32
  27#define STROBE_MAX_PAYLOAD                                              \
  28        (STROBE_MAX_STRS * STROBE_MAX_STR_LEN +                         \
  29        STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
  30
  31struct strobe_value_header {
  32        /*
  33         * meaning depends on type:
  34         * 1. int: 0, if value not set, 1 otherwise
  35         * 2. str: 1 always, whether value is set or not is determined by ptr
  36         * 3. map: 1 always, pointer points to additional struct with number
  37         *    of entries (up to STROBE_MAX_MAP_ENTRIES)
  38         */
  39        uint16_t len;
  40        /*
  41         * _reserved might be used for some future fields/flags, but we always
  42         * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
  43         * bytes in one go and get both header and value
  44         */
  45        uint8_t _reserved[6];
  46};
  47
  48/*
  49 * strobe_value_generic is used from BPF probe only, but needs to be a union
  50 * of strobe_value_int/strobe_value_str/strobe_value_map
  51 */
  52struct strobe_value_generic {
  53        struct strobe_value_header header;
  54        union {
  55                int64_t val;
  56                void *ptr;
  57        };
  58};
  59
  60struct strobe_value_int {
  61        struct strobe_value_header header;
  62        int64_t value;
  63};
  64
  65struct strobe_value_str {
  66        struct strobe_value_header header;
  67        const char* value;
  68};
  69
  70struct strobe_value_map {
  71        struct strobe_value_header header;
  72        const struct strobe_map_raw* value;
  73};
  74
  75struct strobe_map_entry {
  76        const char* key;
  77        const char* val;
  78};
  79
  80/*
  81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
  82 * corresponding int64 ID, which application can use (or ignore) in whatever
  83 * way appropriate. Map is "write-only", there is no way to get data out of
  84 * map. Map is intended to be used to provide metadata for profilers and is
  85 * not to be used for internal in-app communication. All methods are
  86 * thread-safe.
  87 */
  88struct strobe_map_raw {
  89        /*
  90         * general purpose unique ID that's up to application to decide
  91         * whether and how to use; for request metadata use case id is unique
  92         * request ID that's used to match metadata with stack traces on
  93         * Strobelight backend side
  94         */
  95        int64_t id;
  96        /* number of used entries in map */
  97        int64_t cnt;
  98        /*
  99         * having volatile doesn't change anything on BPF side, but clang
 100         * emits warnings for passing `volatile const char *` into
 101         * bpf_probe_read_user_str that expects just `const char *`
 102         */
 103        const char* tag;
 104        /*
 105         * key/value entries, each consisting of 2 pointers to key and value
 106         * C strings
 107         */
 108        struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
 109};
 110
 111/* Following values define supported values of TLS mode */
 112#define TLS_NOT_SET -1
 113#define TLS_LOCAL_EXEC 0
 114#define TLS_IMM_EXEC 1
 115#define TLS_GENERAL_DYN 2
 116
 117/*
 118 * structure that universally represents TLS location (both for static
 119 * executables and shared libraries)
 120 */
 121struct strobe_value_loc {
 122        /*
 123         * tls_mode defines what TLS mode was used for particular metavariable:
 124         * - -1 (TLS_NOT_SET) - no metavariable;
 125         * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
 126         * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
 127         * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
 128         * Local Dynamic mode is not yet supported, because never seen in
 129         * practice.  Mode defines how offset field is interpreted. See
 130         * calc_location() in below for details.
 131         */
 132        int64_t tls_mode;
 133        /*
 134         * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
 135         * tpidr_el0 for aarch64).
 136         * TLS_IMM_EXEC: absolute address of GOT entry containing offset
 137         * from thread pointer;
 138         * TLS_GENERAL_DYN: absolute addres of double GOT entry
 139         * containing tls_index_t struct;
 140         */
 141        int64_t offset;
 142};
 143
 144struct strobemeta_cfg {
 145        int64_t req_meta_idx;
 146        struct strobe_value_loc int_locs[STROBE_MAX_INTS];
 147        struct strobe_value_loc str_locs[STROBE_MAX_STRS];
 148        struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
 149};
 150
 151struct strobe_map_descr {
 152        uint64_t id;
 153        int16_t tag_len;
 154        /*
 155         * cnt <0 - map value isn't set;
 156         * 0 - map has id set, but no key/value entries
 157         */
 158        int16_t cnt;
 159        /*
 160         * both key_lens[i] and val_lens[i] should be >0 for present key/value
 161         * entry
 162         */
 163        uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
 164        uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
 165};
 166
 167struct strobemeta_payload {
 168        /* req_id has valid request ID, if req_meta_valid == 1 */
 169        int64_t req_id;
 170        uint8_t req_meta_valid;
 171        /*
 172         * mask has Nth bit set to 1, if Nth metavar was present and
 173         * successfully read
 174         */
 175        uint64_t int_vals_set_mask;
 176        int64_t int_vals[STROBE_MAX_INTS];
 177        /* len is >0 for present values */
 178        uint16_t str_lens[STROBE_MAX_STRS];
 179        /* if map_descrs[i].cnt == -1, metavar is not present/set */
 180        struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
 181        /*
 182         * payload has compactly packed values of str and map variables in the
 183         * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
 184         * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
 185         * value length
 186         */
 187        char payload[STROBE_MAX_PAYLOAD];
 188};
 189
 190struct strobelight_bpf_sample {
 191        uint64_t ktime;
 192        char comm[TASK_COMM_LEN];
 193        pid_t pid;
 194        int user_stack_id;
 195        int kernel_stack_id;
 196        int has_meta;
 197        struct strobemeta_payload metadata;
 198        /*
 199         * makes it possible to pass (<real payload size> + 1) as data size to
 200         * perf_submit() to avoid perf_submit's paranoia about passing zero as
 201         * size, as it deduces that <real payload size> might be
 202         * **theoretically** zero
 203         */
 204        char dummy_safeguard;
 205};
 206
 207struct {
 208        __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
 209        __uint(max_entries, 32);
 210        __uint(key_size, sizeof(int));
 211        __uint(value_size, sizeof(int));
 212} samples SEC(".maps");
 213
 214struct {
 215        __uint(type, BPF_MAP_TYPE_STACK_TRACE);
 216        __uint(max_entries, 16);
 217        __uint(key_size, sizeof(uint32_t));
 218        __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
 219} stacks_0 SEC(".maps");
 220
 221struct {
 222        __uint(type, BPF_MAP_TYPE_STACK_TRACE);
 223        __uint(max_entries, 16);
 224        __uint(key_size, sizeof(uint32_t));
 225        __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
 226} stacks_1 SEC(".maps");
 227
 228struct {
 229        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 230        __uint(max_entries, 1);
 231        __type(key, uint32_t);
 232        __type(value, struct strobelight_bpf_sample);
 233} sample_heap SEC(".maps");
 234
 235struct {
 236        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 237        __uint(max_entries, STROBE_MAX_CFGS);
 238        __type(key, pid_t);
 239        __type(value, struct strobemeta_cfg);
 240} strobemeta_cfgs SEC(".maps");
 241
 242/* Type for the dtv.  */
 243/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
 244typedef union dtv {
 245        size_t counter;
 246        struct {
 247                void* val;
 248                bool is_static;
 249        } pointer;
 250} dtv_t;
 251
 252/* Partial definition for tcbhead_t */
 253/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
 254struct tcbhead {
 255        void* tcb;
 256        dtv_t* dtv;
 257};
 258
 259/*
 260 * TLS module/offset information for shared library case.
 261 * For x86-64, this is mapped onto two entries in GOT.
 262 * For aarch64, this is pointed to by second GOT entry.
 263 */
 264struct tls_index {
 265        uint64_t module;
 266        uint64_t offset;
 267};
 268
 269#ifdef SUBPROGS
 270__noinline
 271#else
 272__always_inline
 273#endif
 274static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
 275{
 276        /*
 277         * tls_mode value is:
 278         * - -1 (TLS_NOT_SET), if no metavar is present;
 279         * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
 280         * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
 281         * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
 282         * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
 283         * This schema allows to use something like:
 284         * (tls_mode + 1) * (tls_base + offset)
 285         * to get NULL for "no metavar" location, or correct pointer for local
 286         * executable mode without doing extra ifs.
 287         */
 288        if (loc->tls_mode <= TLS_LOCAL_EXEC) {
 289                /* static executable is simple, we just have offset from
 290                 * tls_base */
 291                void *addr = tls_base + loc->offset;
 292                /* multiply by (tls_mode + 1) to get NULL, if we have no
 293                 * metavar in this slot */
 294                return (void *)((loc->tls_mode + 1) * (int64_t)addr);
 295        }
 296        /*
 297         * Other modes are more complicated, we need to jump through few hoops.
 298         *
 299         * For immediate executable mode (currently supported only for aarch64):
 300         *  - loc->offset is pointing to a GOT entry containing fixed offset
 301         *  relative to tls_base;
 302         *
 303         * For general dynamic mode:
 304         *  - loc->offset is pointing to a beginning of double GOT entries;
 305         *  - (for aarch64 only) second entry points to tls_index_t struct;
 306         *  - (for x86-64 only) two GOT entries are already tls_index_t;
 307         *  - tls_index_t->module is used to find start of TLS section in
 308         *  which variable resides;
 309         *  - tls_index_t->offset provides offset within that TLS section,
 310         *  pointing to value of variable.
 311         */
 312        struct tls_index tls_index;
 313        dtv_t *dtv;
 314        void *tls_ptr;
 315
 316        bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
 317                            (void *)loc->offset);
 318        /* valid module index is always positive */
 319        if (tls_index.module > 0) {
 320                /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
 321                bpf_probe_read_user(&dtv, sizeof(dtv),
 322                                    &((struct tcbhead *)tls_base)->dtv);
 323                dtv += tls_index.module;
 324        } else {
 325                dtv = NULL;
 326        }
 327        bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
 328        /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
 329        return tls_ptr && tls_ptr != (void *)-1
 330                ? tls_ptr + tls_index.offset
 331                : NULL;
 332}
 333
 334#ifdef SUBPROGS
 335__noinline
 336#else
 337__always_inline
 338#endif
 339static void read_int_var(struct strobemeta_cfg *cfg,
 340                         size_t idx, void *tls_base,
 341                         struct strobe_value_generic *value,
 342                         struct strobemeta_payload *data)
 343{
 344        void *location = calc_location(&cfg->int_locs[idx], tls_base);
 345        if (!location)
 346                return;
 347
 348        bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
 349        data->int_vals[idx] = value->val;
 350        if (value->header.len)
 351                data->int_vals_set_mask |= (1 << idx);
 352}
 353
 354static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
 355                                             size_t idx, void *tls_base,
 356                                             struct strobe_value_generic *value,
 357                                             struct strobemeta_payload *data,
 358                                             void *payload)
 359{
 360        void *location;
 361        uint32_t len;
 362
 363        data->str_lens[idx] = 0;
 364        location = calc_location(&cfg->str_locs[idx], tls_base);
 365        if (!location)
 366                return 0;
 367
 368        bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
 369        len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
 370        /*
 371         * if bpf_probe_read_user_str returns error (<0), due to casting to
 372         * unsinged int, it will become big number, so next check is
 373         * sufficient to check for errors AND prove to BPF verifier, that
 374         * bpf_probe_read_user_str won't return anything bigger than
 375         * STROBE_MAX_STR_LEN
 376         */
 377        if (len > STROBE_MAX_STR_LEN)
 378                return 0;
 379
 380        data->str_lens[idx] = len;
 381        return len;
 382}
 383
 384static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
 385                                          size_t idx, void *tls_base,
 386                                          struct strobe_value_generic *value,
 387                                          struct strobemeta_payload *data,
 388                                          void *payload)
 389{
 390        struct strobe_map_descr* descr = &data->map_descrs[idx];
 391        struct strobe_map_raw map;
 392        void *location;
 393        uint32_t len;
 394        int i;
 395
 396        descr->tag_len = 0; /* presume no tag is set */
 397        descr->cnt = -1; /* presume no value is set */
 398
 399        location = calc_location(&cfg->map_locs[idx], tls_base);
 400        if (!location)
 401                return payload;
 402
 403        bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
 404        if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
 405                return payload;
 406
 407        descr->id = map.id;
 408        descr->cnt = map.cnt;
 409        if (cfg->req_meta_idx == idx) {
 410                data->req_id = map.id;
 411                data->req_meta_valid = 1;
 412        }
 413
 414        len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
 415        if (len <= STROBE_MAX_STR_LEN) {
 416                descr->tag_len = len;
 417                payload += len;
 418        }
 419
 420#ifdef NO_UNROLL
 421#pragma clang loop unroll(disable)
 422#else
 423#pragma unroll
 424#endif
 425        for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
 426                if (i >= map.cnt)
 427                        break;
 428
 429                descr->key_lens[i] = 0;
 430                len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
 431                                              map.entries[i].key);
 432                if (len <= STROBE_MAX_STR_LEN) {
 433                        descr->key_lens[i] = len;
 434                        payload += len;
 435                }
 436                descr->val_lens[i] = 0;
 437                len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
 438                                              map.entries[i].val);
 439                if (len <= STROBE_MAX_STR_LEN) {
 440                        descr->val_lens[i] = len;
 441                        payload += len;
 442                }
 443        }
 444
 445        return payload;
 446}
 447
 448/*
 449 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
 450 * pointer to *right after* payload ends
 451 */
 452#ifdef SUBPROGS
 453__noinline
 454#else
 455__always_inline
 456#endif
 457static void *read_strobe_meta(struct task_struct *task,
 458                              struct strobemeta_payload *data)
 459{
 460        pid_t pid = bpf_get_current_pid_tgid() >> 32;
 461        struct strobe_value_generic value = {0};
 462        struct strobemeta_cfg *cfg;
 463        void *tls_base, *payload;
 464
 465        cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
 466        if (!cfg)
 467                return NULL;
 468
 469        data->int_vals_set_mask = 0;
 470        data->req_meta_valid = 0;
 471        payload = data->payload;
 472        /*
 473         * we don't have struct task_struct definition, it should be:
 474         * tls_base = (void *)task->thread.fsbase;
 475         */
 476        tls_base = (void *)task;
 477
 478#ifdef NO_UNROLL
 479#pragma clang loop unroll(disable)
 480#else
 481#pragma unroll
 482#endif
 483        for (int i = 0; i < STROBE_MAX_INTS; ++i) {
 484                read_int_var(cfg, i, tls_base, &value, data);
 485        }
 486#ifdef NO_UNROLL
 487#pragma clang loop unroll(disable)
 488#else
 489#pragma unroll
 490#endif
 491        for (int i = 0; i < STROBE_MAX_STRS; ++i) {
 492                payload += read_str_var(cfg, i, tls_base, &value, data, payload);
 493        }
 494#ifdef NO_UNROLL
 495#pragma clang loop unroll(disable)
 496#else
 497#pragma unroll
 498#endif
 499        for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
 500                payload = read_map_var(cfg, i, tls_base, &value, data, payload);
 501        }
 502        /*
 503         * return pointer right after end of payload, so it's possible to
 504         * calculate exact amount of useful data that needs to be sent
 505         */
 506        return payload;
 507}
 508
 509SEC("raw_tracepoint/kfree_skb")
 510int on_event(struct pt_regs *ctx) {
 511        pid_t pid =  bpf_get_current_pid_tgid() >> 32;
 512        struct strobelight_bpf_sample* sample;
 513        struct task_struct *task;
 514        uint32_t zero = 0;
 515        uint64_t ktime_ns;
 516        void *sample_end;
 517
 518        sample = bpf_map_lookup_elem(&sample_heap, &zero);
 519        if (!sample)
 520                return 0; /* this will never happen */
 521
 522        sample->pid = pid;
 523        bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
 524        ktime_ns = bpf_ktime_get_ns();
 525        sample->ktime = ktime_ns;
 526
 527        task = (struct task_struct *)bpf_get_current_task();
 528        sample_end = read_strobe_meta(task, &sample->metadata);
 529        sample->has_meta = sample_end != NULL;
 530        sample_end = sample_end ? : &sample->metadata;
 531
 532        if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
 533                sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
 534                sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
 535        } else {
 536                sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
 537                sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
 538        }
 539
 540        uint64_t sample_size = sample_end - (void *)sample;
 541        /* should always be true */
 542        if (sample_size < sizeof(struct strobelight_bpf_sample))
 543                bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
 544        return 0;
 545}
 546
 547char _license[] SEC("license") = "GPL";
 548