linux/tools/perf/builtin-record.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * builtin-record.c
   4 *
   5 * Builtin record command: Record the profile of a workload
   6 * (or a CPU, or a PID) into the perf.data output file - for
   7 * later analysis via perf report.
   8 */
   9#include "builtin.h"
  10
  11#include "perf.h"
  12
  13#include "util/build-id.h"
  14#include <subcmd/parse-options.h>
  15#include "util/parse-events.h"
  16#include "util/config.h"
  17
  18#include "util/callchain.h"
  19#include "util/cgroup.h"
  20#include "util/header.h"
  21#include "util/event.h"
  22#include "util/evlist.h"
  23#include "util/evsel.h"
  24#include "util/debug.h"
  25#include "util/session.h"
  26#include "util/tool.h"
  27#include "util/symbol.h"
  28#include "util/cpumap.h"
  29#include "util/thread_map.h"
  30#include "util/data.h"
  31#include "util/perf_regs.h"
  32#include "util/auxtrace.h"
  33#include "util/tsc.h"
  34#include "util/parse-branch-options.h"
  35#include "util/parse-regs-options.h"
  36#include "util/llvm-utils.h"
  37#include "util/bpf-loader.h"
  38#include "util/trigger.h"
  39#include "util/perf-hooks.h"
  40#include "util/cpu-set-sched.h"
  41#include "util/time-utils.h"
  42#include "util/units.h"
  43#include "util/bpf-event.h"
  44#include "asm/bug.h"
  45
  46#include <errno.h>
  47#include <inttypes.h>
  48#include <locale.h>
  49#include <poll.h>
  50#include <unistd.h>
  51#include <sched.h>
  52#include <signal.h>
  53#include <sys/mman.h>
  54#include <sys/wait.h>
  55#include <linux/time64.h>
  56#include <linux/zalloc.h>
  57
  58struct switch_output {
  59        bool             enabled;
  60        bool             signal;
  61        unsigned long    size;
  62        unsigned long    time;
  63        const char      *str;
  64        bool             set;
  65        char             **filenames;
  66        int              num_files;
  67        int              cur_file;
  68};
  69
  70struct record {
  71        struct perf_tool        tool;
  72        struct record_opts      opts;
  73        u64                     bytes_written;
  74        struct perf_data        data;
  75        struct auxtrace_record  *itr;
  76        struct perf_evlist      *evlist;
  77        struct perf_session     *session;
  78        int                     realtime_prio;
  79        bool                    no_buildid;
  80        bool                    no_buildid_set;
  81        bool                    no_buildid_cache;
  82        bool                    no_buildid_cache_set;
  83        bool                    buildid_all;
  84        bool                    timestamp_filename;
  85        bool                    timestamp_boundary;
  86        struct switch_output    switch_output;
  87        unsigned long long      samples;
  88        cpu_set_t               affinity_mask;
  89};
  90
  91static volatile int auxtrace_record__snapshot_started;
  92static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
  93static DEFINE_TRIGGER(switch_output_trigger);
  94
  95static const char *affinity_tags[PERF_AFFINITY_MAX] = {
  96        "SYS", "NODE", "CPU"
  97};
  98
  99static bool switch_output_signal(struct record *rec)
 100{
 101        return rec->switch_output.signal &&
 102               trigger_is_ready(&switch_output_trigger);
 103}
 104
 105static bool switch_output_size(struct record *rec)
 106{
 107        return rec->switch_output.size &&
 108               trigger_is_ready(&switch_output_trigger) &&
 109               (rec->bytes_written >= rec->switch_output.size);
 110}
 111
 112static bool switch_output_time(struct record *rec)
 113{
 114        return rec->switch_output.time &&
 115               trigger_is_ready(&switch_output_trigger);
 116}
 117
 118static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
 119                         void *bf, size_t size)
 120{
 121        struct perf_data_file *file = &rec->session->data->file;
 122
 123        if (perf_data_file__write(file, bf, size) < 0) {
 124                pr_err("failed to write perf data, error: %m\n");
 125                return -1;
 126        }
 127
 128        rec->bytes_written += size;
 129
 130        if (switch_output_size(rec))
 131                trigger_hit(&switch_output_trigger);
 132
 133        return 0;
 134}
 135
 136static int record__aio_enabled(struct record *rec);
 137static int record__comp_enabled(struct record *rec);
 138static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 139                            void *src, size_t src_size);
 140
 141#ifdef HAVE_AIO_SUPPORT
 142static int record__aio_write(struct aiocb *cblock, int trace_fd,
 143                void *buf, size_t size, off_t off)
 144{
 145        int rc;
 146
 147        cblock->aio_fildes = trace_fd;
 148        cblock->aio_buf    = buf;
 149        cblock->aio_nbytes = size;
 150        cblock->aio_offset = off;
 151        cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
 152
 153        do {
 154                rc = aio_write(cblock);
 155                if (rc == 0) {
 156                        break;
 157                } else if (errno != EAGAIN) {
 158                        cblock->aio_fildes = -1;
 159                        pr_err("failed to queue perf data, error: %m\n");
 160                        break;
 161                }
 162        } while (1);
 163
 164        return rc;
 165}
 166
 167static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
 168{
 169        void *rem_buf;
 170        off_t rem_off;
 171        size_t rem_size;
 172        int rc, aio_errno;
 173        ssize_t aio_ret, written;
 174
 175        aio_errno = aio_error(cblock);
 176        if (aio_errno == EINPROGRESS)
 177                return 0;
 178
 179        written = aio_ret = aio_return(cblock);
 180        if (aio_ret < 0) {
 181                if (aio_errno != EINTR)
 182                        pr_err("failed to write perf data, error: %m\n");
 183                written = 0;
 184        }
 185
 186        rem_size = cblock->aio_nbytes - written;
 187
 188        if (rem_size == 0) {
 189                cblock->aio_fildes = -1;
 190                /*
 191                 * md->refcount is incremented in record__aio_pushfn() for
 192                 * every aio write request started in record__aio_push() so
 193                 * decrement it because the request is now complete.
 194                 */
 195                perf_mmap__put(md);
 196                rc = 1;
 197        } else {
 198                /*
 199                 * aio write request may require restart with the
 200                 * reminder if the kernel didn't write whole
 201                 * chunk at once.
 202                 */
 203                rem_off = cblock->aio_offset + written;
 204                rem_buf = (void *)(cblock->aio_buf + written);
 205                record__aio_write(cblock, cblock->aio_fildes,
 206                                rem_buf, rem_size, rem_off);
 207                rc = 0;
 208        }
 209
 210        return rc;
 211}
 212
 213static int record__aio_sync(struct perf_mmap *md, bool sync_all)
 214{
 215        struct aiocb **aiocb = md->aio.aiocb;
 216        struct aiocb *cblocks = md->aio.cblocks;
 217        struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
 218        int i, do_suspend;
 219
 220        do {
 221                do_suspend = 0;
 222                for (i = 0; i < md->aio.nr_cblocks; ++i) {
 223                        if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
 224                                if (sync_all)
 225                                        aiocb[i] = NULL;
 226                                else
 227                                        return i;
 228                        } else {
 229                                /*
 230                                 * Started aio write is not complete yet
 231                                 * so it has to be waited before the
 232                                 * next allocation.
 233                                 */
 234                                aiocb[i] = &cblocks[i];
 235                                do_suspend = 1;
 236                        }
 237                }
 238                if (!do_suspend)
 239                        return -1;
 240
 241                while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
 242                        if (!(errno == EAGAIN || errno == EINTR))
 243                                pr_err("failed to sync perf data, error: %m\n");
 244                }
 245        } while (1);
 246}
 247
 248struct record_aio {
 249        struct record   *rec;
 250        void            *data;
 251        size_t          size;
 252};
 253
 254static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
 255{
 256        struct record_aio *aio = to;
 257
 258        /*
 259         * map->base data pointed by buf is copied into free map->aio.data[] buffer
 260         * to release space in the kernel buffer as fast as possible, calling
 261         * perf_mmap__consume() from perf_mmap__push() function.
 262         *
 263         * That lets the kernel to proceed with storing more profiling data into
 264         * the kernel buffer earlier than other per-cpu kernel buffers are handled.
 265         *
 266         * Coping can be done in two steps in case the chunk of profiling data
 267         * crosses the upper bound of the kernel buffer. In this case we first move
 268         * part of data from map->start till the upper bound and then the reminder
 269         * from the beginning of the kernel buffer till the end of the data chunk.
 270         */
 271
 272        if (record__comp_enabled(aio->rec)) {
 273                size = zstd_compress(aio->rec->session, aio->data + aio->size,
 274                                     perf_mmap__mmap_len(map) - aio->size,
 275                                     buf, size);
 276        } else {
 277                memcpy(aio->data + aio->size, buf, size);
 278        }
 279
 280        if (!aio->size) {
 281                /*
 282                 * Increment map->refcount to guard map->aio.data[] buffer
 283                 * from premature deallocation because map object can be
 284                 * released earlier than aio write request started on
 285                 * map->aio.data[] buffer is complete.
 286                 *
 287                 * perf_mmap__put() is done at record__aio_complete()
 288                 * after started aio request completion or at record__aio_push()
 289                 * if the request failed to start.
 290                 */
 291                perf_mmap__get(map);
 292        }
 293
 294        aio->size += size;
 295
 296        return size;
 297}
 298
 299static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
 300{
 301        int ret, idx;
 302        int trace_fd = rec->session->data->file.fd;
 303        struct record_aio aio = { .rec = rec, .size = 0 };
 304
 305        /*
 306         * Call record__aio_sync() to wait till map->aio.data[] buffer
 307         * becomes available after previous aio write operation.
 308         */
 309
 310        idx = record__aio_sync(map, false);
 311        aio.data = map->aio.data[idx];
 312        ret = perf_mmap__push(map, &aio, record__aio_pushfn);
 313        if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
 314                return ret;
 315
 316        rec->samples++;
 317        ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
 318        if (!ret) {
 319                *off += aio.size;
 320                rec->bytes_written += aio.size;
 321                if (switch_output_size(rec))
 322                        trigger_hit(&switch_output_trigger);
 323        } else {
 324                /*
 325                 * Decrement map->refcount incremented in record__aio_pushfn()
 326                 * back if record__aio_write() operation failed to start, otherwise
 327                 * map->refcount is decremented in record__aio_complete() after
 328                 * aio write operation finishes successfully.
 329                 */
 330                perf_mmap__put(map);
 331        }
 332
 333        return ret;
 334}
 335
 336static off_t record__aio_get_pos(int trace_fd)
 337{
 338        return lseek(trace_fd, 0, SEEK_CUR);
 339}
 340
 341static void record__aio_set_pos(int trace_fd, off_t pos)
 342{
 343        lseek(trace_fd, pos, SEEK_SET);
 344}
 345
 346static void record__aio_mmap_read_sync(struct record *rec)
 347{
 348        int i;
 349        struct perf_evlist *evlist = rec->evlist;
 350        struct perf_mmap *maps = evlist->mmap;
 351
 352        if (!record__aio_enabled(rec))
 353                return;
 354
 355        for (i = 0; i < evlist->nr_mmaps; i++) {
 356                struct perf_mmap *map = &maps[i];
 357
 358                if (map->base)
 359                        record__aio_sync(map, true);
 360        }
 361}
 362
 363static int nr_cblocks_default = 1;
 364static int nr_cblocks_max = 4;
 365
 366static int record__aio_parse(const struct option *opt,
 367                             const char *str,
 368                             int unset)
 369{
 370        struct record_opts *opts = (struct record_opts *)opt->value;
 371
 372        if (unset) {
 373                opts->nr_cblocks = 0;
 374        } else {
 375                if (str)
 376                        opts->nr_cblocks = strtol(str, NULL, 0);
 377                if (!opts->nr_cblocks)
 378                        opts->nr_cblocks = nr_cblocks_default;
 379        }
 380
 381        return 0;
 382}
 383#else /* HAVE_AIO_SUPPORT */
 384static int nr_cblocks_max = 0;
 385
 386static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
 387                            off_t *off __maybe_unused)
 388{
 389        return -1;
 390}
 391
 392static off_t record__aio_get_pos(int trace_fd __maybe_unused)
 393{
 394        return -1;
 395}
 396
 397static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
 398{
 399}
 400
 401static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
 402{
 403}
 404#endif
 405
 406static int record__aio_enabled(struct record *rec)
 407{
 408        return rec->opts.nr_cblocks > 0;
 409}
 410
 411#define MMAP_FLUSH_DEFAULT 1
 412static int record__mmap_flush_parse(const struct option *opt,
 413                                    const char *str,
 414                                    int unset)
 415{
 416        int flush_max;
 417        struct record_opts *opts = (struct record_opts *)opt->value;
 418        static struct parse_tag tags[] = {
 419                        { .tag  = 'B', .mult = 1       },
 420                        { .tag  = 'K', .mult = 1 << 10 },
 421                        { .tag  = 'M', .mult = 1 << 20 },
 422                        { .tag  = 'G', .mult = 1 << 30 },
 423                        { .tag  = 0 },
 424        };
 425
 426        if (unset)
 427                return 0;
 428
 429        if (str) {
 430                opts->mmap_flush = parse_tag_value(str, tags);
 431                if (opts->mmap_flush == (int)-1)
 432                        opts->mmap_flush = strtol(str, NULL, 0);
 433        }
 434
 435        if (!opts->mmap_flush)
 436                opts->mmap_flush = MMAP_FLUSH_DEFAULT;
 437
 438        flush_max = perf_evlist__mmap_size(opts->mmap_pages);
 439        flush_max /= 4;
 440        if (opts->mmap_flush > flush_max)
 441                opts->mmap_flush = flush_max;
 442
 443        return 0;
 444}
 445
 446#ifdef HAVE_ZSTD_SUPPORT
 447static unsigned int comp_level_default = 1;
 448
 449static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
 450{
 451        struct record_opts *opts = opt->value;
 452
 453        if (unset) {
 454                opts->comp_level = 0;
 455        } else {
 456                if (str)
 457                        opts->comp_level = strtol(str, NULL, 0);
 458                if (!opts->comp_level)
 459                        opts->comp_level = comp_level_default;
 460        }
 461
 462        return 0;
 463}
 464#endif
 465static unsigned int comp_level_max = 22;
 466
 467static int record__comp_enabled(struct record *rec)
 468{
 469        return rec->opts.comp_level > 0;
 470}
 471
 472static int process_synthesized_event(struct perf_tool *tool,
 473                                     union perf_event *event,
 474                                     struct perf_sample *sample __maybe_unused,
 475                                     struct machine *machine __maybe_unused)
 476{
 477        struct record *rec = container_of(tool, struct record, tool);
 478        return record__write(rec, NULL, event, event->header.size);
 479}
 480
 481static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
 482{
 483        struct record *rec = to;
 484
 485        if (record__comp_enabled(rec)) {
 486                size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
 487                bf   = map->data;
 488        }
 489
 490        rec->samples++;
 491        return record__write(rec, map, bf, size);
 492}
 493
 494static volatile int done;
 495static volatile int signr = -1;
 496static volatile int child_finished;
 497
 498static void sig_handler(int sig)
 499{
 500        if (sig == SIGCHLD)
 501                child_finished = 1;
 502        else
 503                signr = sig;
 504
 505        done = 1;
 506}
 507
 508static void sigsegv_handler(int sig)
 509{
 510        perf_hooks__recover();
 511        sighandler_dump_stack(sig);
 512}
 513
 514static void record__sig_exit(void)
 515{
 516        if (signr == -1)
 517                return;
 518
 519        signal(signr, SIG_DFL);
 520        raise(signr);
 521}
 522
 523#ifdef HAVE_AUXTRACE_SUPPORT
 524
 525static int record__process_auxtrace(struct perf_tool *tool,
 526                                    struct perf_mmap *map,
 527                                    union perf_event *event, void *data1,
 528                                    size_t len1, void *data2, size_t len2)
 529{
 530        struct record *rec = container_of(tool, struct record, tool);
 531        struct perf_data *data = &rec->data;
 532        size_t padding;
 533        u8 pad[8] = {0};
 534
 535        if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
 536                off_t file_offset;
 537                int fd = perf_data__fd(data);
 538                int err;
 539
 540                file_offset = lseek(fd, 0, SEEK_CUR);
 541                if (file_offset == -1)
 542                        return -1;
 543                err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
 544                                                     event, file_offset);
 545                if (err)
 546                        return err;
 547        }
 548
 549        /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
 550        padding = (len1 + len2) & 7;
 551        if (padding)
 552                padding = 8 - padding;
 553
 554        record__write(rec, map, event, event->header.size);
 555        record__write(rec, map, data1, len1);
 556        if (len2)
 557                record__write(rec, map, data2, len2);
 558        record__write(rec, map, &pad, padding);
 559
 560        return 0;
 561}
 562
 563static int record__auxtrace_mmap_read(struct record *rec,
 564                                      struct perf_mmap *map)
 565{
 566        int ret;
 567
 568        ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
 569                                  record__process_auxtrace);
 570        if (ret < 0)
 571                return ret;
 572
 573        if (ret)
 574                rec->samples++;
 575
 576        return 0;
 577}
 578
 579static int record__auxtrace_mmap_read_snapshot(struct record *rec,
 580                                               struct perf_mmap *map)
 581{
 582        int ret;
 583
 584        ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
 585                                           record__process_auxtrace,
 586                                           rec->opts.auxtrace_snapshot_size);
 587        if (ret < 0)
 588                return ret;
 589
 590        if (ret)
 591                rec->samples++;
 592
 593        return 0;
 594}
 595
 596static int record__auxtrace_read_snapshot_all(struct record *rec)
 597{
 598        int i;
 599        int rc = 0;
 600
 601        for (i = 0; i < rec->evlist->nr_mmaps; i++) {
 602                struct perf_mmap *map = &rec->evlist->mmap[i];
 603
 604                if (!map->auxtrace_mmap.base)
 605                        continue;
 606
 607                if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
 608                        rc = -1;
 609                        goto out;
 610                }
 611        }
 612out:
 613        return rc;
 614}
 615
 616static void record__read_auxtrace_snapshot(struct record *rec)
 617{
 618        pr_debug("Recording AUX area tracing snapshot\n");
 619        if (record__auxtrace_read_snapshot_all(rec) < 0) {
 620                trigger_error(&auxtrace_snapshot_trigger);
 621        } else {
 622                if (auxtrace_record__snapshot_finish(rec->itr))
 623                        trigger_error(&auxtrace_snapshot_trigger);
 624                else
 625                        trigger_ready(&auxtrace_snapshot_trigger);
 626        }
 627}
 628
 629static int record__auxtrace_init(struct record *rec)
 630{
 631        int err;
 632
 633        if (!rec->itr) {
 634                rec->itr = auxtrace_record__init(rec->evlist, &err);
 635                if (err)
 636                        return err;
 637        }
 638
 639        err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
 640                                              rec->opts.auxtrace_snapshot_opts);
 641        if (err)
 642                return err;
 643
 644        return auxtrace_parse_filters(rec->evlist);
 645}
 646
 647#else
 648
 649static inline
 650int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
 651                               struct perf_mmap *map __maybe_unused)
 652{
 653        return 0;
 654}
 655
 656static inline
 657void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
 658{
 659}
 660
 661static inline
 662int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
 663{
 664        return 0;
 665}
 666
 667static int record__auxtrace_init(struct record *rec __maybe_unused)
 668{
 669        return 0;
 670}
 671
 672#endif
 673
 674static int record__mmap_evlist(struct record *rec,
 675                               struct perf_evlist *evlist)
 676{
 677        struct record_opts *opts = &rec->opts;
 678        char msg[512];
 679
 680        if (opts->affinity != PERF_AFFINITY_SYS)
 681                cpu__setup_cpunode_map();
 682
 683        if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
 684                                 opts->auxtrace_mmap_pages,
 685                                 opts->auxtrace_snapshot_mode,
 686                                 opts->nr_cblocks, opts->affinity,
 687                                 opts->mmap_flush, opts->comp_level) < 0) {
 688                if (errno == EPERM) {
 689                        pr_err("Permission error mapping pages.\n"
 690                               "Consider increasing "
 691                               "/proc/sys/kernel/perf_event_mlock_kb,\n"
 692                               "or try again with a smaller value of -m/--mmap_pages.\n"
 693                               "(current value: %u,%u)\n",
 694                               opts->mmap_pages, opts->auxtrace_mmap_pages);
 695                        return -errno;
 696                } else {
 697                        pr_err("failed to mmap with %d (%s)\n", errno,
 698                                str_error_r(errno, msg, sizeof(msg)));
 699                        if (errno)
 700                                return -errno;
 701                        else
 702                                return -EINVAL;
 703                }
 704        }
 705        return 0;
 706}
 707
 708static int record__mmap(struct record *rec)
 709{
 710        return record__mmap_evlist(rec, rec->evlist);
 711}
 712
 713static int record__open(struct record *rec)
 714{
 715        char msg[BUFSIZ];
 716        struct perf_evsel *pos;
 717        struct perf_evlist *evlist = rec->evlist;
 718        struct perf_session *session = rec->session;
 719        struct record_opts *opts = &rec->opts;
 720        int rc = 0;
 721
 722        /*
 723         * For initial_delay we need to add a dummy event so that we can track
 724         * PERF_RECORD_MMAP while we wait for the initial delay to enable the
 725         * real events, the ones asked by the user.
 726         */
 727        if (opts->initial_delay) {
 728                if (perf_evlist__add_dummy(evlist))
 729                        return -ENOMEM;
 730
 731                pos = perf_evlist__first(evlist);
 732                pos->tracking = 0;
 733                pos = perf_evlist__last(evlist);
 734                pos->tracking = 1;
 735                pos->attr.enable_on_exec = 1;
 736        }
 737
 738        perf_evlist__config(evlist, opts, &callchain_param);
 739
 740        evlist__for_each_entry(evlist, pos) {
 741try_again:
 742                if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
 743                        if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 744                                if (verbose > 0)
 745                                        ui__warning("%s\n", msg);
 746                                goto try_again;
 747                        }
 748                        if ((errno == EINVAL || errno == EBADF) &&
 749                            pos->leader != pos &&
 750                            pos->weak_group) {
 751                                pos = perf_evlist__reset_weak_group(evlist, pos);
 752                                goto try_again;
 753                        }
 754                        rc = -errno;
 755                        perf_evsel__open_strerror(pos, &opts->target,
 756                                                  errno, msg, sizeof(msg));
 757                        ui__error("%s\n", msg);
 758                        goto out;
 759                }
 760
 761                pos->supported = true;
 762        }
 763
 764        if (perf_evlist__apply_filters(evlist, &pos)) {
 765                pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 766                        pos->filter, perf_evsel__name(pos), errno,
 767                        str_error_r(errno, msg, sizeof(msg)));
 768                rc = -1;
 769                goto out;
 770        }
 771
 772        rc = record__mmap(rec);
 773        if (rc)
 774                goto out;
 775
 776        session->evlist = evlist;
 777        perf_session__set_id_hdr_size(session);
 778out:
 779        return rc;
 780}
 781
 782static int process_sample_event(struct perf_tool *tool,
 783                                union perf_event *event,
 784                                struct perf_sample *sample,
 785                                struct perf_evsel *evsel,
 786                                struct machine *machine)
 787{
 788        struct record *rec = container_of(tool, struct record, tool);
 789
 790        if (rec->evlist->first_sample_time == 0)
 791                rec->evlist->first_sample_time = sample->time;
 792
 793        rec->evlist->last_sample_time = sample->time;
 794
 795        if (rec->buildid_all)
 796                return 0;
 797
 798        rec->samples++;
 799        return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 800}
 801
 802static int process_buildids(struct record *rec)
 803{
 804        struct perf_session *session = rec->session;
 805
 806        if (perf_data__size(&rec->data) == 0)
 807                return 0;
 808
 809        /*
 810         * During this process, it'll load kernel map and replace the
 811         * dso->long_name to a real pathname it found.  In this case
 812         * we prefer the vmlinux path like
 813         *   /lib/modules/3.16.4/build/vmlinux
 814         *
 815         * rather than build-id path (in debug directory).
 816         *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 817         */
 818        symbol_conf.ignore_vmlinux_buildid = true;
 819
 820        /*
 821         * If --buildid-all is given, it marks all DSO regardless of hits,
 822         * so no need to process samples. But if timestamp_boundary is enabled,
 823         * it still needs to walk on all samples to get the timestamps of
 824         * first/last samples.
 825         */
 826        if (rec->buildid_all && !rec->timestamp_boundary)
 827                rec->tool.sample = NULL;
 828
 829        return perf_session__process_events(session);
 830}
 831
 832static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 833{
 834        int err;
 835        struct perf_tool *tool = data;
 836        /*
 837         *As for guest kernel when processing subcommand record&report,
 838         *we arrange module mmap prior to guest kernel mmap and trigger
 839         *a preload dso because default guest module symbols are loaded
 840         *from guest kallsyms instead of /lib/modules/XXX/XXX. This
 841         *method is used to avoid symbol missing when the first addr is
 842         *in module instead of in guest kernel.
 843         */
 844        err = perf_event__synthesize_modules(tool, process_synthesized_event,
 845                                             machine);
 846        if (err < 0)
 847                pr_err("Couldn't record guest kernel [%d]'s reference"
 848                       " relocation symbol.\n", machine->pid);
 849
 850        /*
 851         * We use _stext for guest kernel because guest kernel's /proc/kallsyms
 852         * have no _text sometimes.
 853         */
 854        err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 855                                                 machine);
 856        if (err < 0)
 857                pr_err("Couldn't record guest kernel [%d]'s reference"
 858                       " relocation symbol.\n", machine->pid);
 859}
 860
 861static struct perf_event_header finished_round_event = {
 862        .size = sizeof(struct perf_event_header),
 863        .type = PERF_RECORD_FINISHED_ROUND,
 864};
 865
 866static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
 867{
 868        if (rec->opts.affinity != PERF_AFFINITY_SYS &&
 869            !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
 870                CPU_ZERO(&rec->affinity_mask);
 871                CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
 872                sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
 873        }
 874}
 875
 876static size_t process_comp_header(void *record, size_t increment)
 877{
 878        struct compressed_event *event = record;
 879        size_t size = sizeof(*event);
 880
 881        if (increment) {
 882                event->header.size += increment;
 883                return increment;
 884        }
 885
 886        event->header.type = PERF_RECORD_COMPRESSED;
 887        event->header.size = size;
 888
 889        return size;
 890}
 891
 892static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 893                            void *src, size_t src_size)
 894{
 895        size_t compressed;
 896        size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct compressed_event) - 1;
 897
 898        compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
 899                                                     max_record_size, process_comp_header);
 900
 901        session->bytes_transferred += src_size;
 902        session->bytes_compressed  += compressed;
 903
 904        return compressed;
 905}
 906
 907static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
 908                                    bool overwrite, bool synch)
 909{
 910        u64 bytes_written = rec->bytes_written;
 911        int i;
 912        int rc = 0;
 913        struct perf_mmap *maps;
 914        int trace_fd = rec->data.file.fd;
 915        off_t off = 0;
 916
 917        if (!evlist)
 918                return 0;
 919
 920        maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
 921        if (!maps)
 922                return 0;
 923
 924        if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
 925                return 0;
 926
 927        if (record__aio_enabled(rec))
 928                off = record__aio_get_pos(trace_fd);
 929
 930        for (i = 0; i < evlist->nr_mmaps; i++) {
 931                u64 flush = 0;
 932                struct perf_mmap *map = &maps[i];
 933
 934                if (map->base) {
 935                        record__adjust_affinity(rec, map);
 936                        if (synch) {
 937                                flush = map->flush;
 938                                map->flush = 1;
 939                        }
 940                        if (!record__aio_enabled(rec)) {
 941                                if (perf_mmap__push(map, rec, record__pushfn) < 0) {
 942                                        if (synch)
 943                                                map->flush = flush;
 944                                        rc = -1;
 945                                        goto out;
 946                                }
 947                        } else {
 948                                if (record__aio_push(rec, map, &off) < 0) {
 949                                        record__aio_set_pos(trace_fd, off);
 950                                        if (synch)
 951                                                map->flush = flush;
 952                                        rc = -1;
 953                                        goto out;
 954                                }
 955                        }
 956                        if (synch)
 957                                map->flush = flush;
 958                }
 959
 960                if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
 961                    record__auxtrace_mmap_read(rec, map) != 0) {
 962                        rc = -1;
 963                        goto out;
 964                }
 965        }
 966
 967        if (record__aio_enabled(rec))
 968                record__aio_set_pos(trace_fd, off);
 969
 970        /*
 971         * Mark the round finished in case we wrote
 972         * at least one event.
 973         */
 974        if (bytes_written != rec->bytes_written)
 975                rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
 976
 977        if (overwrite)
 978                perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
 979out:
 980        return rc;
 981}
 982
 983static int record__mmap_read_all(struct record *rec, bool synch)
 984{
 985        int err;
 986
 987        err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
 988        if (err)
 989                return err;
 990
 991        return record__mmap_read_evlist(rec, rec->evlist, true, synch);
 992}
 993
 994static void record__init_features(struct record *rec)
 995{
 996        struct perf_session *session = rec->session;
 997        int feat;
 998
 999        for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1000                perf_header__set_feat(&session->header, feat);
1001
1002        if (rec->no_buildid)
1003                perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1004
1005        if (!have_tracepoints(&rec->evlist->entries))
1006                perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1007
1008        if (!rec->opts.branch_stack)
1009                perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1010
1011        if (!rec->opts.full_auxtrace)
1012                perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1013
1014        if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1015                perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1016
1017        perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1018        if (!record__comp_enabled(rec))
1019                perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1020
1021        perf_header__clear_feat(&session->header, HEADER_STAT);
1022}
1023
1024static void
1025record__finish_output(struct record *rec)
1026{
1027        struct perf_data *data = &rec->data;
1028        int fd = perf_data__fd(data);
1029
1030        if (data->is_pipe)
1031                return;
1032
1033        rec->session->header.data_size += rec->bytes_written;
1034        data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1035
1036        if (!rec->no_buildid) {
1037                process_buildids(rec);
1038
1039                if (rec->buildid_all)
1040                        dsos__hit_all(rec->session);
1041        }
1042        perf_session__write_header(rec->session, rec->evlist, fd, true);
1043
1044        return;
1045}
1046
1047static int record__synthesize_workload(struct record *rec, bool tail)
1048{
1049        int err;
1050        struct thread_map *thread_map;
1051
1052        if (rec->opts.tail_synthesize != tail)
1053                return 0;
1054
1055        thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1056        if (thread_map == NULL)
1057                return -1;
1058
1059        err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1060                                                 process_synthesized_event,
1061                                                 &rec->session->machines.host,
1062                                                 rec->opts.sample_address);
1063        thread_map__put(thread_map);
1064        return err;
1065}
1066
1067static int record__synthesize(struct record *rec, bool tail);
1068
1069static int
1070record__switch_output(struct record *rec, bool at_exit)
1071{
1072        struct perf_data *data = &rec->data;
1073        int fd, err;
1074        char *new_filename;
1075
1076        /* Same Size:      "2015122520103046"*/
1077        char timestamp[] = "InvalidTimestamp";
1078
1079        record__aio_mmap_read_sync(rec);
1080
1081        record__synthesize(rec, true);
1082        if (target__none(&rec->opts.target))
1083                record__synthesize_workload(rec, true);
1084
1085        rec->samples = 0;
1086        record__finish_output(rec);
1087        err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1088        if (err) {
1089                pr_err("Failed to get current timestamp\n");
1090                return -EINVAL;
1091        }
1092
1093        fd = perf_data__switch(data, timestamp,
1094                                    rec->session->header.data_offset,
1095                                    at_exit, &new_filename);
1096        if (fd >= 0 && !at_exit) {
1097                rec->bytes_written = 0;
1098                rec->session->header.data_size = 0;
1099        }
1100
1101        if (!quiet)
1102                fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1103                        data->path, timestamp);
1104
1105        if (rec->switch_output.num_files) {
1106                int n = rec->switch_output.cur_file + 1;
1107
1108                if (n >= rec->switch_output.num_files)
1109                        n = 0;
1110                rec->switch_output.cur_file = n;
1111                if (rec->switch_output.filenames[n]) {
1112                        remove(rec->switch_output.filenames[n]);
1113                        zfree(&rec->switch_output.filenames[n]);
1114                }
1115                rec->switch_output.filenames[n] = new_filename;
1116        } else {
1117                free(new_filename);
1118        }
1119
1120        /* Output tracking events */
1121        if (!at_exit) {
1122                record__synthesize(rec, false);
1123
1124                /*
1125                 * In 'perf record --switch-output' without -a,
1126                 * record__synthesize() in record__switch_output() won't
1127                 * generate tracking events because there's no thread_map
1128                 * in evlist. Which causes newly created perf.data doesn't
1129                 * contain map and comm information.
1130                 * Create a fake thread_map and directly call
1131                 * perf_event__synthesize_thread_map() for those events.
1132                 */
1133                if (target__none(&rec->opts.target))
1134                        record__synthesize_workload(rec, false);
1135        }
1136        return fd;
1137}
1138
1139static volatile int workload_exec_errno;
1140
1141/*
1142 * perf_evlist__prepare_workload will send a SIGUSR1
1143 * if the fork fails, since we asked by setting its
1144 * want_signal to true.
1145 */
1146static void workload_exec_failed_signal(int signo __maybe_unused,
1147                                        siginfo_t *info,
1148                                        void *ucontext __maybe_unused)
1149{
1150        workload_exec_errno = info->si_value.sival_int;
1151        done = 1;
1152        child_finished = 1;
1153}
1154
1155static void snapshot_sig_handler(int sig);
1156static void alarm_sig_handler(int sig);
1157
1158int __weak
1159perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1160                            struct perf_tool *tool __maybe_unused,
1161                            perf_event__handler_t process __maybe_unused,
1162                            struct machine *machine __maybe_unused)
1163{
1164        return 0;
1165}
1166
1167static const struct perf_event_mmap_page *
1168perf_evlist__pick_pc(struct perf_evlist *evlist)
1169{
1170        if (evlist) {
1171                if (evlist->mmap && evlist->mmap[0].base)
1172                        return evlist->mmap[0].base;
1173                if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1174                        return evlist->overwrite_mmap[0].base;
1175        }
1176        return NULL;
1177}
1178
1179static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1180{
1181        const struct perf_event_mmap_page *pc;
1182
1183        pc = perf_evlist__pick_pc(rec->evlist);
1184        if (pc)
1185                return pc;
1186        return NULL;
1187}
1188
1189static int record__synthesize(struct record *rec, bool tail)
1190{
1191        struct perf_session *session = rec->session;
1192        struct machine *machine = &session->machines.host;
1193        struct perf_data *data = &rec->data;
1194        struct record_opts *opts = &rec->opts;
1195        struct perf_tool *tool = &rec->tool;
1196        int fd = perf_data__fd(data);
1197        int err = 0;
1198
1199        if (rec->opts.tail_synthesize != tail)
1200                return 0;
1201
1202        if (data->is_pipe) {
1203                /*
1204                 * We need to synthesize events first, because some
1205                 * features works on top of them (on report side).
1206                 */
1207                err = perf_event__synthesize_attrs(tool, rec->evlist,
1208                                                   process_synthesized_event);
1209                if (err < 0) {
1210                        pr_err("Couldn't synthesize attrs.\n");
1211                        goto out;
1212                }
1213
1214                err = perf_event__synthesize_features(tool, session, rec->evlist,
1215                                                      process_synthesized_event);
1216                if (err < 0) {
1217                        pr_err("Couldn't synthesize features.\n");
1218                        return err;
1219                }
1220
1221                if (have_tracepoints(&rec->evlist->entries)) {
1222                        /*
1223                         * FIXME err <= 0 here actually means that
1224                         * there were no tracepoints so its not really
1225                         * an error, just that we don't need to
1226                         * synthesize anything.  We really have to
1227                         * return this more properly and also
1228                         * propagate errors that now are calling die()
1229                         */
1230                        err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1231                                                                  process_synthesized_event);
1232                        if (err <= 0) {
1233                                pr_err("Couldn't record tracing data.\n");
1234                                goto out;
1235                        }
1236                        rec->bytes_written += err;
1237                }
1238        }
1239
1240        err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1241                                          process_synthesized_event, machine);
1242        if (err)
1243                goto out;
1244
1245        if (rec->opts.full_auxtrace) {
1246                err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1247                                        session, process_synthesized_event);
1248                if (err)
1249                        goto out;
1250        }
1251
1252        if (!perf_evlist__exclude_kernel(rec->evlist)) {
1253                err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1254                                                         machine);
1255                WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1256                                   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1257                                   "Check /proc/kallsyms permission or run as root.\n");
1258
1259                err = perf_event__synthesize_modules(tool, process_synthesized_event,
1260                                                     machine);
1261                WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1262                                   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1263                                   "Check /proc/modules permission or run as root.\n");
1264        }
1265
1266        if (perf_guest) {
1267                machines__process_guests(&session->machines,
1268                                         perf_event__synthesize_guest_os, tool);
1269        }
1270
1271        err = perf_event__synthesize_extra_attr(&rec->tool,
1272                                                rec->evlist,
1273                                                process_synthesized_event,
1274                                                data->is_pipe);
1275        if (err)
1276                goto out;
1277
1278        err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1279                                                 process_synthesized_event,
1280                                                NULL);
1281        if (err < 0) {
1282                pr_err("Couldn't synthesize thread map.\n");
1283                return err;
1284        }
1285
1286        err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1287                                             process_synthesized_event, NULL);
1288        if (err < 0) {
1289                pr_err("Couldn't synthesize cpu map.\n");
1290                return err;
1291        }
1292
1293        err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1294                                                machine, opts);
1295        if (err < 0)
1296                pr_warning("Couldn't synthesize bpf events.\n");
1297
1298        err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1299                                            process_synthesized_event, opts->sample_address,
1300                                            1);
1301out:
1302        return err;
1303}
1304
1305static int __cmd_record(struct record *rec, int argc, const char **argv)
1306{
1307        int err;
1308        int status = 0;
1309        unsigned long waking = 0;
1310        const bool forks = argc > 0;
1311        struct perf_tool *tool = &rec->tool;
1312        struct record_opts *opts = &rec->opts;
1313        struct perf_data *data = &rec->data;
1314        struct perf_session *session;
1315        bool disabled = false, draining = false;
1316        struct perf_evlist *sb_evlist = NULL;
1317        int fd;
1318        float ratio = 0;
1319
1320        atexit(record__sig_exit);
1321        signal(SIGCHLD, sig_handler);
1322        signal(SIGINT, sig_handler);
1323        signal(SIGTERM, sig_handler);
1324        signal(SIGSEGV, sigsegv_handler);
1325
1326        if (rec->opts.record_namespaces)
1327                tool->namespace_events = true;
1328
1329        if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1330                signal(SIGUSR2, snapshot_sig_handler);
1331                if (rec->opts.auxtrace_snapshot_mode)
1332                        trigger_on(&auxtrace_snapshot_trigger);
1333                if (rec->switch_output.enabled)
1334                        trigger_on(&switch_output_trigger);
1335        } else {
1336                signal(SIGUSR2, SIG_IGN);
1337        }
1338
1339        session = perf_session__new(data, false, tool);
1340        if (session == NULL) {
1341                pr_err("Perf session creation failed.\n");
1342                return -1;
1343        }
1344
1345        fd = perf_data__fd(data);
1346        rec->session = session;
1347
1348        if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1349                pr_err("Compression initialization failed.\n");
1350                return -1;
1351        }
1352
1353        session->header.env.comp_type  = PERF_COMP_ZSTD;
1354        session->header.env.comp_level = rec->opts.comp_level;
1355
1356        record__init_features(rec);
1357
1358        if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1359                session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1360
1361        if (forks) {
1362                err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1363                                                    argv, data->is_pipe,
1364                                                    workload_exec_failed_signal);
1365                if (err < 0) {
1366                        pr_err("Couldn't run the workload!\n");
1367                        status = err;
1368                        goto out_delete_session;
1369                }
1370        }
1371
1372        /*
1373         * If we have just single event and are sending data
1374         * through pipe, we need to force the ids allocation,
1375         * because we synthesize event name through the pipe
1376         * and need the id for that.
1377         */
1378        if (data->is_pipe && rec->evlist->nr_entries == 1)
1379                rec->opts.sample_id = true;
1380
1381        if (record__open(rec) != 0) {
1382                err = -1;
1383                goto out_child;
1384        }
1385        session->header.env.comp_mmap_len = session->evlist->mmap_len;
1386
1387        err = bpf__apply_obj_config();
1388        if (err) {
1389                char errbuf[BUFSIZ];
1390
1391                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1392                pr_err("ERROR: Apply config to BPF failed: %s\n",
1393                         errbuf);
1394                goto out_child;
1395        }
1396
1397        /*
1398         * Normally perf_session__new would do this, but it doesn't have the
1399         * evlist.
1400         */
1401        if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1402                pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1403                rec->tool.ordered_events = false;
1404        }
1405
1406        if (!rec->evlist->nr_groups)
1407                perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1408
1409        if (data->is_pipe) {
1410                err = perf_header__write_pipe(fd);
1411                if (err < 0)
1412                        goto out_child;
1413        } else {
1414                err = perf_session__write_header(session, rec->evlist, fd, false);
1415                if (err < 0)
1416                        goto out_child;
1417        }
1418
1419        if (!rec->no_buildid
1420            && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1421                pr_err("Couldn't generate buildids. "
1422                       "Use --no-buildid to profile anyway.\n");
1423                err = -1;
1424                goto out_child;
1425        }
1426
1427        if (!opts->no_bpf_event)
1428                bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1429
1430        if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1431                pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1432                opts->no_bpf_event = true;
1433        }
1434
1435        err = record__synthesize(rec, false);
1436        if (err < 0)
1437                goto out_child;
1438
1439        if (rec->realtime_prio) {
1440                struct sched_param param;
1441
1442                param.sched_priority = rec->realtime_prio;
1443                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1444                        pr_err("Could not set realtime priority.\n");
1445                        err = -1;
1446                        goto out_child;
1447                }
1448        }
1449
1450        /*
1451         * When perf is starting the traced process, all the events
1452         * (apart from group members) have enable_on_exec=1 set,
1453         * so don't spoil it by prematurely enabling them.
1454         */
1455        if (!target__none(&opts->target) && !opts->initial_delay)
1456                perf_evlist__enable(rec->evlist);
1457
1458        /*
1459         * Let the child rip
1460         */
1461        if (forks) {
1462                struct machine *machine = &session->machines.host;
1463                union perf_event *event;
1464                pid_t tgid;
1465
1466                event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1467                if (event == NULL) {
1468                        err = -ENOMEM;
1469                        goto out_child;
1470                }
1471
1472                /*
1473                 * Some H/W events are generated before COMM event
1474                 * which is emitted during exec(), so perf script
1475                 * cannot see a correct process name for those events.
1476                 * Synthesize COMM event to prevent it.
1477                 */
1478                tgid = perf_event__synthesize_comm(tool, event,
1479                                                   rec->evlist->workload.pid,
1480                                                   process_synthesized_event,
1481                                                   machine);
1482                free(event);
1483
1484                if (tgid == -1)
1485                        goto out_child;
1486
1487                event = malloc(sizeof(event->namespaces) +
1488                               (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1489                               machine->id_hdr_size);
1490                if (event == NULL) {
1491                        err = -ENOMEM;
1492                        goto out_child;
1493                }
1494
1495                /*
1496                 * Synthesize NAMESPACES event for the command specified.
1497                 */
1498                perf_event__synthesize_namespaces(tool, event,
1499                                                  rec->evlist->workload.pid,
1500                                                  tgid, process_synthesized_event,
1501                                                  machine);
1502                free(event);
1503
1504                perf_evlist__start_workload(rec->evlist);
1505        }
1506
1507        if (opts->initial_delay) {
1508                usleep(opts->initial_delay * USEC_PER_MSEC);
1509                perf_evlist__enable(rec->evlist);
1510        }
1511
1512        trigger_ready(&auxtrace_snapshot_trigger);
1513        trigger_ready(&switch_output_trigger);
1514        perf_hooks__invoke_record_start();
1515        for (;;) {
1516                unsigned long long hits = rec->samples;
1517
1518                /*
1519                 * rec->evlist->bkw_mmap_state is possible to be
1520                 * BKW_MMAP_EMPTY here: when done == true and
1521                 * hits != rec->samples in previous round.
1522                 *
1523                 * perf_evlist__toggle_bkw_mmap ensure we never
1524                 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1525                 */
1526                if (trigger_is_hit(&switch_output_trigger) || done || draining)
1527                        perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1528
1529                if (record__mmap_read_all(rec, false) < 0) {
1530                        trigger_error(&auxtrace_snapshot_trigger);
1531                        trigger_error(&switch_output_trigger);
1532                        err = -1;
1533                        goto out_child;
1534                }
1535
1536                if (auxtrace_record__snapshot_started) {
1537                        auxtrace_record__snapshot_started = 0;
1538                        if (!trigger_is_error(&auxtrace_snapshot_trigger))
1539                                record__read_auxtrace_snapshot(rec);
1540                        if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1541                                pr_err("AUX area tracing snapshot failed\n");
1542                                err = -1;
1543                                goto out_child;
1544                        }
1545                }
1546
1547                if (trigger_is_hit(&switch_output_trigger)) {
1548                        /*
1549                         * If switch_output_trigger is hit, the data in
1550                         * overwritable ring buffer should have been collected,
1551                         * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1552                         *
1553                         * If SIGUSR2 raise after or during record__mmap_read_all(),
1554                         * record__mmap_read_all() didn't collect data from
1555                         * overwritable ring buffer. Read again.
1556                         */
1557                        if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1558                                continue;
1559                        trigger_ready(&switch_output_trigger);
1560
1561                        /*
1562                         * Reenable events in overwrite ring buffer after
1563                         * record__mmap_read_all(): we should have collected
1564                         * data from it.
1565                         */
1566                        perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1567
1568                        if (!quiet)
1569                                fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1570                                        waking);
1571                        waking = 0;
1572                        fd = record__switch_output(rec, false);
1573                        if (fd < 0) {
1574                                pr_err("Failed to switch to new file\n");
1575                                trigger_error(&switch_output_trigger);
1576                                err = fd;
1577                                goto out_child;
1578                        }
1579
1580                        /* re-arm the alarm */
1581                        if (rec->switch_output.time)
1582                                alarm(rec->switch_output.time);
1583                }
1584
1585                if (hits == rec->samples) {
1586                        if (done || draining)
1587                                break;
1588                        err = perf_evlist__poll(rec->evlist, -1);
1589                        /*
1590                         * Propagate error, only if there's any. Ignore positive
1591                         * number of returned events and interrupt error.
1592                         */
1593                        if (err > 0 || (err < 0 && errno == EINTR))
1594                                err = 0;
1595                        waking++;
1596
1597                        if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1598                                draining = true;
1599                }
1600
1601                /*
1602                 * When perf is starting the traced process, at the end events
1603                 * die with the process and we wait for that. Thus no need to
1604                 * disable events in this case.
1605                 */
1606                if (done && !disabled && !target__none(&opts->target)) {
1607                        trigger_off(&auxtrace_snapshot_trigger);
1608                        perf_evlist__disable(rec->evlist);
1609                        disabled = true;
1610                }
1611        }
1612        trigger_off(&auxtrace_snapshot_trigger);
1613        trigger_off(&switch_output_trigger);
1614
1615        if (forks && workload_exec_errno) {
1616                char msg[STRERR_BUFSIZE];
1617                const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1618                pr_err("Workload failed: %s\n", emsg);
1619                err = -1;
1620                goto out_child;
1621        }
1622
1623        if (!quiet)
1624                fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1625
1626        if (target__none(&rec->opts.target))
1627                record__synthesize_workload(rec, true);
1628
1629out_child:
1630        record__mmap_read_all(rec, true);
1631        record__aio_mmap_read_sync(rec);
1632
1633        if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1634                ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1635                session->header.env.comp_ratio = ratio + 0.5;
1636        }
1637
1638        if (forks) {
1639                int exit_status;
1640
1641                if (!child_finished)
1642                        kill(rec->evlist->workload.pid, SIGTERM);
1643
1644                wait(&exit_status);
1645
1646                if (err < 0)
1647                        status = err;
1648                else if (WIFEXITED(exit_status))
1649                        status = WEXITSTATUS(exit_status);
1650                else if (WIFSIGNALED(exit_status))
1651                        signr = WTERMSIG(exit_status);
1652        } else
1653                status = err;
1654
1655        record__synthesize(rec, true);
1656        /* this will be recalculated during process_buildids() */
1657        rec->samples = 0;
1658
1659        if (!err) {
1660                if (!rec->timestamp_filename) {
1661                        record__finish_output(rec);
1662                } else {
1663                        fd = record__switch_output(rec, true);
1664                        if (fd < 0) {
1665                                status = fd;
1666                                goto out_delete_session;
1667                        }
1668                }
1669        }
1670
1671        perf_hooks__invoke_record_end();
1672
1673        if (!err && !quiet) {
1674                char samples[128];
1675                const char *postfix = rec->timestamp_filename ?
1676                                        ".<timestamp>" : "";
1677
1678                if (rec->samples && !rec->opts.full_auxtrace)
1679                        scnprintf(samples, sizeof(samples),
1680                                  " (%" PRIu64 " samples)", rec->samples);
1681                else
1682                        samples[0] = '\0';
1683
1684                fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1685                        perf_data__size(data) / 1024.0 / 1024.0,
1686                        data->path, postfix, samples);
1687                if (ratio) {
1688                        fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1689                                        rec->session->bytes_transferred / 1024.0 / 1024.0,
1690                                        ratio);
1691                }
1692                fprintf(stderr, " ]\n");
1693        }
1694
1695out_delete_session:
1696        zstd_fini(&session->zstd_data);
1697        perf_session__delete(session);
1698
1699        if (!opts->no_bpf_event)
1700                perf_evlist__stop_sb_thread(sb_evlist);
1701        return status;
1702}
1703
1704static void callchain_debug(struct callchain_param *callchain)
1705{
1706        static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1707
1708        pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1709
1710        if (callchain->record_mode == CALLCHAIN_DWARF)
1711                pr_debug("callchain: stack dump size %d\n",
1712                         callchain->dump_size);
1713}
1714
1715int record_opts__parse_callchain(struct record_opts *record,
1716                                 struct callchain_param *callchain,
1717                                 const char *arg, bool unset)
1718{
1719        int ret;
1720        callchain->enabled = !unset;
1721
1722        /* --no-call-graph */
1723        if (unset) {
1724                callchain->record_mode = CALLCHAIN_NONE;
1725                pr_debug("callchain: disabled\n");
1726                return 0;
1727        }
1728
1729        ret = parse_callchain_record_opt(arg, callchain);
1730        if (!ret) {
1731                /* Enable data address sampling for DWARF unwind. */
1732                if (callchain->record_mode == CALLCHAIN_DWARF)
1733                        record->sample_address = true;
1734                callchain_debug(callchain);
1735        }
1736
1737        return ret;
1738}
1739
1740int record_parse_callchain_opt(const struct option *opt,
1741                               const char *arg,
1742                               int unset)
1743{
1744        return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1745}
1746
1747int record_callchain_opt(const struct option *opt,
1748                         const char *arg __maybe_unused,
1749                         int unset __maybe_unused)
1750{
1751        struct callchain_param *callchain = opt->value;
1752
1753        callchain->enabled = true;
1754
1755        if (callchain->record_mode == CALLCHAIN_NONE)
1756                callchain->record_mode = CALLCHAIN_FP;
1757
1758        callchain_debug(callchain);
1759        return 0;
1760}
1761
1762static int perf_record_config(const char *var, const char *value, void *cb)
1763{
1764        struct record *rec = cb;
1765
1766        if (!strcmp(var, "record.build-id")) {
1767                if (!strcmp(value, "cache"))
1768                        rec->no_buildid_cache = false;
1769                else if (!strcmp(value, "no-cache"))
1770                        rec->no_buildid_cache = true;
1771                else if (!strcmp(value, "skip"))
1772                        rec->no_buildid = true;
1773                else
1774                        return -1;
1775                return 0;
1776        }
1777        if (!strcmp(var, "record.call-graph")) {
1778                var = "call-graph.record-mode";
1779                return perf_default_config(var, value, cb);
1780        }
1781#ifdef HAVE_AIO_SUPPORT
1782        if (!strcmp(var, "record.aio")) {
1783                rec->opts.nr_cblocks = strtol(value, NULL, 0);
1784                if (!rec->opts.nr_cblocks)
1785                        rec->opts.nr_cblocks = nr_cblocks_default;
1786        }
1787#endif
1788
1789        return 0;
1790}
1791
1792struct clockid_map {
1793        const char *name;
1794        int clockid;
1795};
1796
1797#define CLOCKID_MAP(n, c)       \
1798        { .name = n, .clockid = (c), }
1799
1800#define CLOCKID_END     { .name = NULL, }
1801
1802
1803/*
1804 * Add the missing ones, we need to build on many distros...
1805 */
1806#ifndef CLOCK_MONOTONIC_RAW
1807#define CLOCK_MONOTONIC_RAW 4
1808#endif
1809#ifndef CLOCK_BOOTTIME
1810#define CLOCK_BOOTTIME 7
1811#endif
1812#ifndef CLOCK_TAI
1813#define CLOCK_TAI 11
1814#endif
1815
1816static const struct clockid_map clockids[] = {
1817        /* available for all events, NMI safe */
1818        CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1819        CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1820
1821        /* available for some events */
1822        CLOCKID_MAP("realtime", CLOCK_REALTIME),
1823        CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1824        CLOCKID_MAP("tai", CLOCK_TAI),
1825
1826        /* available for the lazy */
1827        CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1828        CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1829        CLOCKID_MAP("real", CLOCK_REALTIME),
1830        CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1831
1832        CLOCKID_END,
1833};
1834
1835static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1836{
1837        struct timespec res;
1838
1839        *res_ns = 0;
1840        if (!clock_getres(clk_id, &res))
1841                *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1842        else
1843                pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1844
1845        return 0;
1846}
1847
1848static int parse_clockid(const struct option *opt, const char *str, int unset)
1849{
1850        struct record_opts *opts = (struct record_opts *)opt->value;
1851        const struct clockid_map *cm;
1852        const char *ostr = str;
1853
1854        if (unset) {
1855                opts->use_clockid = 0;
1856                return 0;
1857        }
1858
1859        /* no arg passed */
1860        if (!str)
1861                return 0;
1862
1863        /* no setting it twice */
1864        if (opts->use_clockid)
1865                return -1;
1866
1867        opts->use_clockid = true;
1868
1869        /* if its a number, we're done */
1870        if (sscanf(str, "%d", &opts->clockid) == 1)
1871                return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1872
1873        /* allow a "CLOCK_" prefix to the name */
1874        if (!strncasecmp(str, "CLOCK_", 6))
1875                str += 6;
1876
1877        for (cm = clockids; cm->name; cm++) {
1878                if (!strcasecmp(str, cm->name)) {
1879                        opts->clockid = cm->clockid;
1880                        return get_clockid_res(opts->clockid,
1881                                               &opts->clockid_res_ns);
1882                }
1883        }
1884
1885        opts->use_clockid = false;
1886        ui__warning("unknown clockid %s, check man page\n", ostr);
1887        return -1;
1888}
1889
1890static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1891{
1892        struct record_opts *opts = (struct record_opts *)opt->value;
1893
1894        if (unset || !str)
1895                return 0;
1896
1897        if (!strcasecmp(str, "node"))
1898                opts->affinity = PERF_AFFINITY_NODE;
1899        else if (!strcasecmp(str, "cpu"))
1900                opts->affinity = PERF_AFFINITY_CPU;
1901
1902        return 0;
1903}
1904
1905static int record__parse_mmap_pages(const struct option *opt,
1906                                    const char *str,
1907                                    int unset __maybe_unused)
1908{
1909        struct record_opts *opts = opt->value;
1910        char *s, *p;
1911        unsigned int mmap_pages;
1912        int ret;
1913
1914        if (!str)
1915                return -EINVAL;
1916
1917        s = strdup(str);
1918        if (!s)
1919                return -ENOMEM;
1920
1921        p = strchr(s, ',');
1922        if (p)
1923                *p = '\0';
1924
1925        if (*s) {
1926                ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1927                if (ret)
1928                        goto out_free;
1929                opts->mmap_pages = mmap_pages;
1930        }
1931
1932        if (!p) {
1933                ret = 0;
1934                goto out_free;
1935        }
1936
1937        ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1938        if (ret)
1939                goto out_free;
1940
1941        opts->auxtrace_mmap_pages = mmap_pages;
1942
1943out_free:
1944        free(s);
1945        return ret;
1946}
1947
1948static void switch_output_size_warn(struct record *rec)
1949{
1950        u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1951        struct switch_output *s = &rec->switch_output;
1952
1953        wakeup_size /= 2;
1954
1955        if (s->size < wakeup_size) {
1956                char buf[100];
1957
1958                unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1959                pr_warning("WARNING: switch-output data size lower than "
1960                           "wakeup kernel buffer size (%s) "
1961                           "expect bigger perf.data sizes\n", buf);
1962        }
1963}
1964
1965static int switch_output_setup(struct record *rec)
1966{
1967        struct switch_output *s = &rec->switch_output;
1968        static struct parse_tag tags_size[] = {
1969                { .tag  = 'B', .mult = 1       },
1970                { .tag  = 'K', .mult = 1 << 10 },
1971                { .tag  = 'M', .mult = 1 << 20 },
1972                { .tag  = 'G', .mult = 1 << 30 },
1973                { .tag  = 0 },
1974        };
1975        static struct parse_tag tags_time[] = {
1976                { .tag  = 's', .mult = 1        },
1977                { .tag  = 'm', .mult = 60       },
1978                { .tag  = 'h', .mult = 60*60    },
1979                { .tag  = 'd', .mult = 60*60*24 },
1980                { .tag  = 0 },
1981        };
1982        unsigned long val;
1983
1984        if (!s->set)
1985                return 0;
1986
1987        if (!strcmp(s->str, "signal")) {
1988                s->signal = true;
1989                pr_debug("switch-output with SIGUSR2 signal\n");
1990                goto enabled;
1991        }
1992
1993        val = parse_tag_value(s->str, tags_size);
1994        if (val != (unsigned long) -1) {
1995                s->size = val;
1996                pr_debug("switch-output with %s size threshold\n", s->str);
1997                goto enabled;
1998        }
1999
2000        val = parse_tag_value(s->str, tags_time);
2001        if (val != (unsigned long) -1) {
2002                s->time = val;
2003                pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2004                         s->str, s->time);
2005                goto enabled;
2006        }
2007
2008        return -1;
2009
2010enabled:
2011        rec->timestamp_filename = true;
2012        s->enabled              = true;
2013
2014        if (s->size && !rec->opts.no_buffering)
2015                switch_output_size_warn(rec);
2016
2017        return 0;
2018}
2019
2020static const char * const __record_usage[] = {
2021        "perf record [<options>] [<command>]",
2022        "perf record [<options>] -- <command> [<options>]",
2023        NULL
2024};
2025const char * const *record_usage = __record_usage;
2026
2027/*
2028 * XXX Ideally would be local to cmd_record() and passed to a record__new
2029 * because we need to have access to it in record__exit, that is called
2030 * after cmd_record() exits, but since record_options need to be accessible to
2031 * builtin-script, leave it here.
2032 *
2033 * At least we don't ouch it in all the other functions here directly.
2034 *
2035 * Just say no to tons of global variables, sigh.
2036 */
2037static struct record record = {
2038        .opts = {
2039                .sample_time         = true,
2040                .mmap_pages          = UINT_MAX,
2041                .user_freq           = UINT_MAX,
2042                .user_interval       = ULLONG_MAX,
2043                .freq                = 4000,
2044                .target              = {
2045                        .uses_mmap   = true,
2046                        .default_per_cpu = true,
2047                },
2048                .mmap_flush          = MMAP_FLUSH_DEFAULT,
2049        },
2050        .tool = {
2051                .sample         = process_sample_event,
2052                .fork           = perf_event__process_fork,
2053                .exit           = perf_event__process_exit,
2054                .comm           = perf_event__process_comm,
2055                .namespaces     = perf_event__process_namespaces,
2056                .mmap           = perf_event__process_mmap,
2057                .mmap2          = perf_event__process_mmap2,
2058                .ordered_events = true,
2059        },
2060};
2061
2062const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2063        "\n\t\t\t\tDefault: fp";
2064
2065static bool dry_run;
2066
2067/*
2068 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2069 * with it and switch to use the library functions in perf_evlist that came
2070 * from builtin-record.c, i.e. use record_opts,
2071 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2072 * using pipes, etc.
2073 */
2074static struct option __record_options[] = {
2075        OPT_CALLBACK('e', "event", &record.evlist, "event",
2076                     "event selector. use 'perf list' to list available events",
2077                     parse_events_option),
2078        OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2079                     "event filter", parse_filter),
2080        OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2081                           NULL, "don't record events from perf itself",
2082                           exclude_perf),
2083        OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2084                    "record events on existing process id"),
2085        OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2086                    "record events on existing thread id"),
2087        OPT_INTEGER('r', "realtime", &record.realtime_prio,
2088                    "collect data with this RT SCHED_FIFO priority"),
2089        OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2090                    "collect data without buffering"),
2091        OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2092                    "collect raw sample records from all opened counters"),
2093        OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2094                            "system-wide collection from all CPUs"),
2095        OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2096                    "list of cpus to monitor"),
2097        OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2098        OPT_STRING('o', "output", &record.data.path, "file",
2099                    "output file name"),
2100        OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2101                        &record.opts.no_inherit_set,
2102                        "child tasks do not inherit counters"),
2103        OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2104                    "synthesize non-sample events at the end of output"),
2105        OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2106        OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2107        OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2108                    "Fail if the specified frequency can't be used"),
2109        OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2110                     "profile at this frequency",
2111                      record__parse_freq),
2112        OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2113                     "number of mmap data pages and AUX area tracing mmap pages",
2114                     record__parse_mmap_pages),
2115        OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2116                     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2117                     record__mmap_flush_parse),
2118        OPT_BOOLEAN(0, "group", &record.opts.group,
2119                    "put the counters into a counter group"),
2120        OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2121                           NULL, "enables call-graph recording" ,
2122                           &record_callchain_opt),
2123        OPT_CALLBACK(0, "call-graph", &record.opts,
2124                     "record_mode[,record_size]", record_callchain_help,
2125                     &record_parse_callchain_opt),
2126        OPT_INCR('v', "verbose", &verbose,
2127                    "be more verbose (show counter open errors, etc)"),
2128        OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2129        OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2130                    "per thread counts"),
2131        OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2132        OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2133                    "Record the sample physical addresses"),
2134        OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2135        OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2136                        &record.opts.sample_time_set,
2137                        "Record the sample timestamps"),
2138        OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2139                        "Record the sample period"),
2140        OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2141                    "don't sample"),
2142        OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2143                        &record.no_buildid_cache_set,
2144                        "do not update the buildid cache"),
2145        OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2146                        &record.no_buildid_set,
2147                        "do not collect buildids in perf.data"),
2148        OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2149                     "monitor event in cgroup name only",
2150                     parse_cgroups),
2151        OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2152                  "ms to wait before starting measurement after program start"),
2153        OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2154                   "user to profile"),
2155
2156        OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2157                     "branch any", "sample any taken branches",
2158                     parse_branch_stack),
2159
2160        OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2161                     "branch filter mask", "branch stack filter modes",
2162                     parse_branch_stack),
2163        OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2164                    "sample by weight (on special events only)"),
2165        OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2166                    "sample transaction flags (special events only)"),
2167        OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2168                    "use per-thread mmaps"),
2169        OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2170                    "sample selected machine registers on interrupt,"
2171                    " use '-I?' to list register names", parse_intr_regs),
2172        OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2173                    "sample selected machine registers on interrupt,"
2174                    " use '--user-regs=?' to list register names", parse_user_regs),
2175        OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2176                    "Record running/enabled time of read (:S) events"),
2177        OPT_CALLBACK('k', "clockid", &record.opts,
2178        "clockid", "clockid to use for events, see clock_gettime()",
2179        parse_clockid),
2180        OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2181                          "opts", "AUX area tracing Snapshot Mode", ""),
2182        OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2183                        "per thread proc mmap processing timeout in ms"),
2184        OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2185                    "Record namespaces events"),
2186        OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2187                    "Record context switch events"),
2188        OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2189                         "Configure all used events to run in kernel space.",
2190                         PARSE_OPT_EXCLUSIVE),
2191        OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2192                         "Configure all used events to run in user space.",
2193                         PARSE_OPT_EXCLUSIVE),
2194        OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2195                    "collect kernel callchains"),
2196        OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2197                    "collect user callchains"),
2198        OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2199                   "clang binary to use for compiling BPF scriptlets"),
2200        OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2201                   "options passed to clang when compiling BPF scriptlets"),
2202        OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2203                   "file", "vmlinux pathname"),
2204        OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2205                    "Record build-id of all DSOs regardless of hits"),
2206        OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2207                    "append timestamp to output filename"),
2208        OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2209                    "Record timestamp boundary (time of first/last samples)"),
2210        OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2211                          &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2212                          "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2213                          "signal"),
2214        OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2215                   "Limit number of switch output generated files"),
2216        OPT_BOOLEAN(0, "dry-run", &dry_run,
2217                    "Parse options then exit"),
2218#ifdef HAVE_AIO_SUPPORT
2219        OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2220                     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2221                     record__aio_parse),
2222#endif
2223        OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2224                     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2225                     record__parse_affinity),
2226#ifdef HAVE_ZSTD_SUPPORT
2227        OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2228                            "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2229                            record__parse_comp_level),
2230#endif
2231        OPT_END()
2232};
2233
2234struct option *record_options = __record_options;
2235
2236int cmd_record(int argc, const char **argv)
2237{
2238        int err;
2239        struct record *rec = &record;
2240        char errbuf[BUFSIZ];
2241
2242        setlocale(LC_ALL, "");
2243
2244#ifndef HAVE_LIBBPF_SUPPORT
2245# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2246        set_nobuild('\0', "clang-path", true);
2247        set_nobuild('\0', "clang-opt", true);
2248# undef set_nobuild
2249#endif
2250
2251#ifndef HAVE_BPF_PROLOGUE
2252# if !defined (HAVE_DWARF_SUPPORT)
2253#  define REASON  "NO_DWARF=1"
2254# elif !defined (HAVE_LIBBPF_SUPPORT)
2255#  define REASON  "NO_LIBBPF=1"
2256# else
2257#  define REASON  "this architecture doesn't support BPF prologue"
2258# endif
2259# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2260        set_nobuild('\0', "vmlinux", true);
2261# undef set_nobuild
2262# undef REASON
2263#endif
2264
2265        CPU_ZERO(&rec->affinity_mask);
2266        rec->opts.affinity = PERF_AFFINITY_SYS;
2267
2268        rec->evlist = perf_evlist__new();
2269        if (rec->evlist == NULL)
2270                return -ENOMEM;
2271
2272        err = perf_config(perf_record_config, rec);
2273        if (err)
2274                return err;
2275
2276        argc = parse_options(argc, argv, record_options, record_usage,
2277                            PARSE_OPT_STOP_AT_NON_OPTION);
2278        if (quiet)
2279                perf_quiet_option();
2280
2281        /* Make system wide (-a) the default target. */
2282        if (!argc && target__none(&rec->opts.target))
2283                rec->opts.target.system_wide = true;
2284
2285        if (nr_cgroups && !rec->opts.target.system_wide) {
2286                usage_with_options_msg(record_usage, record_options,
2287                        "cgroup monitoring only available in system-wide mode");
2288
2289        }
2290
2291        if (rec->opts.comp_level != 0) {
2292                pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2293                rec->no_buildid = true;
2294        }
2295
2296        if (rec->opts.record_switch_events &&
2297            !perf_can_record_switch_events()) {
2298                ui__error("kernel does not support recording context switch events\n");
2299                parse_options_usage(record_usage, record_options, "switch-events", 0);
2300                return -EINVAL;
2301        }
2302
2303        if (switch_output_setup(rec)) {
2304                parse_options_usage(record_usage, record_options, "switch-output", 0);
2305                return -EINVAL;
2306        }
2307
2308        if (rec->switch_output.time) {
2309                signal(SIGALRM, alarm_sig_handler);
2310                alarm(rec->switch_output.time);
2311        }
2312
2313        if (rec->switch_output.num_files) {
2314                rec->switch_output.filenames = calloc(sizeof(char *),
2315                                                      rec->switch_output.num_files);
2316                if (!rec->switch_output.filenames)
2317                        return -EINVAL;
2318        }
2319
2320        /*
2321         * Allow aliases to facilitate the lookup of symbols for address
2322         * filters. Refer to auxtrace_parse_filters().
2323         */
2324        symbol_conf.allow_aliases = true;
2325
2326        symbol__init(NULL);
2327
2328        err = record__auxtrace_init(rec);
2329        if (err)
2330                goto out;
2331
2332        if (dry_run)
2333                goto out;
2334
2335        err = bpf__setup_stdout(rec->evlist);
2336        if (err) {
2337                bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2338                pr_err("ERROR: Setup BPF stdout failed: %s\n",
2339                         errbuf);
2340                goto out;
2341        }
2342
2343        err = -ENOMEM;
2344
2345        if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2346                pr_warning(
2347"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2348"check /proc/sys/kernel/kptr_restrict.\n\n"
2349"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2350"file is not found in the buildid cache or in the vmlinux path.\n\n"
2351"Samples in kernel modules won't be resolved at all.\n\n"
2352"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2353"even with a suitable vmlinux or kallsyms file.\n\n");
2354
2355        if (rec->no_buildid_cache || rec->no_buildid) {
2356                disable_buildid_cache();
2357        } else if (rec->switch_output.enabled) {
2358                /*
2359                 * In 'perf record --switch-output', disable buildid
2360                 * generation by default to reduce data file switching
2361                 * overhead. Still generate buildid if they are required
2362                 * explicitly using
2363                 *
2364                 *  perf record --switch-output --no-no-buildid \
2365                 *              --no-no-buildid-cache
2366                 *
2367                 * Following code equals to:
2368                 *
2369                 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2370                 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2371                 *         disable_buildid_cache();
2372                 */
2373                bool disable = true;
2374
2375                if (rec->no_buildid_set && !rec->no_buildid)
2376                        disable = false;
2377                if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2378                        disable = false;
2379                if (disable) {
2380                        rec->no_buildid = true;
2381                        rec->no_buildid_cache = true;
2382                        disable_buildid_cache();
2383                }
2384        }
2385
2386        if (record.opts.overwrite)
2387                record.opts.tail_synthesize = true;
2388
2389        if (rec->evlist->nr_entries == 0 &&
2390            __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2391                pr_err("Not enough memory for event selector list\n");
2392                goto out;
2393        }
2394
2395        if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2396                rec->opts.no_inherit = true;
2397
2398        err = target__validate(&rec->opts.target);
2399        if (err) {
2400                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2401                ui__warning("%s\n", errbuf);
2402        }
2403
2404        err = target__parse_uid(&rec->opts.target);
2405        if (err) {
2406                int saved_errno = errno;
2407
2408                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2409                ui__error("%s", errbuf);
2410
2411                err = -saved_errno;
2412                goto out;
2413        }
2414
2415        /* Enable ignoring missing threads when -u/-p option is defined. */
2416        rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2417
2418        err = -ENOMEM;
2419        if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2420                usage_with_options(record_usage, record_options);
2421
2422        err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2423        if (err)
2424                goto out;
2425
2426        /*
2427         * We take all buildids when the file contains
2428         * AUX area tracing data because we do not decode the
2429         * trace because it would take too long.
2430         */
2431        if (rec->opts.full_auxtrace)
2432                rec->buildid_all = true;
2433
2434        if (record_opts__config(&rec->opts)) {
2435                err = -EINVAL;
2436                goto out;
2437        }
2438
2439        if (rec->opts.nr_cblocks > nr_cblocks_max)
2440                rec->opts.nr_cblocks = nr_cblocks_max;
2441        pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2442
2443        pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2444        pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2445
2446        if (rec->opts.comp_level > comp_level_max)
2447                rec->opts.comp_level = comp_level_max;
2448        pr_debug("comp level: %d\n", rec->opts.comp_level);
2449
2450        err = __cmd_record(&record, argc, argv);
2451out:
2452        perf_evlist__delete(rec->evlist);
2453        symbol__exit();
2454        auxtrace_record__free(rec->itr);
2455        return err;
2456}
2457
2458static void snapshot_sig_handler(int sig __maybe_unused)
2459{
2460        struct record *rec = &record;
2461
2462        if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2463                trigger_hit(&auxtrace_snapshot_trigger);
2464                auxtrace_record__snapshot_started = 1;
2465                if (auxtrace_record__snapshot_start(record.itr))
2466                        trigger_error(&auxtrace_snapshot_trigger);
2467        }
2468
2469        if (switch_output_signal(rec))
2470                trigger_hit(&switch_output_trigger);
2471}
2472
2473static void alarm_sig_handler(int sig __maybe_unused)
2474{
2475        struct record *rec = &record;
2476
2477        if (switch_output_time(rec))
2478                trigger_hit(&switch_output_trigger);
2479}
2480