linux/tools/perf/builtin-record.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * builtin-record.c
   4 *
   5 * Builtin record command: Record the profile of a workload
   6 * (or a CPU, or a PID) into the perf.data output file - for
   7 * later analysis via perf report.
   8 */
   9#include "builtin.h"
  10
  11#include "util/build-id.h"
  12#include <subcmd/parse-options.h>
  13#include "util/parse-events.h"
  14#include "util/config.h"
  15
  16#include "util/callchain.h"
  17#include "util/cgroup.h"
  18#include "util/header.h"
  19#include "util/event.h"
  20#include "util/evlist.h"
  21#include "util/evsel.h"
  22#include "util/debug.h"
  23#include "util/mmap.h"
  24#include "util/target.h"
  25#include "util/session.h"
  26#include "util/tool.h"
  27#include "util/symbol.h"
  28#include "util/record.h"
  29#include "util/cpumap.h"
  30#include "util/thread_map.h"
  31#include "util/data.h"
  32#include "util/perf_regs.h"
  33#include "util/auxtrace.h"
  34#include "util/tsc.h"
  35#include "util/parse-branch-options.h"
  36#include "util/parse-regs-options.h"
  37#include "util/llvm-utils.h"
  38#include "util/bpf-loader.h"
  39#include "util/trigger.h"
  40#include "util/perf-hooks.h"
  41#include "util/cpu-set-sched.h"
  42#include "util/synthetic-events.h"
  43#include "util/time-utils.h"
  44#include "util/units.h"
  45#include "util/bpf-event.h"
  46#include "asm/bug.h"
  47#include "perf.h"
  48
  49#include <errno.h>
  50#include <inttypes.h>
  51#include <locale.h>
  52#include <poll.h>
  53#include <unistd.h>
  54#include <sched.h>
  55#include <signal.h>
  56#include <sys/mman.h>
  57#include <sys/wait.h>
  58#include <sys/types.h>
  59#include <sys/stat.h>
  60#include <fcntl.h>
  61#include <linux/err.h>
  62#include <linux/string.h>
  63#include <linux/time64.h>
  64#include <linux/zalloc.h>
  65#include <linux/bitmap.h>
  66
  67struct switch_output {
  68        bool             enabled;
  69        bool             signal;
  70        unsigned long    size;
  71        unsigned long    time;
  72        const char      *str;
  73        bool             set;
  74        char             **filenames;
  75        int              num_files;
  76        int              cur_file;
  77};
  78
  79struct record {
  80        struct perf_tool        tool;
  81        struct record_opts      opts;
  82        u64                     bytes_written;
  83        struct perf_data        data;
  84        struct auxtrace_record  *itr;
  85        struct evlist   *evlist;
  86        struct perf_session     *session;
  87        int                     realtime_prio;
  88        bool                    no_buildid;
  89        bool                    no_buildid_set;
  90        bool                    no_buildid_cache;
  91        bool                    no_buildid_cache_set;
  92        bool                    buildid_all;
  93        bool                    timestamp_filename;
  94        bool                    timestamp_boundary;
  95        struct switch_output    switch_output;
  96        unsigned long long      samples;
  97        struct mmap_cpu_mask    affinity_mask;
  98        unsigned long           output_max_size;        /* = 0: unlimited */
  99};
 100
 101static volatile int done;
 102
 103static volatile int auxtrace_record__snapshot_started;
 104static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
 105static DEFINE_TRIGGER(switch_output_trigger);
 106
 107static const char *affinity_tags[PERF_AFFINITY_MAX] = {
 108        "SYS", "NODE", "CPU"
 109};
 110
 111static bool switch_output_signal(struct record *rec)
 112{
 113        return rec->switch_output.signal &&
 114               trigger_is_ready(&switch_output_trigger);
 115}
 116
 117static bool switch_output_size(struct record *rec)
 118{
 119        return rec->switch_output.size &&
 120               trigger_is_ready(&switch_output_trigger) &&
 121               (rec->bytes_written >= rec->switch_output.size);
 122}
 123
 124static bool switch_output_time(struct record *rec)
 125{
 126        return rec->switch_output.time &&
 127               trigger_is_ready(&switch_output_trigger);
 128}
 129
 130static bool record__output_max_size_exceeded(struct record *rec)
 131{
 132        return rec->output_max_size &&
 133               (rec->bytes_written >= rec->output_max_size);
 134}
 135
 136static int record__write(struct record *rec, struct mmap *map __maybe_unused,
 137                         void *bf, size_t size)
 138{
 139        struct perf_data_file *file = &rec->session->data->file;
 140
 141        if (perf_data_file__write(file, bf, size) < 0) {
 142                pr_err("failed to write perf data, error: %m\n");
 143                return -1;
 144        }
 145
 146        rec->bytes_written += size;
 147
 148        if (record__output_max_size_exceeded(rec) && !done) {
 149                fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
 150                                " stopping session ]\n",
 151                                rec->bytes_written >> 10);
 152                done = 1;
 153        }
 154
 155        if (switch_output_size(rec))
 156                trigger_hit(&switch_output_trigger);
 157
 158        return 0;
 159}
 160
 161static int record__aio_enabled(struct record *rec);
 162static int record__comp_enabled(struct record *rec);
 163static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 164                            void *src, size_t src_size);
 165
 166#ifdef HAVE_AIO_SUPPORT
 167static int record__aio_write(struct aiocb *cblock, int trace_fd,
 168                void *buf, size_t size, off_t off)
 169{
 170        int rc;
 171
 172        cblock->aio_fildes = trace_fd;
 173        cblock->aio_buf    = buf;
 174        cblock->aio_nbytes = size;
 175        cblock->aio_offset = off;
 176        cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
 177
 178        do {
 179                rc = aio_write(cblock);
 180                if (rc == 0) {
 181                        break;
 182                } else if (errno != EAGAIN) {
 183                        cblock->aio_fildes = -1;
 184                        pr_err("failed to queue perf data, error: %m\n");
 185                        break;
 186                }
 187        } while (1);
 188
 189        return rc;
 190}
 191
 192static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
 193{
 194        void *rem_buf;
 195        off_t rem_off;
 196        size_t rem_size;
 197        int rc, aio_errno;
 198        ssize_t aio_ret, written;
 199
 200        aio_errno = aio_error(cblock);
 201        if (aio_errno == EINPROGRESS)
 202                return 0;
 203
 204        written = aio_ret = aio_return(cblock);
 205        if (aio_ret < 0) {
 206                if (aio_errno != EINTR)
 207                        pr_err("failed to write perf data, error: %m\n");
 208                written = 0;
 209        }
 210
 211        rem_size = cblock->aio_nbytes - written;
 212
 213        if (rem_size == 0) {
 214                cblock->aio_fildes = -1;
 215                /*
 216                 * md->refcount is incremented in record__aio_pushfn() for
 217                 * every aio write request started in record__aio_push() so
 218                 * decrement it because the request is now complete.
 219                 */
 220                perf_mmap__put(&md->core);
 221                rc = 1;
 222        } else {
 223                /*
 224                 * aio write request may require restart with the
 225                 * reminder if the kernel didn't write whole
 226                 * chunk at once.
 227                 */
 228                rem_off = cblock->aio_offset + written;
 229                rem_buf = (void *)(cblock->aio_buf + written);
 230                record__aio_write(cblock, cblock->aio_fildes,
 231                                rem_buf, rem_size, rem_off);
 232                rc = 0;
 233        }
 234
 235        return rc;
 236}
 237
 238static int record__aio_sync(struct mmap *md, bool sync_all)
 239{
 240        struct aiocb **aiocb = md->aio.aiocb;
 241        struct aiocb *cblocks = md->aio.cblocks;
 242        struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
 243        int i, do_suspend;
 244
 245        do {
 246                do_suspend = 0;
 247                for (i = 0; i < md->aio.nr_cblocks; ++i) {
 248                        if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
 249                                if (sync_all)
 250                                        aiocb[i] = NULL;
 251                                else
 252                                        return i;
 253                        } else {
 254                                /*
 255                                 * Started aio write is not complete yet
 256                                 * so it has to be waited before the
 257                                 * next allocation.
 258                                 */
 259                                aiocb[i] = &cblocks[i];
 260                                do_suspend = 1;
 261                        }
 262                }
 263                if (!do_suspend)
 264                        return -1;
 265
 266                while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
 267                        if (!(errno == EAGAIN || errno == EINTR))
 268                                pr_err("failed to sync perf data, error: %m\n");
 269                }
 270        } while (1);
 271}
 272
 273struct record_aio {
 274        struct record   *rec;
 275        void            *data;
 276        size_t          size;
 277};
 278
 279static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
 280{
 281        struct record_aio *aio = to;
 282
 283        /*
 284         * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
 285         * to release space in the kernel buffer as fast as possible, calling
 286         * perf_mmap__consume() from perf_mmap__push() function.
 287         *
 288         * That lets the kernel to proceed with storing more profiling data into
 289         * the kernel buffer earlier than other per-cpu kernel buffers are handled.
 290         *
 291         * Coping can be done in two steps in case the chunk of profiling data
 292         * crosses the upper bound of the kernel buffer. In this case we first move
 293         * part of data from map->start till the upper bound and then the reminder
 294         * from the beginning of the kernel buffer till the end of the data chunk.
 295         */
 296
 297        if (record__comp_enabled(aio->rec)) {
 298                size = zstd_compress(aio->rec->session, aio->data + aio->size,
 299                                     mmap__mmap_len(map) - aio->size,
 300                                     buf, size);
 301        } else {
 302                memcpy(aio->data + aio->size, buf, size);
 303        }
 304
 305        if (!aio->size) {
 306                /*
 307                 * Increment map->refcount to guard map->aio.data[] buffer
 308                 * from premature deallocation because map object can be
 309                 * released earlier than aio write request started on
 310                 * map->aio.data[] buffer is complete.
 311                 *
 312                 * perf_mmap__put() is done at record__aio_complete()
 313                 * after started aio request completion or at record__aio_push()
 314                 * if the request failed to start.
 315                 */
 316                perf_mmap__get(&map->core);
 317        }
 318
 319        aio->size += size;
 320
 321        return size;
 322}
 323
 324static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
 325{
 326        int ret, idx;
 327        int trace_fd = rec->session->data->file.fd;
 328        struct record_aio aio = { .rec = rec, .size = 0 };
 329
 330        /*
 331         * Call record__aio_sync() to wait till map->aio.data[] buffer
 332         * becomes available after previous aio write operation.
 333         */
 334
 335        idx = record__aio_sync(map, false);
 336        aio.data = map->aio.data[idx];
 337        ret = perf_mmap__push(map, &aio, record__aio_pushfn);
 338        if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
 339                return ret;
 340
 341        rec->samples++;
 342        ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
 343        if (!ret) {
 344                *off += aio.size;
 345                rec->bytes_written += aio.size;
 346                if (switch_output_size(rec))
 347                        trigger_hit(&switch_output_trigger);
 348        } else {
 349                /*
 350                 * Decrement map->refcount incremented in record__aio_pushfn()
 351                 * back if record__aio_write() operation failed to start, otherwise
 352                 * map->refcount is decremented in record__aio_complete() after
 353                 * aio write operation finishes successfully.
 354                 */
 355                perf_mmap__put(&map->core);
 356        }
 357
 358        return ret;
 359}
 360
 361static off_t record__aio_get_pos(int trace_fd)
 362{
 363        return lseek(trace_fd, 0, SEEK_CUR);
 364}
 365
 366static void record__aio_set_pos(int trace_fd, off_t pos)
 367{
 368        lseek(trace_fd, pos, SEEK_SET);
 369}
 370
 371static void record__aio_mmap_read_sync(struct record *rec)
 372{
 373        int i;
 374        struct evlist *evlist = rec->evlist;
 375        struct mmap *maps = evlist->mmap;
 376
 377        if (!record__aio_enabled(rec))
 378                return;
 379
 380        for (i = 0; i < evlist->core.nr_mmaps; i++) {
 381                struct mmap *map = &maps[i];
 382
 383                if (map->core.base)
 384                        record__aio_sync(map, true);
 385        }
 386}
 387
 388static int nr_cblocks_default = 1;
 389static int nr_cblocks_max = 4;
 390
 391static int record__aio_parse(const struct option *opt,
 392                             const char *str,
 393                             int unset)
 394{
 395        struct record_opts *opts = (struct record_opts *)opt->value;
 396
 397        if (unset) {
 398                opts->nr_cblocks = 0;
 399        } else {
 400                if (str)
 401                        opts->nr_cblocks = strtol(str, NULL, 0);
 402                if (!opts->nr_cblocks)
 403                        opts->nr_cblocks = nr_cblocks_default;
 404        }
 405
 406        return 0;
 407}
 408#else /* HAVE_AIO_SUPPORT */
 409static int nr_cblocks_max = 0;
 410
 411static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
 412                            off_t *off __maybe_unused)
 413{
 414        return -1;
 415}
 416
 417static off_t record__aio_get_pos(int trace_fd __maybe_unused)
 418{
 419        return -1;
 420}
 421
 422static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
 423{
 424}
 425
 426static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
 427{
 428}
 429#endif
 430
 431static int record__aio_enabled(struct record *rec)
 432{
 433        return rec->opts.nr_cblocks > 0;
 434}
 435
 436#define MMAP_FLUSH_DEFAULT 1
 437static int record__mmap_flush_parse(const struct option *opt,
 438                                    const char *str,
 439                                    int unset)
 440{
 441        int flush_max;
 442        struct record_opts *opts = (struct record_opts *)opt->value;
 443        static struct parse_tag tags[] = {
 444                        { .tag  = 'B', .mult = 1       },
 445                        { .tag  = 'K', .mult = 1 << 10 },
 446                        { .tag  = 'M', .mult = 1 << 20 },
 447                        { .tag  = 'G', .mult = 1 << 30 },
 448                        { .tag  = 0 },
 449        };
 450
 451        if (unset)
 452                return 0;
 453
 454        if (str) {
 455                opts->mmap_flush = parse_tag_value(str, tags);
 456                if (opts->mmap_flush == (int)-1)
 457                        opts->mmap_flush = strtol(str, NULL, 0);
 458        }
 459
 460        if (!opts->mmap_flush)
 461                opts->mmap_flush = MMAP_FLUSH_DEFAULT;
 462
 463        flush_max = evlist__mmap_size(opts->mmap_pages);
 464        flush_max /= 4;
 465        if (opts->mmap_flush > flush_max)
 466                opts->mmap_flush = flush_max;
 467
 468        return 0;
 469}
 470
 471#ifdef HAVE_ZSTD_SUPPORT
 472static unsigned int comp_level_default = 1;
 473
 474static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
 475{
 476        struct record_opts *opts = opt->value;
 477
 478        if (unset) {
 479                opts->comp_level = 0;
 480        } else {
 481                if (str)
 482                        opts->comp_level = strtol(str, NULL, 0);
 483                if (!opts->comp_level)
 484                        opts->comp_level = comp_level_default;
 485        }
 486
 487        return 0;
 488}
 489#endif
 490static unsigned int comp_level_max = 22;
 491
 492static int record__comp_enabled(struct record *rec)
 493{
 494        return rec->opts.comp_level > 0;
 495}
 496
 497static int process_synthesized_event(struct perf_tool *tool,
 498                                     union perf_event *event,
 499                                     struct perf_sample *sample __maybe_unused,
 500                                     struct machine *machine __maybe_unused)
 501{
 502        struct record *rec = container_of(tool, struct record, tool);
 503        return record__write(rec, NULL, event, event->header.size);
 504}
 505
 506static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 507{
 508        struct record *rec = to;
 509
 510        if (record__comp_enabled(rec)) {
 511                size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
 512                bf   = map->data;
 513        }
 514
 515        rec->samples++;
 516        return record__write(rec, map, bf, size);
 517}
 518
 519static volatile int signr = -1;
 520static volatile int child_finished;
 521
 522static void sig_handler(int sig)
 523{
 524        if (sig == SIGCHLD)
 525                child_finished = 1;
 526        else
 527                signr = sig;
 528
 529        done = 1;
 530}
 531
 532static void sigsegv_handler(int sig)
 533{
 534        perf_hooks__recover();
 535        sighandler_dump_stack(sig);
 536}
 537
 538static void record__sig_exit(void)
 539{
 540        if (signr == -1)
 541                return;
 542
 543        signal(signr, SIG_DFL);
 544        raise(signr);
 545}
 546
 547#ifdef HAVE_AUXTRACE_SUPPORT
 548
 549static int record__process_auxtrace(struct perf_tool *tool,
 550                                    struct mmap *map,
 551                                    union perf_event *event, void *data1,
 552                                    size_t len1, void *data2, size_t len2)
 553{
 554        struct record *rec = container_of(tool, struct record, tool);
 555        struct perf_data *data = &rec->data;
 556        size_t padding;
 557        u8 pad[8] = {0};
 558
 559        if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
 560                off_t file_offset;
 561                int fd = perf_data__fd(data);
 562                int err;
 563
 564                file_offset = lseek(fd, 0, SEEK_CUR);
 565                if (file_offset == -1)
 566                        return -1;
 567                err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
 568                                                     event, file_offset);
 569                if (err)
 570                        return err;
 571        }
 572
 573        /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
 574        padding = (len1 + len2) & 7;
 575        if (padding)
 576                padding = 8 - padding;
 577
 578        record__write(rec, map, event, event->header.size);
 579        record__write(rec, map, data1, len1);
 580        if (len2)
 581                record__write(rec, map, data2, len2);
 582        record__write(rec, map, &pad, padding);
 583
 584        return 0;
 585}
 586
 587static int record__auxtrace_mmap_read(struct record *rec,
 588                                      struct mmap *map)
 589{
 590        int ret;
 591
 592        ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
 593                                  record__process_auxtrace);
 594        if (ret < 0)
 595                return ret;
 596
 597        if (ret)
 598                rec->samples++;
 599
 600        return 0;
 601}
 602
 603static int record__auxtrace_mmap_read_snapshot(struct record *rec,
 604                                               struct mmap *map)
 605{
 606        int ret;
 607
 608        ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
 609                                           record__process_auxtrace,
 610                                           rec->opts.auxtrace_snapshot_size);
 611        if (ret < 0)
 612                return ret;
 613
 614        if (ret)
 615                rec->samples++;
 616
 617        return 0;
 618}
 619
 620static int record__auxtrace_read_snapshot_all(struct record *rec)
 621{
 622        int i;
 623        int rc = 0;
 624
 625        for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
 626                struct mmap *map = &rec->evlist->mmap[i];
 627
 628                if (!map->auxtrace_mmap.base)
 629                        continue;
 630
 631                if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
 632                        rc = -1;
 633                        goto out;
 634                }
 635        }
 636out:
 637        return rc;
 638}
 639
 640static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
 641{
 642        pr_debug("Recording AUX area tracing snapshot\n");
 643        if (record__auxtrace_read_snapshot_all(rec) < 0) {
 644                trigger_error(&auxtrace_snapshot_trigger);
 645        } else {
 646                if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
 647                        trigger_error(&auxtrace_snapshot_trigger);
 648                else
 649                        trigger_ready(&auxtrace_snapshot_trigger);
 650        }
 651}
 652
 653static int record__auxtrace_snapshot_exit(struct record *rec)
 654{
 655        if (trigger_is_error(&auxtrace_snapshot_trigger))
 656                return 0;
 657
 658        if (!auxtrace_record__snapshot_started &&
 659            auxtrace_record__snapshot_start(rec->itr))
 660                return -1;
 661
 662        record__read_auxtrace_snapshot(rec, true);
 663        if (trigger_is_error(&auxtrace_snapshot_trigger))
 664                return -1;
 665
 666        return 0;
 667}
 668
 669static int record__auxtrace_init(struct record *rec)
 670{
 671        int err;
 672
 673        if (!rec->itr) {
 674                rec->itr = auxtrace_record__init(rec->evlist, &err);
 675                if (err)
 676                        return err;
 677        }
 678
 679        err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
 680                                              rec->opts.auxtrace_snapshot_opts);
 681        if (err)
 682                return err;
 683
 684        err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
 685                                            rec->opts.auxtrace_sample_opts);
 686        if (err)
 687                return err;
 688
 689        return auxtrace_parse_filters(rec->evlist);
 690}
 691
 692#else
 693
 694static inline
 695int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
 696                               struct mmap *map __maybe_unused)
 697{
 698        return 0;
 699}
 700
 701static inline
 702void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
 703                                    bool on_exit __maybe_unused)
 704{
 705}
 706
 707static inline
 708int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
 709{
 710        return 0;
 711}
 712
 713static inline
 714int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
 715{
 716        return 0;
 717}
 718
 719static int record__auxtrace_init(struct record *rec __maybe_unused)
 720{
 721        return 0;
 722}
 723
 724#endif
 725
 726static bool record__kcore_readable(struct machine *machine)
 727{
 728        char kcore[PATH_MAX];
 729        int fd;
 730
 731        scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
 732
 733        fd = open(kcore, O_RDONLY);
 734        if (fd < 0)
 735                return false;
 736
 737        close(fd);
 738
 739        return true;
 740}
 741
 742static int record__kcore_copy(struct machine *machine, struct perf_data *data)
 743{
 744        char from_dir[PATH_MAX];
 745        char kcore_dir[PATH_MAX];
 746        int ret;
 747
 748        snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
 749
 750        ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
 751        if (ret)
 752                return ret;
 753
 754        return kcore_copy(from_dir, kcore_dir);
 755}
 756
 757static int record__mmap_evlist(struct record *rec,
 758                               struct evlist *evlist)
 759{
 760        struct record_opts *opts = &rec->opts;
 761        bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
 762                                  opts->auxtrace_sample_mode;
 763        char msg[512];
 764
 765        if (opts->affinity != PERF_AFFINITY_SYS)
 766                cpu__setup_cpunode_map();
 767
 768        if (evlist__mmap_ex(evlist, opts->mmap_pages,
 769                                 opts->auxtrace_mmap_pages,
 770                                 auxtrace_overwrite,
 771                                 opts->nr_cblocks, opts->affinity,
 772                                 opts->mmap_flush, opts->comp_level) < 0) {
 773                if (errno == EPERM) {
 774                        pr_err("Permission error mapping pages.\n"
 775                               "Consider increasing "
 776                               "/proc/sys/kernel/perf_event_mlock_kb,\n"
 777                               "or try again with a smaller value of -m/--mmap_pages.\n"
 778                               "(current value: %u,%u)\n",
 779                               opts->mmap_pages, opts->auxtrace_mmap_pages);
 780                        return -errno;
 781                } else {
 782                        pr_err("failed to mmap with %d (%s)\n", errno,
 783                                str_error_r(errno, msg, sizeof(msg)));
 784                        if (errno)
 785                                return -errno;
 786                        else
 787                                return -EINVAL;
 788                }
 789        }
 790        return 0;
 791}
 792
 793static int record__mmap(struct record *rec)
 794{
 795        return record__mmap_evlist(rec, rec->evlist);
 796}
 797
 798static int record__open(struct record *rec)
 799{
 800        char msg[BUFSIZ];
 801        struct evsel *pos;
 802        struct evlist *evlist = rec->evlist;
 803        struct perf_session *session = rec->session;
 804        struct record_opts *opts = &rec->opts;
 805        int rc = 0;
 806
 807        /*
 808         * For initial_delay we need to add a dummy event so that we can track
 809         * PERF_RECORD_MMAP while we wait for the initial delay to enable the
 810         * real events, the ones asked by the user.
 811         */
 812        if (opts->initial_delay) {
 813                if (perf_evlist__add_dummy(evlist))
 814                        return -ENOMEM;
 815
 816                pos = evlist__first(evlist);
 817                pos->tracking = 0;
 818                pos = evlist__last(evlist);
 819                pos->tracking = 1;
 820                pos->core.attr.enable_on_exec = 1;
 821        }
 822
 823        perf_evlist__config(evlist, opts, &callchain_param);
 824
 825        evlist__for_each_entry(evlist, pos) {
 826try_again:
 827                if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
 828                        if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 829                                if (verbose > 0)
 830                                        ui__warning("%s\n", msg);
 831                                goto try_again;
 832                        }
 833                        if ((errno == EINVAL || errno == EBADF) &&
 834                            pos->leader != pos &&
 835                            pos->weak_group) {
 836                                pos = perf_evlist__reset_weak_group(evlist, pos, true);
 837                                goto try_again;
 838                        }
 839                        rc = -errno;
 840                        perf_evsel__open_strerror(pos, &opts->target,
 841                                                  errno, msg, sizeof(msg));
 842                        ui__error("%s\n", msg);
 843                        goto out;
 844                }
 845
 846                pos->supported = true;
 847        }
 848
 849        if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
 850                pr_warning(
 851"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
 852"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
 853"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
 854"file is not found in the buildid cache or in the vmlinux path.\n\n"
 855"Samples in kernel modules won't be resolved at all.\n\n"
 856"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
 857"even with a suitable vmlinux or kallsyms file.\n\n");
 858        }
 859
 860        if (perf_evlist__apply_filters(evlist, &pos)) {
 861                pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 862                        pos->filter, perf_evsel__name(pos), errno,
 863                        str_error_r(errno, msg, sizeof(msg)));
 864                rc = -1;
 865                goto out;
 866        }
 867
 868        rc = record__mmap(rec);
 869        if (rc)
 870                goto out;
 871
 872        session->evlist = evlist;
 873        perf_session__set_id_hdr_size(session);
 874out:
 875        return rc;
 876}
 877
 878static int process_sample_event(struct perf_tool *tool,
 879                                union perf_event *event,
 880                                struct perf_sample *sample,
 881                                struct evsel *evsel,
 882                                struct machine *machine)
 883{
 884        struct record *rec = container_of(tool, struct record, tool);
 885
 886        if (rec->evlist->first_sample_time == 0)
 887                rec->evlist->first_sample_time = sample->time;
 888
 889        rec->evlist->last_sample_time = sample->time;
 890
 891        if (rec->buildid_all)
 892                return 0;
 893
 894        rec->samples++;
 895        return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 896}
 897
 898static int process_buildids(struct record *rec)
 899{
 900        struct perf_session *session = rec->session;
 901
 902        if (perf_data__size(&rec->data) == 0)
 903                return 0;
 904
 905        /*
 906         * During this process, it'll load kernel map and replace the
 907         * dso->long_name to a real pathname it found.  In this case
 908         * we prefer the vmlinux path like
 909         *   /lib/modules/3.16.4/build/vmlinux
 910         *
 911         * rather than build-id path (in debug directory).
 912         *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 913         */
 914        symbol_conf.ignore_vmlinux_buildid = true;
 915
 916        /*
 917         * If --buildid-all is given, it marks all DSO regardless of hits,
 918         * so no need to process samples. But if timestamp_boundary is enabled,
 919         * it still needs to walk on all samples to get the timestamps of
 920         * first/last samples.
 921         */
 922        if (rec->buildid_all && !rec->timestamp_boundary)
 923                rec->tool.sample = NULL;
 924
 925        return perf_session__process_events(session);
 926}
 927
 928static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 929{
 930        int err;
 931        struct perf_tool *tool = data;
 932        /*
 933         *As for guest kernel when processing subcommand record&report,
 934         *we arrange module mmap prior to guest kernel mmap and trigger
 935         *a preload dso because default guest module symbols are loaded
 936         *from guest kallsyms instead of /lib/modules/XXX/XXX. This
 937         *method is used to avoid symbol missing when the first addr is
 938         *in module instead of in guest kernel.
 939         */
 940        err = perf_event__synthesize_modules(tool, process_synthesized_event,
 941                                             machine);
 942        if (err < 0)
 943                pr_err("Couldn't record guest kernel [%d]'s reference"
 944                       " relocation symbol.\n", machine->pid);
 945
 946        /*
 947         * We use _stext for guest kernel because guest kernel's /proc/kallsyms
 948         * have no _text sometimes.
 949         */
 950        err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 951                                                 machine);
 952        if (err < 0)
 953                pr_err("Couldn't record guest kernel [%d]'s reference"
 954                       " relocation symbol.\n", machine->pid);
 955}
 956
 957static struct perf_event_header finished_round_event = {
 958        .size = sizeof(struct perf_event_header),
 959        .type = PERF_RECORD_FINISHED_ROUND,
 960};
 961
 962static void record__adjust_affinity(struct record *rec, struct mmap *map)
 963{
 964        if (rec->opts.affinity != PERF_AFFINITY_SYS &&
 965            !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
 966                          rec->affinity_mask.nbits)) {
 967                bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
 968                bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
 969                          map->affinity_mask.bits, rec->affinity_mask.nbits);
 970                sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
 971                                  (cpu_set_t *)rec->affinity_mask.bits);
 972                if (verbose == 2)
 973                        mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
 974        }
 975}
 976
 977static size_t process_comp_header(void *record, size_t increment)
 978{
 979        struct perf_record_compressed *event = record;
 980        size_t size = sizeof(*event);
 981
 982        if (increment) {
 983                event->header.size += increment;
 984                return increment;
 985        }
 986
 987        event->header.type = PERF_RECORD_COMPRESSED;
 988        event->header.size = size;
 989
 990        return size;
 991}
 992
 993static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 994                            void *src, size_t src_size)
 995{
 996        size_t compressed;
 997        size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
 998
 999        compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1000                                                     max_record_size, process_comp_header);
1001
1002        session->bytes_transferred += src_size;
1003        session->bytes_compressed  += compressed;
1004
1005        return compressed;
1006}
1007
1008static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1009                                    bool overwrite, bool synch)
1010{
1011        u64 bytes_written = rec->bytes_written;
1012        int i;
1013        int rc = 0;
1014        struct mmap *maps;
1015        int trace_fd = rec->data.file.fd;
1016        off_t off = 0;
1017
1018        if (!evlist)
1019                return 0;
1020
1021        maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1022        if (!maps)
1023                return 0;
1024
1025        if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1026                return 0;
1027
1028        if (record__aio_enabled(rec))
1029                off = record__aio_get_pos(trace_fd);
1030
1031        for (i = 0; i < evlist->core.nr_mmaps; i++) {
1032                u64 flush = 0;
1033                struct mmap *map = &maps[i];
1034
1035                if (map->core.base) {
1036                        record__adjust_affinity(rec, map);
1037                        if (synch) {
1038                                flush = map->core.flush;
1039                                map->core.flush = 1;
1040                        }
1041                        if (!record__aio_enabled(rec)) {
1042                                if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1043                                        if (synch)
1044                                                map->core.flush = flush;
1045                                        rc = -1;
1046                                        goto out;
1047                                }
1048                        } else {
1049                                if (record__aio_push(rec, map, &off) < 0) {
1050                                        record__aio_set_pos(trace_fd, off);
1051                                        if (synch)
1052                                                map->core.flush = flush;
1053                                        rc = -1;
1054                                        goto out;
1055                                }
1056                        }
1057                        if (synch)
1058                                map->core.flush = flush;
1059                }
1060
1061                if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1062                    !rec->opts.auxtrace_sample_mode &&
1063                    record__auxtrace_mmap_read(rec, map) != 0) {
1064                        rc = -1;
1065                        goto out;
1066                }
1067        }
1068
1069        if (record__aio_enabled(rec))
1070                record__aio_set_pos(trace_fd, off);
1071
1072        /*
1073         * Mark the round finished in case we wrote
1074         * at least one event.
1075         */
1076        if (bytes_written != rec->bytes_written)
1077                rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1078
1079        if (overwrite)
1080                perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1081out:
1082        return rc;
1083}
1084
1085static int record__mmap_read_all(struct record *rec, bool synch)
1086{
1087        int err;
1088
1089        err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1090        if (err)
1091                return err;
1092
1093        return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1094}
1095
1096static void record__init_features(struct record *rec)
1097{
1098        struct perf_session *session = rec->session;
1099        int feat;
1100
1101        for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1102                perf_header__set_feat(&session->header, feat);
1103
1104        if (rec->no_buildid)
1105                perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1106
1107        if (!have_tracepoints(&rec->evlist->core.entries))
1108                perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1109
1110        if (!rec->opts.branch_stack)
1111                perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1112
1113        if (!rec->opts.full_auxtrace)
1114                perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1115
1116        if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1117                perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1118
1119        perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1120        if (!record__comp_enabled(rec))
1121                perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1122
1123        perf_header__clear_feat(&session->header, HEADER_STAT);
1124}
1125
1126static void
1127record__finish_output(struct record *rec)
1128{
1129        struct perf_data *data = &rec->data;
1130        int fd = perf_data__fd(data);
1131
1132        if (data->is_pipe)
1133                return;
1134
1135        rec->session->header.data_size += rec->bytes_written;
1136        data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1137
1138        if (!rec->no_buildid) {
1139                process_buildids(rec);
1140
1141                if (rec->buildid_all)
1142                        dsos__hit_all(rec->session);
1143        }
1144        perf_session__write_header(rec->session, rec->evlist, fd, true);
1145
1146        return;
1147}
1148
1149static int record__synthesize_workload(struct record *rec, bool tail)
1150{
1151        int err;
1152        struct perf_thread_map *thread_map;
1153
1154        if (rec->opts.tail_synthesize != tail)
1155                return 0;
1156
1157        thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1158        if (thread_map == NULL)
1159                return -1;
1160
1161        err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1162                                                 process_synthesized_event,
1163                                                 &rec->session->machines.host,
1164                                                 rec->opts.sample_address);
1165        perf_thread_map__put(thread_map);
1166        return err;
1167}
1168
1169static int record__synthesize(struct record *rec, bool tail);
1170
1171static int
1172record__switch_output(struct record *rec, bool at_exit)
1173{
1174        struct perf_data *data = &rec->data;
1175        int fd, err;
1176        char *new_filename;
1177
1178        /* Same Size:      "2015122520103046"*/
1179        char timestamp[] = "InvalidTimestamp";
1180
1181        record__aio_mmap_read_sync(rec);
1182
1183        record__synthesize(rec, true);
1184        if (target__none(&rec->opts.target))
1185                record__synthesize_workload(rec, true);
1186
1187        rec->samples = 0;
1188        record__finish_output(rec);
1189        err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1190        if (err) {
1191                pr_err("Failed to get current timestamp\n");
1192                return -EINVAL;
1193        }
1194
1195        fd = perf_data__switch(data, timestamp,
1196                                    rec->session->header.data_offset,
1197                                    at_exit, &new_filename);
1198        if (fd >= 0 && !at_exit) {
1199                rec->bytes_written = 0;
1200                rec->session->header.data_size = 0;
1201        }
1202
1203        if (!quiet)
1204                fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1205                        data->path, timestamp);
1206
1207        if (rec->switch_output.num_files) {
1208                int n = rec->switch_output.cur_file + 1;
1209
1210                if (n >= rec->switch_output.num_files)
1211                        n = 0;
1212                rec->switch_output.cur_file = n;
1213                if (rec->switch_output.filenames[n]) {
1214                        remove(rec->switch_output.filenames[n]);
1215                        zfree(&rec->switch_output.filenames[n]);
1216                }
1217                rec->switch_output.filenames[n] = new_filename;
1218        } else {
1219                free(new_filename);
1220        }
1221
1222        /* Output tracking events */
1223        if (!at_exit) {
1224                record__synthesize(rec, false);
1225
1226                /*
1227                 * In 'perf record --switch-output' without -a,
1228                 * record__synthesize() in record__switch_output() won't
1229                 * generate tracking events because there's no thread_map
1230                 * in evlist. Which causes newly created perf.data doesn't
1231                 * contain map and comm information.
1232                 * Create a fake thread_map and directly call
1233                 * perf_event__synthesize_thread_map() for those events.
1234                 */
1235                if (target__none(&rec->opts.target))
1236                        record__synthesize_workload(rec, false);
1237        }
1238        return fd;
1239}
1240
1241static volatile int workload_exec_errno;
1242
1243/*
1244 * perf_evlist__prepare_workload will send a SIGUSR1
1245 * if the fork fails, since we asked by setting its
1246 * want_signal to true.
1247 */
1248static void workload_exec_failed_signal(int signo __maybe_unused,
1249                                        siginfo_t *info,
1250                                        void *ucontext __maybe_unused)
1251{
1252        workload_exec_errno = info->si_value.sival_int;
1253        done = 1;
1254        child_finished = 1;
1255}
1256
1257static void snapshot_sig_handler(int sig);
1258static void alarm_sig_handler(int sig);
1259
1260static const struct perf_event_mmap_page *
1261perf_evlist__pick_pc(struct evlist *evlist)
1262{
1263        if (evlist) {
1264                if (evlist->mmap && evlist->mmap[0].core.base)
1265                        return evlist->mmap[0].core.base;
1266                if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1267                        return evlist->overwrite_mmap[0].core.base;
1268        }
1269        return NULL;
1270}
1271
1272static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1273{
1274        const struct perf_event_mmap_page *pc;
1275
1276        pc = perf_evlist__pick_pc(rec->evlist);
1277        if (pc)
1278                return pc;
1279        return NULL;
1280}
1281
1282static int record__synthesize(struct record *rec, bool tail)
1283{
1284        struct perf_session *session = rec->session;
1285        struct machine *machine = &session->machines.host;
1286        struct perf_data *data = &rec->data;
1287        struct record_opts *opts = &rec->opts;
1288        struct perf_tool *tool = &rec->tool;
1289        int fd = perf_data__fd(data);
1290        int err = 0;
1291
1292        if (rec->opts.tail_synthesize != tail)
1293                return 0;
1294
1295        if (data->is_pipe) {
1296                /*
1297                 * We need to synthesize events first, because some
1298                 * features works on top of them (on report side).
1299                 */
1300                err = perf_event__synthesize_attrs(tool, rec->evlist,
1301                                                   process_synthesized_event);
1302                if (err < 0) {
1303                        pr_err("Couldn't synthesize attrs.\n");
1304                        goto out;
1305                }
1306
1307                err = perf_event__synthesize_features(tool, session, rec->evlist,
1308                                                      process_synthesized_event);
1309                if (err < 0) {
1310                        pr_err("Couldn't synthesize features.\n");
1311                        return err;
1312                }
1313
1314                if (have_tracepoints(&rec->evlist->core.entries)) {
1315                        /*
1316                         * FIXME err <= 0 here actually means that
1317                         * there were no tracepoints so its not really
1318                         * an error, just that we don't need to
1319                         * synthesize anything.  We really have to
1320                         * return this more properly and also
1321                         * propagate errors that now are calling die()
1322                         */
1323                        err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1324                                                                  process_synthesized_event);
1325                        if (err <= 0) {
1326                                pr_err("Couldn't record tracing data.\n");
1327                                goto out;
1328                        }
1329                        rec->bytes_written += err;
1330                }
1331        }
1332
1333        err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1334                                          process_synthesized_event, machine);
1335        if (err)
1336                goto out;
1337
1338        /* Synthesize id_index before auxtrace_info */
1339        if (rec->opts.auxtrace_sample_mode) {
1340                err = perf_event__synthesize_id_index(tool,
1341                                                      process_synthesized_event,
1342                                                      session->evlist, machine);
1343                if (err)
1344                        goto out;
1345        }
1346
1347        if (rec->opts.full_auxtrace) {
1348                err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1349                                        session, process_synthesized_event);
1350                if (err)
1351                        goto out;
1352        }
1353
1354        if (!perf_evlist__exclude_kernel(rec->evlist)) {
1355                err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1356                                                         machine);
1357                WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1358                                   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1359                                   "Check /proc/kallsyms permission or run as root.\n");
1360
1361                err = perf_event__synthesize_modules(tool, process_synthesized_event,
1362                                                     machine);
1363                WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1364                                   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1365                                   "Check /proc/modules permission or run as root.\n");
1366        }
1367
1368        if (perf_guest) {
1369                machines__process_guests(&session->machines,
1370                                         perf_event__synthesize_guest_os, tool);
1371        }
1372
1373        err = perf_event__synthesize_extra_attr(&rec->tool,
1374                                                rec->evlist,
1375                                                process_synthesized_event,
1376                                                data->is_pipe);
1377        if (err)
1378                goto out;
1379
1380        err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1381                                                 process_synthesized_event,
1382                                                NULL);
1383        if (err < 0) {
1384                pr_err("Couldn't synthesize thread map.\n");
1385                return err;
1386        }
1387
1388        err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1389                                             process_synthesized_event, NULL);
1390        if (err < 0) {
1391                pr_err("Couldn't synthesize cpu map.\n");
1392                return err;
1393        }
1394
1395        err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1396                                                machine, opts);
1397        if (err < 0)
1398                pr_warning("Couldn't synthesize bpf events.\n");
1399
1400        err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1401                                             machine);
1402        if (err < 0)
1403                pr_warning("Couldn't synthesize cgroup events.\n");
1404
1405        err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1406                                            process_synthesized_event, opts->sample_address,
1407                                            1);
1408out:
1409        return err;
1410}
1411
1412static int __cmd_record(struct record *rec, int argc, const char **argv)
1413{
1414        int err;
1415        int status = 0;
1416        unsigned long waking = 0;
1417        const bool forks = argc > 0;
1418        struct perf_tool *tool = &rec->tool;
1419        struct record_opts *opts = &rec->opts;
1420        struct perf_data *data = &rec->data;
1421        struct perf_session *session;
1422        bool disabled = false, draining = false;
1423        struct evlist *sb_evlist = NULL;
1424        int fd;
1425        float ratio = 0;
1426
1427        atexit(record__sig_exit);
1428        signal(SIGCHLD, sig_handler);
1429        signal(SIGINT, sig_handler);
1430        signal(SIGTERM, sig_handler);
1431        signal(SIGSEGV, sigsegv_handler);
1432
1433        if (rec->opts.record_namespaces)
1434                tool->namespace_events = true;
1435
1436        if (rec->opts.record_cgroup) {
1437#ifdef HAVE_FILE_HANDLE
1438                tool->cgroup_events = true;
1439#else
1440                pr_err("cgroup tracking is not supported\n");
1441                return -1;
1442#endif
1443        }
1444
1445        if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1446                signal(SIGUSR2, snapshot_sig_handler);
1447                if (rec->opts.auxtrace_snapshot_mode)
1448                        trigger_on(&auxtrace_snapshot_trigger);
1449                if (rec->switch_output.enabled)
1450                        trigger_on(&switch_output_trigger);
1451        } else {
1452                signal(SIGUSR2, SIG_IGN);
1453        }
1454
1455        session = perf_session__new(data, false, tool);
1456        if (IS_ERR(session)) {
1457                pr_err("Perf session creation failed.\n");
1458                return PTR_ERR(session);
1459        }
1460
1461        fd = perf_data__fd(data);
1462        rec->session = session;
1463
1464        if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1465                pr_err("Compression initialization failed.\n");
1466                return -1;
1467        }
1468
1469        session->header.env.comp_type  = PERF_COMP_ZSTD;
1470        session->header.env.comp_level = rec->opts.comp_level;
1471
1472        if (rec->opts.kcore &&
1473            !record__kcore_readable(&session->machines.host)) {
1474                pr_err("ERROR: kcore is not readable.\n");
1475                return -1;
1476        }
1477
1478        record__init_features(rec);
1479
1480        if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1481                session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1482
1483        if (forks) {
1484                err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1485                                                    argv, data->is_pipe,
1486                                                    workload_exec_failed_signal);
1487                if (err < 0) {
1488                        pr_err("Couldn't run the workload!\n");
1489                        status = err;
1490                        goto out_delete_session;
1491                }
1492        }
1493
1494        /*
1495         * If we have just single event and are sending data
1496         * through pipe, we need to force the ids allocation,
1497         * because we synthesize event name through the pipe
1498         * and need the id for that.
1499         */
1500        if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1501                rec->opts.sample_id = true;
1502
1503        if (record__open(rec) != 0) {
1504                err = -1;
1505                goto out_child;
1506        }
1507        session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1508
1509        if (rec->opts.kcore) {
1510                err = record__kcore_copy(&session->machines.host, data);
1511                if (err) {
1512                        pr_err("ERROR: Failed to copy kcore\n");
1513                        goto out_child;
1514                }
1515        }
1516
1517        err = bpf__apply_obj_config();
1518        if (err) {
1519                char errbuf[BUFSIZ];
1520
1521                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1522                pr_err("ERROR: Apply config to BPF failed: %s\n",
1523                         errbuf);
1524                goto out_child;
1525        }
1526
1527        /*
1528         * Normally perf_session__new would do this, but it doesn't have the
1529         * evlist.
1530         */
1531        if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1532                pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1533                rec->tool.ordered_events = false;
1534        }
1535
1536        if (!rec->evlist->nr_groups)
1537                perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1538
1539        if (data->is_pipe) {
1540                err = perf_header__write_pipe(fd);
1541                if (err < 0)
1542                        goto out_child;
1543        } else {
1544                err = perf_session__write_header(session, rec->evlist, fd, false);
1545                if (err < 0)
1546                        goto out_child;
1547        }
1548
1549        if (!rec->no_buildid
1550            && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1551                pr_err("Couldn't generate buildids. "
1552                       "Use --no-buildid to profile anyway.\n");
1553                err = -1;
1554                goto out_child;
1555        }
1556
1557        if (!opts->no_bpf_event)
1558                bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1559
1560        if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1561                pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1562                opts->no_bpf_event = true;
1563        }
1564
1565        err = record__synthesize(rec, false);
1566        if (err < 0)
1567                goto out_child;
1568
1569        if (rec->realtime_prio) {
1570                struct sched_param param;
1571
1572                param.sched_priority = rec->realtime_prio;
1573                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1574                        pr_err("Could not set realtime priority.\n");
1575                        err = -1;
1576                        goto out_child;
1577                }
1578        }
1579
1580        /*
1581         * When perf is starting the traced process, all the events
1582         * (apart from group members) have enable_on_exec=1 set,
1583         * so don't spoil it by prematurely enabling them.
1584         */
1585        if (!target__none(&opts->target) && !opts->initial_delay)
1586                evlist__enable(rec->evlist);
1587
1588        /*
1589         * Let the child rip
1590         */
1591        if (forks) {
1592                struct machine *machine = &session->machines.host;
1593                union perf_event *event;
1594                pid_t tgid;
1595
1596                event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1597                if (event == NULL) {
1598                        err = -ENOMEM;
1599                        goto out_child;
1600                }
1601
1602                /*
1603                 * Some H/W events are generated before COMM event
1604                 * which is emitted during exec(), so perf script
1605                 * cannot see a correct process name for those events.
1606                 * Synthesize COMM event to prevent it.
1607                 */
1608                tgid = perf_event__synthesize_comm(tool, event,
1609                                                   rec->evlist->workload.pid,
1610                                                   process_synthesized_event,
1611                                                   machine);
1612                free(event);
1613
1614                if (tgid == -1)
1615                        goto out_child;
1616
1617                event = malloc(sizeof(event->namespaces) +
1618                               (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1619                               machine->id_hdr_size);
1620                if (event == NULL) {
1621                        err = -ENOMEM;
1622                        goto out_child;
1623                }
1624
1625                /*
1626                 * Synthesize NAMESPACES event for the command specified.
1627                 */
1628                perf_event__synthesize_namespaces(tool, event,
1629                                                  rec->evlist->workload.pid,
1630                                                  tgid, process_synthesized_event,
1631                                                  machine);
1632                free(event);
1633
1634                perf_evlist__start_workload(rec->evlist);
1635        }
1636
1637        if (opts->initial_delay) {
1638                usleep(opts->initial_delay * USEC_PER_MSEC);
1639                evlist__enable(rec->evlist);
1640        }
1641
1642        trigger_ready(&auxtrace_snapshot_trigger);
1643        trigger_ready(&switch_output_trigger);
1644        perf_hooks__invoke_record_start();
1645        for (;;) {
1646                unsigned long long hits = rec->samples;
1647
1648                /*
1649                 * rec->evlist->bkw_mmap_state is possible to be
1650                 * BKW_MMAP_EMPTY here: when done == true and
1651                 * hits != rec->samples in previous round.
1652                 *
1653                 * perf_evlist__toggle_bkw_mmap ensure we never
1654                 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1655                 */
1656                if (trigger_is_hit(&switch_output_trigger) || done || draining)
1657                        perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1658
1659                if (record__mmap_read_all(rec, false) < 0) {
1660                        trigger_error(&auxtrace_snapshot_trigger);
1661                        trigger_error(&switch_output_trigger);
1662                        err = -1;
1663                        goto out_child;
1664                }
1665
1666                if (auxtrace_record__snapshot_started) {
1667                        auxtrace_record__snapshot_started = 0;
1668                        if (!trigger_is_error(&auxtrace_snapshot_trigger))
1669                                record__read_auxtrace_snapshot(rec, false);
1670                        if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1671                                pr_err("AUX area tracing snapshot failed\n");
1672                                err = -1;
1673                                goto out_child;
1674                        }
1675                }
1676
1677                if (trigger_is_hit(&switch_output_trigger)) {
1678                        /*
1679                         * If switch_output_trigger is hit, the data in
1680                         * overwritable ring buffer should have been collected,
1681                         * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1682                         *
1683                         * If SIGUSR2 raise after or during record__mmap_read_all(),
1684                         * record__mmap_read_all() didn't collect data from
1685                         * overwritable ring buffer. Read again.
1686                         */
1687                        if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1688                                continue;
1689                        trigger_ready(&switch_output_trigger);
1690
1691                        /*
1692                         * Reenable events in overwrite ring buffer after
1693                         * record__mmap_read_all(): we should have collected
1694                         * data from it.
1695                         */
1696                        perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1697
1698                        if (!quiet)
1699                                fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1700                                        waking);
1701                        waking = 0;
1702                        fd = record__switch_output(rec, false);
1703                        if (fd < 0) {
1704                                pr_err("Failed to switch to new file\n");
1705                                trigger_error(&switch_output_trigger);
1706                                err = fd;
1707                                goto out_child;
1708                        }
1709
1710                        /* re-arm the alarm */
1711                        if (rec->switch_output.time)
1712                                alarm(rec->switch_output.time);
1713                }
1714
1715                if (hits == rec->samples) {
1716                        if (done || draining)
1717                                break;
1718                        err = evlist__poll(rec->evlist, -1);
1719                        /*
1720                         * Propagate error, only if there's any. Ignore positive
1721                         * number of returned events and interrupt error.
1722                         */
1723                        if (err > 0 || (err < 0 && errno == EINTR))
1724                                err = 0;
1725                        waking++;
1726
1727                        if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1728                                draining = true;
1729                }
1730
1731                /*
1732                 * When perf is starting the traced process, at the end events
1733                 * die with the process and we wait for that. Thus no need to
1734                 * disable events in this case.
1735                 */
1736                if (done && !disabled && !target__none(&opts->target)) {
1737                        trigger_off(&auxtrace_snapshot_trigger);
1738                        evlist__disable(rec->evlist);
1739                        disabled = true;
1740                }
1741        }
1742
1743        trigger_off(&auxtrace_snapshot_trigger);
1744        trigger_off(&switch_output_trigger);
1745
1746        if (opts->auxtrace_snapshot_on_exit)
1747                record__auxtrace_snapshot_exit(rec);
1748
1749        if (forks && workload_exec_errno) {
1750                char msg[STRERR_BUFSIZE];
1751                const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1752                pr_err("Workload failed: %s\n", emsg);
1753                err = -1;
1754                goto out_child;
1755        }
1756
1757        if (!quiet)
1758                fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1759
1760        if (target__none(&rec->opts.target))
1761                record__synthesize_workload(rec, true);
1762
1763out_child:
1764        record__mmap_read_all(rec, true);
1765        record__aio_mmap_read_sync(rec);
1766
1767        if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1768                ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1769                session->header.env.comp_ratio = ratio + 0.5;
1770        }
1771
1772        if (forks) {
1773                int exit_status;
1774
1775                if (!child_finished)
1776                        kill(rec->evlist->workload.pid, SIGTERM);
1777
1778                wait(&exit_status);
1779
1780                if (err < 0)
1781                        status = err;
1782                else if (WIFEXITED(exit_status))
1783                        status = WEXITSTATUS(exit_status);
1784                else if (WIFSIGNALED(exit_status))
1785                        signr = WTERMSIG(exit_status);
1786        } else
1787                status = err;
1788
1789        record__synthesize(rec, true);
1790        /* this will be recalculated during process_buildids() */
1791        rec->samples = 0;
1792
1793        if (!err) {
1794                if (!rec->timestamp_filename) {
1795                        record__finish_output(rec);
1796                } else {
1797                        fd = record__switch_output(rec, true);
1798                        if (fd < 0) {
1799                                status = fd;
1800                                goto out_delete_session;
1801                        }
1802                }
1803        }
1804
1805        perf_hooks__invoke_record_end();
1806
1807        if (!err && !quiet) {
1808                char samples[128];
1809                const char *postfix = rec->timestamp_filename ?
1810                                        ".<timestamp>" : "";
1811
1812                if (rec->samples && !rec->opts.full_auxtrace)
1813                        scnprintf(samples, sizeof(samples),
1814                                  " (%" PRIu64 " samples)", rec->samples);
1815                else
1816                        samples[0] = '\0';
1817
1818                fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1819                        perf_data__size(data) / 1024.0 / 1024.0,
1820                        data->path, postfix, samples);
1821                if (ratio) {
1822                        fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1823                                        rec->session->bytes_transferred / 1024.0 / 1024.0,
1824                                        ratio);
1825                }
1826                fprintf(stderr, " ]\n");
1827        }
1828
1829out_delete_session:
1830        zstd_fini(&session->zstd_data);
1831        perf_session__delete(session);
1832
1833        if (!opts->no_bpf_event)
1834                perf_evlist__stop_sb_thread(sb_evlist);
1835        return status;
1836}
1837
1838static void callchain_debug(struct callchain_param *callchain)
1839{
1840        static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1841
1842        pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1843
1844        if (callchain->record_mode == CALLCHAIN_DWARF)
1845                pr_debug("callchain: stack dump size %d\n",
1846                         callchain->dump_size);
1847}
1848
1849int record_opts__parse_callchain(struct record_opts *record,
1850                                 struct callchain_param *callchain,
1851                                 const char *arg, bool unset)
1852{
1853        int ret;
1854        callchain->enabled = !unset;
1855
1856        /* --no-call-graph */
1857        if (unset) {
1858                callchain->record_mode = CALLCHAIN_NONE;
1859                pr_debug("callchain: disabled\n");
1860                return 0;
1861        }
1862
1863        ret = parse_callchain_record_opt(arg, callchain);
1864        if (!ret) {
1865                /* Enable data address sampling for DWARF unwind. */
1866                if (callchain->record_mode == CALLCHAIN_DWARF)
1867                        record->sample_address = true;
1868                callchain_debug(callchain);
1869        }
1870
1871        return ret;
1872}
1873
1874int record_parse_callchain_opt(const struct option *opt,
1875                               const char *arg,
1876                               int unset)
1877{
1878        return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1879}
1880
1881int record_callchain_opt(const struct option *opt,
1882                         const char *arg __maybe_unused,
1883                         int unset __maybe_unused)
1884{
1885        struct callchain_param *callchain = opt->value;
1886
1887        callchain->enabled = true;
1888
1889        if (callchain->record_mode == CALLCHAIN_NONE)
1890                callchain->record_mode = CALLCHAIN_FP;
1891
1892        callchain_debug(callchain);
1893        return 0;
1894}
1895
1896static int perf_record_config(const char *var, const char *value, void *cb)
1897{
1898        struct record *rec = cb;
1899
1900        if (!strcmp(var, "record.build-id")) {
1901                if (!strcmp(value, "cache"))
1902                        rec->no_buildid_cache = false;
1903                else if (!strcmp(value, "no-cache"))
1904                        rec->no_buildid_cache = true;
1905                else if (!strcmp(value, "skip"))
1906                        rec->no_buildid = true;
1907                else
1908                        return -1;
1909                return 0;
1910        }
1911        if (!strcmp(var, "record.call-graph")) {
1912                var = "call-graph.record-mode";
1913                return perf_default_config(var, value, cb);
1914        }
1915#ifdef HAVE_AIO_SUPPORT
1916        if (!strcmp(var, "record.aio")) {
1917                rec->opts.nr_cblocks = strtol(value, NULL, 0);
1918                if (!rec->opts.nr_cblocks)
1919                        rec->opts.nr_cblocks = nr_cblocks_default;
1920        }
1921#endif
1922
1923        return 0;
1924}
1925
1926struct clockid_map {
1927        const char *name;
1928        int clockid;
1929};
1930
1931#define CLOCKID_MAP(n, c)       \
1932        { .name = n, .clockid = (c), }
1933
1934#define CLOCKID_END     { .name = NULL, }
1935
1936
1937/*
1938 * Add the missing ones, we need to build on many distros...
1939 */
1940#ifndef CLOCK_MONOTONIC_RAW
1941#define CLOCK_MONOTONIC_RAW 4
1942#endif
1943#ifndef CLOCK_BOOTTIME
1944#define CLOCK_BOOTTIME 7
1945#endif
1946#ifndef CLOCK_TAI
1947#define CLOCK_TAI 11
1948#endif
1949
1950static const struct clockid_map clockids[] = {
1951        /* available for all events, NMI safe */
1952        CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1953        CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1954
1955        /* available for some events */
1956        CLOCKID_MAP("realtime", CLOCK_REALTIME),
1957        CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1958        CLOCKID_MAP("tai", CLOCK_TAI),
1959
1960        /* available for the lazy */
1961        CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1962        CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1963        CLOCKID_MAP("real", CLOCK_REALTIME),
1964        CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1965
1966        CLOCKID_END,
1967};
1968
1969static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1970{
1971        struct timespec res;
1972
1973        *res_ns = 0;
1974        if (!clock_getres(clk_id, &res))
1975                *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1976        else
1977                pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1978
1979        return 0;
1980}
1981
1982static int parse_clockid(const struct option *opt, const char *str, int unset)
1983{
1984        struct record_opts *opts = (struct record_opts *)opt->value;
1985        const struct clockid_map *cm;
1986        const char *ostr = str;
1987
1988        if (unset) {
1989                opts->use_clockid = 0;
1990                return 0;
1991        }
1992
1993        /* no arg passed */
1994        if (!str)
1995                return 0;
1996
1997        /* no setting it twice */
1998        if (opts->use_clockid)
1999                return -1;
2000
2001        opts->use_clockid = true;
2002
2003        /* if its a number, we're done */
2004        if (sscanf(str, "%d", &opts->clockid) == 1)
2005                return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
2006
2007        /* allow a "CLOCK_" prefix to the name */
2008        if (!strncasecmp(str, "CLOCK_", 6))
2009                str += 6;
2010
2011        for (cm = clockids; cm->name; cm++) {
2012                if (!strcasecmp(str, cm->name)) {
2013                        opts->clockid = cm->clockid;
2014                        return get_clockid_res(opts->clockid,
2015                                               &opts->clockid_res_ns);
2016                }
2017        }
2018
2019        opts->use_clockid = false;
2020        ui__warning("unknown clockid %s, check man page\n", ostr);
2021        return -1;
2022}
2023
2024static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2025{
2026        struct record_opts *opts = (struct record_opts *)opt->value;
2027
2028        if (unset || !str)
2029                return 0;
2030
2031        if (!strcasecmp(str, "node"))
2032                opts->affinity = PERF_AFFINITY_NODE;
2033        else if (!strcasecmp(str, "cpu"))
2034                opts->affinity = PERF_AFFINITY_CPU;
2035
2036        return 0;
2037}
2038
2039static int parse_output_max_size(const struct option *opt,
2040                                 const char *str, int unset)
2041{
2042        unsigned long *s = (unsigned long *)opt->value;
2043        static struct parse_tag tags_size[] = {
2044                { .tag  = 'B', .mult = 1       },
2045                { .tag  = 'K', .mult = 1 << 10 },
2046                { .tag  = 'M', .mult = 1 << 20 },
2047                { .tag  = 'G', .mult = 1 << 30 },
2048                { .tag  = 0 },
2049        };
2050        unsigned long val;
2051
2052        if (unset) {
2053                *s = 0;
2054                return 0;
2055        }
2056
2057        val = parse_tag_value(str, tags_size);
2058        if (val != (unsigned long) -1) {
2059                *s = val;
2060                return 0;
2061        }
2062
2063        return -1;
2064}
2065
2066static int record__parse_mmap_pages(const struct option *opt,
2067                                    const char *str,
2068                                    int unset __maybe_unused)
2069{
2070        struct record_opts *opts = opt->value;
2071        char *s, *p;
2072        unsigned int mmap_pages;
2073        int ret;
2074
2075        if (!str)
2076                return -EINVAL;
2077
2078        s = strdup(str);
2079        if (!s)
2080                return -ENOMEM;
2081
2082        p = strchr(s, ',');
2083        if (p)
2084                *p = '\0';
2085
2086        if (*s) {
2087                ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2088                if (ret)
2089                        goto out_free;
2090                opts->mmap_pages = mmap_pages;
2091        }
2092
2093        if (!p) {
2094                ret = 0;
2095                goto out_free;
2096        }
2097
2098        ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2099        if (ret)
2100                goto out_free;
2101
2102        opts->auxtrace_mmap_pages = mmap_pages;
2103
2104out_free:
2105        free(s);
2106        return ret;
2107}
2108
2109static void switch_output_size_warn(struct record *rec)
2110{
2111        u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2112        struct switch_output *s = &rec->switch_output;
2113
2114        wakeup_size /= 2;
2115
2116        if (s->size < wakeup_size) {
2117                char buf[100];
2118
2119                unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2120                pr_warning("WARNING: switch-output data size lower than "
2121                           "wakeup kernel buffer size (%s) "
2122                           "expect bigger perf.data sizes\n", buf);
2123        }
2124}
2125
2126static int switch_output_setup(struct record *rec)
2127{
2128        struct switch_output *s = &rec->switch_output;
2129        static struct parse_tag tags_size[] = {
2130                { .tag  = 'B', .mult = 1       },
2131                { .tag  = 'K', .mult = 1 << 10 },
2132                { .tag  = 'M', .mult = 1 << 20 },
2133                { .tag  = 'G', .mult = 1 << 30 },
2134                { .tag  = 0 },
2135        };
2136        static struct parse_tag tags_time[] = {
2137                { .tag  = 's', .mult = 1        },
2138                { .tag  = 'm', .mult = 60       },
2139                { .tag  = 'h', .mult = 60*60    },
2140                { .tag  = 'd', .mult = 60*60*24 },
2141                { .tag  = 0 },
2142        };
2143        unsigned long val;
2144
2145        if (!s->set)
2146                return 0;
2147
2148        if (!strcmp(s->str, "signal")) {
2149                s->signal = true;
2150                pr_debug("switch-output with SIGUSR2 signal\n");
2151                goto enabled;
2152        }
2153
2154        val = parse_tag_value(s->str, tags_size);
2155        if (val != (unsigned long) -1) {
2156                s->size = val;
2157                pr_debug("switch-output with %s size threshold\n", s->str);
2158                goto enabled;
2159        }
2160
2161        val = parse_tag_value(s->str, tags_time);
2162        if (val != (unsigned long) -1) {
2163                s->time = val;
2164                pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2165                         s->str, s->time);
2166                goto enabled;
2167        }
2168
2169        return -1;
2170
2171enabled:
2172        rec->timestamp_filename = true;
2173        s->enabled              = true;
2174
2175        if (s->size && !rec->opts.no_buffering)
2176                switch_output_size_warn(rec);
2177
2178        return 0;
2179}
2180
2181static const char * const __record_usage[] = {
2182        "perf record [<options>] [<command>]",
2183        "perf record [<options>] -- <command> [<options>]",
2184        NULL
2185};
2186const char * const *record_usage = __record_usage;
2187
2188static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2189                                  struct perf_sample *sample, struct machine *machine)
2190{
2191        /*
2192         * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2193         * no need to add them twice.
2194         */
2195        if (!(event->header.misc & PERF_RECORD_MISC_USER))
2196                return 0;
2197        return perf_event__process_mmap(tool, event, sample, machine);
2198}
2199
2200static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2201                                   struct perf_sample *sample, struct machine *machine)
2202{
2203        /*
2204         * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2205         * no need to add them twice.
2206         */
2207        if (!(event->header.misc & PERF_RECORD_MISC_USER))
2208                return 0;
2209
2210        return perf_event__process_mmap2(tool, event, sample, machine);
2211}
2212
2213/*
2214 * XXX Ideally would be local to cmd_record() and passed to a record__new
2215 * because we need to have access to it in record__exit, that is called
2216 * after cmd_record() exits, but since record_options need to be accessible to
2217 * builtin-script, leave it here.
2218 *
2219 * At least we don't ouch it in all the other functions here directly.
2220 *
2221 * Just say no to tons of global variables, sigh.
2222 */
2223static struct record record = {
2224        .opts = {
2225                .sample_time         = true,
2226                .mmap_pages          = UINT_MAX,
2227                .user_freq           = UINT_MAX,
2228                .user_interval       = ULLONG_MAX,
2229                .freq                = 4000,
2230                .target              = {
2231                        .uses_mmap   = true,
2232                        .default_per_cpu = true,
2233                },
2234                .mmap_flush          = MMAP_FLUSH_DEFAULT,
2235        },
2236        .tool = {
2237                .sample         = process_sample_event,
2238                .fork           = perf_event__process_fork,
2239                .exit           = perf_event__process_exit,
2240                .comm           = perf_event__process_comm,
2241                .namespaces     = perf_event__process_namespaces,
2242                .mmap           = build_id__process_mmap,
2243                .mmap2          = build_id__process_mmap2,
2244                .ordered_events = true,
2245        },
2246};
2247
2248const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2249        "\n\t\t\t\tDefault: fp";
2250
2251static bool dry_run;
2252
2253/*
2254 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2255 * with it and switch to use the library functions in perf_evlist that came
2256 * from builtin-record.c, i.e. use record_opts,
2257 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2258 * using pipes, etc.
2259 */
2260static struct option __record_options[] = {
2261        OPT_CALLBACK('e', "event", &record.evlist, "event",
2262                     "event selector. use 'perf list' to list available events",
2263                     parse_events_option),
2264        OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2265                     "event filter", parse_filter),
2266        OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2267                           NULL, "don't record events from perf itself",
2268                           exclude_perf),
2269        OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2270                    "record events on existing process id"),
2271        OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2272                    "record events on existing thread id"),
2273        OPT_INTEGER('r', "realtime", &record.realtime_prio,
2274                    "collect data with this RT SCHED_FIFO priority"),
2275        OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2276                    "collect data without buffering"),
2277        OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2278                    "collect raw sample records from all opened counters"),
2279        OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2280                            "system-wide collection from all CPUs"),
2281        OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2282                    "list of cpus to monitor"),
2283        OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2284        OPT_STRING('o', "output", &record.data.path, "file",
2285                    "output file name"),
2286        OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2287                        &record.opts.no_inherit_set,
2288                        "child tasks do not inherit counters"),
2289        OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2290                    "synthesize non-sample events at the end of output"),
2291        OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2292        OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2293        OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2294                    "Fail if the specified frequency can't be used"),
2295        OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2296                     "profile at this frequency",
2297                      record__parse_freq),
2298        OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2299                     "number of mmap data pages and AUX area tracing mmap pages",
2300                     record__parse_mmap_pages),
2301        OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2302                     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2303                     record__mmap_flush_parse),
2304        OPT_BOOLEAN(0, "group", &record.opts.group,
2305                    "put the counters into a counter group"),
2306        OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2307                           NULL, "enables call-graph recording" ,
2308                           &record_callchain_opt),
2309        OPT_CALLBACK(0, "call-graph", &record.opts,
2310                     "record_mode[,record_size]", record_callchain_help,
2311                     &record_parse_callchain_opt),
2312        OPT_INCR('v', "verbose", &verbose,
2313                    "be more verbose (show counter open errors, etc)"),
2314        OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2315        OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2316                    "per thread counts"),
2317        OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2318        OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2319                    "Record the sample physical addresses"),
2320        OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2321        OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2322                        &record.opts.sample_time_set,
2323                        "Record the sample timestamps"),
2324        OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2325                        "Record the sample period"),
2326        OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2327                    "don't sample"),
2328        OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2329                        &record.no_buildid_cache_set,
2330                        "do not update the buildid cache"),
2331        OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2332                        &record.no_buildid_set,
2333                        "do not collect buildids in perf.data"),
2334        OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2335                     "monitor event in cgroup name only",
2336                     parse_cgroups),
2337        OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2338                  "ms to wait before starting measurement after program start"),
2339        OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2340        OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2341                   "user to profile"),
2342
2343        OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2344                     "branch any", "sample any taken branches",
2345                     parse_branch_stack),
2346
2347        OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2348                     "branch filter mask", "branch stack filter modes",
2349                     parse_branch_stack),
2350        OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2351                    "sample by weight (on special events only)"),
2352        OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2353                    "sample transaction flags (special events only)"),
2354        OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2355                    "use per-thread mmaps"),
2356        OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2357                    "sample selected machine registers on interrupt,"
2358                    " use '-I?' to list register names", parse_intr_regs),
2359        OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2360                    "sample selected machine registers on interrupt,"
2361                    " use '--user-regs=?' to list register names", parse_user_regs),
2362        OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2363                    "Record running/enabled time of read (:S) events"),
2364        OPT_CALLBACK('k', "clockid", &record.opts,
2365        "clockid", "clockid to use for events, see clock_gettime()",
2366        parse_clockid),
2367        OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2368                          "opts", "AUX area tracing Snapshot Mode", ""),
2369        OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2370                          "opts", "sample AUX area", ""),
2371        OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2372                        "per thread proc mmap processing timeout in ms"),
2373        OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2374                    "Record namespaces events"),
2375        OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2376                    "Record cgroup events"),
2377        OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2378                    "Record context switch events"),
2379        OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2380                         "Configure all used events to run in kernel space.",
2381                         PARSE_OPT_EXCLUSIVE),
2382        OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2383                         "Configure all used events to run in user space.",
2384                         PARSE_OPT_EXCLUSIVE),
2385        OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2386                    "collect kernel callchains"),
2387        OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2388                    "collect user callchains"),
2389        OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2390                   "clang binary to use for compiling BPF scriptlets"),
2391        OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2392                   "options passed to clang when compiling BPF scriptlets"),
2393        OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2394                   "file", "vmlinux pathname"),
2395        OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2396                    "Record build-id of all DSOs regardless of hits"),
2397        OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2398                    "append timestamp to output filename"),
2399        OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2400                    "Record timestamp boundary (time of first/last samples)"),
2401        OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2402                          &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2403                          "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2404                          "signal"),
2405        OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2406                   "Limit number of switch output generated files"),
2407        OPT_BOOLEAN(0, "dry-run", &dry_run,
2408                    "Parse options then exit"),
2409#ifdef HAVE_AIO_SUPPORT
2410        OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2411                     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2412                     record__aio_parse),
2413#endif
2414        OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2415                     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2416                     record__parse_affinity),
2417#ifdef HAVE_ZSTD_SUPPORT
2418        OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2419                            "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2420                            record__parse_comp_level),
2421#endif
2422        OPT_CALLBACK(0, "max-size", &record.output_max_size,
2423                     "size", "Limit the maximum size of the output file", parse_output_max_size),
2424        OPT_END()
2425};
2426
2427struct option *record_options = __record_options;
2428
2429int cmd_record(int argc, const char **argv)
2430{
2431        int err;
2432        struct record *rec = &record;
2433        char errbuf[BUFSIZ];
2434
2435        setlocale(LC_ALL, "");
2436
2437#ifndef HAVE_LIBBPF_SUPPORT
2438# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2439        set_nobuild('\0', "clang-path", true);
2440        set_nobuild('\0', "clang-opt", true);
2441# undef set_nobuild
2442#endif
2443
2444#ifndef HAVE_BPF_PROLOGUE
2445# if !defined (HAVE_DWARF_SUPPORT)
2446#  define REASON  "NO_DWARF=1"
2447# elif !defined (HAVE_LIBBPF_SUPPORT)
2448#  define REASON  "NO_LIBBPF=1"
2449# else
2450#  define REASON  "this architecture doesn't support BPF prologue"
2451# endif
2452# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2453        set_nobuild('\0', "vmlinux", true);
2454# undef set_nobuild
2455# undef REASON
2456#endif
2457
2458        rec->opts.affinity = PERF_AFFINITY_SYS;
2459
2460        rec->evlist = evlist__new();
2461        if (rec->evlist == NULL)
2462                return -ENOMEM;
2463
2464        err = perf_config(perf_record_config, rec);
2465        if (err)
2466                return err;
2467
2468        argc = parse_options(argc, argv, record_options, record_usage,
2469                            PARSE_OPT_STOP_AT_NON_OPTION);
2470        if (quiet)
2471                perf_quiet_option();
2472
2473        /* Make system wide (-a) the default target. */
2474        if (!argc && target__none(&rec->opts.target))
2475                rec->opts.target.system_wide = true;
2476
2477        if (nr_cgroups && !rec->opts.target.system_wide) {
2478                usage_with_options_msg(record_usage, record_options,
2479                        "cgroup monitoring only available in system-wide mode");
2480
2481        }
2482
2483        if (rec->opts.kcore)
2484                rec->data.is_dir = true;
2485
2486        if (rec->opts.comp_level != 0) {
2487                pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2488                rec->no_buildid = true;
2489        }
2490
2491        if (rec->opts.record_switch_events &&
2492            !perf_can_record_switch_events()) {
2493                ui__error("kernel does not support recording context switch events\n");
2494                parse_options_usage(record_usage, record_options, "switch-events", 0);
2495                return -EINVAL;
2496        }
2497
2498        if (switch_output_setup(rec)) {
2499                parse_options_usage(record_usage, record_options, "switch-output", 0);
2500                return -EINVAL;
2501        }
2502
2503        if (rec->switch_output.time) {
2504                signal(SIGALRM, alarm_sig_handler);
2505                alarm(rec->switch_output.time);
2506        }
2507
2508        if (rec->switch_output.num_files) {
2509                rec->switch_output.filenames = calloc(sizeof(char *),
2510                                                      rec->switch_output.num_files);
2511                if (!rec->switch_output.filenames)
2512                        return -EINVAL;
2513        }
2514
2515        /*
2516         * Allow aliases to facilitate the lookup of symbols for address
2517         * filters. Refer to auxtrace_parse_filters().
2518         */
2519        symbol_conf.allow_aliases = true;
2520
2521        symbol__init(NULL);
2522
2523        if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2524                rec->affinity_mask.nbits = cpu__max_cpu();
2525                rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2526                if (!rec->affinity_mask.bits) {
2527                        pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2528                        return -ENOMEM;
2529                }
2530                pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2531        }
2532
2533        err = record__auxtrace_init(rec);
2534        if (err)
2535                goto out;
2536
2537        if (dry_run)
2538                goto out;
2539
2540        err = bpf__setup_stdout(rec->evlist);
2541        if (err) {
2542                bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2543                pr_err("ERROR: Setup BPF stdout failed: %s\n",
2544                         errbuf);
2545                goto out;
2546        }
2547
2548        err = -ENOMEM;
2549
2550        if (rec->no_buildid_cache || rec->no_buildid) {
2551                disable_buildid_cache();
2552        } else if (rec->switch_output.enabled) {
2553                /*
2554                 * In 'perf record --switch-output', disable buildid
2555                 * generation by default to reduce data file switching
2556                 * overhead. Still generate buildid if they are required
2557                 * explicitly using
2558                 *
2559                 *  perf record --switch-output --no-no-buildid \
2560                 *              --no-no-buildid-cache
2561                 *
2562                 * Following code equals to:
2563                 *
2564                 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2565                 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2566                 *         disable_buildid_cache();
2567                 */
2568                bool disable = true;
2569
2570                if (rec->no_buildid_set && !rec->no_buildid)
2571                        disable = false;
2572                if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2573                        disable = false;
2574                if (disable) {
2575                        rec->no_buildid = true;
2576                        rec->no_buildid_cache = true;
2577                        disable_buildid_cache();
2578                }
2579        }
2580
2581        if (record.opts.overwrite)
2582                record.opts.tail_synthesize = true;
2583
2584        if (rec->evlist->core.nr_entries == 0 &&
2585            __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2586                pr_err("Not enough memory for event selector list\n");
2587                goto out;
2588        }
2589
2590        if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2591                rec->opts.no_inherit = true;
2592
2593        err = target__validate(&rec->opts.target);
2594        if (err) {
2595                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2596                ui__warning("%s\n", errbuf);
2597        }
2598
2599        err = target__parse_uid(&rec->opts.target);
2600        if (err) {
2601                int saved_errno = errno;
2602
2603                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2604                ui__error("%s", errbuf);
2605
2606                err = -saved_errno;
2607                goto out;
2608        }
2609
2610        /* Enable ignoring missing threads when -u/-p option is defined. */
2611        rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2612
2613        err = -ENOMEM;
2614        if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2615                usage_with_options(record_usage, record_options);
2616
2617        err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2618        if (err)
2619                goto out;
2620
2621        /*
2622         * We take all buildids when the file contains
2623         * AUX area tracing data because we do not decode the
2624         * trace because it would take too long.
2625         */
2626        if (rec->opts.full_auxtrace)
2627                rec->buildid_all = true;
2628
2629        if (record_opts__config(&rec->opts)) {
2630                err = -EINVAL;
2631                goto out;
2632        }
2633
2634        if (rec->opts.nr_cblocks > nr_cblocks_max)
2635                rec->opts.nr_cblocks = nr_cblocks_max;
2636        pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2637
2638        pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2639        pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2640
2641        if (rec->opts.comp_level > comp_level_max)
2642                rec->opts.comp_level = comp_level_max;
2643        pr_debug("comp level: %d\n", rec->opts.comp_level);
2644
2645        err = __cmd_record(&record, argc, argv);
2646out:
2647        bitmap_free(rec->affinity_mask.bits);
2648        evlist__delete(rec->evlist);
2649        symbol__exit();
2650        auxtrace_record__free(rec->itr);
2651        return err;
2652}
2653
2654static void snapshot_sig_handler(int sig __maybe_unused)
2655{
2656        struct record *rec = &record;
2657
2658        if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2659                trigger_hit(&auxtrace_snapshot_trigger);
2660                auxtrace_record__snapshot_started = 1;
2661                if (auxtrace_record__snapshot_start(record.itr))
2662                        trigger_error(&auxtrace_snapshot_trigger);
2663        }
2664
2665        if (switch_output_signal(rec))
2666                trigger_hit(&switch_output_trigger);
2667}
2668
2669static void alarm_sig_handler(int sig __maybe_unused)
2670{
2671        struct record *rec = &record;
2672
2673        if (switch_output_time(rec))
2674                trigger_hit(&switch_output_trigger);
2675}
2676