linux/tools/perf/builtin-record.c
<<
>>
Prefs
   1/*
   2 * builtin-record.c
   3 *
   4 * Builtin record command: Record the profile of a workload
   5 * (or a CPU, or a PID) into the perf.data output file - for
   6 * later analysis via perf report.
   7 */
   8#include "builtin.h"
   9
  10#include "perf.h"
  11
  12#include "util/build-id.h"
  13#include "util/util.h"
  14#include <subcmd/parse-options.h>
  15#include "util/parse-events.h"
  16
  17#include "util/callchain.h"
  18#include "util/cgroup.h"
  19#include "util/header.h"
  20#include "util/event.h"
  21#include "util/evlist.h"
  22#include "util/evsel.h"
  23#include "util/debug.h"
  24#include "util/session.h"
  25#include "util/tool.h"
  26#include "util/symbol.h"
  27#include "util/cpumap.h"
  28#include "util/thread_map.h"
  29#include "util/data.h"
  30#include "util/perf_regs.h"
  31#include "util/auxtrace.h"
  32#include "util/parse-branch-options.h"
  33#include "util/parse-regs-options.h"
  34#include "util/llvm-utils.h"
  35#include "util/bpf-loader.h"
  36#include "asm/bug.h"
  37
  38#include <unistd.h>
  39#include <sched.h>
  40#include <sys/mman.h>
  41
  42
  43struct record {
  44        struct perf_tool        tool;
  45        struct record_opts      opts;
  46        u64                     bytes_written;
  47        struct perf_data_file   file;
  48        struct auxtrace_record  *itr;
  49        struct perf_evlist      *evlist;
  50        struct perf_session     *session;
  51        const char              *progname;
  52        int                     realtime_prio;
  53        bool                    no_buildid;
  54        bool                    no_buildid_set;
  55        bool                    no_buildid_cache;
  56        bool                    no_buildid_cache_set;
  57        bool                    buildid_all;
  58        unsigned long long      samples;
  59};
  60
  61static int record__write(struct record *rec, void *bf, size_t size)
  62{
  63        if (perf_data_file__write(rec->session->file, bf, size) < 0) {
  64                pr_err("failed to write perf data, error: %m\n");
  65                return -1;
  66        }
  67
  68        rec->bytes_written += size;
  69        return 0;
  70}
  71
  72static int process_synthesized_event(struct perf_tool *tool,
  73                                     union perf_event *event,
  74                                     struct perf_sample *sample __maybe_unused,
  75                                     struct machine *machine __maybe_unused)
  76{
  77        struct record *rec = container_of(tool, struct record, tool);
  78        return record__write(rec, event, event->header.size);
  79}
  80
  81static int record__mmap_read(struct record *rec, int idx)
  82{
  83        struct perf_mmap *md = &rec->evlist->mmap[idx];
  84        u64 head = perf_mmap__read_head(md);
  85        u64 old = md->prev;
  86        unsigned char *data = md->base + page_size;
  87        unsigned long size;
  88        void *buf;
  89        int rc = 0;
  90
  91        if (old == head)
  92                return 0;
  93
  94        rec->samples++;
  95
  96        size = head - old;
  97
  98        if ((old & md->mask) + size != (head & md->mask)) {
  99                buf = &data[old & md->mask];
 100                size = md->mask + 1 - (old & md->mask);
 101                old += size;
 102
 103                if (record__write(rec, buf, size) < 0) {
 104                        rc = -1;
 105                        goto out;
 106                }
 107        }
 108
 109        buf = &data[old & md->mask];
 110        size = head - old;
 111        old += size;
 112
 113        if (record__write(rec, buf, size) < 0) {
 114                rc = -1;
 115                goto out;
 116        }
 117
 118        md->prev = old;
 119        perf_evlist__mmap_consume(rec->evlist, idx);
 120out:
 121        return rc;
 122}
 123
 124static volatile int done;
 125static volatile int signr = -1;
 126static volatile int child_finished;
 127static volatile int auxtrace_snapshot_enabled;
 128static volatile int auxtrace_snapshot_err;
 129static volatile int auxtrace_record__snapshot_started;
 130
 131static void sig_handler(int sig)
 132{
 133        if (sig == SIGCHLD)
 134                child_finished = 1;
 135        else
 136                signr = sig;
 137
 138        done = 1;
 139}
 140
 141static void record__sig_exit(void)
 142{
 143        if (signr == -1)
 144                return;
 145
 146        signal(signr, SIG_DFL);
 147        raise(signr);
 148}
 149
 150#ifdef HAVE_AUXTRACE_SUPPORT
 151
 152static int record__process_auxtrace(struct perf_tool *tool,
 153                                    union perf_event *event, void *data1,
 154                                    size_t len1, void *data2, size_t len2)
 155{
 156        struct record *rec = container_of(tool, struct record, tool);
 157        struct perf_data_file *file = &rec->file;
 158        size_t padding;
 159        u8 pad[8] = {0};
 160
 161        if (!perf_data_file__is_pipe(file)) {
 162                off_t file_offset;
 163                int fd = perf_data_file__fd(file);
 164                int err;
 165
 166                file_offset = lseek(fd, 0, SEEK_CUR);
 167                if (file_offset == -1)
 168                        return -1;
 169                err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
 170                                                     event, file_offset);
 171                if (err)
 172                        return err;
 173        }
 174
 175        /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
 176        padding = (len1 + len2) & 7;
 177        if (padding)
 178                padding = 8 - padding;
 179
 180        record__write(rec, event, event->header.size);
 181        record__write(rec, data1, len1);
 182        if (len2)
 183                record__write(rec, data2, len2);
 184        record__write(rec, &pad, padding);
 185
 186        return 0;
 187}
 188
 189static int record__auxtrace_mmap_read(struct record *rec,
 190                                      struct auxtrace_mmap *mm)
 191{
 192        int ret;
 193
 194        ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
 195                                  record__process_auxtrace);
 196        if (ret < 0)
 197                return ret;
 198
 199        if (ret)
 200                rec->samples++;
 201
 202        return 0;
 203}
 204
 205static int record__auxtrace_mmap_read_snapshot(struct record *rec,
 206                                               struct auxtrace_mmap *mm)
 207{
 208        int ret;
 209
 210        ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
 211                                           record__process_auxtrace,
 212                                           rec->opts.auxtrace_snapshot_size);
 213        if (ret < 0)
 214                return ret;
 215
 216        if (ret)
 217                rec->samples++;
 218
 219        return 0;
 220}
 221
 222static int record__auxtrace_read_snapshot_all(struct record *rec)
 223{
 224        int i;
 225        int rc = 0;
 226
 227        for (i = 0; i < rec->evlist->nr_mmaps; i++) {
 228                struct auxtrace_mmap *mm =
 229                                &rec->evlist->mmap[i].auxtrace_mmap;
 230
 231                if (!mm->base)
 232                        continue;
 233
 234                if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
 235                        rc = -1;
 236                        goto out;
 237                }
 238        }
 239out:
 240        return rc;
 241}
 242
 243static void record__read_auxtrace_snapshot(struct record *rec)
 244{
 245        pr_debug("Recording AUX area tracing snapshot\n");
 246        if (record__auxtrace_read_snapshot_all(rec) < 0) {
 247                auxtrace_snapshot_err = -1;
 248        } else {
 249                auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
 250                if (!auxtrace_snapshot_err)
 251                        auxtrace_snapshot_enabled = 1;
 252        }
 253}
 254
 255#else
 256
 257static inline
 258int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
 259                               struct auxtrace_mmap *mm __maybe_unused)
 260{
 261        return 0;
 262}
 263
 264static inline
 265void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
 266{
 267}
 268
 269static inline
 270int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
 271{
 272        return 0;
 273}
 274
 275#endif
 276
 277static int record__open(struct record *rec)
 278{
 279        char msg[512];
 280        struct perf_evsel *pos;
 281        struct perf_evlist *evlist = rec->evlist;
 282        struct perf_session *session = rec->session;
 283        struct record_opts *opts = &rec->opts;
 284        int rc = 0;
 285
 286        perf_evlist__config(evlist, opts);
 287
 288        evlist__for_each(evlist, pos) {
 289try_again:
 290                if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
 291                        if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 292                                if (verbose)
 293                                        ui__warning("%s\n", msg);
 294                                goto try_again;
 295                        }
 296
 297                        rc = -errno;
 298                        perf_evsel__open_strerror(pos, &opts->target,
 299                                                  errno, msg, sizeof(msg));
 300                        ui__error("%s\n", msg);
 301                        goto out;
 302                }
 303        }
 304
 305        if (perf_evlist__apply_filters(evlist, &pos)) {
 306                error("failed to set filter \"%s\" on event %s with %d (%s)\n",
 307                        pos->filter, perf_evsel__name(pos), errno,
 308                        strerror_r(errno, msg, sizeof(msg)));
 309                rc = -1;
 310                goto out;
 311        }
 312
 313        if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
 314                                 opts->auxtrace_mmap_pages,
 315                                 opts->auxtrace_snapshot_mode) < 0) {
 316                if (errno == EPERM) {
 317                        pr_err("Permission error mapping pages.\n"
 318                               "Consider increasing "
 319                               "/proc/sys/kernel/perf_event_mlock_kb,\n"
 320                               "or try again with a smaller value of -m/--mmap_pages.\n"
 321                               "(current value: %u,%u)\n",
 322                               opts->mmap_pages, opts->auxtrace_mmap_pages);
 323                        rc = -errno;
 324                } else {
 325                        pr_err("failed to mmap with %d (%s)\n", errno,
 326                                strerror_r(errno, msg, sizeof(msg)));
 327                        if (errno)
 328                                rc = -errno;
 329                        else
 330                                rc = -EINVAL;
 331                }
 332                goto out;
 333        }
 334
 335        session->evlist = evlist;
 336        perf_session__set_id_hdr_size(session);
 337out:
 338        return rc;
 339}
 340
 341static int process_sample_event(struct perf_tool *tool,
 342                                union perf_event *event,
 343                                struct perf_sample *sample,
 344                                struct perf_evsel *evsel,
 345                                struct machine *machine)
 346{
 347        struct record *rec = container_of(tool, struct record, tool);
 348
 349        rec->samples++;
 350
 351        return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 352}
 353
 354static int process_buildids(struct record *rec)
 355{
 356        struct perf_data_file *file  = &rec->file;
 357        struct perf_session *session = rec->session;
 358
 359        if (file->size == 0)
 360                return 0;
 361
 362        /*
 363         * During this process, it'll load kernel map and replace the
 364         * dso->long_name to a real pathname it found.  In this case
 365         * we prefer the vmlinux path like
 366         *   /lib/modules/3.16.4/build/vmlinux
 367         *
 368         * rather than build-id path (in debug directory).
 369         *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 370         */
 371        symbol_conf.ignore_vmlinux_buildid = true;
 372
 373        /*
 374         * If --buildid-all is given, it marks all DSO regardless of hits,
 375         * so no need to process samples.
 376         */
 377        if (rec->buildid_all)
 378                rec->tool.sample = NULL;
 379
 380        return perf_session__process_events(session);
 381}
 382
 383static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 384{
 385        int err;
 386        struct perf_tool *tool = data;
 387        /*
 388         *As for guest kernel when processing subcommand record&report,
 389         *we arrange module mmap prior to guest kernel mmap and trigger
 390         *a preload dso because default guest module symbols are loaded
 391         *from guest kallsyms instead of /lib/modules/XXX/XXX. This
 392         *method is used to avoid symbol missing when the first addr is
 393         *in module instead of in guest kernel.
 394         */
 395        err = perf_event__synthesize_modules(tool, process_synthesized_event,
 396                                             machine);
 397        if (err < 0)
 398                pr_err("Couldn't record guest kernel [%d]'s reference"
 399                       " relocation symbol.\n", machine->pid);
 400
 401        /*
 402         * We use _stext for guest kernel because guest kernel's /proc/kallsyms
 403         * have no _text sometimes.
 404         */
 405        err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 406                                                 machine);
 407        if (err < 0)
 408                pr_err("Couldn't record guest kernel [%d]'s reference"
 409                       " relocation symbol.\n", machine->pid);
 410}
 411
 412static struct perf_event_header finished_round_event = {
 413        .size = sizeof(struct perf_event_header),
 414        .type = PERF_RECORD_FINISHED_ROUND,
 415};
 416
 417static int record__mmap_read_all(struct record *rec)
 418{
 419        u64 bytes_written = rec->bytes_written;
 420        int i;
 421        int rc = 0;
 422
 423        for (i = 0; i < rec->evlist->nr_mmaps; i++) {
 424                struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
 425
 426                if (rec->evlist->mmap[i].base) {
 427                        if (record__mmap_read(rec, i) != 0) {
 428                                rc = -1;
 429                                goto out;
 430                        }
 431                }
 432
 433                if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
 434                    record__auxtrace_mmap_read(rec, mm) != 0) {
 435                        rc = -1;
 436                        goto out;
 437                }
 438        }
 439
 440        /*
 441         * Mark the round finished in case we wrote
 442         * at least one event.
 443         */
 444        if (bytes_written != rec->bytes_written)
 445                rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
 446
 447out:
 448        return rc;
 449}
 450
 451static void record__init_features(struct record *rec)
 452{
 453        struct perf_session *session = rec->session;
 454        int feat;
 455
 456        for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
 457                perf_header__set_feat(&session->header, feat);
 458
 459        if (rec->no_buildid)
 460                perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
 461
 462        if (!have_tracepoints(&rec->evlist->entries))
 463                perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
 464
 465        if (!rec->opts.branch_stack)
 466                perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
 467
 468        if (!rec->opts.full_auxtrace)
 469                perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
 470
 471        perf_header__clear_feat(&session->header, HEADER_STAT);
 472}
 473
 474static void
 475record__finish_output(struct record *rec)
 476{
 477        struct perf_data_file *file = &rec->file;
 478        int fd = perf_data_file__fd(file);
 479
 480        if (file->is_pipe)
 481                return;
 482
 483        rec->session->header.data_size += rec->bytes_written;
 484        file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
 485
 486        if (!rec->no_buildid) {
 487                process_buildids(rec);
 488
 489                if (rec->buildid_all)
 490                        dsos__hit_all(rec->session);
 491        }
 492        perf_session__write_header(rec->session, rec->evlist, fd, true);
 493
 494        return;
 495}
 496
 497static volatile int workload_exec_errno;
 498
 499/*
 500 * perf_evlist__prepare_workload will send a SIGUSR1
 501 * if the fork fails, since we asked by setting its
 502 * want_signal to true.
 503 */
 504static void workload_exec_failed_signal(int signo __maybe_unused,
 505                                        siginfo_t *info,
 506                                        void *ucontext __maybe_unused)
 507{
 508        workload_exec_errno = info->si_value.sival_int;
 509        done = 1;
 510        child_finished = 1;
 511}
 512
 513static void snapshot_sig_handler(int sig);
 514
 515static int record__synthesize(struct record *rec)
 516{
 517        struct perf_session *session = rec->session;
 518        struct machine *machine = &session->machines.host;
 519        struct perf_data_file *file = &rec->file;
 520        struct record_opts *opts = &rec->opts;
 521        struct perf_tool *tool = &rec->tool;
 522        int fd = perf_data_file__fd(file);
 523        int err = 0;
 524
 525        if (file->is_pipe) {
 526                err = perf_event__synthesize_attrs(tool, session,
 527                                                   process_synthesized_event);
 528                if (err < 0) {
 529                        pr_err("Couldn't synthesize attrs.\n");
 530                        goto out;
 531                }
 532
 533                if (have_tracepoints(&rec->evlist->entries)) {
 534                        /*
 535                         * FIXME err <= 0 here actually means that
 536                         * there were no tracepoints so its not really
 537                         * an error, just that we don't need to
 538                         * synthesize anything.  We really have to
 539                         * return this more properly and also
 540                         * propagate errors that now are calling die()
 541                         */
 542                        err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
 543                                                                  process_synthesized_event);
 544                        if (err <= 0) {
 545                                pr_err("Couldn't record tracing data.\n");
 546                                goto out;
 547                        }
 548                        rec->bytes_written += err;
 549                }
 550        }
 551
 552        if (rec->opts.full_auxtrace) {
 553                err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
 554                                        session, process_synthesized_event);
 555                if (err)
 556                        goto out;
 557        }
 558
 559        err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 560                                                 machine);
 561        WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
 562                           "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 563                           "Check /proc/kallsyms permission or run as root.\n");
 564
 565        err = perf_event__synthesize_modules(tool, process_synthesized_event,
 566                                             machine);
 567        WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
 568                           "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 569                           "Check /proc/modules permission or run as root.\n");
 570
 571        if (perf_guest) {
 572                machines__process_guests(&session->machines,
 573                                         perf_event__synthesize_guest_os, tool);
 574        }
 575
 576        err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
 577                                            process_synthesized_event, opts->sample_address,
 578                                            opts->proc_map_timeout);
 579out:
 580        return err;
 581}
 582
 583static int __cmd_record(struct record *rec, int argc, const char **argv)
 584{
 585        int err;
 586        int status = 0;
 587        unsigned long waking = 0;
 588        const bool forks = argc > 0;
 589        struct machine *machine;
 590        struct perf_tool *tool = &rec->tool;
 591        struct record_opts *opts = &rec->opts;
 592        struct perf_data_file *file = &rec->file;
 593        struct perf_session *session;
 594        bool disabled = false, draining = false;
 595        int fd;
 596
 597        rec->progname = argv[0];
 598
 599        atexit(record__sig_exit);
 600        signal(SIGCHLD, sig_handler);
 601        signal(SIGINT, sig_handler);
 602        signal(SIGTERM, sig_handler);
 603        if (rec->opts.auxtrace_snapshot_mode)
 604                signal(SIGUSR2, snapshot_sig_handler);
 605        else
 606                signal(SIGUSR2, SIG_IGN);
 607
 608        session = perf_session__new(file, false, tool);
 609        if (session == NULL) {
 610                pr_err("Perf session creation failed.\n");
 611                return -1;
 612        }
 613
 614        fd = perf_data_file__fd(file);
 615        rec->session = session;
 616
 617        record__init_features(rec);
 618
 619        if (forks) {
 620                err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
 621                                                    argv, file->is_pipe,
 622                                                    workload_exec_failed_signal);
 623                if (err < 0) {
 624                        pr_err("Couldn't run the workload!\n");
 625                        status = err;
 626                        goto out_delete_session;
 627                }
 628        }
 629
 630        if (record__open(rec) != 0) {
 631                err = -1;
 632                goto out_child;
 633        }
 634
 635        err = bpf__apply_obj_config();
 636        if (err) {
 637                char errbuf[BUFSIZ];
 638
 639                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
 640                pr_err("ERROR: Apply config to BPF failed: %s\n",
 641                         errbuf);
 642                goto out_child;
 643        }
 644
 645        /*
 646         * Normally perf_session__new would do this, but it doesn't have the
 647         * evlist.
 648         */
 649        if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
 650                pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
 651                rec->tool.ordered_events = false;
 652        }
 653
 654        if (!rec->evlist->nr_groups)
 655                perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
 656
 657        if (file->is_pipe) {
 658                err = perf_header__write_pipe(fd);
 659                if (err < 0)
 660                        goto out_child;
 661        } else {
 662                err = perf_session__write_header(session, rec->evlist, fd, false);
 663                if (err < 0)
 664                        goto out_child;
 665        }
 666
 667        if (!rec->no_buildid
 668            && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
 669                pr_err("Couldn't generate buildids. "
 670                       "Use --no-buildid to profile anyway.\n");
 671                err = -1;
 672                goto out_child;
 673        }
 674
 675        machine = &session->machines.host;
 676
 677        err = record__synthesize(rec);
 678        if (err < 0)
 679                goto out_child;
 680
 681        if (rec->realtime_prio) {
 682                struct sched_param param;
 683
 684                param.sched_priority = rec->realtime_prio;
 685                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
 686                        pr_err("Could not set realtime priority.\n");
 687                        err = -1;
 688                        goto out_child;
 689                }
 690        }
 691
 692        /*
 693         * When perf is starting the traced process, all the events
 694         * (apart from group members) have enable_on_exec=1 set,
 695         * so don't spoil it by prematurely enabling them.
 696         */
 697        if (!target__none(&opts->target) && !opts->initial_delay)
 698                perf_evlist__enable(rec->evlist);
 699
 700        /*
 701         * Let the child rip
 702         */
 703        if (forks) {
 704                union perf_event *event;
 705
 706                event = malloc(sizeof(event->comm) + machine->id_hdr_size);
 707                if (event == NULL) {
 708                        err = -ENOMEM;
 709                        goto out_child;
 710                }
 711
 712                /*
 713                 * Some H/W events are generated before COMM event
 714                 * which is emitted during exec(), so perf script
 715                 * cannot see a correct process name for those events.
 716                 * Synthesize COMM event to prevent it.
 717                 */
 718                perf_event__synthesize_comm(tool, event,
 719                                            rec->evlist->workload.pid,
 720                                            process_synthesized_event,
 721                                            machine);
 722                free(event);
 723
 724                perf_evlist__start_workload(rec->evlist);
 725        }
 726
 727        if (opts->initial_delay) {
 728                usleep(opts->initial_delay * 1000);
 729                perf_evlist__enable(rec->evlist);
 730        }
 731
 732        auxtrace_snapshot_enabled = 1;
 733        for (;;) {
 734                unsigned long long hits = rec->samples;
 735
 736                if (record__mmap_read_all(rec) < 0) {
 737                        auxtrace_snapshot_enabled = 0;
 738                        err = -1;
 739                        goto out_child;
 740                }
 741
 742                if (auxtrace_record__snapshot_started) {
 743                        auxtrace_record__snapshot_started = 0;
 744                        if (!auxtrace_snapshot_err)
 745                                record__read_auxtrace_snapshot(rec);
 746                        if (auxtrace_snapshot_err) {
 747                                pr_err("AUX area tracing snapshot failed\n");
 748                                err = -1;
 749                                goto out_child;
 750                        }
 751                }
 752
 753                if (hits == rec->samples) {
 754                        if (done || draining)
 755                                break;
 756                        err = perf_evlist__poll(rec->evlist, -1);
 757                        /*
 758                         * Propagate error, only if there's any. Ignore positive
 759                         * number of returned events and interrupt error.
 760                         */
 761                        if (err > 0 || (err < 0 && errno == EINTR))
 762                                err = 0;
 763                        waking++;
 764
 765                        if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
 766                                draining = true;
 767                }
 768
 769                /*
 770                 * When perf is starting the traced process, at the end events
 771                 * die with the process and we wait for that. Thus no need to
 772                 * disable events in this case.
 773                 */
 774                if (done && !disabled && !target__none(&opts->target)) {
 775                        auxtrace_snapshot_enabled = 0;
 776                        perf_evlist__disable(rec->evlist);
 777                        disabled = true;
 778                }
 779        }
 780        auxtrace_snapshot_enabled = 0;
 781
 782        if (forks && workload_exec_errno) {
 783                char msg[STRERR_BUFSIZE];
 784                const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
 785                pr_err("Workload failed: %s\n", emsg);
 786                err = -1;
 787                goto out_child;
 788        }
 789
 790        if (!quiet)
 791                fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
 792
 793out_child:
 794        if (forks) {
 795                int exit_status;
 796
 797                if (!child_finished)
 798                        kill(rec->evlist->workload.pid, SIGTERM);
 799
 800                wait(&exit_status);
 801
 802                if (err < 0)
 803                        status = err;
 804                else if (WIFEXITED(exit_status))
 805                        status = WEXITSTATUS(exit_status);
 806                else if (WIFSIGNALED(exit_status))
 807                        signr = WTERMSIG(exit_status);
 808        } else
 809                status = err;
 810
 811        /* this will be recalculated during process_buildids() */
 812        rec->samples = 0;
 813
 814        if (!err)
 815                record__finish_output(rec);
 816
 817        if (!err && !quiet) {
 818                char samples[128];
 819
 820                if (rec->samples && !rec->opts.full_auxtrace)
 821                        scnprintf(samples, sizeof(samples),
 822                                  " (%" PRIu64 " samples)", rec->samples);
 823                else
 824                        samples[0] = '\0';
 825
 826                fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s ]\n",
 827                        perf_data_file__size(file) / 1024.0 / 1024.0,
 828                        file->path, samples);
 829        }
 830
 831out_delete_session:
 832        perf_session__delete(session);
 833        return status;
 834}
 835
 836static void callchain_debug(void)
 837{
 838        static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
 839
 840        pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
 841
 842        if (callchain_param.record_mode == CALLCHAIN_DWARF)
 843                pr_debug("callchain: stack dump size %d\n",
 844                         callchain_param.dump_size);
 845}
 846
 847int record_parse_callchain_opt(const struct option *opt,
 848                               const char *arg,
 849                               int unset)
 850{
 851        int ret;
 852        struct record_opts *record = (struct record_opts *)opt->value;
 853
 854        record->callgraph_set = true;
 855        callchain_param.enabled = !unset;
 856
 857        /* --no-call-graph */
 858        if (unset) {
 859                callchain_param.record_mode = CALLCHAIN_NONE;
 860                pr_debug("callchain: disabled\n");
 861                return 0;
 862        }
 863
 864        ret = parse_callchain_record_opt(arg, &callchain_param);
 865        if (!ret) {
 866                /* Enable data address sampling for DWARF unwind. */
 867                if (callchain_param.record_mode == CALLCHAIN_DWARF)
 868                        record->sample_address = true;
 869                callchain_debug();
 870        }
 871
 872        return ret;
 873}
 874
 875int record_callchain_opt(const struct option *opt,
 876                         const char *arg __maybe_unused,
 877                         int unset __maybe_unused)
 878{
 879        struct record_opts *record = (struct record_opts *)opt->value;
 880
 881        record->callgraph_set = true;
 882        callchain_param.enabled = true;
 883
 884        if (callchain_param.record_mode == CALLCHAIN_NONE)
 885                callchain_param.record_mode = CALLCHAIN_FP;
 886
 887        callchain_debug();
 888        return 0;
 889}
 890
 891static int perf_record_config(const char *var, const char *value, void *cb)
 892{
 893        struct record *rec = cb;
 894
 895        if (!strcmp(var, "record.build-id")) {
 896                if (!strcmp(value, "cache"))
 897                        rec->no_buildid_cache = false;
 898                else if (!strcmp(value, "no-cache"))
 899                        rec->no_buildid_cache = true;
 900                else if (!strcmp(value, "skip"))
 901                        rec->no_buildid = true;
 902                else
 903                        return -1;
 904                return 0;
 905        }
 906        if (!strcmp(var, "record.call-graph"))
 907                var = "call-graph.record-mode"; /* fall-through */
 908
 909        return perf_default_config(var, value, cb);
 910}
 911
 912struct clockid_map {
 913        const char *name;
 914        int clockid;
 915};
 916
 917#define CLOCKID_MAP(n, c)       \
 918        { .name = n, .clockid = (c), }
 919
 920#define CLOCKID_END     { .name = NULL, }
 921
 922
 923/*
 924 * Add the missing ones, we need to build on many distros...
 925 */
 926#ifndef CLOCK_MONOTONIC_RAW
 927#define CLOCK_MONOTONIC_RAW 4
 928#endif
 929#ifndef CLOCK_BOOTTIME
 930#define CLOCK_BOOTTIME 7
 931#endif
 932#ifndef CLOCK_TAI
 933#define CLOCK_TAI 11
 934#endif
 935
 936static const struct clockid_map clockids[] = {
 937        /* available for all events, NMI safe */
 938        CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
 939        CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
 940
 941        /* available for some events */
 942        CLOCKID_MAP("realtime", CLOCK_REALTIME),
 943        CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
 944        CLOCKID_MAP("tai", CLOCK_TAI),
 945
 946        /* available for the lazy */
 947        CLOCKID_MAP("mono", CLOCK_MONOTONIC),
 948        CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
 949        CLOCKID_MAP("real", CLOCK_REALTIME),
 950        CLOCKID_MAP("boot", CLOCK_BOOTTIME),
 951
 952        CLOCKID_END,
 953};
 954
 955static int parse_clockid(const struct option *opt, const char *str, int unset)
 956{
 957        struct record_opts *opts = (struct record_opts *)opt->value;
 958        const struct clockid_map *cm;
 959        const char *ostr = str;
 960
 961        if (unset) {
 962                opts->use_clockid = 0;
 963                return 0;
 964        }
 965
 966        /* no arg passed */
 967        if (!str)
 968                return 0;
 969
 970        /* no setting it twice */
 971        if (opts->use_clockid)
 972                return -1;
 973
 974        opts->use_clockid = true;
 975
 976        /* if its a number, we're done */
 977        if (sscanf(str, "%d", &opts->clockid) == 1)
 978                return 0;
 979
 980        /* allow a "CLOCK_" prefix to the name */
 981        if (!strncasecmp(str, "CLOCK_", 6))
 982                str += 6;
 983
 984        for (cm = clockids; cm->name; cm++) {
 985                if (!strcasecmp(str, cm->name)) {
 986                        opts->clockid = cm->clockid;
 987                        return 0;
 988                }
 989        }
 990
 991        opts->use_clockid = false;
 992        ui__warning("unknown clockid %s, check man page\n", ostr);
 993        return -1;
 994}
 995
 996static int record__parse_mmap_pages(const struct option *opt,
 997                                    const char *str,
 998                                    int unset __maybe_unused)
 999{
1000        struct record_opts *opts = opt->value;
1001        char *s, *p;
1002        unsigned int mmap_pages;
1003        int ret;
1004
1005        if (!str)
1006                return -EINVAL;
1007
1008        s = strdup(str);
1009        if (!s)
1010                return -ENOMEM;
1011
1012        p = strchr(s, ',');
1013        if (p)
1014                *p = '\0';
1015
1016        if (*s) {
1017                ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1018                if (ret)
1019                        goto out_free;
1020                opts->mmap_pages = mmap_pages;
1021        }
1022
1023        if (!p) {
1024                ret = 0;
1025                goto out_free;
1026        }
1027
1028        ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1029        if (ret)
1030                goto out_free;
1031
1032        opts->auxtrace_mmap_pages = mmap_pages;
1033
1034out_free:
1035        free(s);
1036        return ret;
1037}
1038
1039static const char * const __record_usage[] = {
1040        "perf record [<options>] [<command>]",
1041        "perf record [<options>] -- <command> [<options>]",
1042        NULL
1043};
1044const char * const *record_usage = __record_usage;
1045
1046/*
1047 * XXX Ideally would be local to cmd_record() and passed to a record__new
1048 * because we need to have access to it in record__exit, that is called
1049 * after cmd_record() exits, but since record_options need to be accessible to
1050 * builtin-script, leave it here.
1051 *
1052 * At least we don't ouch it in all the other functions here directly.
1053 *
1054 * Just say no to tons of global variables, sigh.
1055 */
1056static struct record record = {
1057        .opts = {
1058                .sample_time         = true,
1059                .mmap_pages          = UINT_MAX,
1060                .user_freq           = UINT_MAX,
1061                .user_interval       = ULLONG_MAX,
1062                .freq                = 4000,
1063                .target              = {
1064                        .uses_mmap   = true,
1065                        .default_per_cpu = true,
1066                },
1067                .proc_map_timeout     = 500,
1068        },
1069        .tool = {
1070                .sample         = process_sample_event,
1071                .fork           = perf_event__process_fork,
1072                .exit           = perf_event__process_exit,
1073                .comm           = perf_event__process_comm,
1074                .mmap           = perf_event__process_mmap,
1075                .mmap2          = perf_event__process_mmap2,
1076                .ordered_events = true,
1077        },
1078};
1079
1080const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1081        "\n\t\t\t\tDefault: fp";
1082
1083/*
1084 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1085 * with it and switch to use the library functions in perf_evlist that came
1086 * from builtin-record.c, i.e. use record_opts,
1087 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1088 * using pipes, etc.
1089 */
1090struct option __record_options[] = {
1091        OPT_CALLBACK('e', "event", &record.evlist, "event",
1092                     "event selector. use 'perf list' to list available events",
1093                     parse_events_option),
1094        OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1095                     "event filter", parse_filter),
1096        OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1097                           NULL, "don't record events from perf itself",
1098                           exclude_perf),
1099        OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1100                    "record events on existing process id"),
1101        OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1102                    "record events on existing thread id"),
1103        OPT_INTEGER('r', "realtime", &record.realtime_prio,
1104                    "collect data with this RT SCHED_FIFO priority"),
1105        OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1106                    "collect data without buffering"),
1107        OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1108                    "collect raw sample records from all opened counters"),
1109        OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1110                            "system-wide collection from all CPUs"),
1111        OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1112                    "list of cpus to monitor"),
1113        OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1114        OPT_STRING('o', "output", &record.file.path, "file",
1115                    "output file name"),
1116        OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1117                        &record.opts.no_inherit_set,
1118                        "child tasks do not inherit counters"),
1119        OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1120        OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1121                     "number of mmap data pages and AUX area tracing mmap pages",
1122                     record__parse_mmap_pages),
1123        OPT_BOOLEAN(0, "group", &record.opts.group,
1124                    "put the counters into a counter group"),
1125        OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1126                           NULL, "enables call-graph recording" ,
1127                           &record_callchain_opt),
1128        OPT_CALLBACK(0, "call-graph", &record.opts,
1129                     "record_mode[,record_size]", record_callchain_help,
1130                     &record_parse_callchain_opt),
1131        OPT_INCR('v', "verbose", &verbose,
1132                    "be more verbose (show counter open errors, etc)"),
1133        OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1134        OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1135                    "per thread counts"),
1136        OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1137        OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1138                        &record.opts.sample_time_set,
1139                        "Record the sample timestamps"),
1140        OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1141        OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1142                    "don't sample"),
1143        OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1144                        &record.no_buildid_cache_set,
1145                        "do not update the buildid cache"),
1146        OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1147                        &record.no_buildid_set,
1148                        "do not collect buildids in perf.data"),
1149        OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1150                     "monitor event in cgroup name only",
1151                     parse_cgroups),
1152        OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1153                  "ms to wait before starting measurement after program start"),
1154        OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1155                   "user to profile"),
1156
1157        OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1158                     "branch any", "sample any taken branches",
1159                     parse_branch_stack),
1160
1161        OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1162                     "branch filter mask", "branch stack filter modes",
1163                     parse_branch_stack),
1164        OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1165                    "sample by weight (on special events only)"),
1166        OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1167                    "sample transaction flags (special events only)"),
1168        OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1169                    "use per-thread mmaps"),
1170        OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1171                    "sample selected machine registers on interrupt,"
1172                    " use -I ? to list register names", parse_regs),
1173        OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1174                    "Record running/enabled time of read (:S) events"),
1175        OPT_CALLBACK('k', "clockid", &record.opts,
1176        "clockid", "clockid to use for events, see clock_gettime()",
1177        parse_clockid),
1178        OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1179                          "opts", "AUX area tracing Snapshot Mode", ""),
1180        OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1181                        "per thread proc mmap processing timeout in ms"),
1182        OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1183                    "Record context switch events"),
1184        OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1185                         "Configure all used events to run in kernel space.",
1186                         PARSE_OPT_EXCLUSIVE),
1187        OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1188                         "Configure all used events to run in user space.",
1189                         PARSE_OPT_EXCLUSIVE),
1190        OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1191                   "clang binary to use for compiling BPF scriptlets"),
1192        OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1193                   "options passed to clang when compiling BPF scriptlets"),
1194        OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1195                   "file", "vmlinux pathname"),
1196        OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1197                    "Record build-id of all DSOs regardless of hits"),
1198        OPT_END()
1199};
1200
1201struct option *record_options = __record_options;
1202
1203int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1204{
1205        int err;
1206        struct record *rec = &record;
1207        char errbuf[BUFSIZ];
1208
1209#ifndef HAVE_LIBBPF_SUPPORT
1210# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1211        set_nobuild('\0', "clang-path", true);
1212        set_nobuild('\0', "clang-opt", true);
1213# undef set_nobuild
1214#endif
1215
1216#ifndef HAVE_BPF_PROLOGUE
1217# if !defined (HAVE_DWARF_SUPPORT)
1218#  define REASON  "NO_DWARF=1"
1219# elif !defined (HAVE_LIBBPF_SUPPORT)
1220#  define REASON  "NO_LIBBPF=1"
1221# else
1222#  define REASON  "this architecture doesn't support BPF prologue"
1223# endif
1224# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1225        set_nobuild('\0', "vmlinux", true);
1226# undef set_nobuild
1227# undef REASON
1228#endif
1229
1230        rec->evlist = perf_evlist__new();
1231        if (rec->evlist == NULL)
1232                return -ENOMEM;
1233
1234        perf_config(perf_record_config, rec);
1235
1236        argc = parse_options(argc, argv, record_options, record_usage,
1237                            PARSE_OPT_STOP_AT_NON_OPTION);
1238        if (!argc && target__none(&rec->opts.target))
1239                usage_with_options(record_usage, record_options);
1240
1241        if (nr_cgroups && !rec->opts.target.system_wide) {
1242                usage_with_options_msg(record_usage, record_options,
1243                        "cgroup monitoring only available in system-wide mode");
1244
1245        }
1246        if (rec->opts.record_switch_events &&
1247            !perf_can_record_switch_events()) {
1248                ui__error("kernel does not support recording context switch events\n");
1249                parse_options_usage(record_usage, record_options, "switch-events", 0);
1250                return -EINVAL;
1251        }
1252
1253        if (!rec->itr) {
1254                rec->itr = auxtrace_record__init(rec->evlist, &err);
1255                if (err)
1256                        return err;
1257        }
1258
1259        err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1260                                              rec->opts.auxtrace_snapshot_opts);
1261        if (err)
1262                return err;
1263
1264        err = -ENOMEM;
1265
1266        symbol__init(NULL);
1267
1268        if (symbol_conf.kptr_restrict)
1269                pr_warning(
1270"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1271"check /proc/sys/kernel/kptr_restrict.\n\n"
1272"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1273"file is not found in the buildid cache or in the vmlinux path.\n\n"
1274"Samples in kernel modules won't be resolved at all.\n\n"
1275"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1276"even with a suitable vmlinux or kallsyms file.\n\n");
1277
1278        if (rec->no_buildid_cache || rec->no_buildid)
1279                disable_buildid_cache();
1280
1281        if (rec->evlist->nr_entries == 0 &&
1282            perf_evlist__add_default(rec->evlist) < 0) {
1283                pr_err("Not enough memory for event selector list\n");
1284                goto out_symbol_exit;
1285        }
1286
1287        if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1288                rec->opts.no_inherit = true;
1289
1290        err = target__validate(&rec->opts.target);
1291        if (err) {
1292                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1293                ui__warning("%s", errbuf);
1294        }
1295
1296        err = target__parse_uid(&rec->opts.target);
1297        if (err) {
1298                int saved_errno = errno;
1299
1300                target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1301                ui__error("%s", errbuf);
1302
1303                err = -saved_errno;
1304                goto out_symbol_exit;
1305        }
1306
1307        err = -ENOMEM;
1308        if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1309                usage_with_options(record_usage, record_options);
1310
1311        err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1312        if (err)
1313                goto out_symbol_exit;
1314
1315        /*
1316         * We take all buildids when the file contains
1317         * AUX area tracing data because we do not decode the
1318         * trace because it would take too long.
1319         */
1320        if (rec->opts.full_auxtrace)
1321                rec->buildid_all = true;
1322
1323        if (record_opts__config(&rec->opts)) {
1324                err = -EINVAL;
1325                goto out_symbol_exit;
1326        }
1327
1328        err = __cmd_record(&record, argc, argv);
1329out_symbol_exit:
1330        perf_evlist__delete(rec->evlist);
1331        symbol__exit();
1332        auxtrace_record__free(rec->itr);
1333        return err;
1334}
1335
1336static void snapshot_sig_handler(int sig __maybe_unused)
1337{
1338        if (!auxtrace_snapshot_enabled)
1339                return;
1340        auxtrace_snapshot_enabled = 0;
1341        auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1342        auxtrace_record__snapshot_started = 1;
1343}
1344