linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
  21#include "builtin.h"
  22#include "util/cgroup.h"
  23#include "util/color.h"
  24#include "util/debug.h"
  25#include "util/env.h"
  26#include "util/event.h"
  27#include "util/evlist.h"
  28#include <subcmd/exec-cmd.h>
  29#include "util/machine.h"
  30#include "util/path.h"
  31#include "util/session.h"
  32#include "util/thread.h"
  33#include <subcmd/parse-options.h>
  34#include "util/strlist.h"
  35#include "util/intlist.h"
  36#include "util/thread_map.h"
  37#include "util/stat.h"
  38#include "trace/beauty/beauty.h"
  39#include "trace-event.h"
  40#include "util/parse-events.h"
  41#include "util/bpf-loader.h"
  42#include "callchain.h"
  43#include "print_binary.h"
  44#include "string2.h"
  45#include "syscalltbl.h"
  46#include "rb_resort.h"
  47
  48#include <errno.h>
  49#include <inttypes.h>
  50#include <poll.h>
  51#include <signal.h>
  52#include <stdlib.h>
  53#include <string.h>
  54#include <linux/err.h>
  55#include <linux/filter.h>
  56#include <linux/kernel.h>
  57#include <linux/random.h>
  58#include <linux/stringify.h>
  59#include <linux/time64.h>
  60#include <fcntl.h>
  61
  62#include "sane_ctype.h"
  63
  64#ifndef O_CLOEXEC
  65# define O_CLOEXEC              02000000
  66#endif
  67
  68#ifndef F_LINUX_SPECIFIC_BASE
  69# define F_LINUX_SPECIFIC_BASE  1024
  70#endif
  71
  72struct trace {
  73        struct perf_tool        tool;
  74        struct syscalltbl       *sctbl;
  75        struct {
  76                int             max;
  77                struct syscall  *table;
  78                struct {
  79                        struct perf_evsel *sys_enter,
  80                                          *sys_exit;
  81                }               events;
  82        } syscalls;
  83        struct record_opts      opts;
  84        struct perf_evlist      *evlist;
  85        struct machine          *host;
  86        struct thread           *current;
  87        struct cgroup           *cgroup;
  88        u64                     base_time;
  89        FILE                    *output;
  90        unsigned long           nr_events;
  91        struct strlist          *ev_qualifier;
  92        struct {
  93                size_t          nr;
  94                int             *entries;
  95        }                       ev_qualifier_ids;
  96        struct {
  97                size_t          nr;
  98                pid_t           *entries;
  99        }                       filter_pids;
 100        double                  duration_filter;
 101        double                  runtime_ms;
 102        struct {
 103                u64             vfs_getname,
 104                                proc_getname;
 105        } stats;
 106        unsigned int            max_stack;
 107        unsigned int            min_stack;
 108        bool                    not_ev_qualifier;
 109        bool                    live;
 110        bool                    full_time;
 111        bool                    sched;
 112        bool                    multiple_threads;
 113        bool                    summary;
 114        bool                    summary_only;
 115        bool                    failure_only;
 116        bool                    show_comm;
 117        bool                    print_sample;
 118        bool                    show_tool_stats;
 119        bool                    trace_syscalls;
 120        bool                    kernel_syscallchains;
 121        bool                    force;
 122        bool                    vfs_getname;
 123        int                     trace_pgfaults;
 124        int                     open_id;
 125};
 126
 127struct tp_field {
 128        int offset;
 129        union {
 130                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 131                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 132        };
 133};
 134
 135#define TP_UINT_FIELD(bits) \
 136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 137{ \
 138        u##bits value; \
 139        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 140        return value;  \
 141}
 142
 143TP_UINT_FIELD(8);
 144TP_UINT_FIELD(16);
 145TP_UINT_FIELD(32);
 146TP_UINT_FIELD(64);
 147
 148#define TP_UINT_FIELD__SWAPPED(bits) \
 149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 150{ \
 151        u##bits value; \
 152        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 153        return bswap_##bits(value);\
 154}
 155
 156TP_UINT_FIELD__SWAPPED(16);
 157TP_UINT_FIELD__SWAPPED(32);
 158TP_UINT_FIELD__SWAPPED(64);
 159
 160static int tp_field__init_uint(struct tp_field *field,
 161                               struct format_field *format_field,
 162                               bool needs_swap)
 163{
 164        field->offset = format_field->offset;
 165
 166        switch (format_field->size) {
 167        case 1:
 168                field->integer = tp_field__u8;
 169                break;
 170        case 2:
 171                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 172                break;
 173        case 4:
 174                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 175                break;
 176        case 8:
 177                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 178                break;
 179        default:
 180                return -1;
 181        }
 182
 183        return 0;
 184}
 185
 186static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 187{
 188        return sample->raw_data + field->offset;
 189}
 190
 191static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 192{
 193        field->offset = format_field->offset;
 194        field->pointer = tp_field__ptr;
 195        return 0;
 196}
 197
 198struct syscall_tp {
 199        struct tp_field id;
 200        union {
 201                struct tp_field args, ret;
 202        };
 203};
 204
 205static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 206                                          struct tp_field *field,
 207                                          const char *name)
 208{
 209        struct format_field *format_field = perf_evsel__field(evsel, name);
 210
 211        if (format_field == NULL)
 212                return -1;
 213
 214        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 215}
 216
 217#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 218        ({ struct syscall_tp *sc = evsel->priv;\
 219           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 220
 221static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 222                                         struct tp_field *field,
 223                                         const char *name)
 224{
 225        struct format_field *format_field = perf_evsel__field(evsel, name);
 226
 227        if (format_field == NULL)
 228                return -1;
 229
 230        return tp_field__init_ptr(field, format_field);
 231}
 232
 233#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 234        ({ struct syscall_tp *sc = evsel->priv;\
 235           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 236
 237static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 238{
 239        zfree(&evsel->priv);
 240        perf_evsel__delete(evsel);
 241}
 242
 243static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 244{
 245        evsel->priv = malloc(sizeof(struct syscall_tp));
 246        if (evsel->priv != NULL) {
 247                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 248                        goto out_delete;
 249
 250                evsel->handler = handler;
 251                return 0;
 252        }
 253
 254        return -ENOMEM;
 255
 256out_delete:
 257        zfree(&evsel->priv);
 258        return -ENOENT;
 259}
 260
 261static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 262{
 263        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 264
 265        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 266        if (IS_ERR(evsel))
 267                evsel = perf_evsel__newtp("syscalls", direction);
 268
 269        if (IS_ERR(evsel))
 270                return NULL;
 271
 272        if (perf_evsel__init_syscall_tp(evsel, handler))
 273                goto out_delete;
 274
 275        return evsel;
 276
 277out_delete:
 278        perf_evsel__delete_priv(evsel);
 279        return NULL;
 280}
 281
 282#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 283        ({ struct syscall_tp *fields = evsel->priv; \
 284           fields->name.integer(&fields->name, sample); })
 285
 286#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 287        ({ struct syscall_tp *fields = evsel->priv; \
 288           fields->name.pointer(&fields->name, sample); })
 289
 290size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
 291{
 292        int idx = val - sa->offset;
 293
 294        if (idx < 0 || idx >= sa->nr_entries)
 295                return scnprintf(bf, size, intfmt, val);
 296
 297        return scnprintf(bf, size, "%s", sa->entries[idx]);
 298}
 299
 300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 301                                                const char *intfmt,
 302                                                struct syscall_arg *arg)
 303{
 304        return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
 305}
 306
 307static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 308                                              struct syscall_arg *arg)
 309{
 310        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 311}
 312
 313#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 314
 315struct strarrays {
 316        int             nr_entries;
 317        struct strarray **entries;
 318};
 319
 320#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
 321        .nr_entries = ARRAY_SIZE(array), \
 322        .entries = array, \
 323}
 324
 325size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 326                                        struct syscall_arg *arg)
 327{
 328        struct strarrays *sas = arg->parm;
 329        int i;
 330
 331        for (i = 0; i < sas->nr_entries; ++i) {
 332                struct strarray *sa = sas->entries[i];
 333                int idx = arg->val - sa->offset;
 334
 335                if (idx >= 0 && idx < sa->nr_entries) {
 336                        if (sa->entries[idx] == NULL)
 337                                break;
 338                        return scnprintf(bf, size, "%s", sa->entries[idx]);
 339                }
 340        }
 341
 342        return scnprintf(bf, size, "%d", arg->val);
 343}
 344
 345#ifndef AT_FDCWD
 346#define AT_FDCWD        -100
 347#endif
 348
 349static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 350                                           struct syscall_arg *arg)
 351{
 352        int fd = arg->val;
 353
 354        if (fd == AT_FDCWD)
 355                return scnprintf(bf, size, "CWD");
 356
 357        return syscall_arg__scnprintf_fd(bf, size, arg);
 358}
 359
 360#define SCA_FDAT syscall_arg__scnprintf_fd_at
 361
 362static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 363                                              struct syscall_arg *arg);
 364
 365#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 366
 367size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 368{
 369        return scnprintf(bf, size, "%#lx", arg->val);
 370}
 371
 372size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 373{
 374        return scnprintf(bf, size, "%d", arg->val);
 375}
 376
 377size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 378{
 379        return scnprintf(bf, size, "%ld", arg->val);
 380}
 381
 382static const char *bpf_cmd[] = {
 383        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 384        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 385};
 386static DEFINE_STRARRAY(bpf_cmd);
 387
 388static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 389static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 390
 391static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 392static DEFINE_STRARRAY(itimers);
 393
 394static const char *keyctl_options[] = {
 395        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 396        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 397        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 398        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 399        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 400};
 401static DEFINE_STRARRAY(keyctl_options);
 402
 403static const char *whences[] = { "SET", "CUR", "END",
 404#ifdef SEEK_DATA
 405"DATA",
 406#endif
 407#ifdef SEEK_HOLE
 408"HOLE",
 409#endif
 410};
 411static DEFINE_STRARRAY(whences);
 412
 413static const char *fcntl_cmds[] = {
 414        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 415        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 416        "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 417        "GETOWNER_UIDS",
 418};
 419static DEFINE_STRARRAY(fcntl_cmds);
 420
 421static const char *fcntl_linux_specific_cmds[] = {
 422        "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 423        "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 424        "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 425};
 426
 427static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
 428
 429static struct strarray *fcntl_cmds_arrays[] = {
 430        &strarray__fcntl_cmds,
 431        &strarray__fcntl_linux_specific_cmds,
 432};
 433
 434static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 435
 436static const char *rlimit_resources[] = {
 437        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 438        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 439        "RTTIME",
 440};
 441static DEFINE_STRARRAY(rlimit_resources);
 442
 443static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 444static DEFINE_STRARRAY(sighow);
 445
 446static const char *clockid[] = {
 447        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 448        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 449        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 450};
 451static DEFINE_STRARRAY(clockid);
 452
 453static const char *socket_families[] = {
 454        "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 455        "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 456        "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 457        "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 458        "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 459        "ALG", "NFC", "VSOCK",
 460};
 461static DEFINE_STRARRAY(socket_families);
 462
 463static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 464                                                 struct syscall_arg *arg)
 465{
 466        size_t printed = 0;
 467        int mode = arg->val;
 468
 469        if (mode == F_OK) /* 0 */
 470                return scnprintf(bf, size, "F");
 471#define P_MODE(n) \
 472        if (mode & n##_OK) { \
 473                printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 474                mode &= ~n##_OK; \
 475        }
 476
 477        P_MODE(R);
 478        P_MODE(W);
 479        P_MODE(X);
 480#undef P_MODE
 481
 482        if (mode)
 483                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 484
 485        return printed;
 486}
 487
 488#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 489
 490static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 491                                              struct syscall_arg *arg);
 492
 493#define SCA_FILENAME syscall_arg__scnprintf_filename
 494
 495static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 496                                                struct syscall_arg *arg)
 497{
 498        int printed = 0, flags = arg->val;
 499
 500#define P_FLAG(n) \
 501        if (flags & O_##n) { \
 502                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 503                flags &= ~O_##n; \
 504        }
 505
 506        P_FLAG(CLOEXEC);
 507        P_FLAG(NONBLOCK);
 508#undef P_FLAG
 509
 510        if (flags)
 511                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 512
 513        return printed;
 514}
 515
 516#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 517
 518#ifndef GRND_NONBLOCK
 519#define GRND_NONBLOCK   0x0001
 520#endif
 521#ifndef GRND_RANDOM
 522#define GRND_RANDOM     0x0002
 523#endif
 524
 525static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 526                                                   struct syscall_arg *arg)
 527{
 528        int printed = 0, flags = arg->val;
 529
 530#define P_FLAG(n) \
 531        if (flags & GRND_##n) { \
 532                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 533                flags &= ~GRND_##n; \
 534        }
 535
 536        P_FLAG(RANDOM);
 537        P_FLAG(NONBLOCK);
 538#undef P_FLAG
 539
 540        if (flags)
 541                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 542
 543        return printed;
 544}
 545
 546#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 547
 548#define STRARRAY(name, array) \
 549          { .scnprintf  = SCA_STRARRAY, \
 550            .parm       = &strarray__##array, }
 551
 552#include "trace/beauty/arch_errno_names.c"
 553#include "trace/beauty/eventfd.c"
 554#include "trace/beauty/futex_op.c"
 555#include "trace/beauty/futex_val3.c"
 556#include "trace/beauty/mmap.c"
 557#include "trace/beauty/mode_t.c"
 558#include "trace/beauty/msg_flags.c"
 559#include "trace/beauty/open_flags.c"
 560#include "trace/beauty/perf_event_open.c"
 561#include "trace/beauty/pid.c"
 562#include "trace/beauty/sched_policy.c"
 563#include "trace/beauty/seccomp.c"
 564#include "trace/beauty/signum.c"
 565#include "trace/beauty/socket_type.c"
 566#include "trace/beauty/waitid_options.c"
 567
 568struct syscall_arg_fmt {
 569        size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 570        void       *parm;
 571        const char *name;
 572        bool       show_zero;
 573};
 574
 575static struct syscall_fmt {
 576        const char *name;
 577        const char *alias;
 578        struct syscall_arg_fmt arg[6];
 579        u8         nr_args;
 580        bool       errpid;
 581        bool       timeout;
 582        bool       hexret;
 583} syscall_fmts[] = {
 584        { .name     = "access",
 585          .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 586        { .name     = "bpf",
 587          .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 588        { .name     = "brk",        .hexret = true,
 589          .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
 590        { .name     = "clock_gettime",
 591          .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 592        { .name     = "clone",      .errpid = true, .nr_args = 5,
 593          .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
 594                   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 595                   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 596                   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 597                   [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
 598        { .name     = "close",
 599          .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 600        { .name     = "epoll_ctl",
 601          .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 602        { .name     = "eventfd2",
 603          .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 604        { .name     = "fchmodat",
 605          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 606        { .name     = "fchownat",
 607          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 608        { .name     = "fcntl",
 609          .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 610                           .parm      = &strarrays__fcntl_cmds_arrays,
 611                           .show_zero = true, },
 612                   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 613        { .name     = "flock",
 614          .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 615        { .name     = "fstat", .alias = "newfstat", },
 616        { .name     = "fstatat", .alias = "newfstatat", },
 617        { .name     = "futex",
 618          .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 619                   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 620        { .name     = "futimesat",
 621          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 622        { .name     = "getitimer",
 623          .arg = { [0] = STRARRAY(which, itimers), }, },
 624        { .name     = "getpid",     .errpid = true, },
 625        { .name     = "getpgid",    .errpid = true, },
 626        { .name     = "getppid",    .errpid = true, },
 627        { .name     = "getrandom",
 628          .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 629        { .name     = "getrlimit",
 630          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 631        { .name     = "gettid",     .errpid = true, },
 632        { .name     = "ioctl",
 633          .arg = {
 634#if defined(__i386__) || defined(__x86_64__)
 635/*
 636 * FIXME: Make this available to all arches.
 637 */
 638                   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 639                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 640#else
 641                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 642#endif
 643        { .name     = "kcmp",       .nr_args = 5,
 644          .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
 645                   [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
 646                   [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
 647                   [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
 648                   [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
 649        { .name     = "keyctl",
 650          .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 651        { .name     = "kill",
 652          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 653        { .name     = "linkat",
 654          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 655        { .name     = "lseek",
 656          .arg = { [2] = STRARRAY(whence, whences), }, },
 657        { .name     = "lstat", .alias = "newlstat", },
 658        { .name     = "madvise",
 659          .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 660                   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 661        { .name     = "mkdirat",
 662          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 663        { .name     = "mknodat",
 664          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 665        { .name     = "mlock",
 666          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 667        { .name     = "mlockall",
 668          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 669        { .name     = "mmap",       .hexret = true,
 670/* The standard mmap maps to old_mmap on s390x */
 671#if defined(__s390x__)
 672        .alias = "old_mmap",
 673#endif
 674          .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
 675                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 676                   [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
 677        { .name     = "mprotect",
 678          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 679                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
 680        { .name     = "mq_unlink",
 681          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 682        { .name     = "mremap",     .hexret = true,
 683          .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
 684                   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
 685                   [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
 686        { .name     = "munlock",
 687          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 688        { .name     = "munmap",
 689          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 690        { .name     = "name_to_handle_at",
 691          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 692        { .name     = "newfstatat",
 693          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 694        { .name     = "open",
 695          .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 696        { .name     = "open_by_handle_at",
 697          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 698                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 699        { .name     = "openat",
 700          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 701                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 702        { .name     = "perf_event_open",
 703          .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
 704                   [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
 705                   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 706        { .name     = "pipe2",
 707          .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 708        { .name     = "pkey_alloc",
 709          .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
 710        { .name     = "pkey_free",
 711          .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
 712        { .name     = "pkey_mprotect",
 713          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 714                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 715                   [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
 716        { .name     = "poll", .timeout = true, },
 717        { .name     = "ppoll", .timeout = true, },
 718        { .name     = "prctl", .alias = "arch_prctl",
 719          .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 720                   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 721                   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 722        { .name     = "pread", .alias = "pread64", },
 723        { .name     = "preadv", .alias = "pread", },
 724        { .name     = "prlimit64",
 725          .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 726        { .name     = "pwrite", .alias = "pwrite64", },
 727        { .name     = "readlinkat",
 728          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 729        { .name     = "recvfrom",
 730          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 731        { .name     = "recvmmsg",
 732          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 733        { .name     = "recvmsg",
 734          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 735        { .name     = "renameat",
 736          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 737        { .name     = "rt_sigaction",
 738          .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 739        { .name     = "rt_sigprocmask",
 740          .arg = { [0] = STRARRAY(how, sighow), }, },
 741        { .name     = "rt_sigqueueinfo",
 742          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 743        { .name     = "rt_tgsigqueueinfo",
 744          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 745        { .name     = "sched_setscheduler",
 746          .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 747        { .name     = "seccomp",
 748          .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
 749                   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 750        { .name     = "select", .timeout = true, },
 751        { .name     = "sendmmsg",
 752          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 753        { .name     = "sendmsg",
 754          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 755        { .name     = "sendto",
 756          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 757        { .name     = "set_tid_address", .errpid = true, },
 758        { .name     = "setitimer",
 759          .arg = { [0] = STRARRAY(which, itimers), }, },
 760        { .name     = "setrlimit",
 761          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 762        { .name     = "socket",
 763          .arg = { [0] = STRARRAY(family, socket_families),
 764                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 765        { .name     = "socketpair",
 766          .arg = { [0] = STRARRAY(family, socket_families),
 767                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 768        { .name     = "stat", .alias = "newstat", },
 769        { .name     = "statx",
 770          .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
 771                   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 772                   [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
 773        { .name     = "swapoff",
 774          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 775        { .name     = "swapon",
 776          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 777        { .name     = "symlinkat",
 778          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 779        { .name     = "tgkill",
 780          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 781        { .name     = "tkill",
 782          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 783        { .name     = "uname", .alias = "newuname", },
 784        { .name     = "unlinkat",
 785          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 786        { .name     = "utimensat",
 787          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 788        { .name     = "wait4",      .errpid = true,
 789          .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 790        { .name     = "waitid",     .errpid = true,
 791          .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 792};
 793
 794static int syscall_fmt__cmp(const void *name, const void *fmtp)
 795{
 796        const struct syscall_fmt *fmt = fmtp;
 797        return strcmp(name, fmt->name);
 798}
 799
 800static struct syscall_fmt *syscall_fmt__find(const char *name)
 801{
 802        const int nmemb = ARRAY_SIZE(syscall_fmts);
 803        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 804}
 805
 806struct syscall {
 807        struct event_format *tp_format;
 808        int                 nr_args;
 809        struct format_field *args;
 810        const char          *name;
 811        bool                is_exit;
 812        struct syscall_fmt  *fmt;
 813        struct syscall_arg_fmt *arg_fmt;
 814};
 815
 816/*
 817 * We need to have this 'calculated' boolean because in some cases we really
 818 * don't know what is the duration of a syscall, for instance, when we start
 819 * a session and some threads are waiting for a syscall to finish, say 'poll',
 820 * in which case all we can do is to print "( ? ) for duration and for the
 821 * start timestamp.
 822 */
 823static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 824{
 825        double duration = (double)t / NSEC_PER_MSEC;
 826        size_t printed = fprintf(fp, "(");
 827
 828        if (!calculated)
 829                printed += fprintf(fp, "         ");
 830        else if (duration >= 1.0)
 831                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 832        else if (duration >= 0.01)
 833                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 834        else
 835                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 836        return printed + fprintf(fp, "): ");
 837}
 838
 839/**
 840 * filename.ptr: The filename char pointer that will be vfs_getname'd
 841 * filename.entry_str_pos: Where to insert the string translated from
 842 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 843 * ret_scnprintf: syscall args may set this to a different syscall return
 844 *                formatter, for instance, fcntl may return fds, file flags, etc.
 845 */
 846struct thread_trace {
 847        u64               entry_time;
 848        bool              entry_pending;
 849        unsigned long     nr_events;
 850        unsigned long     pfmaj, pfmin;
 851        char              *entry_str;
 852        double            runtime_ms;
 853        size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 854        struct {
 855                unsigned long ptr;
 856                short int     entry_str_pos;
 857                bool          pending_open;
 858                unsigned int  namelen;
 859                char          *name;
 860        } filename;
 861        struct {
 862                int       max;
 863                char      **table;
 864        } paths;
 865
 866        struct intlist *syscall_stats;
 867};
 868
 869static struct thread_trace *thread_trace__new(void)
 870{
 871        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 872
 873        if (ttrace)
 874                ttrace->paths.max = -1;
 875
 876        ttrace->syscall_stats = intlist__new(NULL);
 877
 878        return ttrace;
 879}
 880
 881static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 882{
 883        struct thread_trace *ttrace;
 884
 885        if (thread == NULL)
 886                goto fail;
 887
 888        if (thread__priv(thread) == NULL)
 889                thread__set_priv(thread, thread_trace__new());
 890
 891        if (thread__priv(thread) == NULL)
 892                goto fail;
 893
 894        ttrace = thread__priv(thread);
 895        ++ttrace->nr_events;
 896
 897        return ttrace;
 898fail:
 899        color_fprintf(fp, PERF_COLOR_RED,
 900                      "WARNING: not enough memory, dropping samples!\n");
 901        return NULL;
 902}
 903
 904
 905void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
 906                                    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
 907{
 908        struct thread_trace *ttrace = thread__priv(arg->thread);
 909
 910        ttrace->ret_scnprintf = ret_scnprintf;
 911}
 912
 913#define TRACE_PFMAJ             (1 << 0)
 914#define TRACE_PFMIN             (1 << 1)
 915
 916static const size_t trace__entry_str_size = 2048;
 917
 918static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 919{
 920        struct thread_trace *ttrace = thread__priv(thread);
 921
 922        if (fd > ttrace->paths.max) {
 923                char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 924
 925                if (npath == NULL)
 926                        return -1;
 927
 928                if (ttrace->paths.max != -1) {
 929                        memset(npath + ttrace->paths.max + 1, 0,
 930                               (fd - ttrace->paths.max) * sizeof(char *));
 931                } else {
 932                        memset(npath, 0, (fd + 1) * sizeof(char *));
 933                }
 934
 935                ttrace->paths.table = npath;
 936                ttrace->paths.max   = fd;
 937        }
 938
 939        ttrace->paths.table[fd] = strdup(pathname);
 940
 941        return ttrace->paths.table[fd] != NULL ? 0 : -1;
 942}
 943
 944static int thread__read_fd_path(struct thread *thread, int fd)
 945{
 946        char linkname[PATH_MAX], pathname[PATH_MAX];
 947        struct stat st;
 948        int ret;
 949
 950        if (thread->pid_ == thread->tid) {
 951                scnprintf(linkname, sizeof(linkname),
 952                          "/proc/%d/fd/%d", thread->pid_, fd);
 953        } else {
 954                scnprintf(linkname, sizeof(linkname),
 955                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 956        }
 957
 958        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 959                return -1;
 960
 961        ret = readlink(linkname, pathname, sizeof(pathname));
 962
 963        if (ret < 0 || ret > st.st_size)
 964                return -1;
 965
 966        pathname[ret] = '\0';
 967        return trace__set_fd_pathname(thread, fd, pathname);
 968}
 969
 970static const char *thread__fd_path(struct thread *thread, int fd,
 971                                   struct trace *trace)
 972{
 973        struct thread_trace *ttrace = thread__priv(thread);
 974
 975        if (ttrace == NULL)
 976                return NULL;
 977
 978        if (fd < 0)
 979                return NULL;
 980
 981        if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 982                if (!trace->live)
 983                        return NULL;
 984                ++trace->stats.proc_getname;
 985                if (thread__read_fd_path(thread, fd))
 986                        return NULL;
 987        }
 988
 989        return ttrace->paths.table[fd];
 990}
 991
 992size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
 993{
 994        int fd = arg->val;
 995        size_t printed = scnprintf(bf, size, "%d", fd);
 996        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 997
 998        if (path)
 999                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001        return printed;
1002}
1003
1004size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005{
1006        size_t printed = scnprintf(bf, size, "%d", fd);
1007        struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009        if (thread) {
1010                const char *path = thread__fd_path(thread, fd, trace);
1011
1012                if (path)
1013                        printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015                thread__put(thread);
1016        }
1017
1018        return printed;
1019}
1020
1021static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022                                              struct syscall_arg *arg)
1023{
1024        int fd = arg->val;
1025        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026        struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028        if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029                zfree(&ttrace->paths.table[fd]);
1030
1031        return printed;
1032}
1033
1034static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035                                     unsigned long ptr)
1036{
1037        struct thread_trace *ttrace = thread__priv(thread);
1038
1039        ttrace->filename.ptr = ptr;
1040        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041}
1042
1043static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044                                              struct syscall_arg *arg)
1045{
1046        unsigned long ptr = arg->val;
1047
1048        if (!arg->trace->vfs_getname)
1049                return scnprintf(bf, size, "%#x", ptr);
1050
1051        thread__set_filename_pos(arg->thread, bf, ptr);
1052        return 0;
1053}
1054
1055static bool trace__filter_duration(struct trace *trace, double t)
1056{
1057        return t < (trace->duration_filter * NSEC_PER_MSEC);
1058}
1059
1060static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061{
1062        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064        return fprintf(fp, "%10.3f ", ts);
1065}
1066
1067/*
1068 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069 * using ttrace->entry_time for a thread that receives a sys_exit without
1070 * first having received a sys_enter ("poll" issued before tracing session
1071 * starts, lost sys_enter exit due to ring buffer overflow).
1072 */
1073static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074{
1075        if (tstamp > 0)
1076                return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078        return fprintf(fp, "         ? ");
1079}
1080
1081static bool done = false;
1082static bool interrupted = false;
1083
1084static void sig_handler(int sig)
1085{
1086        done = true;
1087        interrupted = sig == SIGINT;
1088}
1089
1090static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091                                        u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092{
1093        size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094        printed += fprintf_duration(duration, duration_calculated, fp);
1095
1096        if (trace->multiple_threads) {
1097                if (trace->show_comm)
1098                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099                printed += fprintf(fp, "%d ", thread->tid);
1100        }
1101
1102        return printed;
1103}
1104
1105static int trace__process_event(struct trace *trace, struct machine *machine,
1106                                union perf_event *event, struct perf_sample *sample)
1107{
1108        int ret = 0;
1109
1110        switch (event->header.type) {
1111        case PERF_RECORD_LOST:
1112                color_fprintf(trace->output, PERF_COLOR_RED,
1113                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1114                ret = machine__process_lost_event(machine, event, sample);
1115                break;
1116        default:
1117                ret = machine__process_event(machine, event, sample);
1118                break;
1119        }
1120
1121        return ret;
1122}
1123
1124static int trace__tool_process(struct perf_tool *tool,
1125                               union perf_event *event,
1126                               struct perf_sample *sample,
1127                               struct machine *machine)
1128{
1129        struct trace *trace = container_of(tool, struct trace, tool);
1130        return trace__process_event(trace, machine, event, sample);
1131}
1132
1133static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134{
1135        struct machine *machine = vmachine;
1136
1137        if (machine->kptr_restrict_warned)
1138                return NULL;
1139
1140        if (symbol_conf.kptr_restrict) {
1141                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143                           "Kernel samples will not be resolved.\n");
1144                machine->kptr_restrict_warned = true;
1145                return NULL;
1146        }
1147
1148        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149}
1150
1151static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152{
1153        int err = symbol__init(NULL);
1154
1155        if (err)
1156                return err;
1157
1158        trace->host = machine__new_host();
1159        if (trace->host == NULL)
1160                return -ENOMEM;
1161
1162        err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163        if (err < 0)
1164                goto out;
1165
1166        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167                                            evlist->threads, trace__tool_process, false,
1168                                            trace->opts.proc_map_timeout, 1);
1169out:
1170        if (err)
1171                symbol__exit();
1172
1173        return err;
1174}
1175
1176static void trace__symbols__exit(struct trace *trace)
1177{
1178        machine__exit(trace->host);
1179        trace->host = NULL;
1180
1181        symbol__exit();
1182}
1183
1184static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185{
1186        int idx;
1187
1188        if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189                nr_args = sc->fmt->nr_args;
1190
1191        sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192        if (sc->arg_fmt == NULL)
1193                return -1;
1194
1195        for (idx = 0; idx < nr_args; ++idx) {
1196                if (sc->fmt)
1197                        sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198        }
1199
1200        sc->nr_args = nr_args;
1201        return 0;
1202}
1203
1204static int syscall__set_arg_fmts(struct syscall *sc)
1205{
1206        struct format_field *field;
1207        int idx = 0, len;
1208
1209        for (field = sc->args; field; field = field->next, ++idx) {
1210                if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211                        continue;
1212
1213                if (strcmp(field->type, "const char *") == 0 &&
1214                         (strcmp(field->name, "filename") == 0 ||
1215                          strcmp(field->name, "path") == 0 ||
1216                          strcmp(field->name, "pathname") == 0))
1217                        sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218                else if (field->flags & FIELD_IS_POINTER)
1219                        sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220                else if (strcmp(field->type, "pid_t") == 0)
1221                        sc->arg_fmt[idx].scnprintf = SCA_PID;
1222                else if (strcmp(field->type, "umode_t") == 0)
1223                        sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224                else if ((strcmp(field->type, "int") == 0 ||
1225                          strcmp(field->type, "unsigned int") == 0 ||
1226                          strcmp(field->type, "long") == 0) &&
1227                         (len = strlen(field->name)) >= 2 &&
1228                         strcmp(field->name + len - 2, "fd") == 0) {
1229                        /*
1230                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1231                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232                         * 65 int
1233                         * 23 unsigned int
1234                         * 7 unsigned long
1235                         */
1236                        sc->arg_fmt[idx].scnprintf = SCA_FD;
1237                }
1238        }
1239
1240        return 0;
1241}
1242
1243static int trace__read_syscall_info(struct trace *trace, int id)
1244{
1245        char tp_name[128];
1246        struct syscall *sc;
1247        const char *name = syscalltbl__name(trace->sctbl, id);
1248
1249        if (name == NULL)
1250                return -1;
1251
1252        if (id > trace->syscalls.max) {
1253                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255                if (nsyscalls == NULL)
1256                        return -1;
1257
1258                if (trace->syscalls.max != -1) {
1259                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1260                               (id - trace->syscalls.max) * sizeof(*sc));
1261                } else {
1262                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263                }
1264
1265                trace->syscalls.table = nsyscalls;
1266                trace->syscalls.max   = id;
1267        }
1268
1269        sc = trace->syscalls.table + id;
1270        sc->name = name;
1271
1272        sc->fmt  = syscall_fmt__find(sc->name);
1273
1274        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280        }
1281
1282        if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283                return -1;
1284
1285        if (IS_ERR(sc->tp_format))
1286                return -1;
1287
1288        sc->args = sc->tp_format->format.fields;
1289        /*
1290         * We need to check and discard the first variable '__syscall_nr'
1291         * or 'nr' that mean the syscall number. It is needless here.
1292         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293         */
1294        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295                sc->args = sc->args->next;
1296                --sc->nr_args;
1297        }
1298
1299        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300
1301        return syscall__set_arg_fmts(sc);
1302}
1303
1304static int trace__validate_ev_qualifier(struct trace *trace)
1305{
1306        int err = 0, i;
1307        size_t nr_allocated;
1308        struct str_node *pos;
1309
1310        trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311        trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314        if (trace->ev_qualifier_ids.entries == NULL) {
1315                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316                       trace->output);
1317                err = -EINVAL;
1318                goto out;
1319        }
1320
1321        nr_allocated = trace->ev_qualifier_ids.nr;
1322        i = 0;
1323
1324        strlist__for_each_entry(pos, trace->ev_qualifier) {
1325                const char *sc = pos->s;
1326                int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328                if (id < 0) {
1329                        id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330                        if (id >= 0)
1331                                goto matches;
1332
1333                        if (err == 0) {
1334                                fputs("Error:\tInvalid syscall ", trace->output);
1335                                err = -EINVAL;
1336                        } else {
1337                                fputs(", ", trace->output);
1338                        }
1339
1340                        fputs(sc, trace->output);
1341                }
1342matches:
1343                trace->ev_qualifier_ids.entries[i++] = id;
1344                if (match_next == -1)
1345                        continue;
1346
1347                while (1) {
1348                        id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349                        if (id < 0)
1350                                break;
1351                        if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352                                void *entries;
1353
1354                                nr_allocated += 8;
1355                                entries = realloc(trace->ev_qualifier_ids.entries,
1356                                                  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357                                if (entries == NULL) {
1358                                        err = -ENOMEM;
1359                                        fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360                                        goto out_free;
1361                                }
1362                                trace->ev_qualifier_ids.entries = entries;
1363                        }
1364                        trace->ev_qualifier_ids.nr++;
1365                        trace->ev_qualifier_ids.entries[i++] = id;
1366                }
1367        }
1368
1369        if (err < 0) {
1370                fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371                      "\nHint:\tand: 'man syscalls'\n", trace->output);
1372out_free:
1373                zfree(&trace->ev_qualifier_ids.entries);
1374                trace->ev_qualifier_ids.nr = 0;
1375        }
1376out:
1377        return err;
1378}
1379
1380/*
1381 * args is to be interpreted as a series of longs but we need to handle
1382 * 8-byte unaligned accesses. args points to raw_data within the event
1383 * and raw_data is guaranteed to be 8-byte unaligned because it is
1384 * preceded by raw_size which is a u32. So we need to copy args to a temp
1385 * variable to read it. Most notably this avoids extended load instructions
1386 * on unaligned addresses
1387 */
1388unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389{
1390        unsigned long val;
1391        unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393        memcpy(&val, p, sizeof(val));
1394        return val;
1395}
1396
1397static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398                                      struct syscall_arg *arg)
1399{
1400        if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401                return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403        return scnprintf(bf, size, "arg%d: ", arg->idx);
1404}
1405
1406static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407                                     struct syscall_arg *arg, unsigned long val)
1408{
1409        if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410                arg->val = val;
1411                if (sc->arg_fmt[arg->idx].parm)
1412                        arg->parm = sc->arg_fmt[arg->idx].parm;
1413                return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414        }
1415        return scnprintf(bf, size, "%ld", val);
1416}
1417
1418static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419                                      unsigned char *args, struct trace *trace,
1420                                      struct thread *thread)
1421{
1422        size_t printed = 0;
1423        unsigned long val;
1424        u8 bit = 1;
1425        struct syscall_arg arg = {
1426                .args   = args,
1427                .idx    = 0,
1428                .mask   = 0,
1429                .trace  = trace,
1430                .thread = thread,
1431        };
1432        struct thread_trace *ttrace = thread__priv(thread);
1433
1434        /*
1435         * Things like fcntl will set this in its 'cmd' formatter to pick the
1436         * right formatter for the return value (an fd? file flags?), which is
1437         * not needed for syscalls that always return a given type, say an fd.
1438         */
1439        ttrace->ret_scnprintf = NULL;
1440
1441        if (sc->args != NULL) {
1442                struct format_field *field;
1443
1444                for (field = sc->args; field;
1445                     field = field->next, ++arg.idx, bit <<= 1) {
1446                        if (arg.mask & bit)
1447                                continue;
1448
1449                        val = syscall_arg__val(&arg, arg.idx);
1450
1451                        /*
1452                         * Suppress this argument if its value is zero and
1453                         * and we don't have a string associated in an
1454                         * strarray for it.
1455                         */
1456                        if (val == 0 &&
1457                            !(sc->arg_fmt &&
1458                              (sc->arg_fmt[arg.idx].show_zero ||
1459                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461                              sc->arg_fmt[arg.idx].parm))
1462                                continue;
1463
1464                        printed += scnprintf(bf + printed, size - printed,
1465                                             "%s%s: ", printed ? ", " : "", field->name);
1466                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467                }
1468        } else if (IS_ERR(sc->tp_format)) {
1469                /*
1470                 * If we managed to read the tracepoint /format file, then we
1471                 * may end up not having any args, like with gettid(), so only
1472                 * print the raw args when we didn't manage to read it.
1473                 */
1474                while (arg.idx < sc->nr_args) {
1475                        if (arg.mask & bit)
1476                                goto next_arg;
1477                        val = syscall_arg__val(&arg, arg.idx);
1478                        if (printed)
1479                                printed += scnprintf(bf + printed, size - printed, ", ");
1480                        printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482next_arg:
1483                        ++arg.idx;
1484                        bit <<= 1;
1485                }
1486        }
1487
1488        return printed;
1489}
1490
1491typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492                                  union perf_event *event,
1493                                  struct perf_sample *sample);
1494
1495static struct syscall *trace__syscall_info(struct trace *trace,
1496                                           struct perf_evsel *evsel, int id)
1497{
1498
1499        if (id < 0) {
1500
1501                /*
1502                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503                 * before that, leaving at a higher verbosity level till that is
1504                 * explained. Reproduced with plain ftrace with:
1505                 *
1506                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507                 * grep "NR -1 " /t/trace_pipe
1508                 *
1509                 * After generating some load on the machine.
1510                 */
1511                if (verbose > 1) {
1512                        static u64 n;
1513                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514                                id, perf_evsel__name(evsel), ++n);
1515                }
1516                return NULL;
1517        }
1518
1519        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520            trace__read_syscall_info(trace, id))
1521                goto out_cant_read;
1522
1523        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524                goto out_cant_read;
1525
1526        return &trace->syscalls.table[id];
1527
1528out_cant_read:
1529        if (verbose > 0) {
1530                fprintf(trace->output, "Problems reading syscall %d", id);
1531                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533                fputs(" information\n", trace->output);
1534        }
1535        return NULL;
1536}
1537
1538static void thread__update_stats(struct thread_trace *ttrace,
1539                                 int id, struct perf_sample *sample)
1540{
1541        struct int_node *inode;
1542        struct stats *stats;
1543        u64 duration = 0;
1544
1545        inode = intlist__findnew(ttrace->syscall_stats, id);
1546        if (inode == NULL)
1547                return;
1548
1549        stats = inode->priv;
1550        if (stats == NULL) {
1551                stats = malloc(sizeof(struct stats));
1552                if (stats == NULL)
1553                        return;
1554                init_stats(stats);
1555                inode->priv = stats;
1556        }
1557
1558        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559                duration = sample->time - ttrace->entry_time;
1560
1561        update_stats(stats, duration);
1562}
1563
1564static int trace__printf_interrupted_entry(struct trace *trace)
1565{
1566        struct thread_trace *ttrace;
1567        size_t printed;
1568
1569        if (trace->failure_only || trace->current == NULL)
1570                return 0;
1571
1572        ttrace = thread__priv(trace->current);
1573
1574        if (!ttrace->entry_pending)
1575                return 0;
1576
1577        printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578        printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579        ttrace->entry_pending = false;
1580
1581        return printed;
1582}
1583
1584static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585                                 struct perf_sample *sample, struct thread *thread)
1586{
1587        int printed = 0;
1588
1589        if (trace->print_sample) {
1590                double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592                printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593                                   perf_evsel__name(evsel), ts,
1594                                   thread__comm_str(thread),
1595                                   sample->pid, sample->tid, sample->cpu);
1596        }
1597
1598        return printed;
1599}
1600
1601static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602                            union perf_event *event __maybe_unused,
1603                            struct perf_sample *sample)
1604{
1605        char *msg;
1606        void *args;
1607        size_t printed = 0;
1608        struct thread *thread;
1609        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611        struct thread_trace *ttrace;
1612
1613        if (sc == NULL)
1614                return -1;
1615
1616        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617        ttrace = thread__trace(thread, trace->output);
1618        if (ttrace == NULL)
1619                goto out_put;
1620
1621        trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625        if (ttrace->entry_str == NULL) {
1626                ttrace->entry_str = malloc(trace__entry_str_size);
1627                if (!ttrace->entry_str)
1628                        goto out_put;
1629        }
1630
1631        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632                trace__printf_interrupted_entry(trace);
1633
1634        ttrace->entry_time = sample->time;
1635        msg = ttrace->entry_str;
1636        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639                                           args, trace, thread);
1640
1641        if (sc->is_exit) {
1642                if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643                        trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644                        fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645                }
1646        } else {
1647                ttrace->entry_pending = true;
1648                /* See trace__vfs_getname & trace__sys_exit */
1649                ttrace->filename.pending_open = false;
1650        }
1651
1652        if (trace->current != thread) {
1653                thread__put(trace->current);
1654                trace->current = thread__get(thread);
1655        }
1656        err = 0;
1657out_put:
1658        thread__put(thread);
1659        return err;
1660}
1661
1662static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663                                    struct perf_sample *sample,
1664                                    struct callchain_cursor *cursor)
1665{
1666        struct addr_location al;
1667        int max_stack = evsel->attr.sample_max_stack ?
1668                        evsel->attr.sample_max_stack :
1669                        trace->max_stack;
1670
1671        if (machine__resolve(trace->host, &al, sample) < 0 ||
1672            thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673                return -1;
1674
1675        return 0;
1676}
1677
1678static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679{
1680        /* TODO: user-configurable print_opts */
1681        const unsigned int print_opts = EVSEL__PRINT_SYM |
1682                                        EVSEL__PRINT_DSO |
1683                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686}
1687
1688static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689{
1690        struct perf_env *env = perf_evsel__env(evsel);
1691        const char *arch_name = perf_env__arch(env);
1692
1693        return arch_syscalls__strerrno(arch_name, err);
1694}
1695
1696static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697                           union perf_event *event __maybe_unused,
1698                           struct perf_sample *sample)
1699{
1700        long ret;
1701        u64 duration = 0;
1702        bool duration_calculated = false;
1703        struct thread *thread;
1704        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706        struct thread_trace *ttrace;
1707
1708        if (sc == NULL)
1709                return -1;
1710
1711        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712        ttrace = thread__trace(thread, trace->output);
1713        if (ttrace == NULL)
1714                goto out_put;
1715
1716        trace__fprintf_sample(trace, evsel, sample, thread);
1717
1718        if (trace->summary)
1719                thread__update_stats(ttrace, id, sample);
1720
1721        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723        if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725                ttrace->filename.pending_open = false;
1726                ++trace->stats.vfs_getname;
1727        }
1728
1729        if (ttrace->entry_time) {
1730                duration = sample->time - ttrace->entry_time;
1731                if (trace__filter_duration(trace, duration))
1732                        goto out;
1733                duration_calculated = true;
1734        } else if (trace->duration_filter)
1735                goto out;
1736
1737        if (sample->callchain) {
1738                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739                if (callchain_ret == 0) {
1740                        if (callchain_cursor.nr < trace->min_stack)
1741                                goto out;
1742                        callchain_ret = 1;
1743                }
1744        }
1745
1746        if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747                goto out;
1748
1749        trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751        if (ttrace->entry_pending) {
1752                fprintf(trace->output, "%-70s", ttrace->entry_str);
1753        } else {
1754                fprintf(trace->output, " ... [");
1755                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756                fprintf(trace->output, "]: %s()", sc->name);
1757        }
1758
1759        if (sc->fmt == NULL) {
1760                if (ret < 0)
1761                        goto errno_print;
1762signed_print:
1763                fprintf(trace->output, ") = %ld", ret);
1764        } else if (ret < 0) {
1765errno_print: {
1766                char bf[STRERR_BUFSIZE];
1767                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768                           *e = errno_to_name(evsel, -ret);
1769
1770                fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771        }
1772        } else if (ret == 0 && sc->fmt->timeout)
1773                fprintf(trace->output, ") = 0 Timeout");
1774        else if (ttrace->ret_scnprintf) {
1775                char bf[1024];
1776                struct syscall_arg arg = {
1777                        .val    = ret,
1778                        .thread = thread,
1779                        .trace  = trace,
1780                };
1781                ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782                ttrace->ret_scnprintf = NULL;
1783                fprintf(trace->output, ") = %s", bf);
1784        } else if (sc->fmt->hexret)
1785                fprintf(trace->output, ") = %#lx", ret);
1786        else if (sc->fmt->errpid) {
1787                struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789                if (child != NULL) {
1790                        fprintf(trace->output, ") = %ld", ret);
1791                        if (child->comm_set)
1792                                fprintf(trace->output, " (%s)", thread__comm_str(child));
1793                        thread__put(child);
1794                }
1795        } else
1796                goto signed_print;
1797
1798        fputc('\n', trace->output);
1799
1800        if (callchain_ret > 0)
1801                trace__fprintf_callchain(trace, sample);
1802        else if (callchain_ret < 0)
1803                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804out:
1805        ttrace->entry_pending = false;
1806        err = 0;
1807out_put:
1808        thread__put(thread);
1809        return err;
1810}
1811
1812static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813                              union perf_event *event __maybe_unused,
1814                              struct perf_sample *sample)
1815{
1816        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817        struct thread_trace *ttrace;
1818        size_t filename_len, entry_str_len, to_move;
1819        ssize_t remaining_space;
1820        char *pos;
1821        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823        if (!thread)
1824                goto out;
1825
1826        ttrace = thread__priv(thread);
1827        if (!ttrace)
1828                goto out_put;
1829
1830        filename_len = strlen(filename);
1831        if (filename_len == 0)
1832                goto out_put;
1833
1834        if (ttrace->filename.namelen < filename_len) {
1835                char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837                if (f == NULL)
1838                        goto out_put;
1839
1840                ttrace->filename.namelen = filename_len;
1841                ttrace->filename.name = f;
1842        }
1843
1844        strcpy(ttrace->filename.name, filename);
1845        ttrace->filename.pending_open = true;
1846
1847        if (!ttrace->filename.ptr)
1848                goto out_put;
1849
1850        entry_str_len = strlen(ttrace->entry_str);
1851        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852        if (remaining_space <= 0)
1853                goto out_put;
1854
1855        if (filename_len > (size_t)remaining_space) {
1856                filename += filename_len - remaining_space;
1857                filename_len = remaining_space;
1858        }
1859
1860        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862        memmove(pos + filename_len, pos, to_move);
1863        memcpy(pos, filename, filename_len);
1864
1865        ttrace->filename.ptr = 0;
1866        ttrace->filename.entry_str_pos = 0;
1867out_put:
1868        thread__put(thread);
1869out:
1870        return 0;
1871}
1872
1873static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874                                     union perf_event *event __maybe_unused,
1875                                     struct perf_sample *sample)
1876{
1877        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879        struct thread *thread = machine__findnew_thread(trace->host,
1880                                                        sample->pid,
1881                                                        sample->tid);
1882        struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884        if (ttrace == NULL)
1885                goto out_dump;
1886
1887        ttrace->runtime_ms += runtime_ms;
1888        trace->runtime_ms += runtime_ms;
1889out_put:
1890        thread__put(thread);
1891        return 0;
1892
1893out_dump:
1894        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895               evsel->name,
1896               perf_evsel__strval(evsel, sample, "comm"),
1897               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898               runtime,
1899               perf_evsel__intval(evsel, sample, "vruntime"));
1900        goto out_put;
1901}
1902
1903static int bpf_output__printer(enum binary_printer_ops op,
1904                               unsigned int val, void *extra __maybe_unused, FILE *fp)
1905{
1906        unsigned char ch = (unsigned char)val;
1907
1908        switch (op) {
1909        case BINARY_PRINT_CHAR_DATA:
1910                return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911        case BINARY_PRINT_DATA_BEGIN:
1912        case BINARY_PRINT_LINE_BEGIN:
1913        case BINARY_PRINT_ADDR:
1914        case BINARY_PRINT_NUM_DATA:
1915        case BINARY_PRINT_NUM_PAD:
1916        case BINARY_PRINT_SEP:
1917        case BINARY_PRINT_CHAR_PAD:
1918        case BINARY_PRINT_LINE_END:
1919        case BINARY_PRINT_DATA_END:
1920        default:
1921                break;
1922        }
1923
1924        return 0;
1925}
1926
1927static void bpf_output__fprintf(struct trace *trace,
1928                                struct perf_sample *sample)
1929{
1930        binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931                        bpf_output__printer, NULL, trace->output);
1932}
1933
1934static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935                                union perf_event *event __maybe_unused,
1936                                struct perf_sample *sample)
1937{
1938        int callchain_ret = 0;
1939
1940        if (sample->callchain) {
1941                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942                if (callchain_ret == 0) {
1943                        if (callchain_cursor.nr < trace->min_stack)
1944                                goto out;
1945                        callchain_ret = 1;
1946                }
1947        }
1948
1949        trace__printf_interrupted_entry(trace);
1950        trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952        if (trace->trace_syscalls)
1953                fprintf(trace->output, "(         ): ");
1954
1955        fprintf(trace->output, "%s:", evsel->name);
1956
1957        if (perf_evsel__is_bpf_output(evsel)) {
1958                bpf_output__fprintf(trace, sample);
1959        } else if (evsel->tp_format) {
1960                event_format__fprintf(evsel->tp_format, sample->cpu,
1961                                      sample->raw_data, sample->raw_size,
1962                                      trace->output);
1963        }
1964
1965        fprintf(trace->output, "\n");
1966
1967        if (callchain_ret > 0)
1968                trace__fprintf_callchain(trace, sample);
1969        else if (callchain_ret < 0)
1970                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971out:
1972        return 0;
1973}
1974
1975static void print_location(FILE *f, struct perf_sample *sample,
1976                           struct addr_location *al,
1977                           bool print_dso, bool print_sym)
1978{
1979
1980        if ((verbose > 0 || print_dso) && al->map)
1981                fprintf(f, "%s@", al->map->dso->long_name);
1982
1983        if ((verbose > 0 || print_sym) && al->sym)
1984                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985                        al->addr - al->sym->start);
1986        else if (al->map)
1987                fprintf(f, "0x%" PRIx64, al->addr);
1988        else
1989                fprintf(f, "0x%" PRIx64, sample->addr);
1990}
1991
1992static int trace__pgfault(struct trace *trace,
1993                          struct perf_evsel *evsel,
1994                          union perf_event *event __maybe_unused,
1995                          struct perf_sample *sample)
1996{
1997        struct thread *thread;
1998        struct addr_location al;
1999        char map_type = 'd';
2000        struct thread_trace *ttrace;
2001        int err = -1;
2002        int callchain_ret = 0;
2003
2004        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006        if (sample->callchain) {
2007                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008                if (callchain_ret == 0) {
2009                        if (callchain_cursor.nr < trace->min_stack)
2010                                goto out_put;
2011                        callchain_ret = 1;
2012                }
2013        }
2014
2015        ttrace = thread__trace(thread, trace->output);
2016        if (ttrace == NULL)
2017                goto out_put;
2018
2019        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020                ttrace->pfmaj++;
2021        else
2022                ttrace->pfmin++;
2023
2024        if (trace->summary_only)
2025                goto out;
2026
2027        thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2028
2029        trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2030
2031        fprintf(trace->output, "%sfault [",
2032                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2033                "maj" : "min");
2034
2035        print_location(trace->output, sample, &al, false, true);
2036
2037        fprintf(trace->output, "] => ");
2038
2039        thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2040
2041        if (!al.map) {
2042                thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2043
2044                if (al.map)
2045                        map_type = 'x';
2046                else
2047                        map_type = '?';
2048        }
2049
2050        print_location(trace->output, sample, &al, true, false);
2051
2052        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2053
2054        if (callchain_ret > 0)
2055                trace__fprintf_callchain(trace, sample);
2056        else if (callchain_ret < 0)
2057                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058out:
2059        err = 0;
2060out_put:
2061        thread__put(thread);
2062        return err;
2063}
2064
2065static void trace__set_base_time(struct trace *trace,
2066                                 struct perf_evsel *evsel,
2067                                 struct perf_sample *sample)
2068{
2069        /*
2070         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071         * and don't use sample->time unconditionally, we may end up having
2072         * some other event in the future without PERF_SAMPLE_TIME for good
2073         * reason, i.e. we may not be interested in its timestamps, just in
2074         * it taking place, picking some piece of information when it
2075         * appears in our event stream (vfs_getname comes to mind).
2076         */
2077        if (trace->base_time == 0 && !trace->full_time &&
2078            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079                trace->base_time = sample->time;
2080}
2081
2082static int trace__process_sample(struct perf_tool *tool,
2083                                 union perf_event *event,
2084                                 struct perf_sample *sample,
2085                                 struct perf_evsel *evsel,
2086                                 struct machine *machine __maybe_unused)
2087{
2088        struct trace *trace = container_of(tool, struct trace, tool);
2089        struct thread *thread;
2090        int err = 0;
2091
2092        tracepoint_handler handler = evsel->handler;
2093
2094        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095        if (thread && thread__is_filtered(thread))
2096                goto out;
2097
2098        trace__set_base_time(trace, evsel, sample);
2099
2100        if (handler) {
2101                ++trace->nr_events;
2102                handler(trace, evsel, event, sample);
2103        }
2104out:
2105        thread__put(thread);
2106        return err;
2107}
2108
2109static int trace__record(struct trace *trace, int argc, const char **argv)
2110{
2111        unsigned int rec_argc, i, j;
2112        const char **rec_argv;
2113        const char * const record_args[] = {
2114                "record",
2115                "-R",
2116                "-m", "1024",
2117                "-c", "1",
2118        };
2119
2120        const char * const sc_args[] = { "-e", };
2121        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122        const char * const majpf_args[] = { "-e", "major-faults" };
2123        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124        const char * const minpf_args[] = { "-e", "minor-faults" };
2125        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2126
2127        /* +1 is for the event string below */
2128        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129                majpf_args_nr + minpf_args_nr + argc;
2130        rec_argv = calloc(rec_argc + 1, sizeof(char *));
2131
2132        if (rec_argv == NULL)
2133                return -ENOMEM;
2134
2135        j = 0;
2136        for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137                rec_argv[j++] = record_args[i];
2138
2139        if (trace->trace_syscalls) {
2140                for (i = 0; i < sc_args_nr; i++)
2141                        rec_argv[j++] = sc_args[i];
2142
2143                /* event string may be different for older kernels - e.g., RHEL6 */
2144                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146                else if (is_valid_tracepoint("syscalls:sys_enter"))
2147                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148                else {
2149                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150                        free(rec_argv);
2151                        return -1;
2152                }
2153        }
2154
2155        if (trace->trace_pgfaults & TRACE_PFMAJ)
2156                for (i = 0; i < majpf_args_nr; i++)
2157                        rec_argv[j++] = majpf_args[i];
2158
2159        if (trace->trace_pgfaults & TRACE_PFMIN)
2160                for (i = 0; i < minpf_args_nr; i++)
2161                        rec_argv[j++] = minpf_args[i];
2162
2163        for (i = 0; i < (unsigned int)argc; i++)
2164                rec_argv[j++] = argv[i];
2165
2166        return cmd_record(j, rec_argv);
2167}
2168
2169static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2170
2171static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2172{
2173        struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2174
2175        if (IS_ERR(evsel))
2176                return false;
2177
2178        if (perf_evsel__field(evsel, "pathname") == NULL) {
2179                perf_evsel__delete(evsel);
2180                return false;
2181        }
2182
2183        evsel->handler = trace__vfs_getname;
2184        perf_evlist__add(evlist, evsel);
2185        return true;
2186}
2187
2188static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2189{
2190        struct perf_evsel *evsel;
2191        struct perf_event_attr attr = {
2192                .type = PERF_TYPE_SOFTWARE,
2193                .mmap_data = 1,
2194        };
2195
2196        attr.config = config;
2197        attr.sample_period = 1;
2198
2199        event_attr_init(&attr);
2200
2201        evsel = perf_evsel__new(&attr);
2202        if (evsel)
2203                evsel->handler = trace__pgfault;
2204
2205        return evsel;
2206}
2207
2208static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2209{
2210        const u32 type = event->header.type;
2211        struct perf_evsel *evsel;
2212
2213        if (type != PERF_RECORD_SAMPLE) {
2214                trace__process_event(trace, trace->host, event, sample);
2215                return;
2216        }
2217
2218        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219        if (evsel == NULL) {
2220                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221                return;
2222        }
2223
2224        trace__set_base_time(trace, evsel, sample);
2225
2226        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227            sample->raw_data == NULL) {
2228                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229                       perf_evsel__name(evsel), sample->tid,
2230                       sample->cpu, sample->raw_size);
2231        } else {
2232                tracepoint_handler handler = evsel->handler;
2233                handler(trace, evsel, event, sample);
2234        }
2235}
2236
2237static int trace__add_syscall_newtp(struct trace *trace)
2238{
2239        int ret = -1;
2240        struct perf_evlist *evlist = trace->evlist;
2241        struct perf_evsel *sys_enter, *sys_exit;
2242
2243        sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244        if (sys_enter == NULL)
2245                goto out;
2246
2247        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248                goto out_delete_sys_enter;
2249
2250        sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251        if (sys_exit == NULL)
2252                goto out_delete_sys_enter;
2253
2254        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255                goto out_delete_sys_exit;
2256
2257        perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258        perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2259
2260        perf_evlist__add(evlist, sys_enter);
2261        perf_evlist__add(evlist, sys_exit);
2262
2263        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2264                /*
2265                 * We're interested only in the user space callchain
2266                 * leading to the syscall, allow overriding that for
2267                 * debugging reasons using --kernel_syscall_callchains
2268                 */
2269                sys_exit->attr.exclude_callchain_kernel = 1;
2270        }
2271
2272        trace->syscalls.events.sys_enter = sys_enter;
2273        trace->syscalls.events.sys_exit  = sys_exit;
2274
2275        ret = 0;
2276out:
2277        return ret;
2278
2279out_delete_sys_exit:
2280        perf_evsel__delete_priv(sys_exit);
2281out_delete_sys_enter:
2282        perf_evsel__delete_priv(sys_enter);
2283        goto out;
2284}
2285
2286static int trace__set_ev_qualifier_filter(struct trace *trace)
2287{
2288        int err = -1;
2289        struct perf_evsel *sys_exit;
2290        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291                                                trace->ev_qualifier_ids.nr,
2292                                                trace->ev_qualifier_ids.entries);
2293
2294        if (filter == NULL)
2295                goto out_enomem;
2296
2297        if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298                                          filter)) {
2299                sys_exit = trace->syscalls.events.sys_exit;
2300                err = perf_evsel__append_tp_filter(sys_exit, filter);
2301        }
2302
2303        free(filter);
2304out:
2305        return err;
2306out_enomem:
2307        errno = ENOMEM;
2308        goto out;
2309}
2310
2311static int trace__set_filter_loop_pids(struct trace *trace)
2312{
2313        unsigned int nr = 1;
2314        pid_t pids[32] = {
2315                getpid(),
2316        };
2317        struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2318
2319        while (thread && nr < ARRAY_SIZE(pids)) {
2320                struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2321
2322                if (parent == NULL)
2323                        break;
2324
2325                if (!strcmp(thread__comm_str(parent), "sshd")) {
2326                        pids[nr++] = parent->tid;
2327                        break;
2328                }
2329                thread = parent;
2330        }
2331
2332        return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2333}
2334
2335static int trace__run(struct trace *trace, int argc, const char **argv)
2336{
2337        struct perf_evlist *evlist = trace->evlist;
2338        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339        int err = -1, i;
2340        unsigned long before;
2341        const bool forks = argc > 0;
2342        bool draining = false;
2343
2344        trace->live = true;
2345
2346        if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347                goto out_error_raw_syscalls;
2348
2349        if (trace->trace_syscalls)
2350                trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2351
2352        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354                if (pgfault_maj == NULL)
2355                        goto out_error_mem;
2356                perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357                perf_evlist__add(evlist, pgfault_maj);
2358        }
2359
2360        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362                if (pgfault_min == NULL)
2363                        goto out_error_mem;
2364                perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365                perf_evlist__add(evlist, pgfault_min);
2366        }
2367
2368        if (trace->sched &&
2369            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370                                   trace__sched_stat_runtime))
2371                goto out_error_sched_stat_runtime;
2372
2373        /*
2374         * If a global cgroup was set, apply it to all the events without an
2375         * explicit cgroup. I.e.:
2376         *
2377         *      trace -G A -e sched:*switch
2378         *
2379         * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2380         * _and_ sched:sched_switch to the 'A' cgroup, while:
2381         *
2382         * trace -e sched:*switch -G A
2383         *
2384         * will only set the sched:sched_switch event to the 'A' cgroup, all the
2385         * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2386         * a cgroup (on the root cgroup, sys wide, etc).
2387         *
2388         * Multiple cgroups:
2389         *
2390         * trace -G A -e sched:*switch -G B
2391         *
2392         * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2393         * to the 'B' cgroup.
2394         *
2395         * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2396         * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2397         */
2398        if (trace->cgroup)
2399                evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2400
2401        err = perf_evlist__create_maps(evlist, &trace->opts.target);
2402        if (err < 0) {
2403                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2404                goto out_delete_evlist;
2405        }
2406
2407        err = trace__symbols_init(trace, evlist);
2408        if (err < 0) {
2409                fprintf(trace->output, "Problems initializing symbol libraries!\n");
2410                goto out_delete_evlist;
2411        }
2412
2413        perf_evlist__config(evlist, &trace->opts, &callchain_param);
2414
2415        signal(SIGCHLD, sig_handler);
2416        signal(SIGINT, sig_handler);
2417
2418        if (forks) {
2419                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2420                                                    argv, false, NULL);
2421                if (err < 0) {
2422                        fprintf(trace->output, "Couldn't run the workload!\n");
2423                        goto out_delete_evlist;
2424                }
2425        }
2426
2427        err = perf_evlist__open(evlist);
2428        if (err < 0)
2429                goto out_error_open;
2430
2431        err = bpf__apply_obj_config();
2432        if (err) {
2433                char errbuf[BUFSIZ];
2434
2435                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2436                pr_err("ERROR: Apply config to BPF failed: %s\n",
2437                         errbuf);
2438                goto out_error_open;
2439        }
2440
2441        /*
2442         * Better not use !target__has_task() here because we need to cover the
2443         * case where no threads were specified in the command line, but a
2444         * workload was, and in that case we will fill in the thread_map when
2445         * we fork the workload in perf_evlist__prepare_workload.
2446         */
2447        if (trace->filter_pids.nr > 0)
2448                err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2449        else if (thread_map__pid(evlist->threads, 0) == -1)
2450                err = trace__set_filter_loop_pids(trace);
2451
2452        if (err < 0)
2453                goto out_error_mem;
2454
2455        if (trace->ev_qualifier_ids.nr > 0) {
2456                err = trace__set_ev_qualifier_filter(trace);
2457                if (err < 0)
2458                        goto out_errno;
2459
2460                pr_debug("event qualifier tracepoint filter: %s\n",
2461                         trace->syscalls.events.sys_exit->filter);
2462        }
2463
2464        err = perf_evlist__apply_filters(evlist, &evsel);
2465        if (err < 0)
2466                goto out_error_apply_filters;
2467
2468        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2469        if (err < 0)
2470                goto out_error_mmap;
2471
2472        if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2473                perf_evlist__enable(evlist);
2474
2475        if (forks)
2476                perf_evlist__start_workload(evlist);
2477
2478        if (trace->opts.initial_delay) {
2479                usleep(trace->opts.initial_delay * 1000);
2480                perf_evlist__enable(evlist);
2481        }
2482
2483        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2484                                  evlist->threads->nr > 1 ||
2485                                  perf_evlist__first(evlist)->attr.inherit;
2486
2487        /*
2488         * Now that we already used evsel->attr to ask the kernel to setup the
2489         * events, lets reuse evsel->attr.sample_max_stack as the limit in
2490         * trace__resolve_callchain(), allowing per-event max-stack settings
2491         * to override an explicitely set --max-stack global setting.
2492         */
2493        evlist__for_each_entry(evlist, evsel) {
2494                if (evsel__has_callchain(evsel) &&
2495                    evsel->attr.sample_max_stack == 0)
2496                        evsel->attr.sample_max_stack = trace->max_stack;
2497        }
2498again:
2499        before = trace->nr_events;
2500
2501        for (i = 0; i < evlist->nr_mmaps; i++) {
2502                union perf_event *event;
2503                struct perf_mmap *md;
2504
2505                md = &evlist->mmap[i];
2506                if (perf_mmap__read_init(md) < 0)
2507                        continue;
2508
2509                while ((event = perf_mmap__read_event(md)) != NULL) {
2510                        struct perf_sample sample;
2511
2512                        ++trace->nr_events;
2513
2514                        err = perf_evlist__parse_sample(evlist, event, &sample);
2515                        if (err) {
2516                                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2517                                goto next_event;
2518                        }
2519
2520                        trace__handle_event(trace, event, &sample);
2521next_event:
2522                        perf_mmap__consume(md);
2523
2524                        if (interrupted)
2525                                goto out_disable;
2526
2527                        if (done && !draining) {
2528                                perf_evlist__disable(evlist);
2529                                draining = true;
2530                        }
2531                }
2532                perf_mmap__read_done(md);
2533        }
2534
2535        if (trace->nr_events == before) {
2536                int timeout = done ? 100 : -1;
2537
2538                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2539                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2540                                draining = true;
2541
2542                        goto again;
2543                }
2544        } else {
2545                goto again;
2546        }
2547
2548out_disable:
2549        thread__zput(trace->current);
2550
2551        perf_evlist__disable(evlist);
2552
2553        if (!err) {
2554                if (trace->summary)
2555                        trace__fprintf_thread_summary(trace, trace->output);
2556
2557                if (trace->show_tool_stats) {
2558                        fprintf(trace->output, "Stats:\n "
2559                                               " vfs_getname : %" PRIu64 "\n"
2560                                               " proc_getname: %" PRIu64 "\n",
2561                                trace->stats.vfs_getname,
2562                                trace->stats.proc_getname);
2563                }
2564        }
2565
2566out_delete_evlist:
2567        trace__symbols__exit(trace);
2568
2569        perf_evlist__delete(evlist);
2570        cgroup__put(trace->cgroup);
2571        trace->evlist = NULL;
2572        trace->live = false;
2573        return err;
2574{
2575        char errbuf[BUFSIZ];
2576
2577out_error_sched_stat_runtime:
2578        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2579        goto out_error;
2580
2581out_error_raw_syscalls:
2582        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2583        goto out_error;
2584
2585out_error_mmap:
2586        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2587        goto out_error;
2588
2589out_error_open:
2590        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2591
2592out_error:
2593        fprintf(trace->output, "%s\n", errbuf);
2594        goto out_delete_evlist;
2595
2596out_error_apply_filters:
2597        fprintf(trace->output,
2598                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2599                evsel->filter, perf_evsel__name(evsel), errno,
2600                str_error_r(errno, errbuf, sizeof(errbuf)));
2601        goto out_delete_evlist;
2602}
2603out_error_mem:
2604        fprintf(trace->output, "Not enough memory to run!\n");
2605        goto out_delete_evlist;
2606
2607out_errno:
2608        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2609        goto out_delete_evlist;
2610}
2611
2612static int trace__replay(struct trace *trace)
2613{
2614        const struct perf_evsel_str_handler handlers[] = {
2615                { "probe:vfs_getname",       trace__vfs_getname, },
2616        };
2617        struct perf_data data = {
2618                .file      = {
2619                        .path = input_name,
2620                },
2621                .mode      = PERF_DATA_MODE_READ,
2622                .force     = trace->force,
2623        };
2624        struct perf_session *session;
2625        struct perf_evsel *evsel;
2626        int err = -1;
2627
2628        trace->tool.sample        = trace__process_sample;
2629        trace->tool.mmap          = perf_event__process_mmap;
2630        trace->tool.mmap2         = perf_event__process_mmap2;
2631        trace->tool.comm          = perf_event__process_comm;
2632        trace->tool.exit          = perf_event__process_exit;
2633        trace->tool.fork          = perf_event__process_fork;
2634        trace->tool.attr          = perf_event__process_attr;
2635        trace->tool.tracing_data  = perf_event__process_tracing_data;
2636        trace->tool.build_id      = perf_event__process_build_id;
2637        trace->tool.namespaces    = perf_event__process_namespaces;
2638
2639        trace->tool.ordered_events = true;
2640        trace->tool.ordering_requires_timestamps = true;
2641
2642        /* add tid to output */
2643        trace->multiple_threads = true;
2644
2645        session = perf_session__new(&data, false, &trace->tool);
2646        if (session == NULL)
2647                return -1;
2648
2649        if (trace->opts.target.pid)
2650                symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2651
2652        if (trace->opts.target.tid)
2653                symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2654
2655        if (symbol__init(&session->header.env) < 0)
2656                goto out;
2657
2658        trace->host = &session->machines.host;
2659
2660        err = perf_session__set_tracepoints_handlers(session, handlers);
2661        if (err)
2662                goto out;
2663
2664        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2665                                                     "raw_syscalls:sys_enter");
2666        /* older kernels have syscalls tp versus raw_syscalls */
2667        if (evsel == NULL)
2668                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2669                                                             "syscalls:sys_enter");
2670
2671        if (evsel &&
2672            (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2673            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2674                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2675                goto out;
2676        }
2677
2678        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2679                                                     "raw_syscalls:sys_exit");
2680        if (evsel == NULL)
2681                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682                                                             "syscalls:sys_exit");
2683        if (evsel &&
2684            (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2685            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2686                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2687                goto out;
2688        }
2689
2690        evlist__for_each_entry(session->evlist, evsel) {
2691                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2692                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2693                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2694                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2695                        evsel->handler = trace__pgfault;
2696        }
2697
2698        setup_pager();
2699
2700        err = perf_session__process_events(session);
2701        if (err)
2702                pr_err("Failed to process events, error %d", err);
2703
2704        else if (trace->summary)
2705                trace__fprintf_thread_summary(trace, trace->output);
2706
2707out:
2708        perf_session__delete(session);
2709
2710        return err;
2711}
2712
2713static size_t trace__fprintf_threads_header(FILE *fp)
2714{
2715        size_t printed;
2716
2717        printed  = fprintf(fp, "\n Summary of events:\n\n");
2718
2719        return printed;
2720}
2721
2722DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2723        struct stats    *stats;
2724        double          msecs;
2725        int             syscall;
2726)
2727{
2728        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2729        struct stats *stats = source->priv;
2730
2731        entry->syscall = source->i;
2732        entry->stats   = stats;
2733        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2734}
2735
2736static size_t thread__dump_stats(struct thread_trace *ttrace,
2737                                 struct trace *trace, FILE *fp)
2738{
2739        size_t printed = 0;
2740        struct syscall *sc;
2741        struct rb_node *nd;
2742        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2743
2744        if (syscall_stats == NULL)
2745                return 0;
2746
2747        printed += fprintf(fp, "\n");
2748
2749        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2750        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2751        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2752
2753        resort_rb__for_each_entry(nd, syscall_stats) {
2754                struct stats *stats = syscall_stats_entry->stats;
2755                if (stats) {
2756                        double min = (double)(stats->min) / NSEC_PER_MSEC;
2757                        double max = (double)(stats->max) / NSEC_PER_MSEC;
2758                        double avg = avg_stats(stats);
2759                        double pct;
2760                        u64 n = (u64) stats->n;
2761
2762                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2763                        avg /= NSEC_PER_MSEC;
2764
2765                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2766                        printed += fprintf(fp, "   %-15s", sc->name);
2767                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2768                                           n, syscall_stats_entry->msecs, min, avg);
2769                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2770                }
2771        }
2772
2773        resort_rb__delete(syscall_stats);
2774        printed += fprintf(fp, "\n\n");
2775
2776        return printed;
2777}
2778
2779static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2780{
2781        size_t printed = 0;
2782        struct thread_trace *ttrace = thread__priv(thread);
2783        double ratio;
2784
2785        if (ttrace == NULL)
2786                return 0;
2787
2788        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2789
2790        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2791        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2792        printed += fprintf(fp, "%.1f%%", ratio);
2793        if (ttrace->pfmaj)
2794                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2795        if (ttrace->pfmin)
2796                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2797        if (trace->sched)
2798                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2799        else if (fputc('\n', fp) != EOF)
2800                ++printed;
2801
2802        printed += thread__dump_stats(ttrace, trace, fp);
2803
2804        return printed;
2805}
2806
2807static unsigned long thread__nr_events(struct thread_trace *ttrace)
2808{
2809        return ttrace ? ttrace->nr_events : 0;
2810}
2811
2812DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2813        struct thread *thread;
2814)
2815{
2816        entry->thread = rb_entry(nd, struct thread, rb_node);
2817}
2818
2819static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2820{
2821        size_t printed = trace__fprintf_threads_header(fp);
2822        struct rb_node *nd;
2823        int i;
2824
2825        for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2826                DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2827
2828                if (threads == NULL) {
2829                        fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2830                        return 0;
2831                }
2832
2833                resort_rb__for_each_entry(nd, threads)
2834                        printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2835
2836                resort_rb__delete(threads);
2837        }
2838        return printed;
2839}
2840
2841static int trace__set_duration(const struct option *opt, const char *str,
2842                               int unset __maybe_unused)
2843{
2844        struct trace *trace = opt->value;
2845
2846        trace->duration_filter = atof(str);
2847        return 0;
2848}
2849
2850static int trace__set_filter_pids(const struct option *opt, const char *str,
2851                                  int unset __maybe_unused)
2852{
2853        int ret = -1;
2854        size_t i;
2855        struct trace *trace = opt->value;
2856        /*
2857         * FIXME: introduce a intarray class, plain parse csv and create a
2858         * { int nr, int entries[] } struct...
2859         */
2860        struct intlist *list = intlist__new(str);
2861
2862        if (list == NULL)
2863                return -1;
2864
2865        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2866        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2867
2868        if (trace->filter_pids.entries == NULL)
2869                goto out;
2870
2871        trace->filter_pids.entries[0] = getpid();
2872
2873        for (i = 1; i < trace->filter_pids.nr; ++i)
2874                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2875
2876        intlist__delete(list);
2877        ret = 0;
2878out:
2879        return ret;
2880}
2881
2882static int trace__open_output(struct trace *trace, const char *filename)
2883{
2884        struct stat st;
2885
2886        if (!stat(filename, &st) && st.st_size) {
2887                char oldname[PATH_MAX];
2888
2889                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2890                unlink(oldname);
2891                rename(filename, oldname);
2892        }
2893
2894        trace->output = fopen(filename, "w");
2895
2896        return trace->output == NULL ? -errno : 0;
2897}
2898
2899static int parse_pagefaults(const struct option *opt, const char *str,
2900                            int unset __maybe_unused)
2901{
2902        int *trace_pgfaults = opt->value;
2903
2904        if (strcmp(str, "all") == 0)
2905                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2906        else if (strcmp(str, "maj") == 0)
2907                *trace_pgfaults |= TRACE_PFMAJ;
2908        else if (strcmp(str, "min") == 0)
2909                *trace_pgfaults |= TRACE_PFMIN;
2910        else
2911                return -1;
2912
2913        return 0;
2914}
2915
2916static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2917{
2918        struct perf_evsel *evsel;
2919
2920        evlist__for_each_entry(evlist, evsel)
2921                evsel->handler = handler;
2922}
2923
2924/*
2925 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2926 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2927 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2928 *
2929 * It'd be better to introduce a parse_options() variant that would return a
2930 * list with the terms it didn't match to an event...
2931 */
2932static int trace__parse_events_option(const struct option *opt, const char *str,
2933                                      int unset __maybe_unused)
2934{
2935        struct trace *trace = (struct trace *)opt->value;
2936        const char *s = str;
2937        char *sep = NULL, *lists[2] = { NULL, NULL, };
2938        int len = strlen(str) + 1, err = -1, list, idx;
2939        char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2940        char group_name[PATH_MAX];
2941
2942        if (strace_groups_dir == NULL)
2943                return -1;
2944
2945        if (*s == '!') {
2946                ++s;
2947                trace->not_ev_qualifier = true;
2948        }
2949
2950        while (1) {
2951                if ((sep = strchr(s, ',')) != NULL)
2952                        *sep = '\0';
2953
2954                list = 0;
2955                if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2956                    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2957                        list = 1;
2958                } else {
2959                        path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2960                        if (access(group_name, R_OK) == 0)
2961                                list = 1;
2962                }
2963
2964                if (lists[list]) {
2965                        sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2966                } else {
2967                        lists[list] = malloc(len);
2968                        if (lists[list] == NULL)
2969                                goto out;
2970                        strcpy(lists[list], s);
2971                }
2972
2973                if (!sep)
2974                        break;
2975
2976                *sep = ',';
2977                s = sep + 1;
2978        }
2979
2980        if (lists[1] != NULL) {
2981                struct strlist_config slist_config = {
2982                        .dirname = strace_groups_dir,
2983                };
2984
2985                trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2986                if (trace->ev_qualifier == NULL) {
2987                        fputs("Not enough memory to parse event qualifier", trace->output);
2988                        goto out;
2989                }
2990
2991                if (trace__validate_ev_qualifier(trace))
2992                        goto out;
2993        }
2994
2995        err = 0;
2996
2997        if (lists[0]) {
2998                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2999                                               "event selector. use 'perf list' to list available events",
3000                                               parse_events_option);
3001                err = parse_events_option(&o, lists[0], 0);
3002        }
3003out:
3004        if (sep)
3005                *sep = ',';
3006
3007        return err;
3008}
3009
3010static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3011{
3012        struct trace *trace = opt->value;
3013
3014        if (!list_empty(&trace->evlist->entries))
3015                return parse_cgroups(opt, str, unset);
3016
3017        trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3018
3019        return 0;
3020}
3021
3022int cmd_trace(int argc, const char **argv)
3023{
3024        const char *trace_usage[] = {
3025                "perf trace [<options>] [<command>]",
3026                "perf trace [<options>] -- <command> [<options>]",
3027                "perf trace record [<options>] [<command>]",
3028                "perf trace record [<options>] -- <command> [<options>]",
3029                NULL
3030        };
3031        struct trace trace = {
3032                .syscalls = {
3033                        . max = -1,
3034                },
3035                .opts = {
3036                        .target = {
3037                                .uid       = UINT_MAX,
3038                                .uses_mmap = true,
3039                        },
3040                        .user_freq     = UINT_MAX,
3041                        .user_interval = ULLONG_MAX,
3042                        .no_buffering  = true,
3043                        .mmap_pages    = UINT_MAX,
3044                        .proc_map_timeout  = 500,
3045                },
3046                .output = stderr,
3047                .show_comm = true,
3048                .trace_syscalls = true,
3049                .kernel_syscallchains = false,
3050                .max_stack = UINT_MAX,
3051        };
3052        const char *output_name = NULL;
3053        const struct option trace_options[] = {
3054        OPT_CALLBACK('e', "event", &trace, "event",
3055                     "event/syscall selector. use 'perf list' to list available events",
3056                     trace__parse_events_option),
3057        OPT_BOOLEAN(0, "comm", &trace.show_comm,
3058                    "show the thread COMM next to its id"),
3059        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3060        OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3061                     trace__parse_events_option),
3062        OPT_STRING('o', "output", &output_name, "file", "output file name"),
3063        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3064        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3065                    "trace events on existing process id"),
3066        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3067                    "trace events on existing thread id"),
3068        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3069                     "pids to filter (by the kernel)", trace__set_filter_pids),
3070        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3071                    "system-wide collection from all CPUs"),
3072        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3073                    "list of cpus to monitor"),
3074        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3075                    "child tasks do not inherit counters"),
3076        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3077                     "number of mmap data pages",
3078                     perf_evlist__parse_mmap_pages),
3079        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3080                   "user to profile"),
3081        OPT_CALLBACK(0, "duration", &trace, "float",
3082                     "show only events with duration > N.M ms",
3083                     trace__set_duration),
3084        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3085        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3086        OPT_BOOLEAN('T', "time", &trace.full_time,
3087                    "Show full timestamp, not time relative to first start"),
3088        OPT_BOOLEAN(0, "failure", &trace.failure_only,
3089                    "Show only syscalls that failed"),
3090        OPT_BOOLEAN('s', "summary", &trace.summary_only,
3091                    "Show only syscall summary with statistics"),
3092        OPT_BOOLEAN('S', "with-summary", &trace.summary,
3093                    "Show all syscalls and summary with statistics"),
3094        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3095                     "Trace pagefaults", parse_pagefaults, "maj"),
3096        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3097        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3098        OPT_CALLBACK(0, "call-graph", &trace.opts,
3099                     "record_mode[,record_size]", record_callchain_help,
3100                     &record_parse_callchain_opt),
3101        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3102                    "Show the kernel callchains on the syscall exit path"),
3103        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3104                     "Set the minimum stack depth when parsing the callchain, "
3105                     "anything below the specified depth will be ignored."),
3106        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3107                     "Set the maximum stack depth when parsing the callchain, "
3108                     "anything beyond the specified depth will be ignored. "
3109                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3110        OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3111                        "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3112        OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3113                        "per thread proc mmap processing timeout in ms"),
3114        OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3115                     trace__parse_cgroups),
3116        OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3117                     "ms to wait before starting measurement after program "
3118                     "start"),
3119        OPT_END()
3120        };
3121        bool __maybe_unused max_stack_user_set = true;
3122        bool mmap_pages_user_set = true;
3123        const char * const trace_subcommands[] = { "record", NULL };
3124        int err;
3125        char bf[BUFSIZ];
3126
3127        signal(SIGSEGV, sighandler_dump_stack);
3128        signal(SIGFPE, sighandler_dump_stack);
3129
3130        trace.evlist = perf_evlist__new();
3131        trace.sctbl = syscalltbl__new();
3132
3133        if (trace.evlist == NULL || trace.sctbl == NULL) {
3134                pr_err("Not enough memory to run!\n");
3135                err = -ENOMEM;
3136                goto out;
3137        }
3138
3139        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3140                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3141
3142        if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3143                usage_with_options_msg(trace_usage, trace_options,
3144                                       "cgroup monitoring only available in system-wide mode");
3145        }
3146
3147        err = bpf__setup_stdout(trace.evlist);
3148        if (err) {
3149                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3150                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3151                goto out;
3152        }
3153
3154        err = -1;
3155
3156        if (trace.trace_pgfaults) {
3157                trace.opts.sample_address = true;
3158                trace.opts.sample_time = true;
3159        }
3160
3161        if (trace.opts.mmap_pages == UINT_MAX)
3162                mmap_pages_user_set = false;
3163
3164        if (trace.max_stack == UINT_MAX) {
3165                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3166                max_stack_user_set = false;
3167        }
3168
3169#ifdef HAVE_DWARF_UNWIND_SUPPORT
3170        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3171                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3172        }
3173#endif
3174
3175        if (callchain_param.enabled) {
3176                if (!mmap_pages_user_set && geteuid() == 0)
3177                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3178
3179                symbol_conf.use_callchain = true;
3180        }
3181
3182        if (trace.evlist->nr_entries > 0)
3183                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3184
3185        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3186                return trace__record(&trace, argc-1, &argv[1]);
3187
3188        /* summary_only implies summary option, but don't overwrite summary if set */
3189        if (trace.summary_only)
3190                trace.summary = trace.summary_only;
3191
3192        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3193            trace.evlist->nr_entries == 0 /* Was --events used? */) {
3194                pr_err("Please specify something to trace.\n");
3195                return -1;
3196        }
3197
3198        if (!trace.trace_syscalls && trace.ev_qualifier) {
3199                pr_err("The -e option can't be used with --no-syscalls.\n");
3200                goto out;
3201        }
3202
3203        if (output_name != NULL) {
3204                err = trace__open_output(&trace, output_name);
3205                if (err < 0) {
3206                        perror("failed to create output file");
3207                        goto out;
3208                }
3209        }
3210
3211        trace.open_id = syscalltbl__id(trace.sctbl, "open");
3212
3213        err = target__validate(&trace.opts.target);
3214        if (err) {
3215                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216                fprintf(trace.output, "%s", bf);
3217                goto out_close;
3218        }
3219
3220        err = target__parse_uid(&trace.opts.target);
3221        if (err) {
3222                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223                fprintf(trace.output, "%s", bf);
3224                goto out_close;
3225        }
3226
3227        if (!argc && target__none(&trace.opts.target))
3228                trace.opts.target.system_wide = true;
3229
3230        if (input_name)
3231                err = trace__replay(&trace);
3232        else
3233                err = trace__run(&trace, argc, argv);
3234
3235out_close:
3236        if (output_name != NULL)
3237                fclose(trace.output);
3238out:
3239        return err;
3240}
3241