linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
  21#include "builtin.h"
  22#include "util/cgroup.h"
  23#include "util/color.h"
  24#include "util/debug.h"
  25#include "util/env.h"
  26#include "util/event.h"
  27#include "util/evlist.h"
  28#include <subcmd/exec-cmd.h>
  29#include "util/machine.h"
  30#include "util/path.h"
  31#include "util/session.h"
  32#include "util/thread.h"
  33#include <subcmd/parse-options.h>
  34#include "util/strlist.h"
  35#include "util/intlist.h"
  36#include "util/thread_map.h"
  37#include "util/stat.h"
  38#include "trace/beauty/beauty.h"
  39#include "trace-event.h"
  40#include "util/parse-events.h"
  41#include "util/bpf-loader.h"
  42#include "callchain.h"
  43#include "print_binary.h"
  44#include "string2.h"
  45#include "syscalltbl.h"
  46#include "rb_resort.h"
  47
  48#include <errno.h>
  49#include <inttypes.h>
  50#include <poll.h>
  51#include <signal.h>
  52#include <stdlib.h>
  53#include <string.h>
  54#include <linux/err.h>
  55#include <linux/filter.h>
  56#include <linux/kernel.h>
  57#include <linux/random.h>
  58#include <linux/stringify.h>
  59#include <linux/time64.h>
  60#include <fcntl.h>
  61
  62#include "sane_ctype.h"
  63
  64#ifndef O_CLOEXEC
  65# define O_CLOEXEC              02000000
  66#endif
  67
  68#ifndef F_LINUX_SPECIFIC_BASE
  69# define F_LINUX_SPECIFIC_BASE  1024
  70#endif
  71
  72struct trace {
  73        struct perf_tool        tool;
  74        struct syscalltbl       *sctbl;
  75        struct {
  76                int             max;
  77                struct syscall  *table;
  78                struct {
  79                        struct perf_evsel *sys_enter,
  80                                          *sys_exit,
  81                                          *augmented;
  82                }               events;
  83        } syscalls;
  84        struct record_opts      opts;
  85        struct perf_evlist      *evlist;
  86        struct machine          *host;
  87        struct thread           *current;
  88        struct cgroup           *cgroup;
  89        u64                     base_time;
  90        FILE                    *output;
  91        unsigned long           nr_events;
  92        struct strlist          *ev_qualifier;
  93        struct {
  94                size_t          nr;
  95                int             *entries;
  96        }                       ev_qualifier_ids;
  97        struct {
  98                size_t          nr;
  99                pid_t           *entries;
 100        }                       filter_pids;
 101        double                  duration_filter;
 102        double                  runtime_ms;
 103        struct {
 104                u64             vfs_getname,
 105                                proc_getname;
 106        } stats;
 107        unsigned int            max_stack;
 108        unsigned int            min_stack;
 109        bool                    not_ev_qualifier;
 110        bool                    live;
 111        bool                    full_time;
 112        bool                    sched;
 113        bool                    multiple_threads;
 114        bool                    summary;
 115        bool                    summary_only;
 116        bool                    failure_only;
 117        bool                    show_comm;
 118        bool                    print_sample;
 119        bool                    show_tool_stats;
 120        bool                    trace_syscalls;
 121        bool                    kernel_syscallchains;
 122        bool                    force;
 123        bool                    vfs_getname;
 124        int                     trace_pgfaults;
 125};
 126
 127struct tp_field {
 128        int offset;
 129        union {
 130                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 131                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 132        };
 133};
 134
 135#define TP_UINT_FIELD(bits) \
 136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 137{ \
 138        u##bits value; \
 139        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 140        return value;  \
 141}
 142
 143TP_UINT_FIELD(8);
 144TP_UINT_FIELD(16);
 145TP_UINT_FIELD(32);
 146TP_UINT_FIELD(64);
 147
 148#define TP_UINT_FIELD__SWAPPED(bits) \
 149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 150{ \
 151        u##bits value; \
 152        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 153        return bswap_##bits(value);\
 154}
 155
 156TP_UINT_FIELD__SWAPPED(16);
 157TP_UINT_FIELD__SWAPPED(32);
 158TP_UINT_FIELD__SWAPPED(64);
 159
 160static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 161{
 162        field->offset = offset;
 163
 164        switch (size) {
 165        case 1:
 166                field->integer = tp_field__u8;
 167                break;
 168        case 2:
 169                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 170                break;
 171        case 4:
 172                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 173                break;
 174        case 8:
 175                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 176                break;
 177        default:
 178                return -1;
 179        }
 180
 181        return 0;
 182}
 183
 184static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
 185{
 186        return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 187}
 188
 189static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 190{
 191        return sample->raw_data + field->offset;
 192}
 193
 194static int __tp_field__init_ptr(struct tp_field *field, int offset)
 195{
 196        field->offset = offset;
 197        field->pointer = tp_field__ptr;
 198        return 0;
 199}
 200
 201static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 202{
 203        return __tp_field__init_ptr(field, format_field->offset);
 204}
 205
 206struct syscall_tp {
 207        struct tp_field id;
 208        union {
 209                struct tp_field args, ret;
 210        };
 211};
 212
 213static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 214                                          struct tp_field *field,
 215                                          const char *name)
 216{
 217        struct format_field *format_field = perf_evsel__field(evsel, name);
 218
 219        if (format_field == NULL)
 220                return -1;
 221
 222        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 223}
 224
 225#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 226        ({ struct syscall_tp *sc = evsel->priv;\
 227           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 228
 229static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 230                                         struct tp_field *field,
 231                                         const char *name)
 232{
 233        struct format_field *format_field = perf_evsel__field(evsel, name);
 234
 235        if (format_field == NULL)
 236                return -1;
 237
 238        return tp_field__init_ptr(field, format_field);
 239}
 240
 241#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 242        ({ struct syscall_tp *sc = evsel->priv;\
 243           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 244
 245static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 246{
 247        zfree(&evsel->priv);
 248        perf_evsel__delete(evsel);
 249}
 250
 251static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
 252{
 253        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 254
 255        if (evsel->priv != NULL) {
 256                if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
 257                        goto out_delete;
 258                return 0;
 259        }
 260
 261        return -ENOMEM;
 262out_delete:
 263        zfree(&evsel->priv);
 264        return -ENOENT;
 265}
 266
 267static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
 268{
 269        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 270
 271        if (evsel->priv != NULL) {       /* field, sizeof_field, offsetof_field */
 272                if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
 273                        goto out_delete;
 274
 275                return 0;
 276        }
 277
 278        return -ENOMEM;
 279out_delete:
 280        zfree(&evsel->priv);
 281        return -EINVAL;
 282}
 283
 284static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
 285{
 286        struct syscall_tp *sc = evsel->priv;
 287
 288        return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 289}
 290
 291static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
 292{
 293        evsel->priv = malloc(sizeof(struct syscall_tp));
 294        if (evsel->priv != NULL) {
 295                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 296                        goto out_delete;
 297
 298                evsel->handler = handler;
 299                return 0;
 300        }
 301
 302        return -ENOMEM;
 303
 304out_delete:
 305        zfree(&evsel->priv);
 306        return -ENOENT;
 307}
 308
 309static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 310{
 311        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 312
 313        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 314        if (IS_ERR(evsel))
 315                evsel = perf_evsel__newtp("syscalls", direction);
 316
 317        if (IS_ERR(evsel))
 318                return NULL;
 319
 320        if (perf_evsel__init_raw_syscall_tp(evsel, handler))
 321                goto out_delete;
 322
 323        return evsel;
 324
 325out_delete:
 326        perf_evsel__delete_priv(evsel);
 327        return NULL;
 328}
 329
 330#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 331        ({ struct syscall_tp *fields = evsel->priv; \
 332           fields->name.integer(&fields->name, sample); })
 333
 334#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 335        ({ struct syscall_tp *fields = evsel->priv; \
 336           fields->name.pointer(&fields->name, sample); })
 337
 338size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
 339{
 340        int idx = val - sa->offset;
 341
 342        if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
 343                return scnprintf(bf, size, intfmt, val);
 344
 345        return scnprintf(bf, size, "%s", sa->entries[idx]);
 346}
 347
 348static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 349                                                const char *intfmt,
 350                                                struct syscall_arg *arg)
 351{
 352        return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
 353}
 354
 355static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 356                                              struct syscall_arg *arg)
 357{
 358        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 359}
 360
 361#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 362
 363struct strarrays {
 364        int             nr_entries;
 365        struct strarray **entries;
 366};
 367
 368#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
 369        .nr_entries = ARRAY_SIZE(array), \
 370        .entries = array, \
 371}
 372
 373size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 374                                        struct syscall_arg *arg)
 375{
 376        struct strarrays *sas = arg->parm;
 377        int i;
 378
 379        for (i = 0; i < sas->nr_entries; ++i) {
 380                struct strarray *sa = sas->entries[i];
 381                int idx = arg->val - sa->offset;
 382
 383                if (idx >= 0 && idx < sa->nr_entries) {
 384                        if (sa->entries[idx] == NULL)
 385                                break;
 386                        return scnprintf(bf, size, "%s", sa->entries[idx]);
 387                }
 388        }
 389
 390        return scnprintf(bf, size, "%d", arg->val);
 391}
 392
 393#ifndef AT_FDCWD
 394#define AT_FDCWD        -100
 395#endif
 396
 397static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 398                                           struct syscall_arg *arg)
 399{
 400        int fd = arg->val;
 401
 402        if (fd == AT_FDCWD)
 403                return scnprintf(bf, size, "CWD");
 404
 405        return syscall_arg__scnprintf_fd(bf, size, arg);
 406}
 407
 408#define SCA_FDAT syscall_arg__scnprintf_fd_at
 409
 410static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 411                                              struct syscall_arg *arg);
 412
 413#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 414
 415size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 416{
 417        return scnprintf(bf, size, "%#lx", arg->val);
 418}
 419
 420size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 421{
 422        return scnprintf(bf, size, "%d", arg->val);
 423}
 424
 425size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 426{
 427        return scnprintf(bf, size, "%ld", arg->val);
 428}
 429
 430static const char *bpf_cmd[] = {
 431        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 432        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 433};
 434static DEFINE_STRARRAY(bpf_cmd);
 435
 436static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 437static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 438
 439static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 440static DEFINE_STRARRAY(itimers);
 441
 442static const char *keyctl_options[] = {
 443        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 444        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 445        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 446        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 447        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 448};
 449static DEFINE_STRARRAY(keyctl_options);
 450
 451static const char *whences[] = { "SET", "CUR", "END",
 452#ifdef SEEK_DATA
 453"DATA",
 454#endif
 455#ifdef SEEK_HOLE
 456"HOLE",
 457#endif
 458};
 459static DEFINE_STRARRAY(whences);
 460
 461static const char *fcntl_cmds[] = {
 462        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 463        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 464        "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 465        "GETOWNER_UIDS",
 466};
 467static DEFINE_STRARRAY(fcntl_cmds);
 468
 469static const char *fcntl_linux_specific_cmds[] = {
 470        "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 471        "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 472        "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 473};
 474
 475static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
 476
 477static struct strarray *fcntl_cmds_arrays[] = {
 478        &strarray__fcntl_cmds,
 479        &strarray__fcntl_linux_specific_cmds,
 480};
 481
 482static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 483
 484static const char *rlimit_resources[] = {
 485        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 486        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 487        "RTTIME",
 488};
 489static DEFINE_STRARRAY(rlimit_resources);
 490
 491static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 492static DEFINE_STRARRAY(sighow);
 493
 494static const char *clockid[] = {
 495        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 496        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 497        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 498};
 499static DEFINE_STRARRAY(clockid);
 500
 501static const char *socket_families[] = {
 502        "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 503        "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 504        "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 505        "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 506        "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 507        "ALG", "NFC", "VSOCK",
 508};
 509static DEFINE_STRARRAY(socket_families);
 510
 511static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 512                                                 struct syscall_arg *arg)
 513{
 514        size_t printed = 0;
 515        int mode = arg->val;
 516
 517        if (mode == F_OK) /* 0 */
 518                return scnprintf(bf, size, "F");
 519#define P_MODE(n) \
 520        if (mode & n##_OK) { \
 521                printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 522                mode &= ~n##_OK; \
 523        }
 524
 525        P_MODE(R);
 526        P_MODE(W);
 527        P_MODE(X);
 528#undef P_MODE
 529
 530        if (mode)
 531                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 532
 533        return printed;
 534}
 535
 536#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 537
 538static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 539                                              struct syscall_arg *arg);
 540
 541#define SCA_FILENAME syscall_arg__scnprintf_filename
 542
 543static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 544                                                struct syscall_arg *arg)
 545{
 546        int printed = 0, flags = arg->val;
 547
 548#define P_FLAG(n) \
 549        if (flags & O_##n) { \
 550                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 551                flags &= ~O_##n; \
 552        }
 553
 554        P_FLAG(CLOEXEC);
 555        P_FLAG(NONBLOCK);
 556#undef P_FLAG
 557
 558        if (flags)
 559                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 560
 561        return printed;
 562}
 563
 564#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 565
 566#ifndef GRND_NONBLOCK
 567#define GRND_NONBLOCK   0x0001
 568#endif
 569#ifndef GRND_RANDOM
 570#define GRND_RANDOM     0x0002
 571#endif
 572
 573static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 574                                                   struct syscall_arg *arg)
 575{
 576        int printed = 0, flags = arg->val;
 577
 578#define P_FLAG(n) \
 579        if (flags & GRND_##n) { \
 580                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 581                flags &= ~GRND_##n; \
 582        }
 583
 584        P_FLAG(RANDOM);
 585        P_FLAG(NONBLOCK);
 586#undef P_FLAG
 587
 588        if (flags)
 589                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 590
 591        return printed;
 592}
 593
 594#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 595
 596#define STRARRAY(name, array) \
 597          { .scnprintf  = SCA_STRARRAY, \
 598            .parm       = &strarray__##array, }
 599
 600#include "trace/beauty/arch_errno_names.c"
 601#include "trace/beauty/eventfd.c"
 602#include "trace/beauty/futex_op.c"
 603#include "trace/beauty/futex_val3.c"
 604#include "trace/beauty/mmap.c"
 605#include "trace/beauty/mode_t.c"
 606#include "trace/beauty/msg_flags.c"
 607#include "trace/beauty/open_flags.c"
 608#include "trace/beauty/perf_event_open.c"
 609#include "trace/beauty/pid.c"
 610#include "trace/beauty/sched_policy.c"
 611#include "trace/beauty/seccomp.c"
 612#include "trace/beauty/signum.c"
 613#include "trace/beauty/socket_type.c"
 614#include "trace/beauty/waitid_options.c"
 615
 616struct syscall_arg_fmt {
 617        size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 618        void       *parm;
 619        const char *name;
 620        bool       show_zero;
 621};
 622
 623static struct syscall_fmt {
 624        const char *name;
 625        const char *alias;
 626        struct syscall_arg_fmt arg[6];
 627        u8         nr_args;
 628        bool       errpid;
 629        bool       timeout;
 630        bool       hexret;
 631} syscall_fmts[] = {
 632        { .name     = "access",
 633          .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 634        { .name     = "bpf",
 635          .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 636        { .name     = "brk",        .hexret = true,
 637          .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
 638        { .name     = "clock_gettime",
 639          .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 640        { .name     = "clone",      .errpid = true, .nr_args = 5,
 641          .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
 642                   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 643                   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 644                   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 645                   [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
 646        { .name     = "close",
 647          .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 648        { .name     = "epoll_ctl",
 649          .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 650        { .name     = "eventfd2",
 651          .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 652        { .name     = "fchmodat",
 653          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 654        { .name     = "fchownat",
 655          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 656        { .name     = "fcntl",
 657          .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 658                           .parm      = &strarrays__fcntl_cmds_arrays,
 659                           .show_zero = true, },
 660                   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 661        { .name     = "flock",
 662          .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 663        { .name     = "fstat", .alias = "newfstat", },
 664        { .name     = "fstatat", .alias = "newfstatat", },
 665        { .name     = "futex",
 666          .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 667                   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 668        { .name     = "futimesat",
 669          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 670        { .name     = "getitimer",
 671          .arg = { [0] = STRARRAY(which, itimers), }, },
 672        { .name     = "getpid",     .errpid = true, },
 673        { .name     = "getpgid",    .errpid = true, },
 674        { .name     = "getppid",    .errpid = true, },
 675        { .name     = "getrandom",
 676          .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 677        { .name     = "getrlimit",
 678          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 679        { .name     = "gettid",     .errpid = true, },
 680        { .name     = "ioctl",
 681          .arg = {
 682#if defined(__i386__) || defined(__x86_64__)
 683/*
 684 * FIXME: Make this available to all arches.
 685 */
 686                   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 687                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 688#else
 689                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 690#endif
 691        { .name     = "kcmp",       .nr_args = 5,
 692          .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
 693                   [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
 694                   [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
 695                   [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
 696                   [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
 697        { .name     = "keyctl",
 698          .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 699        { .name     = "kill",
 700          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 701        { .name     = "linkat",
 702          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 703        { .name     = "lseek",
 704          .arg = { [2] = STRARRAY(whence, whences), }, },
 705        { .name     = "lstat", .alias = "newlstat", },
 706        { .name     = "madvise",
 707          .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 708                   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 709        { .name     = "mkdirat",
 710          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 711        { .name     = "mknodat",
 712          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 713        { .name     = "mlock",
 714          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 715        { .name     = "mlockall",
 716          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 717        { .name     = "mmap",       .hexret = true,
 718/* The standard mmap maps to old_mmap on s390x */
 719#if defined(__s390x__)
 720        .alias = "old_mmap",
 721#endif
 722          .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
 723                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 724                   [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
 725        { .name     = "mprotect",
 726          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 727                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
 728        { .name     = "mq_unlink",
 729          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 730        { .name     = "mremap",     .hexret = true,
 731          .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
 732                   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
 733                   [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
 734        { .name     = "munlock",
 735          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 736        { .name     = "munmap",
 737          .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 738        { .name     = "name_to_handle_at",
 739          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 740        { .name     = "newfstatat",
 741          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 742        { .name     = "open",
 743          .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 744        { .name     = "open_by_handle_at",
 745          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 746                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 747        { .name     = "openat",
 748          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 749                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 750        { .name     = "perf_event_open",
 751          .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
 752                   [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
 753                   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 754        { .name     = "pipe2",
 755          .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 756        { .name     = "pkey_alloc",
 757          .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
 758        { .name     = "pkey_free",
 759          .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
 760        { .name     = "pkey_mprotect",
 761          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 762                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 763                   [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
 764        { .name     = "poll", .timeout = true, },
 765        { .name     = "ppoll", .timeout = true, },
 766        { .name     = "prctl", .alias = "arch_prctl",
 767          .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 768                   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 769                   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 770        { .name     = "pread", .alias = "pread64", },
 771        { .name     = "preadv", .alias = "pread", },
 772        { .name     = "prlimit64",
 773          .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 774        { .name     = "pwrite", .alias = "pwrite64", },
 775        { .name     = "readlinkat",
 776          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 777        { .name     = "recvfrom",
 778          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 779        { .name     = "recvmmsg",
 780          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 781        { .name     = "recvmsg",
 782          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 783        { .name     = "renameat",
 784          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 785        { .name     = "rt_sigaction",
 786          .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 787        { .name     = "rt_sigprocmask",
 788          .arg = { [0] = STRARRAY(how, sighow), }, },
 789        { .name     = "rt_sigqueueinfo",
 790          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 791        { .name     = "rt_tgsigqueueinfo",
 792          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 793        { .name     = "sched_setscheduler",
 794          .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 795        { .name     = "seccomp",
 796          .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
 797                   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 798        { .name     = "select", .timeout = true, },
 799        { .name     = "sendmmsg",
 800          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 801        { .name     = "sendmsg",
 802          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 803        { .name     = "sendto",
 804          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 805        { .name     = "set_tid_address", .errpid = true, },
 806        { .name     = "setitimer",
 807          .arg = { [0] = STRARRAY(which, itimers), }, },
 808        { .name     = "setrlimit",
 809          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 810        { .name     = "socket",
 811          .arg = { [0] = STRARRAY(family, socket_families),
 812                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 813                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 814        { .name     = "socketpair",
 815          .arg = { [0] = STRARRAY(family, socket_families),
 816                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 817                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 818        { .name     = "stat", .alias = "newstat", },
 819        { .name     = "statx",
 820          .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
 821                   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 822                   [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
 823        { .name     = "swapoff",
 824          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 825        { .name     = "swapon",
 826          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 827        { .name     = "symlinkat",
 828          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 829        { .name     = "tgkill",
 830          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 831        { .name     = "tkill",
 832          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 833        { .name     = "uname", .alias = "newuname", },
 834        { .name     = "unlinkat",
 835          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 836        { .name     = "utimensat",
 837          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 838        { .name     = "wait4",      .errpid = true,
 839          .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 840        { .name     = "waitid",     .errpid = true,
 841          .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 842};
 843
 844static int syscall_fmt__cmp(const void *name, const void *fmtp)
 845{
 846        const struct syscall_fmt *fmt = fmtp;
 847        return strcmp(name, fmt->name);
 848}
 849
 850static struct syscall_fmt *syscall_fmt__find(const char *name)
 851{
 852        const int nmemb = ARRAY_SIZE(syscall_fmts);
 853        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 854}
 855
 856/*
 857 * is_exit: is this "exit" or "exit_group"?
 858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
 859 */
 860struct syscall {
 861        struct event_format *tp_format;
 862        int                 nr_args;
 863        bool                is_exit;
 864        bool                is_open;
 865        struct format_field *args;
 866        const char          *name;
 867        struct syscall_fmt  *fmt;
 868        struct syscall_arg_fmt *arg_fmt;
 869};
 870
 871/*
 872 * We need to have this 'calculated' boolean because in some cases we really
 873 * don't know what is the duration of a syscall, for instance, when we start
 874 * a session and some threads are waiting for a syscall to finish, say 'poll',
 875 * in which case all we can do is to print "( ? ) for duration and for the
 876 * start timestamp.
 877 */
 878static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 879{
 880        double duration = (double)t / NSEC_PER_MSEC;
 881        size_t printed = fprintf(fp, "(");
 882
 883        if (!calculated)
 884                printed += fprintf(fp, "         ");
 885        else if (duration >= 1.0)
 886                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 887        else if (duration >= 0.01)
 888                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 889        else
 890                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 891        return printed + fprintf(fp, "): ");
 892}
 893
 894/**
 895 * filename.ptr: The filename char pointer that will be vfs_getname'd
 896 * filename.entry_str_pos: Where to insert the string translated from
 897 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 898 * ret_scnprintf: syscall args may set this to a different syscall return
 899 *                formatter, for instance, fcntl may return fds, file flags, etc.
 900 */
 901struct thread_trace {
 902        u64               entry_time;
 903        bool              entry_pending;
 904        unsigned long     nr_events;
 905        unsigned long     pfmaj, pfmin;
 906        char              *entry_str;
 907        double            runtime_ms;
 908        size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 909        struct {
 910                unsigned long ptr;
 911                short int     entry_str_pos;
 912                bool          pending_open;
 913                unsigned int  namelen;
 914                char          *name;
 915        } filename;
 916        struct {
 917                int       max;
 918                char      **table;
 919        } paths;
 920
 921        struct intlist *syscall_stats;
 922};
 923
 924static struct thread_trace *thread_trace__new(void)
 925{
 926        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 927
 928        if (ttrace)
 929                ttrace->paths.max = -1;
 930
 931        ttrace->syscall_stats = intlist__new(NULL);
 932
 933        return ttrace;
 934}
 935
 936static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 937{
 938        struct thread_trace *ttrace;
 939
 940        if (thread == NULL)
 941                goto fail;
 942
 943        if (thread__priv(thread) == NULL)
 944                thread__set_priv(thread, thread_trace__new());
 945
 946        if (thread__priv(thread) == NULL)
 947                goto fail;
 948
 949        ttrace = thread__priv(thread);
 950        ++ttrace->nr_events;
 951
 952        return ttrace;
 953fail:
 954        color_fprintf(fp, PERF_COLOR_RED,
 955                      "WARNING: not enough memory, dropping samples!\n");
 956        return NULL;
 957}
 958
 959
 960void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
 961                                    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
 962{
 963        struct thread_trace *ttrace = thread__priv(arg->thread);
 964
 965        ttrace->ret_scnprintf = ret_scnprintf;
 966}
 967
 968#define TRACE_PFMAJ             (1 << 0)
 969#define TRACE_PFMIN             (1 << 1)
 970
 971static const size_t trace__entry_str_size = 2048;
 972
 973static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 974{
 975        struct thread_trace *ttrace = thread__priv(thread);
 976
 977        if (fd > ttrace->paths.max) {
 978                char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 979
 980                if (npath == NULL)
 981                        return -1;
 982
 983                if (ttrace->paths.max != -1) {
 984                        memset(npath + ttrace->paths.max + 1, 0,
 985                               (fd - ttrace->paths.max) * sizeof(char *));
 986                } else {
 987                        memset(npath, 0, (fd + 1) * sizeof(char *));
 988                }
 989
 990                ttrace->paths.table = npath;
 991                ttrace->paths.max   = fd;
 992        }
 993
 994        ttrace->paths.table[fd] = strdup(pathname);
 995
 996        return ttrace->paths.table[fd] != NULL ? 0 : -1;
 997}
 998
 999static int thread__read_fd_path(struct thread *thread, int fd)
1000{
1001        char linkname[PATH_MAX], pathname[PATH_MAX];
1002        struct stat st;
1003        int ret;
1004
1005        if (thread->pid_ == thread->tid) {
1006                scnprintf(linkname, sizeof(linkname),
1007                          "/proc/%d/fd/%d", thread->pid_, fd);
1008        } else {
1009                scnprintf(linkname, sizeof(linkname),
1010                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1011        }
1012
1013        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1014                return -1;
1015
1016        ret = readlink(linkname, pathname, sizeof(pathname));
1017
1018        if (ret < 0 || ret > st.st_size)
1019                return -1;
1020
1021        pathname[ret] = '\0';
1022        return trace__set_fd_pathname(thread, fd, pathname);
1023}
1024
1025static const char *thread__fd_path(struct thread *thread, int fd,
1026                                   struct trace *trace)
1027{
1028        struct thread_trace *ttrace = thread__priv(thread);
1029
1030        if (ttrace == NULL)
1031                return NULL;
1032
1033        if (fd < 0)
1034                return NULL;
1035
1036        if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1037                if (!trace->live)
1038                        return NULL;
1039                ++trace->stats.proc_getname;
1040                if (thread__read_fd_path(thread, fd))
1041                        return NULL;
1042        }
1043
1044        return ttrace->paths.table[fd];
1045}
1046
1047size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1048{
1049        int fd = arg->val;
1050        size_t printed = scnprintf(bf, size, "%d", fd);
1051        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1052
1053        if (path)
1054                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1055
1056        return printed;
1057}
1058
1059size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1060{
1061        size_t printed = scnprintf(bf, size, "%d", fd);
1062        struct thread *thread = machine__find_thread(trace->host, pid, pid);
1063
1064        if (thread) {
1065                const char *path = thread__fd_path(thread, fd, trace);
1066
1067                if (path)
1068                        printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1069
1070                thread__put(thread);
1071        }
1072
1073        return printed;
1074}
1075
1076static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077                                              struct syscall_arg *arg)
1078{
1079        int fd = arg->val;
1080        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081        struct thread_trace *ttrace = thread__priv(arg->thread);
1082
1083        if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084                zfree(&ttrace->paths.table[fd]);
1085
1086        return printed;
1087}
1088
1089static void thread__set_filename_pos(struct thread *thread, const char *bf,
1090                                     unsigned long ptr)
1091{
1092        struct thread_trace *ttrace = thread__priv(thread);
1093
1094        ttrace->filename.ptr = ptr;
1095        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1096}
1097
1098static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099                                              struct syscall_arg *arg)
1100{
1101        unsigned long ptr = arg->val;
1102
1103        if (!arg->trace->vfs_getname)
1104                return scnprintf(bf, size, "%#x", ptr);
1105
1106        thread__set_filename_pos(arg->thread, bf, ptr);
1107        return 0;
1108}
1109
1110static bool trace__filter_duration(struct trace *trace, double t)
1111{
1112        return t < (trace->duration_filter * NSEC_PER_MSEC);
1113}
1114
1115static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1116{
1117        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1118
1119        return fprintf(fp, "%10.3f ", ts);
1120}
1121
1122/*
1123 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124 * using ttrace->entry_time for a thread that receives a sys_exit without
1125 * first having received a sys_enter ("poll" issued before tracing session
1126 * starts, lost sys_enter exit due to ring buffer overflow).
1127 */
1128static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1129{
1130        if (tstamp > 0)
1131                return __trace__fprintf_tstamp(trace, tstamp, fp);
1132
1133        return fprintf(fp, "         ? ");
1134}
1135
1136static bool done = false;
1137static bool interrupted = false;
1138
1139static void sig_handler(int sig)
1140{
1141        done = true;
1142        interrupted = sig == SIGINT;
1143}
1144
1145static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146                                        u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1147{
1148        size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149        printed += fprintf_duration(duration, duration_calculated, fp);
1150
1151        if (trace->multiple_threads) {
1152                if (trace->show_comm)
1153                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154                printed += fprintf(fp, "%d ", thread->tid);
1155        }
1156
1157        return printed;
1158}
1159
1160static int trace__process_event(struct trace *trace, struct machine *machine,
1161                                union perf_event *event, struct perf_sample *sample)
1162{
1163        int ret = 0;
1164
1165        switch (event->header.type) {
1166        case PERF_RECORD_LOST:
1167                color_fprintf(trace->output, PERF_COLOR_RED,
1168                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1169                ret = machine__process_lost_event(machine, event, sample);
1170                break;
1171        default:
1172                ret = machine__process_event(machine, event, sample);
1173                break;
1174        }
1175
1176        return ret;
1177}
1178
1179static int trace__tool_process(struct perf_tool *tool,
1180                               union perf_event *event,
1181                               struct perf_sample *sample,
1182                               struct machine *machine)
1183{
1184        struct trace *trace = container_of(tool, struct trace, tool);
1185        return trace__process_event(trace, machine, event, sample);
1186}
1187
1188static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1189{
1190        struct machine *machine = vmachine;
1191
1192        if (machine->kptr_restrict_warned)
1193                return NULL;
1194
1195        if (symbol_conf.kptr_restrict) {
1196                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198                           "Kernel samples will not be resolved.\n");
1199                machine->kptr_restrict_warned = true;
1200                return NULL;
1201        }
1202
1203        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1204}
1205
1206static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1207{
1208        int err = symbol__init(NULL);
1209
1210        if (err)
1211                return err;
1212
1213        trace->host = machine__new_host();
1214        if (trace->host == NULL)
1215                return -ENOMEM;
1216
1217        err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1218        if (err < 0)
1219                goto out;
1220
1221        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222                                            evlist->threads, trace__tool_process, false,
1223                                            trace->opts.proc_map_timeout, 1);
1224out:
1225        if (err)
1226                symbol__exit();
1227
1228        return err;
1229}
1230
1231static void trace__symbols__exit(struct trace *trace)
1232{
1233        machine__exit(trace->host);
1234        trace->host = NULL;
1235
1236        symbol__exit();
1237}
1238
1239static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1240{
1241        int idx;
1242
1243        if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244                nr_args = sc->fmt->nr_args;
1245
1246        sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247        if (sc->arg_fmt == NULL)
1248                return -1;
1249
1250        for (idx = 0; idx < nr_args; ++idx) {
1251                if (sc->fmt)
1252                        sc->arg_fmt[idx] = sc->fmt->arg[idx];
1253        }
1254
1255        sc->nr_args = nr_args;
1256        return 0;
1257}
1258
1259static int syscall__set_arg_fmts(struct syscall *sc)
1260{
1261        struct format_field *field;
1262        int idx = 0, len;
1263
1264        for (field = sc->args; field; field = field->next, ++idx) {
1265                if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1266                        continue;
1267
1268                if (strcmp(field->type, "const char *") == 0 &&
1269                         (strcmp(field->name, "filename") == 0 ||
1270                          strcmp(field->name, "path") == 0 ||
1271                          strcmp(field->name, "pathname") == 0))
1272                        sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273                else if (field->flags & FIELD_IS_POINTER)
1274                        sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275                else if (strcmp(field->type, "pid_t") == 0)
1276                        sc->arg_fmt[idx].scnprintf = SCA_PID;
1277                else if (strcmp(field->type, "umode_t") == 0)
1278                        sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279                else if ((strcmp(field->type, "int") == 0 ||
1280                          strcmp(field->type, "unsigned int") == 0 ||
1281                          strcmp(field->type, "long") == 0) &&
1282                         (len = strlen(field->name)) >= 2 &&
1283                         strcmp(field->name + len - 2, "fd") == 0) {
1284                        /*
1285                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1286                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1287                         * 65 int
1288                         * 23 unsigned int
1289                         * 7 unsigned long
1290                         */
1291                        sc->arg_fmt[idx].scnprintf = SCA_FD;
1292                }
1293        }
1294
1295        return 0;
1296}
1297
1298static int trace__read_syscall_info(struct trace *trace, int id)
1299{
1300        char tp_name[128];
1301        struct syscall *sc;
1302        const char *name = syscalltbl__name(trace->sctbl, id);
1303
1304        if (name == NULL)
1305                return -1;
1306
1307        if (id > trace->syscalls.max) {
1308                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1309
1310                if (nsyscalls == NULL)
1311                        return -1;
1312
1313                if (trace->syscalls.max != -1) {
1314                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1315                               (id - trace->syscalls.max) * sizeof(*sc));
1316                } else {
1317                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1318                }
1319
1320                trace->syscalls.table = nsyscalls;
1321                trace->syscalls.max   = id;
1322        }
1323
1324        sc = trace->syscalls.table + id;
1325        sc->name = name;
1326
1327        sc->fmt  = syscall_fmt__find(sc->name);
1328
1329        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1331
1332        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1335        }
1336
1337        if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1338                return -1;
1339
1340        if (IS_ERR(sc->tp_format))
1341                return -1;
1342
1343        sc->args = sc->tp_format->format.fields;
1344        /*
1345         * We need to check and discard the first variable '__syscall_nr'
1346         * or 'nr' that mean the syscall number. It is needless here.
1347         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1348         */
1349        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350                sc->args = sc->args->next;
1351                --sc->nr_args;
1352        }
1353
1354        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355        sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1356
1357        return syscall__set_arg_fmts(sc);
1358}
1359
1360static int trace__validate_ev_qualifier(struct trace *trace)
1361{
1362        int err = 0, i;
1363        size_t nr_allocated;
1364        struct str_node *pos;
1365
1366        trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367        trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1369
1370        if (trace->ev_qualifier_ids.entries == NULL) {
1371                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1372                       trace->output);
1373                err = -EINVAL;
1374                goto out;
1375        }
1376
1377        nr_allocated = trace->ev_qualifier_ids.nr;
1378        i = 0;
1379
1380        strlist__for_each_entry(pos, trace->ev_qualifier) {
1381                const char *sc = pos->s;
1382                int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1383
1384                if (id < 0) {
1385                        id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1386                        if (id >= 0)
1387                                goto matches;
1388
1389                        if (err == 0) {
1390                                fputs("Error:\tInvalid syscall ", trace->output);
1391                                err = -EINVAL;
1392                        } else {
1393                                fputs(", ", trace->output);
1394                        }
1395
1396                        fputs(sc, trace->output);
1397                }
1398matches:
1399                trace->ev_qualifier_ids.entries[i++] = id;
1400                if (match_next == -1)
1401                        continue;
1402
1403                while (1) {
1404                        id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1405                        if (id < 0)
1406                                break;
1407                        if (nr_allocated == trace->ev_qualifier_ids.nr) {
1408                                void *entries;
1409
1410                                nr_allocated += 8;
1411                                entries = realloc(trace->ev_qualifier_ids.entries,
1412                                                  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413                                if (entries == NULL) {
1414                                        err = -ENOMEM;
1415                                        fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1416                                        goto out_free;
1417                                }
1418                                trace->ev_qualifier_ids.entries = entries;
1419                        }
1420                        trace->ev_qualifier_ids.nr++;
1421                        trace->ev_qualifier_ids.entries[i++] = id;
1422                }
1423        }
1424
1425        if (err < 0) {
1426                fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427                      "\nHint:\tand: 'man syscalls'\n", trace->output);
1428out_free:
1429                zfree(&trace->ev_qualifier_ids.entries);
1430                trace->ev_qualifier_ids.nr = 0;
1431        }
1432out:
1433        return err;
1434}
1435
1436/*
1437 * args is to be interpreted as a series of longs but we need to handle
1438 * 8-byte unaligned accesses. args points to raw_data within the event
1439 * and raw_data is guaranteed to be 8-byte unaligned because it is
1440 * preceded by raw_size which is a u32. So we need to copy args to a temp
1441 * variable to read it. Most notably this avoids extended load instructions
1442 * on unaligned addresses
1443 */
1444unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1445{
1446        unsigned long val;
1447        unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1448
1449        memcpy(&val, p, sizeof(val));
1450        return val;
1451}
1452
1453static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454                                      struct syscall_arg *arg)
1455{
1456        if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457                return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1458
1459        return scnprintf(bf, size, "arg%d: ", arg->idx);
1460}
1461
1462static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463                                     struct syscall_arg *arg, unsigned long val)
1464{
1465        if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1466                arg->val = val;
1467                if (sc->arg_fmt[arg->idx].parm)
1468                        arg->parm = sc->arg_fmt[arg->idx].parm;
1469                return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1470        }
1471        return scnprintf(bf, size, "%ld", val);
1472}
1473
1474static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475                                      unsigned char *args, struct trace *trace,
1476                                      struct thread *thread)
1477{
1478        size_t printed = 0;
1479        unsigned long val;
1480        u8 bit = 1;
1481        struct syscall_arg arg = {
1482                .args   = args,
1483                .idx    = 0,
1484                .mask   = 0,
1485                .trace  = trace,
1486                .thread = thread,
1487        };
1488        struct thread_trace *ttrace = thread__priv(thread);
1489
1490        /*
1491         * Things like fcntl will set this in its 'cmd' formatter to pick the
1492         * right formatter for the return value (an fd? file flags?), which is
1493         * not needed for syscalls that always return a given type, say an fd.
1494         */
1495        ttrace->ret_scnprintf = NULL;
1496
1497        if (sc->args != NULL) {
1498                struct format_field *field;
1499
1500                for (field = sc->args; field;
1501                     field = field->next, ++arg.idx, bit <<= 1) {
1502                        if (arg.mask & bit)
1503                                continue;
1504
1505                        val = syscall_arg__val(&arg, arg.idx);
1506
1507                        /*
1508                         * Suppress this argument if its value is zero and
1509                         * and we don't have a string associated in an
1510                         * strarray for it.
1511                         */
1512                        if (val == 0 &&
1513                            !(sc->arg_fmt &&
1514                              (sc->arg_fmt[arg.idx].show_zero ||
1515                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517                              sc->arg_fmt[arg.idx].parm))
1518                                continue;
1519
1520                        printed += scnprintf(bf + printed, size - printed,
1521                                             "%s%s: ", printed ? ", " : "", field->name);
1522                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1523                }
1524        } else if (IS_ERR(sc->tp_format)) {
1525                /*
1526                 * If we managed to read the tracepoint /format file, then we
1527                 * may end up not having any args, like with gettid(), so only
1528                 * print the raw args when we didn't manage to read it.
1529                 */
1530                while (arg.idx < sc->nr_args) {
1531                        if (arg.mask & bit)
1532                                goto next_arg;
1533                        val = syscall_arg__val(&arg, arg.idx);
1534                        if (printed)
1535                                printed += scnprintf(bf + printed, size - printed, ", ");
1536                        printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1538next_arg:
1539                        ++arg.idx;
1540                        bit <<= 1;
1541                }
1542        }
1543
1544        return printed;
1545}
1546
1547typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548                                  union perf_event *event,
1549                                  struct perf_sample *sample);
1550
1551static struct syscall *trace__syscall_info(struct trace *trace,
1552                                           struct perf_evsel *evsel, int id)
1553{
1554
1555        if (id < 0) {
1556
1557                /*
1558                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559                 * before that, leaving at a higher verbosity level till that is
1560                 * explained. Reproduced with plain ftrace with:
1561                 *
1562                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563                 * grep "NR -1 " /t/trace_pipe
1564                 *
1565                 * After generating some load on the machine.
1566                 */
1567                if (verbose > 1) {
1568                        static u64 n;
1569                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570                                id, perf_evsel__name(evsel), ++n);
1571                }
1572                return NULL;
1573        }
1574
1575        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576            trace__read_syscall_info(trace, id))
1577                goto out_cant_read;
1578
1579        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1580                goto out_cant_read;
1581
1582        return &trace->syscalls.table[id];
1583
1584out_cant_read:
1585        if (verbose > 0) {
1586                fprintf(trace->output, "Problems reading syscall %d", id);
1587                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589                fputs(" information\n", trace->output);
1590        }
1591        return NULL;
1592}
1593
1594static void thread__update_stats(struct thread_trace *ttrace,
1595                                 int id, struct perf_sample *sample)
1596{
1597        struct int_node *inode;
1598        struct stats *stats;
1599        u64 duration = 0;
1600
1601        inode = intlist__findnew(ttrace->syscall_stats, id);
1602        if (inode == NULL)
1603                return;
1604
1605        stats = inode->priv;
1606        if (stats == NULL) {
1607                stats = malloc(sizeof(struct stats));
1608                if (stats == NULL)
1609                        return;
1610                init_stats(stats);
1611                inode->priv = stats;
1612        }
1613
1614        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615                duration = sample->time - ttrace->entry_time;
1616
1617        update_stats(stats, duration);
1618}
1619
1620static int trace__printf_interrupted_entry(struct trace *trace)
1621{
1622        struct thread_trace *ttrace;
1623        size_t printed;
1624
1625        if (trace->failure_only || trace->current == NULL)
1626                return 0;
1627
1628        ttrace = thread__priv(trace->current);
1629
1630        if (!ttrace->entry_pending)
1631                return 0;
1632
1633        printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634        printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635        ttrace->entry_pending = false;
1636
1637        return printed;
1638}
1639
1640static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641                                 struct perf_sample *sample, struct thread *thread)
1642{
1643        int printed = 0;
1644
1645        if (trace->print_sample) {
1646                double ts = (double)sample->time / NSEC_PER_MSEC;
1647
1648                printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649                                   perf_evsel__name(evsel), ts,
1650                                   thread__comm_str(thread),
1651                                   sample->pid, sample->tid, sample->cpu);
1652        }
1653
1654        return printed;
1655}
1656
1657static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658                            union perf_event *event __maybe_unused,
1659                            struct perf_sample *sample)
1660{
1661        char *msg;
1662        void *args;
1663        size_t printed = 0;
1664        struct thread *thread;
1665        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667        struct thread_trace *ttrace;
1668
1669        if (sc == NULL)
1670                return -1;
1671
1672        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673        ttrace = thread__trace(thread, trace->output);
1674        if (ttrace == NULL)
1675                goto out_put;
1676
1677        trace__fprintf_sample(trace, evsel, sample, thread);
1678
1679        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1680
1681        if (ttrace->entry_str == NULL) {
1682                ttrace->entry_str = malloc(trace__entry_str_size);
1683                if (!ttrace->entry_str)
1684                        goto out_put;
1685        }
1686
1687        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688                trace__printf_interrupted_entry(trace);
1689
1690        ttrace->entry_time = sample->time;
1691        msg = ttrace->entry_str;
1692        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1693
1694        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695                                           args, trace, thread);
1696
1697        if (sc->is_exit) {
1698                if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699                        trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700                        fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1701                }
1702        } else {
1703                ttrace->entry_pending = true;
1704                /* See trace__vfs_getname & trace__sys_exit */
1705                ttrace->filename.pending_open = false;
1706        }
1707
1708        if (trace->current != thread) {
1709                thread__put(trace->current);
1710                trace->current = thread__get(thread);
1711        }
1712        err = 0;
1713out_put:
1714        thread__put(thread);
1715        return err;
1716}
1717
1718static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719                                    struct perf_sample *sample)
1720{
1721        struct thread_trace *ttrace;
1722        struct thread *thread;
1723        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1725        char msg[1024];
1726        void *args;
1727
1728        if (sc == NULL)
1729                return -1;
1730
1731        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732        ttrace = thread__trace(thread, trace->output);
1733        /*
1734         * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735         * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1736         */
1737        if (ttrace == NULL)
1738                goto out_put;
1739
1740        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741        syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742        fprintf(trace->output, "%s", msg);
1743        err = 0;
1744out_put:
1745        thread__put(thread);
1746        return err;
1747}
1748
1749static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750                                    struct perf_sample *sample,
1751                                    struct callchain_cursor *cursor)
1752{
1753        struct addr_location al;
1754        int max_stack = evsel->attr.sample_max_stack ?
1755                        evsel->attr.sample_max_stack :
1756                        trace->max_stack;
1757
1758        if (machine__resolve(trace->host, &al, sample) < 0 ||
1759            thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1760                return -1;
1761
1762        return 0;
1763}
1764
1765static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1766{
1767        /* TODO: user-configurable print_opts */
1768        const unsigned int print_opts = EVSEL__PRINT_SYM |
1769                                        EVSEL__PRINT_DSO |
1770                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1771
1772        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1773}
1774
1775static const char *errno_to_name(struct perf_evsel *evsel, int err)
1776{
1777        struct perf_env *env = perf_evsel__env(evsel);
1778        const char *arch_name = perf_env__arch(env);
1779
1780        return arch_syscalls__strerrno(arch_name, err);
1781}
1782
1783static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784                           union perf_event *event __maybe_unused,
1785                           struct perf_sample *sample)
1786{
1787        long ret;
1788        u64 duration = 0;
1789        bool duration_calculated = false;
1790        struct thread *thread;
1791        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793        struct thread_trace *ttrace;
1794
1795        if (sc == NULL)
1796                return -1;
1797
1798        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799        ttrace = thread__trace(thread, trace->output);
1800        if (ttrace == NULL)
1801                goto out_put;
1802
1803        trace__fprintf_sample(trace, evsel, sample, thread);
1804
1805        if (trace->summary)
1806                thread__update_stats(ttrace, id, sample);
1807
1808        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1809
1810        if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812                ttrace->filename.pending_open = false;
1813                ++trace->stats.vfs_getname;
1814        }
1815
1816        if (ttrace->entry_time) {
1817                duration = sample->time - ttrace->entry_time;
1818                if (trace__filter_duration(trace, duration))
1819                        goto out;
1820                duration_calculated = true;
1821        } else if (trace->duration_filter)
1822                goto out;
1823
1824        if (sample->callchain) {
1825                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826                if (callchain_ret == 0) {
1827                        if (callchain_cursor.nr < trace->min_stack)
1828                                goto out;
1829                        callchain_ret = 1;
1830                }
1831        }
1832
1833        if (trace->summary_only || (ret >= 0 && trace->failure_only))
1834                goto out;
1835
1836        trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1837
1838        if (ttrace->entry_pending) {
1839                fprintf(trace->output, "%-70s", ttrace->entry_str);
1840        } else {
1841                fprintf(trace->output, " ... [");
1842                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843                fprintf(trace->output, "]: %s()", sc->name);
1844        }
1845
1846        if (sc->fmt == NULL) {
1847                if (ret < 0)
1848                        goto errno_print;
1849signed_print:
1850                fprintf(trace->output, ") = %ld", ret);
1851        } else if (ret < 0) {
1852errno_print: {
1853                char bf[STRERR_BUFSIZE];
1854                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855                           *e = errno_to_name(evsel, -ret);
1856
1857                fprintf(trace->output, ") = -1 %s %s", e, emsg);
1858        }
1859        } else if (ret == 0 && sc->fmt->timeout)
1860                fprintf(trace->output, ") = 0 Timeout");
1861        else if (ttrace->ret_scnprintf) {
1862                char bf[1024];
1863                struct syscall_arg arg = {
1864                        .val    = ret,
1865                        .thread = thread,
1866                        .trace  = trace,
1867                };
1868                ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869                ttrace->ret_scnprintf = NULL;
1870                fprintf(trace->output, ") = %s", bf);
1871        } else if (sc->fmt->hexret)
1872                fprintf(trace->output, ") = %#lx", ret);
1873        else if (sc->fmt->errpid) {
1874                struct thread *child = machine__find_thread(trace->host, ret, ret);
1875
1876                if (child != NULL) {
1877                        fprintf(trace->output, ") = %ld", ret);
1878                        if (child->comm_set)
1879                                fprintf(trace->output, " (%s)", thread__comm_str(child));
1880                        thread__put(child);
1881                }
1882        } else
1883                goto signed_print;
1884
1885        fputc('\n', trace->output);
1886
1887        if (callchain_ret > 0)
1888                trace__fprintf_callchain(trace, sample);
1889        else if (callchain_ret < 0)
1890                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1891out:
1892        ttrace->entry_pending = false;
1893        err = 0;
1894out_put:
1895        thread__put(thread);
1896        return err;
1897}
1898
1899static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900                              union perf_event *event __maybe_unused,
1901                              struct perf_sample *sample)
1902{
1903        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904        struct thread_trace *ttrace;
1905        size_t filename_len, entry_str_len, to_move;
1906        ssize_t remaining_space;
1907        char *pos;
1908        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1909
1910        if (!thread)
1911                goto out;
1912
1913        ttrace = thread__priv(thread);
1914        if (!ttrace)
1915                goto out_put;
1916
1917        filename_len = strlen(filename);
1918        if (filename_len == 0)
1919                goto out_put;
1920
1921        if (ttrace->filename.namelen < filename_len) {
1922                char *f = realloc(ttrace->filename.name, filename_len + 1);
1923
1924                if (f == NULL)
1925                        goto out_put;
1926
1927                ttrace->filename.namelen = filename_len;
1928                ttrace->filename.name = f;
1929        }
1930
1931        strcpy(ttrace->filename.name, filename);
1932        ttrace->filename.pending_open = true;
1933
1934        if (!ttrace->filename.ptr)
1935                goto out_put;
1936
1937        entry_str_len = strlen(ttrace->entry_str);
1938        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939        if (remaining_space <= 0)
1940                goto out_put;
1941
1942        if (filename_len > (size_t)remaining_space) {
1943                filename += filename_len - remaining_space;
1944                filename_len = remaining_space;
1945        }
1946
1947        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949        memmove(pos + filename_len, pos, to_move);
1950        memcpy(pos, filename, filename_len);
1951
1952        ttrace->filename.ptr = 0;
1953        ttrace->filename.entry_str_pos = 0;
1954out_put:
1955        thread__put(thread);
1956out:
1957        return 0;
1958}
1959
1960static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961                                     union perf_event *event __maybe_unused,
1962                                     struct perf_sample *sample)
1963{
1964        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966        struct thread *thread = machine__findnew_thread(trace->host,
1967                                                        sample->pid,
1968                                                        sample->tid);
1969        struct thread_trace *ttrace = thread__trace(thread, trace->output);
1970
1971        if (ttrace == NULL)
1972                goto out_dump;
1973
1974        ttrace->runtime_ms += runtime_ms;
1975        trace->runtime_ms += runtime_ms;
1976out_put:
1977        thread__put(thread);
1978        return 0;
1979
1980out_dump:
1981        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1982               evsel->name,
1983               perf_evsel__strval(evsel, sample, "comm"),
1984               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1985               runtime,
1986               perf_evsel__intval(evsel, sample, "vruntime"));
1987        goto out_put;
1988}
1989
1990static int bpf_output__printer(enum binary_printer_ops op,
1991                               unsigned int val, void *extra __maybe_unused, FILE *fp)
1992{
1993        unsigned char ch = (unsigned char)val;
1994
1995        switch (op) {
1996        case BINARY_PRINT_CHAR_DATA:
1997                return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998        case BINARY_PRINT_DATA_BEGIN:
1999        case BINARY_PRINT_LINE_BEGIN:
2000        case BINARY_PRINT_ADDR:
2001        case BINARY_PRINT_NUM_DATA:
2002        case BINARY_PRINT_NUM_PAD:
2003        case BINARY_PRINT_SEP:
2004        case BINARY_PRINT_CHAR_PAD:
2005        case BINARY_PRINT_LINE_END:
2006        case BINARY_PRINT_DATA_END:
2007        default:
2008                break;
2009        }
2010
2011        return 0;
2012}
2013
2014static void bpf_output__fprintf(struct trace *trace,
2015                                struct perf_sample *sample)
2016{
2017        binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018                        bpf_output__printer, NULL, trace->output);
2019}
2020
2021static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022                                union perf_event *event __maybe_unused,
2023                                struct perf_sample *sample)
2024{
2025        int callchain_ret = 0;
2026
2027        if (sample->callchain) {
2028                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029                if (callchain_ret == 0) {
2030                        if (callchain_cursor.nr < trace->min_stack)
2031                                goto out;
2032                        callchain_ret = 1;
2033                }
2034        }
2035
2036        trace__printf_interrupted_entry(trace);
2037        trace__fprintf_tstamp(trace, sample->time, trace->output);
2038
2039        if (trace->trace_syscalls)
2040                fprintf(trace->output, "(         ): ");
2041
2042        fprintf(trace->output, "%s:", evsel->name);
2043
2044        if (perf_evsel__is_bpf_output(evsel)) {
2045                if (evsel == trace->syscalls.events.augmented)
2046                        trace__fprintf_sys_enter(trace, evsel, sample);
2047                else
2048                        bpf_output__fprintf(trace, sample);
2049        } else if (evsel->tp_format) {
2050                if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051                    trace__fprintf_sys_enter(trace, evsel, sample)) {
2052                        event_format__fprintf(evsel->tp_format, sample->cpu,
2053                                              sample->raw_data, sample->raw_size,
2054                                              trace->output);
2055                }
2056        }
2057
2058        fprintf(trace->output, "\n");
2059
2060        if (callchain_ret > 0)
2061                trace__fprintf_callchain(trace, sample);
2062        else if (callchain_ret < 0)
2063                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2064out:
2065        return 0;
2066}
2067
2068static void print_location(FILE *f, struct perf_sample *sample,
2069                           struct addr_location *al,
2070                           bool print_dso, bool print_sym)
2071{
2072
2073        if ((verbose > 0 || print_dso) && al->map)
2074                fprintf(f, "%s@", al->map->dso->long_name);
2075
2076        if ((verbose > 0 || print_sym) && al->sym)
2077                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078                        al->addr - al->sym->start);
2079        else if (al->map)
2080                fprintf(f, "0x%" PRIx64, al->addr);
2081        else
2082                fprintf(f, "0x%" PRIx64, sample->addr);
2083}
2084
2085static int trace__pgfault(struct trace *trace,
2086                          struct perf_evsel *evsel,
2087                          union perf_event *event __maybe_unused,
2088                          struct perf_sample *sample)
2089{
2090        struct thread *thread;
2091        struct addr_location al;
2092        char map_type = 'd';
2093        struct thread_trace *ttrace;
2094        int err = -1;
2095        int callchain_ret = 0;
2096
2097        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098
2099        if (sample->callchain) {
2100                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101                if (callchain_ret == 0) {
2102                        if (callchain_cursor.nr < trace->min_stack)
2103                                goto out_put;
2104                        callchain_ret = 1;
2105                }
2106        }
2107
2108        ttrace = thread__trace(thread, trace->output);
2109        if (ttrace == NULL)
2110                goto out_put;
2111
2112        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2113                ttrace->pfmaj++;
2114        else
2115                ttrace->pfmin++;
2116
2117        if (trace->summary_only)
2118                goto out;
2119
2120        thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2121
2122        trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2123
2124        fprintf(trace->output, "%sfault [",
2125                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2126                "maj" : "min");
2127
2128        print_location(trace->output, sample, &al, false, true);
2129
2130        fprintf(trace->output, "] => ");
2131
2132        thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2133
2134        if (!al.map) {
2135                thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2136
2137                if (al.map)
2138                        map_type = 'x';
2139                else
2140                        map_type = '?';
2141        }
2142
2143        print_location(trace->output, sample, &al, true, false);
2144
2145        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2146
2147        if (callchain_ret > 0)
2148                trace__fprintf_callchain(trace, sample);
2149        else if (callchain_ret < 0)
2150                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2151out:
2152        err = 0;
2153out_put:
2154        thread__put(thread);
2155        return err;
2156}
2157
2158static void trace__set_base_time(struct trace *trace,
2159                                 struct perf_evsel *evsel,
2160                                 struct perf_sample *sample)
2161{
2162        /*
2163         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164         * and don't use sample->time unconditionally, we may end up having
2165         * some other event in the future without PERF_SAMPLE_TIME for good
2166         * reason, i.e. we may not be interested in its timestamps, just in
2167         * it taking place, picking some piece of information when it
2168         * appears in our event stream (vfs_getname comes to mind).
2169         */
2170        if (trace->base_time == 0 && !trace->full_time &&
2171            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172                trace->base_time = sample->time;
2173}
2174
2175static int trace__process_sample(struct perf_tool *tool,
2176                                 union perf_event *event,
2177                                 struct perf_sample *sample,
2178                                 struct perf_evsel *evsel,
2179                                 struct machine *machine __maybe_unused)
2180{
2181        struct trace *trace = container_of(tool, struct trace, tool);
2182        struct thread *thread;
2183        int err = 0;
2184
2185        tracepoint_handler handler = evsel->handler;
2186
2187        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188        if (thread && thread__is_filtered(thread))
2189                goto out;
2190
2191        trace__set_base_time(trace, evsel, sample);
2192
2193        if (handler) {
2194                ++trace->nr_events;
2195                handler(trace, evsel, event, sample);
2196        }
2197out:
2198        thread__put(thread);
2199        return err;
2200}
2201
2202static int trace__record(struct trace *trace, int argc, const char **argv)
2203{
2204        unsigned int rec_argc, i, j;
2205        const char **rec_argv;
2206        const char * const record_args[] = {
2207                "record",
2208                "-R",
2209                "-m", "1024",
2210                "-c", "1",
2211        };
2212
2213        const char * const sc_args[] = { "-e", };
2214        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215        const char * const majpf_args[] = { "-e", "major-faults" };
2216        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217        const char * const minpf_args[] = { "-e", "minor-faults" };
2218        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2219
2220        /* +1 is for the event string below */
2221        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222                majpf_args_nr + minpf_args_nr + argc;
2223        rec_argv = calloc(rec_argc + 1, sizeof(char *));
2224
2225        if (rec_argv == NULL)
2226                return -ENOMEM;
2227
2228        j = 0;
2229        for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230                rec_argv[j++] = record_args[i];
2231
2232        if (trace->trace_syscalls) {
2233                for (i = 0; i < sc_args_nr; i++)
2234                        rec_argv[j++] = sc_args[i];
2235
2236                /* event string may be different for older kernels - e.g., RHEL6 */
2237                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239                else if (is_valid_tracepoint("syscalls:sys_enter"))
2240                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2241                else {
2242                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2243                        free(rec_argv);
2244                        return -1;
2245                }
2246        }
2247
2248        if (trace->trace_pgfaults & TRACE_PFMAJ)
2249                for (i = 0; i < majpf_args_nr; i++)
2250                        rec_argv[j++] = majpf_args[i];
2251
2252        if (trace->trace_pgfaults & TRACE_PFMIN)
2253                for (i = 0; i < minpf_args_nr; i++)
2254                        rec_argv[j++] = minpf_args[i];
2255
2256        for (i = 0; i < (unsigned int)argc; i++)
2257                rec_argv[j++] = argv[i];
2258
2259        return cmd_record(j, rec_argv);
2260}
2261
2262static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2263
2264static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2265{
2266        struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2267
2268        if (IS_ERR(evsel))
2269                return false;
2270
2271        if (perf_evsel__field(evsel, "pathname") == NULL) {
2272                perf_evsel__delete(evsel);
2273                return false;
2274        }
2275
2276        evsel->handler = trace__vfs_getname;
2277        perf_evlist__add(evlist, evsel);
2278        return true;
2279}
2280
2281static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2282{
2283        struct perf_evsel *evsel;
2284        struct perf_event_attr attr = {
2285                .type = PERF_TYPE_SOFTWARE,
2286                .mmap_data = 1,
2287        };
2288
2289        attr.config = config;
2290        attr.sample_period = 1;
2291
2292        event_attr_init(&attr);
2293
2294        evsel = perf_evsel__new(&attr);
2295        if (evsel)
2296                evsel->handler = trace__pgfault;
2297
2298        return evsel;
2299}
2300
2301static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2302{
2303        const u32 type = event->header.type;
2304        struct perf_evsel *evsel;
2305
2306        if (type != PERF_RECORD_SAMPLE) {
2307                trace__process_event(trace, trace->host, event, sample);
2308                return;
2309        }
2310
2311        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2312        if (evsel == NULL) {
2313                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2314                return;
2315        }
2316
2317        trace__set_base_time(trace, evsel, sample);
2318
2319        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2320            sample->raw_data == NULL) {
2321                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2322                       perf_evsel__name(evsel), sample->tid,
2323                       sample->cpu, sample->raw_size);
2324        } else {
2325                tracepoint_handler handler = evsel->handler;
2326                handler(trace, evsel, event, sample);
2327        }
2328}
2329
2330static int trace__add_syscall_newtp(struct trace *trace)
2331{
2332        int ret = -1;
2333        struct perf_evlist *evlist = trace->evlist;
2334        struct perf_evsel *sys_enter, *sys_exit;
2335
2336        sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2337        if (sys_enter == NULL)
2338                goto out;
2339
2340        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2341                goto out_delete_sys_enter;
2342
2343        sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2344        if (sys_exit == NULL)
2345                goto out_delete_sys_enter;
2346
2347        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2348                goto out_delete_sys_exit;
2349
2350        perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2351        perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2352
2353        perf_evlist__add(evlist, sys_enter);
2354        perf_evlist__add(evlist, sys_exit);
2355
2356        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2357                /*
2358                 * We're interested only in the user space callchain
2359                 * leading to the syscall, allow overriding that for
2360                 * debugging reasons using --kernel_syscall_callchains
2361                 */
2362                sys_exit->attr.exclude_callchain_kernel = 1;
2363        }
2364
2365        trace->syscalls.events.sys_enter = sys_enter;
2366        trace->syscalls.events.sys_exit  = sys_exit;
2367
2368        ret = 0;
2369out:
2370        return ret;
2371
2372out_delete_sys_exit:
2373        perf_evsel__delete_priv(sys_exit);
2374out_delete_sys_enter:
2375        perf_evsel__delete_priv(sys_enter);
2376        goto out;
2377}
2378
2379static int trace__set_ev_qualifier_filter(struct trace *trace)
2380{
2381        int err = -1;
2382        struct perf_evsel *sys_exit;
2383        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2384                                                trace->ev_qualifier_ids.nr,
2385                                                trace->ev_qualifier_ids.entries);
2386
2387        if (filter == NULL)
2388                goto out_enomem;
2389
2390        if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2391                                          filter)) {
2392                sys_exit = trace->syscalls.events.sys_exit;
2393                err = perf_evsel__append_tp_filter(sys_exit, filter);
2394        }
2395
2396        free(filter);
2397out:
2398        return err;
2399out_enomem:
2400        errno = ENOMEM;
2401        goto out;
2402}
2403
2404static int trace__set_filter_loop_pids(struct trace *trace)
2405{
2406        unsigned int nr = 1;
2407        pid_t pids[32] = {
2408                getpid(),
2409        };
2410        struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2411
2412        while (thread && nr < ARRAY_SIZE(pids)) {
2413                struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2414
2415                if (parent == NULL)
2416                        break;
2417
2418                if (!strcmp(thread__comm_str(parent), "sshd")) {
2419                        pids[nr++] = parent->tid;
2420                        break;
2421                }
2422                thread = parent;
2423        }
2424
2425        return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2426}
2427
2428static int trace__run(struct trace *trace, int argc, const char **argv)
2429{
2430        struct perf_evlist *evlist = trace->evlist;
2431        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2432        int err = -1, i;
2433        unsigned long before;
2434        const bool forks = argc > 0;
2435        bool draining = false;
2436
2437        trace->live = true;
2438
2439        if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2440                goto out_error_raw_syscalls;
2441
2442        if (trace->trace_syscalls)
2443                trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2444
2445        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2446                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2447                if (pgfault_maj == NULL)
2448                        goto out_error_mem;
2449                perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2450                perf_evlist__add(evlist, pgfault_maj);
2451        }
2452
2453        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2454                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2455                if (pgfault_min == NULL)
2456                        goto out_error_mem;
2457                perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2458                perf_evlist__add(evlist, pgfault_min);
2459        }
2460
2461        if (trace->sched &&
2462            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2463                                   trace__sched_stat_runtime))
2464                goto out_error_sched_stat_runtime;
2465
2466        /*
2467         * If a global cgroup was set, apply it to all the events without an
2468         * explicit cgroup. I.e.:
2469         *
2470         *      trace -G A -e sched:*switch
2471         *
2472         * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2473         * _and_ sched:sched_switch to the 'A' cgroup, while:
2474         *
2475         * trace -e sched:*switch -G A
2476         *
2477         * will only set the sched:sched_switch event to the 'A' cgroup, all the
2478         * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2479         * a cgroup (on the root cgroup, sys wide, etc).
2480         *
2481         * Multiple cgroups:
2482         *
2483         * trace -G A -e sched:*switch -G B
2484         *
2485         * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2486         * to the 'B' cgroup.
2487         *
2488         * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2489         * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2490         */
2491        if (trace->cgroup)
2492                evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2493
2494        err = perf_evlist__create_maps(evlist, &trace->opts.target);
2495        if (err < 0) {
2496                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2497                goto out_delete_evlist;
2498        }
2499
2500        err = trace__symbols_init(trace, evlist);
2501        if (err < 0) {
2502                fprintf(trace->output, "Problems initializing symbol libraries!\n");
2503                goto out_delete_evlist;
2504        }
2505
2506        perf_evlist__config(evlist, &trace->opts, &callchain_param);
2507
2508        signal(SIGCHLD, sig_handler);
2509        signal(SIGINT, sig_handler);
2510
2511        if (forks) {
2512                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2513                                                    argv, false, NULL);
2514                if (err < 0) {
2515                        fprintf(trace->output, "Couldn't run the workload!\n");
2516                        goto out_delete_evlist;
2517                }
2518        }
2519
2520        err = perf_evlist__open(evlist);
2521        if (err < 0)
2522                goto out_error_open;
2523
2524        err = bpf__apply_obj_config();
2525        if (err) {
2526                char errbuf[BUFSIZ];
2527
2528                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2529                pr_err("ERROR: Apply config to BPF failed: %s\n",
2530                         errbuf);
2531                goto out_error_open;
2532        }
2533
2534        /*
2535         * Better not use !target__has_task() here because we need to cover the
2536         * case where no threads were specified in the command line, but a
2537         * workload was, and in that case we will fill in the thread_map when
2538         * we fork the workload in perf_evlist__prepare_workload.
2539         */
2540        if (trace->filter_pids.nr > 0)
2541                err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2542        else if (thread_map__pid(evlist->threads, 0) == -1)
2543                err = trace__set_filter_loop_pids(trace);
2544
2545        if (err < 0)
2546                goto out_error_mem;
2547
2548        if (trace->ev_qualifier_ids.nr > 0) {
2549                err = trace__set_ev_qualifier_filter(trace);
2550                if (err < 0)
2551                        goto out_errno;
2552
2553                pr_debug("event qualifier tracepoint filter: %s\n",
2554                         trace->syscalls.events.sys_exit->filter);
2555        }
2556
2557        err = perf_evlist__apply_filters(evlist, &evsel);
2558        if (err < 0)
2559                goto out_error_apply_filters;
2560
2561        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2562        if (err < 0)
2563                goto out_error_mmap;
2564
2565        if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2566                perf_evlist__enable(evlist);
2567
2568        if (forks)
2569                perf_evlist__start_workload(evlist);
2570
2571        if (trace->opts.initial_delay) {
2572                usleep(trace->opts.initial_delay * 1000);
2573                perf_evlist__enable(evlist);
2574        }
2575
2576        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2577                                  evlist->threads->nr > 1 ||
2578                                  perf_evlist__first(evlist)->attr.inherit;
2579
2580        /*
2581         * Now that we already used evsel->attr to ask the kernel to setup the
2582         * events, lets reuse evsel->attr.sample_max_stack as the limit in
2583         * trace__resolve_callchain(), allowing per-event max-stack settings
2584         * to override an explicitely set --max-stack global setting.
2585         */
2586        evlist__for_each_entry(evlist, evsel) {
2587                if (evsel__has_callchain(evsel) &&
2588                    evsel->attr.sample_max_stack == 0)
2589                        evsel->attr.sample_max_stack = trace->max_stack;
2590        }
2591again:
2592        before = trace->nr_events;
2593
2594        for (i = 0; i < evlist->nr_mmaps; i++) {
2595                union perf_event *event;
2596                struct perf_mmap *md;
2597
2598                md = &evlist->mmap[i];
2599                if (perf_mmap__read_init(md) < 0)
2600                        continue;
2601
2602                while ((event = perf_mmap__read_event(md)) != NULL) {
2603                        struct perf_sample sample;
2604
2605                        ++trace->nr_events;
2606
2607                        err = perf_evlist__parse_sample(evlist, event, &sample);
2608                        if (err) {
2609                                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2610                                goto next_event;
2611                        }
2612
2613                        trace__handle_event(trace, event, &sample);
2614next_event:
2615                        perf_mmap__consume(md);
2616
2617                        if (interrupted)
2618                                goto out_disable;
2619
2620                        if (done && !draining) {
2621                                perf_evlist__disable(evlist);
2622                                draining = true;
2623                        }
2624                }
2625                perf_mmap__read_done(md);
2626        }
2627
2628        if (trace->nr_events == before) {
2629                int timeout = done ? 100 : -1;
2630
2631                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2632                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2633                                draining = true;
2634
2635                        goto again;
2636                }
2637        } else {
2638                goto again;
2639        }
2640
2641out_disable:
2642        thread__zput(trace->current);
2643
2644        perf_evlist__disable(evlist);
2645
2646        if (!err) {
2647                if (trace->summary)
2648                        trace__fprintf_thread_summary(trace, trace->output);
2649
2650                if (trace->show_tool_stats) {
2651                        fprintf(trace->output, "Stats:\n "
2652                                               " vfs_getname : %" PRIu64 "\n"
2653                                               " proc_getname: %" PRIu64 "\n",
2654                                trace->stats.vfs_getname,
2655                                trace->stats.proc_getname);
2656                }
2657        }
2658
2659out_delete_evlist:
2660        trace__symbols__exit(trace);
2661
2662        perf_evlist__delete(evlist);
2663        cgroup__put(trace->cgroup);
2664        trace->evlist = NULL;
2665        trace->live = false;
2666        return err;
2667{
2668        char errbuf[BUFSIZ];
2669
2670out_error_sched_stat_runtime:
2671        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2672        goto out_error;
2673
2674out_error_raw_syscalls:
2675        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2676        goto out_error;
2677
2678out_error_mmap:
2679        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2680        goto out_error;
2681
2682out_error_open:
2683        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2684
2685out_error:
2686        fprintf(trace->output, "%s\n", errbuf);
2687        goto out_delete_evlist;
2688
2689out_error_apply_filters:
2690        fprintf(trace->output,
2691                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2692                evsel->filter, perf_evsel__name(evsel), errno,
2693                str_error_r(errno, errbuf, sizeof(errbuf)));
2694        goto out_delete_evlist;
2695}
2696out_error_mem:
2697        fprintf(trace->output, "Not enough memory to run!\n");
2698        goto out_delete_evlist;
2699
2700out_errno:
2701        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2702        goto out_delete_evlist;
2703}
2704
2705static int trace__replay(struct trace *trace)
2706{
2707        const struct perf_evsel_str_handler handlers[] = {
2708                { "probe:vfs_getname",       trace__vfs_getname, },
2709        };
2710        struct perf_data data = {
2711                .file      = {
2712                        .path = input_name,
2713                },
2714                .mode      = PERF_DATA_MODE_READ,
2715                .force     = trace->force,
2716        };
2717        struct perf_session *session;
2718        struct perf_evsel *evsel;
2719        int err = -1;
2720
2721        trace->tool.sample        = trace__process_sample;
2722        trace->tool.mmap          = perf_event__process_mmap;
2723        trace->tool.mmap2         = perf_event__process_mmap2;
2724        trace->tool.comm          = perf_event__process_comm;
2725        trace->tool.exit          = perf_event__process_exit;
2726        trace->tool.fork          = perf_event__process_fork;
2727        trace->tool.attr          = perf_event__process_attr;
2728        trace->tool.tracing_data  = perf_event__process_tracing_data;
2729        trace->tool.build_id      = perf_event__process_build_id;
2730        trace->tool.namespaces    = perf_event__process_namespaces;
2731
2732        trace->tool.ordered_events = true;
2733        trace->tool.ordering_requires_timestamps = true;
2734
2735        /* add tid to output */
2736        trace->multiple_threads = true;
2737
2738        session = perf_session__new(&data, false, &trace->tool);
2739        if (session == NULL)
2740                return -1;
2741
2742        if (trace->opts.target.pid)
2743                symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2744
2745        if (trace->opts.target.tid)
2746                symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2747
2748        if (symbol__init(&session->header.env) < 0)
2749                goto out;
2750
2751        trace->host = &session->machines.host;
2752
2753        err = perf_session__set_tracepoints_handlers(session, handlers);
2754        if (err)
2755                goto out;
2756
2757        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2758                                                     "raw_syscalls:sys_enter");
2759        /* older kernels have syscalls tp versus raw_syscalls */
2760        if (evsel == NULL)
2761                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2762                                                             "syscalls:sys_enter");
2763
2764        if (evsel &&
2765            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2766            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2767                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2768                goto out;
2769        }
2770
2771        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2772                                                     "raw_syscalls:sys_exit");
2773        if (evsel == NULL)
2774                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2775                                                             "syscalls:sys_exit");
2776        if (evsel &&
2777            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2778            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2779                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2780                goto out;
2781        }
2782
2783        evlist__for_each_entry(session->evlist, evsel) {
2784                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2785                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2786                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2787                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2788                        evsel->handler = trace__pgfault;
2789        }
2790
2791        setup_pager();
2792
2793        err = perf_session__process_events(session);
2794        if (err)
2795                pr_err("Failed to process events, error %d", err);
2796
2797        else if (trace->summary)
2798                trace__fprintf_thread_summary(trace, trace->output);
2799
2800out:
2801        perf_session__delete(session);
2802
2803        return err;
2804}
2805
2806static size_t trace__fprintf_threads_header(FILE *fp)
2807{
2808        size_t printed;
2809
2810        printed  = fprintf(fp, "\n Summary of events:\n\n");
2811
2812        return printed;
2813}
2814
2815DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2816        struct stats    *stats;
2817        double          msecs;
2818        int             syscall;
2819)
2820{
2821        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2822        struct stats *stats = source->priv;
2823
2824        entry->syscall = source->i;
2825        entry->stats   = stats;
2826        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2827}
2828
2829static size_t thread__dump_stats(struct thread_trace *ttrace,
2830                                 struct trace *trace, FILE *fp)
2831{
2832        size_t printed = 0;
2833        struct syscall *sc;
2834        struct rb_node *nd;
2835        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2836
2837        if (syscall_stats == NULL)
2838                return 0;
2839
2840        printed += fprintf(fp, "\n");
2841
2842        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2843        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2844        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2845
2846        resort_rb__for_each_entry(nd, syscall_stats) {
2847                struct stats *stats = syscall_stats_entry->stats;
2848                if (stats) {
2849                        double min = (double)(stats->min) / NSEC_PER_MSEC;
2850                        double max = (double)(stats->max) / NSEC_PER_MSEC;
2851                        double avg = avg_stats(stats);
2852                        double pct;
2853                        u64 n = (u64) stats->n;
2854
2855                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2856                        avg /= NSEC_PER_MSEC;
2857
2858                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2859                        printed += fprintf(fp, "   %-15s", sc->name);
2860                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2861                                           n, syscall_stats_entry->msecs, min, avg);
2862                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2863                }
2864        }
2865
2866        resort_rb__delete(syscall_stats);
2867        printed += fprintf(fp, "\n\n");
2868
2869        return printed;
2870}
2871
2872static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2873{
2874        size_t printed = 0;
2875        struct thread_trace *ttrace = thread__priv(thread);
2876        double ratio;
2877
2878        if (ttrace == NULL)
2879                return 0;
2880
2881        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2882
2883        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2884        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2885        printed += fprintf(fp, "%.1f%%", ratio);
2886        if (ttrace->pfmaj)
2887                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2888        if (ttrace->pfmin)
2889                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2890        if (trace->sched)
2891                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2892        else if (fputc('\n', fp) != EOF)
2893                ++printed;
2894
2895        printed += thread__dump_stats(ttrace, trace, fp);
2896
2897        return printed;
2898}
2899
2900static unsigned long thread__nr_events(struct thread_trace *ttrace)
2901{
2902        return ttrace ? ttrace->nr_events : 0;
2903}
2904
2905DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2906        struct thread *thread;
2907)
2908{
2909        entry->thread = rb_entry(nd, struct thread, rb_node);
2910}
2911
2912static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2913{
2914        size_t printed = trace__fprintf_threads_header(fp);
2915        struct rb_node *nd;
2916        int i;
2917
2918        for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2919                DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2920
2921                if (threads == NULL) {
2922                        fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2923                        return 0;
2924                }
2925
2926                resort_rb__for_each_entry(nd, threads)
2927                        printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2928
2929                resort_rb__delete(threads);
2930        }
2931        return printed;
2932}
2933
2934static int trace__set_duration(const struct option *opt, const char *str,
2935                               int unset __maybe_unused)
2936{
2937        struct trace *trace = opt->value;
2938
2939        trace->duration_filter = atof(str);
2940        return 0;
2941}
2942
2943static int trace__set_filter_pids(const struct option *opt, const char *str,
2944                                  int unset __maybe_unused)
2945{
2946        int ret = -1;
2947        size_t i;
2948        struct trace *trace = opt->value;
2949        /*
2950         * FIXME: introduce a intarray class, plain parse csv and create a
2951         * { int nr, int entries[] } struct...
2952         */
2953        struct intlist *list = intlist__new(str);
2954
2955        if (list == NULL)
2956                return -1;
2957
2958        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2959        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2960
2961        if (trace->filter_pids.entries == NULL)
2962                goto out;
2963
2964        trace->filter_pids.entries[0] = getpid();
2965
2966        for (i = 1; i < trace->filter_pids.nr; ++i)
2967                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2968
2969        intlist__delete(list);
2970        ret = 0;
2971out:
2972        return ret;
2973}
2974
2975static int trace__open_output(struct trace *trace, const char *filename)
2976{
2977        struct stat st;
2978
2979        if (!stat(filename, &st) && st.st_size) {
2980                char oldname[PATH_MAX];
2981
2982                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2983                unlink(oldname);
2984                rename(filename, oldname);
2985        }
2986
2987        trace->output = fopen(filename, "w");
2988
2989        return trace->output == NULL ? -errno : 0;
2990}
2991
2992static int parse_pagefaults(const struct option *opt, const char *str,
2993                            int unset __maybe_unused)
2994{
2995        int *trace_pgfaults = opt->value;
2996
2997        if (strcmp(str, "all") == 0)
2998                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2999        else if (strcmp(str, "maj") == 0)
3000                *trace_pgfaults |= TRACE_PFMAJ;
3001        else if (strcmp(str, "min") == 0)
3002                *trace_pgfaults |= TRACE_PFMIN;
3003        else
3004                return -1;
3005
3006        return 0;
3007}
3008
3009static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3010{
3011        struct perf_evsel *evsel;
3012
3013        evlist__for_each_entry(evlist, evsel)
3014                evsel->handler = handler;
3015}
3016
3017static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3018{
3019        struct perf_evsel *evsel;
3020
3021        evlist__for_each_entry(evlist, evsel) {
3022                if (evsel->priv || !evsel->tp_format)
3023                        continue;
3024
3025                if (strcmp(evsel->tp_format->system, "syscalls"))
3026                        continue;
3027
3028                if (perf_evsel__init_syscall_tp(evsel))
3029                        return -1;
3030
3031                if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3032                        struct syscall_tp *sc = evsel->priv;
3033
3034                        if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3035                                return -1;
3036                } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3037                        struct syscall_tp *sc = evsel->priv;
3038
3039                        if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3040                                return -1;
3041                }
3042        }
3043
3044        return 0;
3045}
3046
3047/*
3048 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3049 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3050 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3051 *
3052 * It'd be better to introduce a parse_options() variant that would return a
3053 * list with the terms it didn't match to an event...
3054 */
3055static int trace__parse_events_option(const struct option *opt, const char *str,
3056                                      int unset __maybe_unused)
3057{
3058        struct trace *trace = (struct trace *)opt->value;
3059        const char *s = str;
3060        char *sep = NULL, *lists[2] = { NULL, NULL, };
3061        int len = strlen(str) + 1, err = -1, list, idx;
3062        char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3063        char group_name[PATH_MAX];
3064
3065        if (strace_groups_dir == NULL)
3066                return -1;
3067
3068        if (*s == '!') {
3069                ++s;
3070                trace->not_ev_qualifier = true;
3071        }
3072
3073        while (1) {
3074                if ((sep = strchr(s, ',')) != NULL)
3075                        *sep = '\0';
3076
3077                list = 0;
3078                if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3079                    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3080                        list = 1;
3081                } else {
3082                        path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3083                        if (access(group_name, R_OK) == 0)
3084                                list = 1;
3085                }
3086
3087                if (lists[list]) {
3088                        sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3089                } else {
3090                        lists[list] = malloc(len);
3091                        if (lists[list] == NULL)
3092                                goto out;
3093                        strcpy(lists[list], s);
3094                }
3095
3096                if (!sep)
3097                        break;
3098
3099                *sep = ',';
3100                s = sep + 1;
3101        }
3102
3103        if (lists[1] != NULL) {
3104                struct strlist_config slist_config = {
3105                        .dirname = strace_groups_dir,
3106                };
3107
3108                trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3109                if (trace->ev_qualifier == NULL) {
3110                        fputs("Not enough memory to parse event qualifier", trace->output);
3111                        goto out;
3112                }
3113
3114                if (trace__validate_ev_qualifier(trace))
3115                        goto out;
3116                trace->trace_syscalls = true;
3117        }
3118
3119        err = 0;
3120
3121        if (lists[0]) {
3122                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3123                                               "event selector. use 'perf list' to list available events",
3124                                               parse_events_option);
3125                err = parse_events_option(&o, lists[0], 0);
3126        }
3127out:
3128        if (sep)
3129                *sep = ',';
3130
3131        return err;
3132}
3133
3134static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3135{
3136        struct trace *trace = opt->value;
3137
3138        if (!list_empty(&trace->evlist->entries))
3139                return parse_cgroups(opt, str, unset);
3140
3141        trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3142
3143        return 0;
3144}
3145
3146int cmd_trace(int argc, const char **argv)
3147{
3148        const char *trace_usage[] = {
3149                "perf trace [<options>] [<command>]",
3150                "perf trace [<options>] -- <command> [<options>]",
3151                "perf trace record [<options>] [<command>]",
3152                "perf trace record [<options>] -- <command> [<options>]",
3153                NULL
3154        };
3155        struct trace trace = {
3156                .syscalls = {
3157                        . max = -1,
3158                },
3159                .opts = {
3160                        .target = {
3161                                .uid       = UINT_MAX,
3162                                .uses_mmap = true,
3163                        },
3164                        .user_freq     = UINT_MAX,
3165                        .user_interval = ULLONG_MAX,
3166                        .no_buffering  = true,
3167                        .mmap_pages    = UINT_MAX,
3168                        .proc_map_timeout  = 500,
3169                },
3170                .output = stderr,
3171                .show_comm = true,
3172                .trace_syscalls = false,
3173                .kernel_syscallchains = false,
3174                .max_stack = UINT_MAX,
3175        };
3176        const char *output_name = NULL;
3177        const struct option trace_options[] = {
3178        OPT_CALLBACK('e', "event", &trace, "event",
3179                     "event/syscall selector. use 'perf list' to list available events",
3180                     trace__parse_events_option),
3181        OPT_BOOLEAN(0, "comm", &trace.show_comm,
3182                    "show the thread COMM next to its id"),
3183        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3184        OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3185                     trace__parse_events_option),
3186        OPT_STRING('o', "output", &output_name, "file", "output file name"),
3187        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3188        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3189                    "trace events on existing process id"),
3190        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3191                    "trace events on existing thread id"),
3192        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3193                     "pids to filter (by the kernel)", trace__set_filter_pids),
3194        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3195                    "system-wide collection from all CPUs"),
3196        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3197                    "list of cpus to monitor"),
3198        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3199                    "child tasks do not inherit counters"),
3200        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3201                     "number of mmap data pages",
3202                     perf_evlist__parse_mmap_pages),
3203        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3204                   "user to profile"),
3205        OPT_CALLBACK(0, "duration", &trace, "float",
3206                     "show only events with duration > N.M ms",
3207                     trace__set_duration),
3208        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3209        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3210        OPT_BOOLEAN('T', "time", &trace.full_time,
3211                    "Show full timestamp, not time relative to first start"),
3212        OPT_BOOLEAN(0, "failure", &trace.failure_only,
3213                    "Show only syscalls that failed"),
3214        OPT_BOOLEAN('s', "summary", &trace.summary_only,
3215                    "Show only syscall summary with statistics"),
3216        OPT_BOOLEAN('S', "with-summary", &trace.summary,
3217                    "Show all syscalls and summary with statistics"),
3218        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3219                     "Trace pagefaults", parse_pagefaults, "maj"),
3220        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3221        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3222        OPT_CALLBACK(0, "call-graph", &trace.opts,
3223                     "record_mode[,record_size]", record_callchain_help,
3224                     &record_parse_callchain_opt),
3225        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3226                    "Show the kernel callchains on the syscall exit path"),
3227        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3228                     "Set the minimum stack depth when parsing the callchain, "
3229                     "anything below the specified depth will be ignored."),
3230        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3231                     "Set the maximum stack depth when parsing the callchain, "
3232                     "anything beyond the specified depth will be ignored. "
3233                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3234        OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3235                        "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3236        OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3237                        "per thread proc mmap processing timeout in ms"),
3238        OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3239                     trace__parse_cgroups),
3240        OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3241                     "ms to wait before starting measurement after program "
3242                     "start"),
3243        OPT_END()
3244        };
3245        bool __maybe_unused max_stack_user_set = true;
3246        bool mmap_pages_user_set = true;
3247        struct perf_evsel *evsel;
3248        const char * const trace_subcommands[] = { "record", NULL };
3249        int err = -1;
3250        char bf[BUFSIZ];
3251
3252        signal(SIGSEGV, sighandler_dump_stack);
3253        signal(SIGFPE, sighandler_dump_stack);
3254
3255        trace.evlist = perf_evlist__new();
3256        trace.sctbl = syscalltbl__new();
3257
3258        if (trace.evlist == NULL || trace.sctbl == NULL) {
3259                pr_err("Not enough memory to run!\n");
3260                err = -ENOMEM;
3261                goto out;
3262        }
3263
3264        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3265                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3266
3267        if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3268                usage_with_options_msg(trace_usage, trace_options,
3269                                       "cgroup monitoring only available in system-wide mode");
3270        }
3271
3272        evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3273        if (IS_ERR(evsel)) {
3274                bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3275                pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3276                goto out;
3277        }
3278
3279        if (evsel) {
3280                if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3281                    perf_evsel__init_augmented_syscall_tp_args(evsel))
3282                        goto out;
3283                trace.syscalls.events.augmented = evsel;
3284        }
3285
3286        err = bpf__setup_stdout(trace.evlist);
3287        if (err) {
3288                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3289                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3290                goto out;
3291        }
3292
3293        err = -1;
3294
3295        if (trace.trace_pgfaults) {
3296                trace.opts.sample_address = true;
3297                trace.opts.sample_time = true;
3298        }
3299
3300        if (trace.opts.mmap_pages == UINT_MAX)
3301                mmap_pages_user_set = false;
3302
3303        if (trace.max_stack == UINT_MAX) {
3304                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3305                max_stack_user_set = false;
3306        }
3307
3308#ifdef HAVE_DWARF_UNWIND_SUPPORT
3309        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3310                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3311        }
3312#endif
3313
3314        if (callchain_param.enabled) {
3315                if (!mmap_pages_user_set && geteuid() == 0)
3316                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3317
3318                symbol_conf.use_callchain = true;
3319        }
3320
3321        if (trace.evlist->nr_entries > 0) {
3322                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3323                if (evlist__set_syscall_tp_fields(trace.evlist)) {
3324                        perror("failed to set syscalls:* tracepoint fields");
3325                        goto out;
3326                }
3327        }
3328
3329        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3330                return trace__record(&trace, argc-1, &argv[1]);
3331
3332        /* summary_only implies summary option, but don't overwrite summary if set */
3333        if (trace.summary_only)
3334                trace.summary = trace.summary_only;
3335
3336        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3337            trace.evlist->nr_entries == 0 /* Was --events used? */) {
3338                trace.trace_syscalls = true;
3339        }
3340
3341        if (output_name != NULL) {
3342                err = trace__open_output(&trace, output_name);
3343                if (err < 0) {
3344                        perror("failed to create output file");
3345                        goto out;
3346                }
3347        }
3348
3349        err = target__validate(&trace.opts.target);
3350        if (err) {
3351                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3352                fprintf(trace.output, "%s", bf);
3353                goto out_close;
3354        }
3355
3356        err = target__parse_uid(&trace.opts.target);
3357        if (err) {
3358                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3359                fprintf(trace.output, "%s", bf);
3360                goto out_close;
3361        }
3362
3363        if (!argc && target__none(&trace.opts.target))
3364                trace.opts.target.system_wide = true;
3365
3366        if (input_name)
3367                err = trace__replay(&trace);
3368        else
3369                err = trace__run(&trace, argc, argv);
3370
3371out_close:
3372        if (output_name != NULL)
3373                fclose(trace.output);
3374out:
3375        return err;
3376}
3377