linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
  21#include "builtin.h"
  22#include "util/color.h"
  23#include "util/debug.h"
  24#include "util/evlist.h"
  25#include <subcmd/exec-cmd.h>
  26#include "util/machine.h"
  27#include "util/session.h"
  28#include "util/thread.h"
  29#include <subcmd/parse-options.h>
  30#include "util/strlist.h"
  31#include "util/intlist.h"
  32#include "util/thread_map.h"
  33#include "util/stat.h"
  34#include "trace-event.h"
  35#include "util/parse-events.h"
  36#include "util/bpf-loader.h"
  37#include "callchain.h"
  38#include "syscalltbl.h"
  39#include "rb_resort.h"
  40
  41#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
  42#include <stdlib.h>
  43#include <linux/err.h>
  44#include <linux/filter.h>
  45#include <linux/audit.h>
  46#include <linux/random.h>
  47#include <linux/stringify.h>
  48
  49#ifndef O_CLOEXEC
  50# define O_CLOEXEC              02000000
  51#endif
  52
  53struct trace {
  54        struct perf_tool        tool;
  55        struct syscalltbl       *sctbl;
  56        struct {
  57                int             max;
  58                struct syscall  *table;
  59                struct {
  60                        struct perf_evsel *sys_enter,
  61                                          *sys_exit;
  62                }               events;
  63        } syscalls;
  64        struct record_opts      opts;
  65        struct perf_evlist      *evlist;
  66        struct machine          *host;
  67        struct thread           *current;
  68        u64                     base_time;
  69        FILE                    *output;
  70        unsigned long           nr_events;
  71        struct strlist          *ev_qualifier;
  72        struct {
  73                size_t          nr;
  74                int             *entries;
  75        }                       ev_qualifier_ids;
  76        struct intlist          *tid_list;
  77        struct intlist          *pid_list;
  78        struct {
  79                size_t          nr;
  80                pid_t           *entries;
  81        }                       filter_pids;
  82        double                  duration_filter;
  83        double                  runtime_ms;
  84        struct {
  85                u64             vfs_getname,
  86                                proc_getname;
  87        } stats;
  88        unsigned int            max_stack;
  89        unsigned int            min_stack;
  90        bool                    not_ev_qualifier;
  91        bool                    live;
  92        bool                    full_time;
  93        bool                    sched;
  94        bool                    multiple_threads;
  95        bool                    summary;
  96        bool                    summary_only;
  97        bool                    show_comm;
  98        bool                    show_tool_stats;
  99        bool                    trace_syscalls;
 100        bool                    kernel_syscallchains;
 101        bool                    force;
 102        bool                    vfs_getname;
 103        int                     trace_pgfaults;
 104        int                     open_id;
 105};
 106
 107struct tp_field {
 108        int offset;
 109        union {
 110                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 111                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 112        };
 113};
 114
 115#define TP_UINT_FIELD(bits) \
 116static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 117{ \
 118        u##bits value; \
 119        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 120        return value;  \
 121}
 122
 123TP_UINT_FIELD(8);
 124TP_UINT_FIELD(16);
 125TP_UINT_FIELD(32);
 126TP_UINT_FIELD(64);
 127
 128#define TP_UINT_FIELD__SWAPPED(bits) \
 129static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 130{ \
 131        u##bits value; \
 132        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 133        return bswap_##bits(value);\
 134}
 135
 136TP_UINT_FIELD__SWAPPED(16);
 137TP_UINT_FIELD__SWAPPED(32);
 138TP_UINT_FIELD__SWAPPED(64);
 139
 140static int tp_field__init_uint(struct tp_field *field,
 141                               struct format_field *format_field,
 142                               bool needs_swap)
 143{
 144        field->offset = format_field->offset;
 145
 146        switch (format_field->size) {
 147        case 1:
 148                field->integer = tp_field__u8;
 149                break;
 150        case 2:
 151                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 152                break;
 153        case 4:
 154                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 155                break;
 156        case 8:
 157                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 158                break;
 159        default:
 160                return -1;
 161        }
 162
 163        return 0;
 164}
 165
 166static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 167{
 168        return sample->raw_data + field->offset;
 169}
 170
 171static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 172{
 173        field->offset = format_field->offset;
 174        field->pointer = tp_field__ptr;
 175        return 0;
 176}
 177
 178struct syscall_tp {
 179        struct tp_field id;
 180        union {
 181                struct tp_field args, ret;
 182        };
 183};
 184
 185static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 186                                          struct tp_field *field,
 187                                          const char *name)
 188{
 189        struct format_field *format_field = perf_evsel__field(evsel, name);
 190
 191        if (format_field == NULL)
 192                return -1;
 193
 194        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 195}
 196
 197#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 198        ({ struct syscall_tp *sc = evsel->priv;\
 199           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 200
 201static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 202                                         struct tp_field *field,
 203                                         const char *name)
 204{
 205        struct format_field *format_field = perf_evsel__field(evsel, name);
 206
 207        if (format_field == NULL)
 208                return -1;
 209
 210        return tp_field__init_ptr(field, format_field);
 211}
 212
 213#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 214        ({ struct syscall_tp *sc = evsel->priv;\
 215           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 216
 217static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 218{
 219        zfree(&evsel->priv);
 220        perf_evsel__delete(evsel);
 221}
 222
 223static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 224{
 225        evsel->priv = malloc(sizeof(struct syscall_tp));
 226        if (evsel->priv != NULL) {
 227                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 228                        goto out_delete;
 229
 230                evsel->handler = handler;
 231                return 0;
 232        }
 233
 234        return -ENOMEM;
 235
 236out_delete:
 237        zfree(&evsel->priv);
 238        return -ENOENT;
 239}
 240
 241static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 242{
 243        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 244
 245        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 246        if (IS_ERR(evsel))
 247                evsel = perf_evsel__newtp("syscalls", direction);
 248
 249        if (IS_ERR(evsel))
 250                return NULL;
 251
 252        if (perf_evsel__init_syscall_tp(evsel, handler))
 253                goto out_delete;
 254
 255        return evsel;
 256
 257out_delete:
 258        perf_evsel__delete_priv(evsel);
 259        return NULL;
 260}
 261
 262#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 263        ({ struct syscall_tp *fields = evsel->priv; \
 264           fields->name.integer(&fields->name, sample); })
 265
 266#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 267        ({ struct syscall_tp *fields = evsel->priv; \
 268           fields->name.pointer(&fields->name, sample); })
 269
 270struct syscall_arg {
 271        unsigned long val;
 272        struct thread *thread;
 273        struct trace  *trace;
 274        void          *parm;
 275        u8            idx;
 276        u8            mask;
 277};
 278
 279struct strarray {
 280        int         offset;
 281        int         nr_entries;
 282        const char **entries;
 283};
 284
 285#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
 286        .nr_entries = ARRAY_SIZE(array), \
 287        .entries = array, \
 288}
 289
 290#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
 291        .offset     = off, \
 292        .nr_entries = ARRAY_SIZE(array), \
 293        .entries = array, \
 294}
 295
 296static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 297                                                const char *intfmt,
 298                                                struct syscall_arg *arg)
 299{
 300        struct strarray *sa = arg->parm;
 301        int idx = arg->val - sa->offset;
 302
 303        if (idx < 0 || idx >= sa->nr_entries)
 304                return scnprintf(bf, size, intfmt, arg->val);
 305
 306        return scnprintf(bf, size, "%s", sa->entries[idx]);
 307}
 308
 309static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 310                                              struct syscall_arg *arg)
 311{
 312        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 313}
 314
 315#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 316
 317#if defined(__i386__) || defined(__x86_64__)
 318/*
 319 * FIXME: Make this available to all arches as soon as the ioctl beautifier
 320 *        gets rewritten to support all arches.
 321 */
 322static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
 323                                                 struct syscall_arg *arg)
 324{
 325        return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
 326}
 327
 328#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
 329#endif /* defined(__i386__) || defined(__x86_64__) */
 330
 331static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 332                                        struct syscall_arg *arg);
 333
 334#define SCA_FD syscall_arg__scnprintf_fd
 335
 336#ifndef AT_FDCWD
 337#define AT_FDCWD        -100
 338#endif
 339
 340static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 341                                           struct syscall_arg *arg)
 342{
 343        int fd = arg->val;
 344
 345        if (fd == AT_FDCWD)
 346                return scnprintf(bf, size, "CWD");
 347
 348        return syscall_arg__scnprintf_fd(bf, size, arg);
 349}
 350
 351#define SCA_FDAT syscall_arg__scnprintf_fd_at
 352
 353static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 354                                              struct syscall_arg *arg);
 355
 356#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 357
 358static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
 359                                         struct syscall_arg *arg)
 360{
 361        return scnprintf(bf, size, "%#lx", arg->val);
 362}
 363
 364#define SCA_HEX syscall_arg__scnprintf_hex
 365
 366static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
 367                                         struct syscall_arg *arg)
 368{
 369        return scnprintf(bf, size, "%d", arg->val);
 370}
 371
 372#define SCA_INT syscall_arg__scnprintf_int
 373
 374static const char *bpf_cmd[] = {
 375        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 376        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 377};
 378static DEFINE_STRARRAY(bpf_cmd);
 379
 380static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 381static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 382
 383static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 384static DEFINE_STRARRAY(itimers);
 385
 386static const char *keyctl_options[] = {
 387        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 388        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 389        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 390        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 391        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 392};
 393static DEFINE_STRARRAY(keyctl_options);
 394
 395static const char *whences[] = { "SET", "CUR", "END",
 396#ifdef SEEK_DATA
 397"DATA",
 398#endif
 399#ifdef SEEK_HOLE
 400"HOLE",
 401#endif
 402};
 403static DEFINE_STRARRAY(whences);
 404
 405static const char *fcntl_cmds[] = {
 406        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 407        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
 408        "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
 409        "F_GETOWNER_UIDS",
 410};
 411static DEFINE_STRARRAY(fcntl_cmds);
 412
 413static const char *rlimit_resources[] = {
 414        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 415        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 416        "RTTIME",
 417};
 418static DEFINE_STRARRAY(rlimit_resources);
 419
 420static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 421static DEFINE_STRARRAY(sighow);
 422
 423static const char *clockid[] = {
 424        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 425        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 426        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 427};
 428static DEFINE_STRARRAY(clockid);
 429
 430static const char *socket_families[] = {
 431        "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 432        "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 433        "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 434        "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 435        "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 436        "ALG", "NFC", "VSOCK",
 437};
 438static DEFINE_STRARRAY(socket_families);
 439
 440static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 441                                                 struct syscall_arg *arg)
 442{
 443        size_t printed = 0;
 444        int mode = arg->val;
 445
 446        if (mode == F_OK) /* 0 */
 447                return scnprintf(bf, size, "F");
 448#define P_MODE(n) \
 449        if (mode & n##_OK) { \
 450                printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 451                mode &= ~n##_OK; \
 452        }
 453
 454        P_MODE(R);
 455        P_MODE(W);
 456        P_MODE(X);
 457#undef P_MODE
 458
 459        if (mode)
 460                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 461
 462        return printed;
 463}
 464
 465#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 466
 467static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 468                                              struct syscall_arg *arg);
 469
 470#define SCA_FILENAME syscall_arg__scnprintf_filename
 471
 472static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 473                                                struct syscall_arg *arg)
 474{
 475        int printed = 0, flags = arg->val;
 476
 477#define P_FLAG(n) \
 478        if (flags & O_##n) { \
 479                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 480                flags &= ~O_##n; \
 481        }
 482
 483        P_FLAG(CLOEXEC);
 484        P_FLAG(NONBLOCK);
 485#undef P_FLAG
 486
 487        if (flags)
 488                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 489
 490        return printed;
 491}
 492
 493#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 494
 495#if defined(__i386__) || defined(__x86_64__)
 496/*
 497 * FIXME: Make this available to all arches.
 498 */
 499#define TCGETS          0x5401
 500
 501static const char *tioctls[] = {
 502        "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
 503        "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
 504        "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
 505        "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
 506        "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
 507        "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
 508        "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
 509        "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
 510        "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
 511        "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
 512        "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
 513        [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
 514        "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
 515        "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
 516        "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
 517};
 518
 519static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
 520#endif /* defined(__i386__) || defined(__x86_64__) */
 521
 522#ifndef GRND_NONBLOCK
 523#define GRND_NONBLOCK   0x0001
 524#endif
 525#ifndef GRND_RANDOM
 526#define GRND_RANDOM     0x0002
 527#endif
 528
 529static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 530                                                   struct syscall_arg *arg)
 531{
 532        int printed = 0, flags = arg->val;
 533
 534#define P_FLAG(n) \
 535        if (flags & GRND_##n) { \
 536                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 537                flags &= ~GRND_##n; \
 538        }
 539
 540        P_FLAG(RANDOM);
 541        P_FLAG(NONBLOCK);
 542#undef P_FLAG
 543
 544        if (flags)
 545                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 546
 547        return printed;
 548}
 549
 550#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 551
 552#define STRARRAY(arg, name, array) \
 553          .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
 554          .arg_parm      = { [arg] = &strarray__##array, }
 555
 556#include "trace/beauty/eventfd.c"
 557#include "trace/beauty/flock.c"
 558#include "trace/beauty/futex_op.c"
 559#include "trace/beauty/mmap.c"
 560#include "trace/beauty/mode_t.c"
 561#include "trace/beauty/msg_flags.c"
 562#include "trace/beauty/open_flags.c"
 563#include "trace/beauty/perf_event_open.c"
 564#include "trace/beauty/pid.c"
 565#include "trace/beauty/sched_policy.c"
 566#include "trace/beauty/seccomp.c"
 567#include "trace/beauty/signum.c"
 568#include "trace/beauty/socket_type.c"
 569#include "trace/beauty/waitid_options.c"
 570
 571static struct syscall_fmt {
 572        const char *name;
 573        const char *alias;
 574        size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
 575        void       *arg_parm[6];
 576        bool       errmsg;
 577        bool       errpid;
 578        bool       timeout;
 579        bool       hexret;
 580} syscall_fmts[] = {
 581        { .name     = "access",     .errmsg = true,
 582          .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
 583        { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
 584        { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
 585        { .name     = "brk",        .hexret = true,
 586          .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
 587        { .name     = "chdir",      .errmsg = true, },
 588        { .name     = "chmod",      .errmsg = true, },
 589        { .name     = "chroot",     .errmsg = true, },
 590        { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
 591        { .name     = "clone",      .errpid = true, },
 592        { .name     = "close",      .errmsg = true,
 593          .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
 594        { .name     = "connect",    .errmsg = true, },
 595        { .name     = "creat",      .errmsg = true, },
 596        { .name     = "dup",        .errmsg = true, },
 597        { .name     = "dup2",       .errmsg = true, },
 598        { .name     = "dup3",       .errmsg = true, },
 599        { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
 600        { .name     = "eventfd2",   .errmsg = true,
 601          .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
 602        { .name     = "faccessat",  .errmsg = true, },
 603        { .name     = "fadvise64",  .errmsg = true, },
 604        { .name     = "fallocate",  .errmsg = true, },
 605        { .name     = "fchdir",     .errmsg = true, },
 606        { .name     = "fchmod",     .errmsg = true, },
 607        { .name     = "fchmodat",   .errmsg = true,
 608          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 609        { .name     = "fchown",     .errmsg = true, },
 610        { .name     = "fchownat",   .errmsg = true,
 611          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 612        { .name     = "fcntl",      .errmsg = true,
 613          .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
 614          .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
 615        { .name     = "fdatasync",  .errmsg = true, },
 616        { .name     = "flock",      .errmsg = true,
 617          .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
 618        { .name     = "fsetxattr",  .errmsg = true, },
 619        { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
 620        { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
 621        { .name     = "fstatfs",    .errmsg = true, },
 622        { .name     = "fsync",    .errmsg = true, },
 623        { .name     = "ftruncate", .errmsg = true, },
 624        { .name     = "futex",      .errmsg = true,
 625          .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
 626        { .name     = "futimesat", .errmsg = true,
 627          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 628        { .name     = "getdents",   .errmsg = true, },
 629        { .name     = "getdents64", .errmsg = true, },
 630        { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 631        { .name     = "getpid",     .errpid = true, },
 632        { .name     = "getpgid",    .errpid = true, },
 633        { .name     = "getppid",    .errpid = true, },
 634        { .name     = "getrandom",  .errmsg = true,
 635          .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
 636        { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 637        { .name     = "getxattr",   .errmsg = true, },
 638        { .name     = "inotify_add_watch",          .errmsg = true, },
 639        { .name     = "ioctl",      .errmsg = true,
 640          .arg_scnprintf = {
 641#if defined(__i386__) || defined(__x86_64__)
 642/*
 643 * FIXME: Make this available to all arches.
 644 */
 645                             [1] = SCA_STRHEXARRAY, /* cmd */
 646                             [2] = SCA_HEX, /* arg */ },
 647          .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
 648#else
 649                             [2] = SCA_HEX, /* arg */ }, },
 650#endif
 651        { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
 652        { .name     = "kill",       .errmsg = true,
 653          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 654        { .name     = "lchown",    .errmsg = true, },
 655        { .name     = "lgetxattr",  .errmsg = true, },
 656        { .name     = "linkat",     .errmsg = true,
 657          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 658        { .name     = "listxattr",  .errmsg = true, },
 659        { .name     = "llistxattr", .errmsg = true, },
 660        { .name     = "lremovexattr",  .errmsg = true, },
 661        { .name     = "lseek",      .errmsg = true,
 662          .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
 663          .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
 664        { .name     = "lsetxattr",  .errmsg = true, },
 665        { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
 666        { .name     = "lsxattr",    .errmsg = true, },
 667        { .name     = "madvise",    .errmsg = true,
 668          .arg_scnprintf = { [0] = SCA_HEX,      /* start */
 669                             [2] = SCA_MADV_BHV, /* behavior */ }, },
 670        { .name     = "mkdir",    .errmsg = true, },
 671        { .name     = "mkdirat",    .errmsg = true,
 672          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 673        { .name     = "mknod",      .errmsg = true, },
 674        { .name     = "mknodat",    .errmsg = true,
 675          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 676        { .name     = "mlock",      .errmsg = true,
 677          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 678        { .name     = "mlockall",   .errmsg = true,
 679          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 680        { .name     = "mmap",       .hexret = true,
 681          .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
 682                             [2] = SCA_MMAP_PROT, /* prot */
 683                             [3] = SCA_MMAP_FLAGS, /* flags */ }, },
 684        { .name     = "mprotect",   .errmsg = true,
 685          .arg_scnprintf = { [0] = SCA_HEX, /* start */
 686                             [2] = SCA_MMAP_PROT, /* prot */ }, },
 687        { .name     = "mq_unlink", .errmsg = true,
 688          .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
 689        { .name     = "mremap",     .hexret = true,
 690          .arg_scnprintf = { [0] = SCA_HEX, /* addr */
 691                             [3] = SCA_MREMAP_FLAGS, /* flags */
 692                             [4] = SCA_HEX, /* new_addr */ }, },
 693        { .name     = "munlock",    .errmsg = true,
 694          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 695        { .name     = "munmap",     .errmsg = true,
 696          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 697        { .name     = "name_to_handle_at", .errmsg = true,
 698          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 699        { .name     = "newfstatat", .errmsg = true,
 700          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 701        { .name     = "open",       .errmsg = true,
 702          .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
 703        { .name     = "open_by_handle_at", .errmsg = true,
 704          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 705                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 706        { .name     = "openat",     .errmsg = true,
 707          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 708                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 709        { .name     = "perf_event_open", .errmsg = true,
 710          .arg_scnprintf = { [2] = SCA_INT, /* cpu */
 711                             [3] = SCA_FD,  /* group_fd */
 712                             [4] = SCA_PERF_FLAGS,  /* flags */ }, },
 713        { .name     = "pipe2",      .errmsg = true,
 714          .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
 715        { .name     = "poll",       .errmsg = true, .timeout = true, },
 716        { .name     = "ppoll",      .errmsg = true, .timeout = true, },
 717        { .name     = "pread",      .errmsg = true, .alias = "pread64", },
 718        { .name     = "preadv",     .errmsg = true, .alias = "pread", },
 719        { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
 720        { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
 721        { .name     = "pwritev",    .errmsg = true, },
 722        { .name     = "read",       .errmsg = true, },
 723        { .name     = "readlink",   .errmsg = true, },
 724        { .name     = "readlinkat", .errmsg = true,
 725          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 726        { .name     = "readv",      .errmsg = true, },
 727        { .name     = "recvfrom",   .errmsg = true,
 728          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 729        { .name     = "recvmmsg",   .errmsg = true,
 730          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 731        { .name     = "recvmsg",    .errmsg = true,
 732          .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 733        { .name     = "removexattr", .errmsg = true, },
 734        { .name     = "renameat",   .errmsg = true,
 735          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 736        { .name     = "rmdir",    .errmsg = true, },
 737        { .name     = "rt_sigaction", .errmsg = true,
 738          .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
 739        { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
 740        { .name     = "rt_sigqueueinfo", .errmsg = true,
 741          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 742        { .name     = "rt_tgsigqueueinfo", .errmsg = true,
 743          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 744        { .name     = "sched_setscheduler",   .errmsg = true,
 745          .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
 746        { .name     = "seccomp", .errmsg = true,
 747          .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
 748                             [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
 749        { .name     = "select",     .errmsg = true, .timeout = true, },
 750        { .name     = "sendmmsg",    .errmsg = true,
 751          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 752        { .name     = "sendmsg",    .errmsg = true,
 753          .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 754        { .name     = "sendto",     .errmsg = true,
 755          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 756        { .name     = "set_tid_address", .errpid = true, },
 757        { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 758        { .name     = "setpgid",    .errmsg = true, },
 759        { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 760        { .name     = "setxattr",   .errmsg = true, },
 761        { .name     = "shutdown",   .errmsg = true, },
 762        { .name     = "socket",     .errmsg = true,
 763          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 764                             [1] = SCA_SK_TYPE, /* type */ },
 765          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
 766        { .name     = "socketpair", .errmsg = true,
 767          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 768                             [1] = SCA_SK_TYPE, /* type */ },
 769          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
 770        { .name     = "stat",       .errmsg = true, .alias = "newstat", },
 771        { .name     = "statfs",     .errmsg = true, },
 772        { .name     = "swapoff",    .errmsg = true,
 773          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 774        { .name     = "swapon",     .errmsg = true,
 775          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 776        { .name     = "symlinkat",  .errmsg = true,
 777          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 778        { .name     = "tgkill",     .errmsg = true,
 779          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 780        { .name     = "tkill",      .errmsg = true,
 781          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 782        { .name     = "truncate",   .errmsg = true, },
 783        { .name     = "uname",      .errmsg = true, .alias = "newuname", },
 784        { .name     = "unlinkat",   .errmsg = true,
 785          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 786        { .name     = "utime",  .errmsg = true, },
 787        { .name     = "utimensat",  .errmsg = true,
 788          .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
 789        { .name     = "utimes",  .errmsg = true, },
 790        { .name     = "vmsplice",  .errmsg = true, },
 791        { .name     = "wait4",      .errpid = true,
 792          .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
 793        { .name     = "waitid",     .errpid = true,
 794          .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
 795        { .name     = "write",      .errmsg = true, },
 796        { .name     = "writev",     .errmsg = true, },
 797};
 798
 799static int syscall_fmt__cmp(const void *name, const void *fmtp)
 800{
 801        const struct syscall_fmt *fmt = fmtp;
 802        return strcmp(name, fmt->name);
 803}
 804
 805static struct syscall_fmt *syscall_fmt__find(const char *name)
 806{
 807        const int nmemb = ARRAY_SIZE(syscall_fmts);
 808        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 809}
 810
 811struct syscall {
 812        struct event_format *tp_format;
 813        int                 nr_args;
 814        struct format_field *args;
 815        const char          *name;
 816        bool                is_exit;
 817        struct syscall_fmt  *fmt;
 818        size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 819        void                **arg_parm;
 820};
 821
 822static size_t fprintf_duration(unsigned long t, FILE *fp)
 823{
 824        double duration = (double)t / NSEC_PER_MSEC;
 825        size_t printed = fprintf(fp, "(");
 826
 827        if (duration >= 1.0)
 828                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 829        else if (duration >= 0.01)
 830                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 831        else
 832                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 833        return printed + fprintf(fp, "): ");
 834}
 835
 836/**
 837 * filename.ptr: The filename char pointer that will be vfs_getname'd
 838 * filename.entry_str_pos: Where to insert the string translated from
 839 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 840 */
 841struct thread_trace {
 842        u64               entry_time;
 843        u64               exit_time;
 844        bool              entry_pending;
 845        unsigned long     nr_events;
 846        unsigned long     pfmaj, pfmin;
 847        char              *entry_str;
 848        double            runtime_ms;
 849        struct {
 850                unsigned long ptr;
 851                short int     entry_str_pos;
 852                bool          pending_open;
 853                unsigned int  namelen;
 854                char          *name;
 855        } filename;
 856        struct {
 857                int       max;
 858                char      **table;
 859        } paths;
 860
 861        struct intlist *syscall_stats;
 862};
 863
 864static struct thread_trace *thread_trace__new(void)
 865{
 866        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 867
 868        if (ttrace)
 869                ttrace->paths.max = -1;
 870
 871        ttrace->syscall_stats = intlist__new(NULL);
 872
 873        return ttrace;
 874}
 875
 876static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 877{
 878        struct thread_trace *ttrace;
 879
 880        if (thread == NULL)
 881                goto fail;
 882
 883        if (thread__priv(thread) == NULL)
 884                thread__set_priv(thread, thread_trace__new());
 885
 886        if (thread__priv(thread) == NULL)
 887                goto fail;
 888
 889        ttrace = thread__priv(thread);
 890        ++ttrace->nr_events;
 891
 892        return ttrace;
 893fail:
 894        color_fprintf(fp, PERF_COLOR_RED,
 895                      "WARNING: not enough memory, dropping samples!\n");
 896        return NULL;
 897}
 898
 899#define TRACE_PFMAJ             (1 << 0)
 900#define TRACE_PFMIN             (1 << 1)
 901
 902static const size_t trace__entry_str_size = 2048;
 903
 904static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 905{
 906        struct thread_trace *ttrace = thread__priv(thread);
 907
 908        if (fd > ttrace->paths.max) {
 909                char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 910
 911                if (npath == NULL)
 912                        return -1;
 913
 914                if (ttrace->paths.max != -1) {
 915                        memset(npath + ttrace->paths.max + 1, 0,
 916                               (fd - ttrace->paths.max) * sizeof(char *));
 917                } else {
 918                        memset(npath, 0, (fd + 1) * sizeof(char *));
 919                }
 920
 921                ttrace->paths.table = npath;
 922                ttrace->paths.max   = fd;
 923        }
 924
 925        ttrace->paths.table[fd] = strdup(pathname);
 926
 927        return ttrace->paths.table[fd] != NULL ? 0 : -1;
 928}
 929
 930static int thread__read_fd_path(struct thread *thread, int fd)
 931{
 932        char linkname[PATH_MAX], pathname[PATH_MAX];
 933        struct stat st;
 934        int ret;
 935
 936        if (thread->pid_ == thread->tid) {
 937                scnprintf(linkname, sizeof(linkname),
 938                          "/proc/%d/fd/%d", thread->pid_, fd);
 939        } else {
 940                scnprintf(linkname, sizeof(linkname),
 941                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 942        }
 943
 944        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 945                return -1;
 946
 947        ret = readlink(linkname, pathname, sizeof(pathname));
 948
 949        if (ret < 0 || ret > st.st_size)
 950                return -1;
 951
 952        pathname[ret] = '\0';
 953        return trace__set_fd_pathname(thread, fd, pathname);
 954}
 955
 956static const char *thread__fd_path(struct thread *thread, int fd,
 957                                   struct trace *trace)
 958{
 959        struct thread_trace *ttrace = thread__priv(thread);
 960
 961        if (ttrace == NULL)
 962                return NULL;
 963
 964        if (fd < 0)
 965                return NULL;
 966
 967        if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 968                if (!trace->live)
 969                        return NULL;
 970                ++trace->stats.proc_getname;
 971                if (thread__read_fd_path(thread, fd))
 972                        return NULL;
 973        }
 974
 975        return ttrace->paths.table[fd];
 976}
 977
 978static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 979                                        struct syscall_arg *arg)
 980{
 981        int fd = arg->val;
 982        size_t printed = scnprintf(bf, size, "%d", fd);
 983        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 984
 985        if (path)
 986                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
 987
 988        return printed;
 989}
 990
 991static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 992                                              struct syscall_arg *arg)
 993{
 994        int fd = arg->val;
 995        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
 996        struct thread_trace *ttrace = thread__priv(arg->thread);
 997
 998        if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
 999                zfree(&ttrace->paths.table[fd]);
1000
1001        return printed;
1002}
1003
1004static void thread__set_filename_pos(struct thread *thread, const char *bf,
1005                                     unsigned long ptr)
1006{
1007        struct thread_trace *ttrace = thread__priv(thread);
1008
1009        ttrace->filename.ptr = ptr;
1010        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1011}
1012
1013static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1014                                              struct syscall_arg *arg)
1015{
1016        unsigned long ptr = arg->val;
1017
1018        if (!arg->trace->vfs_getname)
1019                return scnprintf(bf, size, "%#x", ptr);
1020
1021        thread__set_filename_pos(arg->thread, bf, ptr);
1022        return 0;
1023}
1024
1025static bool trace__filter_duration(struct trace *trace, double t)
1026{
1027        return t < (trace->duration_filter * NSEC_PER_MSEC);
1028}
1029
1030static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1031{
1032        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1033
1034        return fprintf(fp, "%10.3f ", ts);
1035}
1036
1037static bool done = false;
1038static bool interrupted = false;
1039
1040static void sig_handler(int sig)
1041{
1042        done = true;
1043        interrupted = sig == SIGINT;
1044}
1045
1046static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1047                                        u64 duration, u64 tstamp, FILE *fp)
1048{
1049        size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1050        printed += fprintf_duration(duration, fp);
1051
1052        if (trace->multiple_threads) {
1053                if (trace->show_comm)
1054                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1055                printed += fprintf(fp, "%d ", thread->tid);
1056        }
1057
1058        return printed;
1059}
1060
1061static int trace__process_event(struct trace *trace, struct machine *machine,
1062                                union perf_event *event, struct perf_sample *sample)
1063{
1064        int ret = 0;
1065
1066        switch (event->header.type) {
1067        case PERF_RECORD_LOST:
1068                color_fprintf(trace->output, PERF_COLOR_RED,
1069                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1070                ret = machine__process_lost_event(machine, event, sample);
1071                break;
1072        default:
1073                ret = machine__process_event(machine, event, sample);
1074                break;
1075        }
1076
1077        return ret;
1078}
1079
1080static int trace__tool_process(struct perf_tool *tool,
1081                               union perf_event *event,
1082                               struct perf_sample *sample,
1083                               struct machine *machine)
1084{
1085        struct trace *trace = container_of(tool, struct trace, tool);
1086        return trace__process_event(trace, machine, event, sample);
1087}
1088
1089static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1090{
1091        struct machine *machine = vmachine;
1092
1093        if (machine->kptr_restrict_warned)
1094                return NULL;
1095
1096        if (symbol_conf.kptr_restrict) {
1097                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1098                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1099                           "Kernel samples will not be resolved.\n");
1100                machine->kptr_restrict_warned = true;
1101                return NULL;
1102        }
1103
1104        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1105}
1106
1107static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1108{
1109        int err = symbol__init(NULL);
1110
1111        if (err)
1112                return err;
1113
1114        trace->host = machine__new_host();
1115        if (trace->host == NULL)
1116                return -ENOMEM;
1117
1118        if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1119                return -errno;
1120
1121        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1122                                            evlist->threads, trace__tool_process, false,
1123                                            trace->opts.proc_map_timeout);
1124        if (err)
1125                symbol__exit();
1126
1127        return err;
1128}
1129
1130static int syscall__set_arg_fmts(struct syscall *sc)
1131{
1132        struct format_field *field;
1133        int idx = 0, len;
1134
1135        sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1136        if (sc->arg_scnprintf == NULL)
1137                return -1;
1138
1139        if (sc->fmt)
1140                sc->arg_parm = sc->fmt->arg_parm;
1141
1142        for (field = sc->args; field; field = field->next) {
1143                if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1144                        sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1145                else if (strcmp(field->type, "const char *") == 0 &&
1146                         (strcmp(field->name, "filename") == 0 ||
1147                          strcmp(field->name, "path") == 0 ||
1148                          strcmp(field->name, "pathname") == 0))
1149                        sc->arg_scnprintf[idx] = SCA_FILENAME;
1150                else if (field->flags & FIELD_IS_POINTER)
1151                        sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1152                else if (strcmp(field->type, "pid_t") == 0)
1153                        sc->arg_scnprintf[idx] = SCA_PID;
1154                else if (strcmp(field->type, "umode_t") == 0)
1155                        sc->arg_scnprintf[idx] = SCA_MODE_T;
1156                else if ((strcmp(field->type, "int") == 0 ||
1157                          strcmp(field->type, "unsigned int") == 0 ||
1158                          strcmp(field->type, "long") == 0) &&
1159                         (len = strlen(field->name)) >= 2 &&
1160                         strcmp(field->name + len - 2, "fd") == 0) {
1161                        /*
1162                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1163                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1164                         * 65 int
1165                         * 23 unsigned int
1166                         * 7 unsigned long
1167                         */
1168                        sc->arg_scnprintf[idx] = SCA_FD;
1169                }
1170                ++idx;
1171        }
1172
1173        return 0;
1174}
1175
1176static int trace__read_syscall_info(struct trace *trace, int id)
1177{
1178        char tp_name[128];
1179        struct syscall *sc;
1180        const char *name = syscalltbl__name(trace->sctbl, id);
1181
1182        if (name == NULL)
1183                return -1;
1184
1185        if (id > trace->syscalls.max) {
1186                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1187
1188                if (nsyscalls == NULL)
1189                        return -1;
1190
1191                if (trace->syscalls.max != -1) {
1192                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1193                               (id - trace->syscalls.max) * sizeof(*sc));
1194                } else {
1195                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1196                }
1197
1198                trace->syscalls.table = nsyscalls;
1199                trace->syscalls.max   = id;
1200        }
1201
1202        sc = trace->syscalls.table + id;
1203        sc->name = name;
1204
1205        sc->fmt  = syscall_fmt__find(sc->name);
1206
1207        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1208        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1209
1210        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1211                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1212                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1213        }
1214
1215        if (IS_ERR(sc->tp_format))
1216                return -1;
1217
1218        sc->args = sc->tp_format->format.fields;
1219        sc->nr_args = sc->tp_format->format.nr_fields;
1220        /*
1221         * We need to check and discard the first variable '__syscall_nr'
1222         * or 'nr' that mean the syscall number. It is needless here.
1223         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1224         */
1225        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1226                sc->args = sc->args->next;
1227                --sc->nr_args;
1228        }
1229
1230        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1231
1232        return syscall__set_arg_fmts(sc);
1233}
1234
1235static int trace__validate_ev_qualifier(struct trace *trace)
1236{
1237        int err = 0, i;
1238        struct str_node *pos;
1239
1240        trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1241        trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1242                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1243
1244        if (trace->ev_qualifier_ids.entries == NULL) {
1245                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1246                       trace->output);
1247                err = -EINVAL;
1248                goto out;
1249        }
1250
1251        i = 0;
1252
1253        strlist__for_each_entry(pos, trace->ev_qualifier) {
1254                const char *sc = pos->s;
1255                int id = syscalltbl__id(trace->sctbl, sc);
1256
1257                if (id < 0) {
1258                        if (err == 0) {
1259                                fputs("Error:\tInvalid syscall ", trace->output);
1260                                err = -EINVAL;
1261                        } else {
1262                                fputs(", ", trace->output);
1263                        }
1264
1265                        fputs(sc, trace->output);
1266                }
1267
1268                trace->ev_qualifier_ids.entries[i++] = id;
1269        }
1270
1271        if (err < 0) {
1272                fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1273                      "\nHint:\tand: 'man syscalls'\n", trace->output);
1274                zfree(&trace->ev_qualifier_ids.entries);
1275                trace->ev_qualifier_ids.nr = 0;
1276        }
1277out:
1278        return err;
1279}
1280
1281/*
1282 * args is to be interpreted as a series of longs but we need to handle
1283 * 8-byte unaligned accesses. args points to raw_data within the event
1284 * and raw_data is guaranteed to be 8-byte unaligned because it is
1285 * preceded by raw_size which is a u32. So we need to copy args to a temp
1286 * variable to read it. Most notably this avoids extended load instructions
1287 * on unaligned addresses
1288 */
1289
1290static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1291                                      unsigned char *args, struct trace *trace,
1292                                      struct thread *thread)
1293{
1294        size_t printed = 0;
1295        unsigned char *p;
1296        unsigned long val;
1297
1298        if (sc->args != NULL) {
1299                struct format_field *field;
1300                u8 bit = 1;
1301                struct syscall_arg arg = {
1302                        .idx    = 0,
1303                        .mask   = 0,
1304                        .trace  = trace,
1305                        .thread = thread,
1306                };
1307
1308                for (field = sc->args; field;
1309                     field = field->next, ++arg.idx, bit <<= 1) {
1310                        if (arg.mask & bit)
1311                                continue;
1312
1313                        /* special care for unaligned accesses */
1314                        p = args + sizeof(unsigned long) * arg.idx;
1315                        memcpy(&val, p, sizeof(val));
1316
1317                        /*
1318                         * Suppress this argument if its value is zero and
1319                         * and we don't have a string associated in an
1320                         * strarray for it.
1321                         */
1322                        if (val == 0 &&
1323                            !(sc->arg_scnprintf &&
1324                              sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1325                              sc->arg_parm[arg.idx]))
1326                                continue;
1327
1328                        printed += scnprintf(bf + printed, size - printed,
1329                                             "%s%s: ", printed ? ", " : "", field->name);
1330                        if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1331                                arg.val = val;
1332                                if (sc->arg_parm)
1333                                        arg.parm = sc->arg_parm[arg.idx];
1334                                printed += sc->arg_scnprintf[arg.idx](bf + printed,
1335                                                                      size - printed, &arg);
1336                        } else {
1337                                printed += scnprintf(bf + printed, size - printed,
1338                                                     "%ld", val);
1339                        }
1340                }
1341        } else if (IS_ERR(sc->tp_format)) {
1342                /*
1343                 * If we managed to read the tracepoint /format file, then we
1344                 * may end up not having any args, like with gettid(), so only
1345                 * print the raw args when we didn't manage to read it.
1346                 */
1347                int i = 0;
1348
1349                while (i < 6) {
1350                        /* special care for unaligned accesses */
1351                        p = args + sizeof(unsigned long) * i;
1352                        memcpy(&val, p, sizeof(val));
1353                        printed += scnprintf(bf + printed, size - printed,
1354                                             "%sarg%d: %ld",
1355                                             printed ? ", " : "", i, val);
1356                        ++i;
1357                }
1358        }
1359
1360        return printed;
1361}
1362
1363typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1364                                  union perf_event *event,
1365                                  struct perf_sample *sample);
1366
1367static struct syscall *trace__syscall_info(struct trace *trace,
1368                                           struct perf_evsel *evsel, int id)
1369{
1370
1371        if (id < 0) {
1372
1373                /*
1374                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1375                 * before that, leaving at a higher verbosity level till that is
1376                 * explained. Reproduced with plain ftrace with:
1377                 *
1378                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1379                 * grep "NR -1 " /t/trace_pipe
1380                 *
1381                 * After generating some load on the machine.
1382                 */
1383                if (verbose > 1) {
1384                        static u64 n;
1385                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1386                                id, perf_evsel__name(evsel), ++n);
1387                }
1388                return NULL;
1389        }
1390
1391        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1392            trace__read_syscall_info(trace, id))
1393                goto out_cant_read;
1394
1395        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1396                goto out_cant_read;
1397
1398        return &trace->syscalls.table[id];
1399
1400out_cant_read:
1401        if (verbose) {
1402                fprintf(trace->output, "Problems reading syscall %d", id);
1403                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1404                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1405                fputs(" information\n", trace->output);
1406        }
1407        return NULL;
1408}
1409
1410static void thread__update_stats(struct thread_trace *ttrace,
1411                                 int id, struct perf_sample *sample)
1412{
1413        struct int_node *inode;
1414        struct stats *stats;
1415        u64 duration = 0;
1416
1417        inode = intlist__findnew(ttrace->syscall_stats, id);
1418        if (inode == NULL)
1419                return;
1420
1421        stats = inode->priv;
1422        if (stats == NULL) {
1423                stats = malloc(sizeof(struct stats));
1424                if (stats == NULL)
1425                        return;
1426                init_stats(stats);
1427                inode->priv = stats;
1428        }
1429
1430        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1431                duration = sample->time - ttrace->entry_time;
1432
1433        update_stats(stats, duration);
1434}
1435
1436static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1437{
1438        struct thread_trace *ttrace;
1439        u64 duration;
1440        size_t printed;
1441
1442        if (trace->current == NULL)
1443                return 0;
1444
1445        ttrace = thread__priv(trace->current);
1446
1447        if (!ttrace->entry_pending)
1448                return 0;
1449
1450        duration = sample->time - ttrace->entry_time;
1451
1452        printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1453        printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1454        ttrace->entry_pending = false;
1455
1456        return printed;
1457}
1458
1459static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1460                            union perf_event *event __maybe_unused,
1461                            struct perf_sample *sample)
1462{
1463        char *msg;
1464        void *args;
1465        size_t printed = 0;
1466        struct thread *thread;
1467        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1468        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1469        struct thread_trace *ttrace;
1470
1471        if (sc == NULL)
1472                return -1;
1473
1474        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1475        ttrace = thread__trace(thread, trace->output);
1476        if (ttrace == NULL)
1477                goto out_put;
1478
1479        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1480
1481        if (ttrace->entry_str == NULL) {
1482                ttrace->entry_str = malloc(trace__entry_str_size);
1483                if (!ttrace->entry_str)
1484                        goto out_put;
1485        }
1486
1487        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1488                trace__printf_interrupted_entry(trace, sample);
1489
1490        ttrace->entry_time = sample->time;
1491        msg = ttrace->entry_str;
1492        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1493
1494        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1495                                           args, trace, thread);
1496
1497        if (sc->is_exit) {
1498                if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1499                        trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1500                        fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1501                }
1502        } else {
1503                ttrace->entry_pending = true;
1504                /* See trace__vfs_getname & trace__sys_exit */
1505                ttrace->filename.pending_open = false;
1506        }
1507
1508        if (trace->current != thread) {
1509                thread__put(trace->current);
1510                trace->current = thread__get(thread);
1511        }
1512        err = 0;
1513out_put:
1514        thread__put(thread);
1515        return err;
1516}
1517
1518static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1519                                    struct perf_sample *sample,
1520                                    struct callchain_cursor *cursor)
1521{
1522        struct addr_location al;
1523
1524        if (machine__resolve(trace->host, &al, sample) < 0 ||
1525            thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1526                return -1;
1527
1528        return 0;
1529}
1530
1531static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1532{
1533        /* TODO: user-configurable print_opts */
1534        const unsigned int print_opts = EVSEL__PRINT_SYM |
1535                                        EVSEL__PRINT_DSO |
1536                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1537
1538        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1539}
1540
1541static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1542                           union perf_event *event __maybe_unused,
1543                           struct perf_sample *sample)
1544{
1545        long ret;
1546        u64 duration = 0;
1547        struct thread *thread;
1548        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1549        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1550        struct thread_trace *ttrace;
1551
1552        if (sc == NULL)
1553                return -1;
1554
1555        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1556        ttrace = thread__trace(thread, trace->output);
1557        if (ttrace == NULL)
1558                goto out_put;
1559
1560        if (trace->summary)
1561                thread__update_stats(ttrace, id, sample);
1562
1563        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1564
1565        if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1566                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1567                ttrace->filename.pending_open = false;
1568                ++trace->stats.vfs_getname;
1569        }
1570
1571        ttrace->exit_time = sample->time;
1572
1573        if (ttrace->entry_time) {
1574                duration = sample->time - ttrace->entry_time;
1575                if (trace__filter_duration(trace, duration))
1576                        goto out;
1577        } else if (trace->duration_filter)
1578                goto out;
1579
1580        if (sample->callchain) {
1581                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1582                if (callchain_ret == 0) {
1583                        if (callchain_cursor.nr < trace->min_stack)
1584                                goto out;
1585                        callchain_ret = 1;
1586                }
1587        }
1588
1589        if (trace->summary_only)
1590                goto out;
1591
1592        trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1593
1594        if (ttrace->entry_pending) {
1595                fprintf(trace->output, "%-70s", ttrace->entry_str);
1596        } else {
1597                fprintf(trace->output, " ... [");
1598                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1599                fprintf(trace->output, "]: %s()", sc->name);
1600        }
1601
1602        if (sc->fmt == NULL) {
1603signed_print:
1604                fprintf(trace->output, ") = %ld", ret);
1605        } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1606                char bf[STRERR_BUFSIZE];
1607                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1608                           *e = audit_errno_to_name(-ret);
1609
1610                fprintf(trace->output, ") = -1 %s %s", e, emsg);
1611        } else if (ret == 0 && sc->fmt->timeout)
1612                fprintf(trace->output, ") = 0 Timeout");
1613        else if (sc->fmt->hexret)
1614                fprintf(trace->output, ") = %#lx", ret);
1615        else if (sc->fmt->errpid) {
1616                struct thread *child = machine__find_thread(trace->host, ret, ret);
1617
1618                if (child != NULL) {
1619                        fprintf(trace->output, ") = %ld", ret);
1620                        if (child->comm_set)
1621                                fprintf(trace->output, " (%s)", thread__comm_str(child));
1622                        thread__put(child);
1623                }
1624        } else
1625                goto signed_print;
1626
1627        fputc('\n', trace->output);
1628
1629        if (callchain_ret > 0)
1630                trace__fprintf_callchain(trace, sample);
1631        else if (callchain_ret < 0)
1632                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1633out:
1634        ttrace->entry_pending = false;
1635        err = 0;
1636out_put:
1637        thread__put(thread);
1638        return err;
1639}
1640
1641static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1642                              union perf_event *event __maybe_unused,
1643                              struct perf_sample *sample)
1644{
1645        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1646        struct thread_trace *ttrace;
1647        size_t filename_len, entry_str_len, to_move;
1648        ssize_t remaining_space;
1649        char *pos;
1650        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1651
1652        if (!thread)
1653                goto out;
1654
1655        ttrace = thread__priv(thread);
1656        if (!ttrace)
1657                goto out;
1658
1659        filename_len = strlen(filename);
1660
1661        if (ttrace->filename.namelen < filename_len) {
1662                char *f = realloc(ttrace->filename.name, filename_len + 1);
1663
1664                if (f == NULL)
1665                                goto out;
1666
1667                ttrace->filename.namelen = filename_len;
1668                ttrace->filename.name = f;
1669        }
1670
1671        strcpy(ttrace->filename.name, filename);
1672        ttrace->filename.pending_open = true;
1673
1674        if (!ttrace->filename.ptr)
1675                goto out;
1676
1677        entry_str_len = strlen(ttrace->entry_str);
1678        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1679        if (remaining_space <= 0)
1680                goto out;
1681
1682        if (filename_len > (size_t)remaining_space) {
1683                filename += filename_len - remaining_space;
1684                filename_len = remaining_space;
1685        }
1686
1687        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1688        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1689        memmove(pos + filename_len, pos, to_move);
1690        memcpy(pos, filename, filename_len);
1691
1692        ttrace->filename.ptr = 0;
1693        ttrace->filename.entry_str_pos = 0;
1694out:
1695        return 0;
1696}
1697
1698static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1699                                     union perf_event *event __maybe_unused,
1700                                     struct perf_sample *sample)
1701{
1702        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1703        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1704        struct thread *thread = machine__findnew_thread(trace->host,
1705                                                        sample->pid,
1706                                                        sample->tid);
1707        struct thread_trace *ttrace = thread__trace(thread, trace->output);
1708
1709        if (ttrace == NULL)
1710                goto out_dump;
1711
1712        ttrace->runtime_ms += runtime_ms;
1713        trace->runtime_ms += runtime_ms;
1714        thread__put(thread);
1715        return 0;
1716
1717out_dump:
1718        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1719               evsel->name,
1720               perf_evsel__strval(evsel, sample, "comm"),
1721               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1722               runtime,
1723               perf_evsel__intval(evsel, sample, "vruntime"));
1724        thread__put(thread);
1725        return 0;
1726}
1727
1728static void bpf_output__printer(enum binary_printer_ops op,
1729                                unsigned int val, void *extra)
1730{
1731        FILE *output = extra;
1732        unsigned char ch = (unsigned char)val;
1733
1734        switch (op) {
1735        case BINARY_PRINT_CHAR_DATA:
1736                fprintf(output, "%c", isprint(ch) ? ch : '.');
1737                break;
1738        case BINARY_PRINT_DATA_BEGIN:
1739        case BINARY_PRINT_LINE_BEGIN:
1740        case BINARY_PRINT_ADDR:
1741        case BINARY_PRINT_NUM_DATA:
1742        case BINARY_PRINT_NUM_PAD:
1743        case BINARY_PRINT_SEP:
1744        case BINARY_PRINT_CHAR_PAD:
1745        case BINARY_PRINT_LINE_END:
1746        case BINARY_PRINT_DATA_END:
1747        default:
1748                break;
1749        }
1750}
1751
1752static void bpf_output__fprintf(struct trace *trace,
1753                                struct perf_sample *sample)
1754{
1755        print_binary(sample->raw_data, sample->raw_size, 8,
1756                     bpf_output__printer, trace->output);
1757}
1758
1759static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1760                                union perf_event *event __maybe_unused,
1761                                struct perf_sample *sample)
1762{
1763        int callchain_ret = 0;
1764
1765        if (sample->callchain) {
1766                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1767                if (callchain_ret == 0) {
1768                        if (callchain_cursor.nr < trace->min_stack)
1769                                goto out;
1770                        callchain_ret = 1;
1771                }
1772        }
1773
1774        trace__printf_interrupted_entry(trace, sample);
1775        trace__fprintf_tstamp(trace, sample->time, trace->output);
1776
1777        if (trace->trace_syscalls)
1778                fprintf(trace->output, "(         ): ");
1779
1780        fprintf(trace->output, "%s:", evsel->name);
1781
1782        if (perf_evsel__is_bpf_output(evsel)) {
1783                bpf_output__fprintf(trace, sample);
1784        } else if (evsel->tp_format) {
1785                event_format__fprintf(evsel->tp_format, sample->cpu,
1786                                      sample->raw_data, sample->raw_size,
1787                                      trace->output);
1788        }
1789
1790        fprintf(trace->output, ")\n");
1791
1792        if (callchain_ret > 0)
1793                trace__fprintf_callchain(trace, sample);
1794        else if (callchain_ret < 0)
1795                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1796out:
1797        return 0;
1798}
1799
1800static void print_location(FILE *f, struct perf_sample *sample,
1801                           struct addr_location *al,
1802                           bool print_dso, bool print_sym)
1803{
1804
1805        if ((verbose || print_dso) && al->map)
1806                fprintf(f, "%s@", al->map->dso->long_name);
1807
1808        if ((verbose || print_sym) && al->sym)
1809                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1810                        al->addr - al->sym->start);
1811        else if (al->map)
1812                fprintf(f, "0x%" PRIx64, al->addr);
1813        else
1814                fprintf(f, "0x%" PRIx64, sample->addr);
1815}
1816
1817static int trace__pgfault(struct trace *trace,
1818                          struct perf_evsel *evsel,
1819                          union perf_event *event __maybe_unused,
1820                          struct perf_sample *sample)
1821{
1822        struct thread *thread;
1823        struct addr_location al;
1824        char map_type = 'd';
1825        struct thread_trace *ttrace;
1826        int err = -1;
1827        int callchain_ret = 0;
1828
1829        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1830
1831        if (sample->callchain) {
1832                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1833                if (callchain_ret == 0) {
1834                        if (callchain_cursor.nr < trace->min_stack)
1835                                goto out_put;
1836                        callchain_ret = 1;
1837                }
1838        }
1839
1840        ttrace = thread__trace(thread, trace->output);
1841        if (ttrace == NULL)
1842                goto out_put;
1843
1844        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1845                ttrace->pfmaj++;
1846        else
1847                ttrace->pfmin++;
1848
1849        if (trace->summary_only)
1850                goto out;
1851
1852        thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1853                              sample->ip, &al);
1854
1855        trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1856
1857        fprintf(trace->output, "%sfault [",
1858                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1859                "maj" : "min");
1860
1861        print_location(trace->output, sample, &al, false, true);
1862
1863        fprintf(trace->output, "] => ");
1864
1865        thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1866                                   sample->addr, &al);
1867
1868        if (!al.map) {
1869                thread__find_addr_location(thread, sample->cpumode,
1870                                           MAP__FUNCTION, sample->addr, &al);
1871
1872                if (al.map)
1873                        map_type = 'x';
1874                else
1875                        map_type = '?';
1876        }
1877
1878        print_location(trace->output, sample, &al, true, false);
1879
1880        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1881
1882        if (callchain_ret > 0)
1883                trace__fprintf_callchain(trace, sample);
1884        else if (callchain_ret < 0)
1885                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1886out:
1887        err = 0;
1888out_put:
1889        thread__put(thread);
1890        return err;
1891}
1892
1893static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1894{
1895        if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1896            (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1897                return false;
1898
1899        if (trace->pid_list || trace->tid_list)
1900                return true;
1901
1902        return false;
1903}
1904
1905static void trace__set_base_time(struct trace *trace,
1906                                 struct perf_evsel *evsel,
1907                                 struct perf_sample *sample)
1908{
1909        /*
1910         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1911         * and don't use sample->time unconditionally, we may end up having
1912         * some other event in the future without PERF_SAMPLE_TIME for good
1913         * reason, i.e. we may not be interested in its timestamps, just in
1914         * it taking place, picking some piece of information when it
1915         * appears in our event stream (vfs_getname comes to mind).
1916         */
1917        if (trace->base_time == 0 && !trace->full_time &&
1918            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1919                trace->base_time = sample->time;
1920}
1921
1922static int trace__process_sample(struct perf_tool *tool,
1923                                 union perf_event *event,
1924                                 struct perf_sample *sample,
1925                                 struct perf_evsel *evsel,
1926                                 struct machine *machine __maybe_unused)
1927{
1928        struct trace *trace = container_of(tool, struct trace, tool);
1929        int err = 0;
1930
1931        tracepoint_handler handler = evsel->handler;
1932
1933        if (skip_sample(trace, sample))
1934                return 0;
1935
1936        trace__set_base_time(trace, evsel, sample);
1937
1938        if (handler) {
1939                ++trace->nr_events;
1940                handler(trace, evsel, event, sample);
1941        }
1942
1943        return err;
1944}
1945
1946static int parse_target_str(struct trace *trace)
1947{
1948        if (trace->opts.target.pid) {
1949                trace->pid_list = intlist__new(trace->opts.target.pid);
1950                if (trace->pid_list == NULL) {
1951                        pr_err("Error parsing process id string\n");
1952                        return -EINVAL;
1953                }
1954        }
1955
1956        if (trace->opts.target.tid) {
1957                trace->tid_list = intlist__new(trace->opts.target.tid);
1958                if (trace->tid_list == NULL) {
1959                        pr_err("Error parsing thread id string\n");
1960                        return -EINVAL;
1961                }
1962        }
1963
1964        return 0;
1965}
1966
1967static int trace__record(struct trace *trace, int argc, const char **argv)
1968{
1969        unsigned int rec_argc, i, j;
1970        const char **rec_argv;
1971        const char * const record_args[] = {
1972                "record",
1973                "-R",
1974                "-m", "1024",
1975                "-c", "1",
1976        };
1977
1978        const char * const sc_args[] = { "-e", };
1979        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1980        const char * const majpf_args[] = { "-e", "major-faults" };
1981        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1982        const char * const minpf_args[] = { "-e", "minor-faults" };
1983        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1984
1985        /* +1 is for the event string below */
1986        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1987                majpf_args_nr + minpf_args_nr + argc;
1988        rec_argv = calloc(rec_argc + 1, sizeof(char *));
1989
1990        if (rec_argv == NULL)
1991                return -ENOMEM;
1992
1993        j = 0;
1994        for (i = 0; i < ARRAY_SIZE(record_args); i++)
1995                rec_argv[j++] = record_args[i];
1996
1997        if (trace->trace_syscalls) {
1998                for (i = 0; i < sc_args_nr; i++)
1999                        rec_argv[j++] = sc_args[i];
2000
2001                /* event string may be different for older kernels - e.g., RHEL6 */
2002                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2003                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2004                else if (is_valid_tracepoint("syscalls:sys_enter"))
2005                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2006                else {
2007                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2008                        return -1;
2009                }
2010        }
2011
2012        if (trace->trace_pgfaults & TRACE_PFMAJ)
2013                for (i = 0; i < majpf_args_nr; i++)
2014                        rec_argv[j++] = majpf_args[i];
2015
2016        if (trace->trace_pgfaults & TRACE_PFMIN)
2017                for (i = 0; i < minpf_args_nr; i++)
2018                        rec_argv[j++] = minpf_args[i];
2019
2020        for (i = 0; i < (unsigned int)argc; i++)
2021                rec_argv[j++] = argv[i];
2022
2023        return cmd_record(j, rec_argv, NULL);
2024}
2025
2026static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2027
2028static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2029{
2030        struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2031
2032        if (IS_ERR(evsel))
2033                return false;
2034
2035        if (perf_evsel__field(evsel, "pathname") == NULL) {
2036                perf_evsel__delete(evsel);
2037                return false;
2038        }
2039
2040        evsel->handler = trace__vfs_getname;
2041        perf_evlist__add(evlist, evsel);
2042        return true;
2043}
2044
2045static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2046{
2047        struct perf_evsel *evsel;
2048        struct perf_event_attr attr = {
2049                .type = PERF_TYPE_SOFTWARE,
2050                .mmap_data = 1,
2051        };
2052
2053        attr.config = config;
2054        attr.sample_period = 1;
2055
2056        event_attr_init(&attr);
2057
2058        evsel = perf_evsel__new(&attr);
2059        if (evsel)
2060                evsel->handler = trace__pgfault;
2061
2062        return evsel;
2063}
2064
2065static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2066{
2067        const u32 type = event->header.type;
2068        struct perf_evsel *evsel;
2069
2070        if (type != PERF_RECORD_SAMPLE) {
2071                trace__process_event(trace, trace->host, event, sample);
2072                return;
2073        }
2074
2075        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2076        if (evsel == NULL) {
2077                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2078                return;
2079        }
2080
2081        trace__set_base_time(trace, evsel, sample);
2082
2083        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2084            sample->raw_data == NULL) {
2085                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2086                       perf_evsel__name(evsel), sample->tid,
2087                       sample->cpu, sample->raw_size);
2088        } else {
2089                tracepoint_handler handler = evsel->handler;
2090                handler(trace, evsel, event, sample);
2091        }
2092}
2093
2094static int trace__add_syscall_newtp(struct trace *trace)
2095{
2096        int ret = -1;
2097        struct perf_evlist *evlist = trace->evlist;
2098        struct perf_evsel *sys_enter, *sys_exit;
2099
2100        sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2101        if (sys_enter == NULL)
2102                goto out;
2103
2104        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2105                goto out_delete_sys_enter;
2106
2107        sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2108        if (sys_exit == NULL)
2109                goto out_delete_sys_enter;
2110
2111        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2112                goto out_delete_sys_exit;
2113
2114        perf_evlist__add(evlist, sys_enter);
2115        perf_evlist__add(evlist, sys_exit);
2116
2117        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2118                /*
2119                 * We're interested only in the user space callchain
2120                 * leading to the syscall, allow overriding that for
2121                 * debugging reasons using --kernel_syscall_callchains
2122                 */
2123                sys_exit->attr.exclude_callchain_kernel = 1;
2124        }
2125
2126        trace->syscalls.events.sys_enter = sys_enter;
2127        trace->syscalls.events.sys_exit  = sys_exit;
2128
2129        ret = 0;
2130out:
2131        return ret;
2132
2133out_delete_sys_exit:
2134        perf_evsel__delete_priv(sys_exit);
2135out_delete_sys_enter:
2136        perf_evsel__delete_priv(sys_enter);
2137        goto out;
2138}
2139
2140static int trace__set_ev_qualifier_filter(struct trace *trace)
2141{
2142        int err = -1;
2143        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2144                                                trace->ev_qualifier_ids.nr,
2145                                                trace->ev_qualifier_ids.entries);
2146
2147        if (filter == NULL)
2148                goto out_enomem;
2149
2150        if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2151                err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2152
2153        free(filter);
2154out:
2155        return err;
2156out_enomem:
2157        errno = ENOMEM;
2158        goto out;
2159}
2160
2161static int trace__run(struct trace *trace, int argc, const char **argv)
2162{
2163        struct perf_evlist *evlist = trace->evlist;
2164        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2165        int err = -1, i;
2166        unsigned long before;
2167        const bool forks = argc > 0;
2168        bool draining = false;
2169
2170        trace->live = true;
2171
2172        if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2173                goto out_error_raw_syscalls;
2174
2175        if (trace->trace_syscalls)
2176                trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2177
2178        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2179                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2180                if (pgfault_maj == NULL)
2181                        goto out_error_mem;
2182                perf_evlist__add(evlist, pgfault_maj);
2183        }
2184
2185        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2186                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2187                if (pgfault_min == NULL)
2188                        goto out_error_mem;
2189                perf_evlist__add(evlist, pgfault_min);
2190        }
2191
2192        if (trace->sched &&
2193            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2194                                   trace__sched_stat_runtime))
2195                goto out_error_sched_stat_runtime;
2196
2197        err = perf_evlist__create_maps(evlist, &trace->opts.target);
2198        if (err < 0) {
2199                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2200                goto out_delete_evlist;
2201        }
2202
2203        err = trace__symbols_init(trace, evlist);
2204        if (err < 0) {
2205                fprintf(trace->output, "Problems initializing symbol libraries!\n");
2206                goto out_delete_evlist;
2207        }
2208
2209        perf_evlist__config(evlist, &trace->opts, NULL);
2210
2211        if (callchain_param.enabled) {
2212                bool use_identifier = false;
2213
2214                if (trace->syscalls.events.sys_exit) {
2215                        perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2216                                                     &trace->opts, &callchain_param);
2217                        use_identifier = true;
2218                }
2219
2220                if (pgfault_maj) {
2221                        perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2222                        use_identifier = true;
2223                }
2224
2225                if (pgfault_min) {
2226                        perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2227                        use_identifier = true;
2228                }
2229
2230                if (use_identifier) {
2231                       /*
2232                        * Now we have evsels with different sample_ids, use
2233                        * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2234                        * from a fixed position in each ring buffer record.
2235                        *
2236                        * As of this the changeset introducing this comment, this
2237                        * isn't strictly needed, as the fields that can come before
2238                        * PERF_SAMPLE_ID are all used, but we'll probably disable
2239                        * some of those for things like copying the payload of
2240                        * pointer syscall arguments, and for vfs_getname we don't
2241                        * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2242                        * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2243                        */
2244                        perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2245                        perf_evlist__reset_sample_bit(evlist, ID);
2246                }
2247        }
2248
2249        signal(SIGCHLD, sig_handler);
2250        signal(SIGINT, sig_handler);
2251
2252        if (forks) {
2253                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2254                                                    argv, false, NULL);
2255                if (err < 0) {
2256                        fprintf(trace->output, "Couldn't run the workload!\n");
2257                        goto out_delete_evlist;
2258                }
2259        }
2260
2261        err = perf_evlist__open(evlist);
2262        if (err < 0)
2263                goto out_error_open;
2264
2265        err = bpf__apply_obj_config();
2266        if (err) {
2267                char errbuf[BUFSIZ];
2268
2269                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2270                pr_err("ERROR: Apply config to BPF failed: %s\n",
2271                         errbuf);
2272                goto out_error_open;
2273        }
2274
2275        /*
2276         * Better not use !target__has_task() here because we need to cover the
2277         * case where no threads were specified in the command line, but a
2278         * workload was, and in that case we will fill in the thread_map when
2279         * we fork the workload in perf_evlist__prepare_workload.
2280         */
2281        if (trace->filter_pids.nr > 0)
2282                err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2283        else if (thread_map__pid(evlist->threads, 0) == -1)
2284                err = perf_evlist__set_filter_pid(evlist, getpid());
2285
2286        if (err < 0)
2287                goto out_error_mem;
2288
2289        if (trace->ev_qualifier_ids.nr > 0) {
2290                err = trace__set_ev_qualifier_filter(trace);
2291                if (err < 0)
2292                        goto out_errno;
2293
2294                pr_debug("event qualifier tracepoint filter: %s\n",
2295                         trace->syscalls.events.sys_exit->filter);
2296        }
2297
2298        err = perf_evlist__apply_filters(evlist, &evsel);
2299        if (err < 0)
2300                goto out_error_apply_filters;
2301
2302        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2303        if (err < 0)
2304                goto out_error_mmap;
2305
2306        if (!target__none(&trace->opts.target))
2307                perf_evlist__enable(evlist);
2308
2309        if (forks)
2310                perf_evlist__start_workload(evlist);
2311
2312        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2313                                  evlist->threads->nr > 1 ||
2314                                  perf_evlist__first(evlist)->attr.inherit;
2315again:
2316        before = trace->nr_events;
2317
2318        for (i = 0; i < evlist->nr_mmaps; i++) {
2319                union perf_event *event;
2320
2321                while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2322                        struct perf_sample sample;
2323
2324                        ++trace->nr_events;
2325
2326                        err = perf_evlist__parse_sample(evlist, event, &sample);
2327                        if (err) {
2328                                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2329                                goto next_event;
2330                        }
2331
2332                        trace__handle_event(trace, event, &sample);
2333next_event:
2334                        perf_evlist__mmap_consume(evlist, i);
2335
2336                        if (interrupted)
2337                                goto out_disable;
2338
2339                        if (done && !draining) {
2340                                perf_evlist__disable(evlist);
2341                                draining = true;
2342                        }
2343                }
2344        }
2345
2346        if (trace->nr_events == before) {
2347                int timeout = done ? 100 : -1;
2348
2349                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2350                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2351                                draining = true;
2352
2353                        goto again;
2354                }
2355        } else {
2356                goto again;
2357        }
2358
2359out_disable:
2360        thread__zput(trace->current);
2361
2362        perf_evlist__disable(evlist);
2363
2364        if (!err) {
2365                if (trace->summary)
2366                        trace__fprintf_thread_summary(trace, trace->output);
2367
2368                if (trace->show_tool_stats) {
2369                        fprintf(trace->output, "Stats:\n "
2370                                               " vfs_getname : %" PRIu64 "\n"
2371                                               " proc_getname: %" PRIu64 "\n",
2372                                trace->stats.vfs_getname,
2373                                trace->stats.proc_getname);
2374                }
2375        }
2376
2377out_delete_evlist:
2378        perf_evlist__delete(evlist);
2379        trace->evlist = NULL;
2380        trace->live = false;
2381        return err;
2382{
2383        char errbuf[BUFSIZ];
2384
2385out_error_sched_stat_runtime:
2386        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2387        goto out_error;
2388
2389out_error_raw_syscalls:
2390        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2391        goto out_error;
2392
2393out_error_mmap:
2394        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2395        goto out_error;
2396
2397out_error_open:
2398        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2399
2400out_error:
2401        fprintf(trace->output, "%s\n", errbuf);
2402        goto out_delete_evlist;
2403
2404out_error_apply_filters:
2405        fprintf(trace->output,
2406                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2407                evsel->filter, perf_evsel__name(evsel), errno,
2408                str_error_r(errno, errbuf, sizeof(errbuf)));
2409        goto out_delete_evlist;
2410}
2411out_error_mem:
2412        fprintf(trace->output, "Not enough memory to run!\n");
2413        goto out_delete_evlist;
2414
2415out_errno:
2416        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2417        goto out_delete_evlist;
2418}
2419
2420static int trace__replay(struct trace *trace)
2421{
2422        const struct perf_evsel_str_handler handlers[] = {
2423                { "probe:vfs_getname",       trace__vfs_getname, },
2424        };
2425        struct perf_data_file file = {
2426                .path  = input_name,
2427                .mode  = PERF_DATA_MODE_READ,
2428                .force = trace->force,
2429        };
2430        struct perf_session *session;
2431        struct perf_evsel *evsel;
2432        int err = -1;
2433
2434        trace->tool.sample        = trace__process_sample;
2435        trace->tool.mmap          = perf_event__process_mmap;
2436        trace->tool.mmap2         = perf_event__process_mmap2;
2437        trace->tool.comm          = perf_event__process_comm;
2438        trace->tool.exit          = perf_event__process_exit;
2439        trace->tool.fork          = perf_event__process_fork;
2440        trace->tool.attr          = perf_event__process_attr;
2441        trace->tool.tracing_data = perf_event__process_tracing_data;
2442        trace->tool.build_id      = perf_event__process_build_id;
2443
2444        trace->tool.ordered_events = true;
2445        trace->tool.ordering_requires_timestamps = true;
2446
2447        /* add tid to output */
2448        trace->multiple_threads = true;
2449
2450        session = perf_session__new(&file, false, &trace->tool);
2451        if (session == NULL)
2452                return -1;
2453
2454        if (symbol__init(&session->header.env) < 0)
2455                goto out;
2456
2457        trace->host = &session->machines.host;
2458
2459        err = perf_session__set_tracepoints_handlers(session, handlers);
2460        if (err)
2461                goto out;
2462
2463        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2464                                                     "raw_syscalls:sys_enter");
2465        /* older kernels have syscalls tp versus raw_syscalls */
2466        if (evsel == NULL)
2467                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2468                                                             "syscalls:sys_enter");
2469
2470        if (evsel &&
2471            (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2472            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2473                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2474                goto out;
2475        }
2476
2477        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478                                                     "raw_syscalls:sys_exit");
2479        if (evsel == NULL)
2480                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2481                                                             "syscalls:sys_exit");
2482        if (evsel &&
2483            (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2484            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2485                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2486                goto out;
2487        }
2488
2489        evlist__for_each_entry(session->evlist, evsel) {
2490                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2491                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2492                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2493                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2494                        evsel->handler = trace__pgfault;
2495        }
2496
2497        err = parse_target_str(trace);
2498        if (err != 0)
2499                goto out;
2500
2501        setup_pager();
2502
2503        err = perf_session__process_events(session);
2504        if (err)
2505                pr_err("Failed to process events, error %d", err);
2506
2507        else if (trace->summary)
2508                trace__fprintf_thread_summary(trace, trace->output);
2509
2510out:
2511        perf_session__delete(session);
2512
2513        return err;
2514}
2515
2516static size_t trace__fprintf_threads_header(FILE *fp)
2517{
2518        size_t printed;
2519
2520        printed  = fprintf(fp, "\n Summary of events:\n\n");
2521
2522        return printed;
2523}
2524
2525DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2526        struct stats    *stats;
2527        double          msecs;
2528        int             syscall;
2529)
2530{
2531        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2532        struct stats *stats = source->priv;
2533
2534        entry->syscall = source->i;
2535        entry->stats   = stats;
2536        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2537}
2538
2539static size_t thread__dump_stats(struct thread_trace *ttrace,
2540                                 struct trace *trace, FILE *fp)
2541{
2542        size_t printed = 0;
2543        struct syscall *sc;
2544        struct rb_node *nd;
2545        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2546
2547        if (syscall_stats == NULL)
2548                return 0;
2549
2550        printed += fprintf(fp, "\n");
2551
2552        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2553        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2554        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2555
2556        resort_rb__for_each_entry(nd, syscall_stats) {
2557                struct stats *stats = syscall_stats_entry->stats;
2558                if (stats) {
2559                        double min = (double)(stats->min) / NSEC_PER_MSEC;
2560                        double max = (double)(stats->max) / NSEC_PER_MSEC;
2561                        double avg = avg_stats(stats);
2562                        double pct;
2563                        u64 n = (u64) stats->n;
2564
2565                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2566                        avg /= NSEC_PER_MSEC;
2567
2568                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2569                        printed += fprintf(fp, "   %-15s", sc->name);
2570                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2571                                           n, syscall_stats_entry->msecs, min, avg);
2572                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2573                }
2574        }
2575
2576        resort_rb__delete(syscall_stats);
2577        printed += fprintf(fp, "\n\n");
2578
2579        return printed;
2580}
2581
2582static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2583{
2584        size_t printed = 0;
2585        struct thread_trace *ttrace = thread__priv(thread);
2586        double ratio;
2587
2588        if (ttrace == NULL)
2589                return 0;
2590
2591        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2592
2593        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2594        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2595        printed += fprintf(fp, "%.1f%%", ratio);
2596        if (ttrace->pfmaj)
2597                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2598        if (ttrace->pfmin)
2599                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2600        if (trace->sched)
2601                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2602        else if (fputc('\n', fp) != EOF)
2603                ++printed;
2604
2605        printed += thread__dump_stats(ttrace, trace, fp);
2606
2607        return printed;
2608}
2609
2610static unsigned long thread__nr_events(struct thread_trace *ttrace)
2611{
2612        return ttrace ? ttrace->nr_events : 0;
2613}
2614
2615DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2616        struct thread *thread;
2617)
2618{
2619        entry->thread = rb_entry(nd, struct thread, rb_node);
2620}
2621
2622static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2623{
2624        DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2625        size_t printed = trace__fprintf_threads_header(fp);
2626        struct rb_node *nd;
2627
2628        if (threads == NULL) {
2629                fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2630                return 0;
2631        }
2632
2633        resort_rb__for_each_entry(nd, threads)
2634                printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2635
2636        resort_rb__delete(threads);
2637
2638        return printed;
2639}
2640
2641static int trace__set_duration(const struct option *opt, const char *str,
2642                               int unset __maybe_unused)
2643{
2644        struct trace *trace = opt->value;
2645
2646        trace->duration_filter = atof(str);
2647        return 0;
2648}
2649
2650static int trace__set_filter_pids(const struct option *opt, const char *str,
2651                                  int unset __maybe_unused)
2652{
2653        int ret = -1;
2654        size_t i;
2655        struct trace *trace = opt->value;
2656        /*
2657         * FIXME: introduce a intarray class, plain parse csv and create a
2658         * { int nr, int entries[] } struct...
2659         */
2660        struct intlist *list = intlist__new(str);
2661
2662        if (list == NULL)
2663                return -1;
2664
2665        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2666        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2667
2668        if (trace->filter_pids.entries == NULL)
2669                goto out;
2670
2671        trace->filter_pids.entries[0] = getpid();
2672
2673        for (i = 1; i < trace->filter_pids.nr; ++i)
2674                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2675
2676        intlist__delete(list);
2677        ret = 0;
2678out:
2679        return ret;
2680}
2681
2682static int trace__open_output(struct trace *trace, const char *filename)
2683{
2684        struct stat st;
2685
2686        if (!stat(filename, &st) && st.st_size) {
2687                char oldname[PATH_MAX];
2688
2689                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2690                unlink(oldname);
2691                rename(filename, oldname);
2692        }
2693
2694        trace->output = fopen(filename, "w");
2695
2696        return trace->output == NULL ? -errno : 0;
2697}
2698
2699static int parse_pagefaults(const struct option *opt, const char *str,
2700                            int unset __maybe_unused)
2701{
2702        int *trace_pgfaults = opt->value;
2703
2704        if (strcmp(str, "all") == 0)
2705                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2706        else if (strcmp(str, "maj") == 0)
2707                *trace_pgfaults |= TRACE_PFMAJ;
2708        else if (strcmp(str, "min") == 0)
2709                *trace_pgfaults |= TRACE_PFMIN;
2710        else
2711                return -1;
2712
2713        return 0;
2714}
2715
2716static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2717{
2718        struct perf_evsel *evsel;
2719
2720        evlist__for_each_entry(evlist, evsel)
2721                evsel->handler = handler;
2722}
2723
2724int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2725{
2726        const char *trace_usage[] = {
2727                "perf trace [<options>] [<command>]",
2728                "perf trace [<options>] -- <command> [<options>]",
2729                "perf trace record [<options>] [<command>]",
2730                "perf trace record [<options>] -- <command> [<options>]",
2731                NULL
2732        };
2733        struct trace trace = {
2734                .syscalls = {
2735                        . max = -1,
2736                },
2737                .opts = {
2738                        .target = {
2739                                .uid       = UINT_MAX,
2740                                .uses_mmap = true,
2741                        },
2742                        .user_freq     = UINT_MAX,
2743                        .user_interval = ULLONG_MAX,
2744                        .no_buffering  = true,
2745                        .mmap_pages    = UINT_MAX,
2746                        .proc_map_timeout  = 500,
2747                },
2748                .output = stderr,
2749                .show_comm = true,
2750                .trace_syscalls = true,
2751                .kernel_syscallchains = false,
2752                .max_stack = UINT_MAX,
2753        };
2754        const char *output_name = NULL;
2755        const char *ev_qualifier_str = NULL;
2756        const struct option trace_options[] = {
2757        OPT_CALLBACK(0, "event", &trace.evlist, "event",
2758                     "event selector. use 'perf list' to list available events",
2759                     parse_events_option),
2760        OPT_BOOLEAN(0, "comm", &trace.show_comm,
2761                    "show the thread COMM next to its id"),
2762        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2763        OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2764        OPT_STRING('o', "output", &output_name, "file", "output file name"),
2765        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2766        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2767                    "trace events on existing process id"),
2768        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2769                    "trace events on existing thread id"),
2770        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2771                     "pids to filter (by the kernel)", trace__set_filter_pids),
2772        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2773                    "system-wide collection from all CPUs"),
2774        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2775                    "list of cpus to monitor"),
2776        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2777                    "child tasks do not inherit counters"),
2778        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2779                     "number of mmap data pages",
2780                     perf_evlist__parse_mmap_pages),
2781        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2782                   "user to profile"),
2783        OPT_CALLBACK(0, "duration", &trace, "float",
2784                     "show only events with duration > N.M ms",
2785                     trace__set_duration),
2786        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2787        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2788        OPT_BOOLEAN('T', "time", &trace.full_time,
2789                    "Show full timestamp, not time relative to first start"),
2790        OPT_BOOLEAN('s', "summary", &trace.summary_only,
2791                    "Show only syscall summary with statistics"),
2792        OPT_BOOLEAN('S', "with-summary", &trace.summary,
2793                    "Show all syscalls and summary with statistics"),
2794        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2795                     "Trace pagefaults", parse_pagefaults, "maj"),
2796        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2797        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2798        OPT_CALLBACK(0, "call-graph", &trace.opts,
2799                     "record_mode[,record_size]", record_callchain_help,
2800                     &record_parse_callchain_opt),
2801        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2802                    "Show the kernel callchains on the syscall exit path"),
2803        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2804                     "Set the minimum stack depth when parsing the callchain, "
2805                     "anything below the specified depth will be ignored."),
2806        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2807                     "Set the maximum stack depth when parsing the callchain, "
2808                     "anything beyond the specified depth will be ignored. "
2809                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2810        OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2811                        "per thread proc mmap processing timeout in ms"),
2812        OPT_END()
2813        };
2814        bool __maybe_unused max_stack_user_set = true;
2815        bool mmap_pages_user_set = true;
2816        const char * const trace_subcommands[] = { "record", NULL };
2817        int err;
2818        char bf[BUFSIZ];
2819
2820        signal(SIGSEGV, sighandler_dump_stack);
2821        signal(SIGFPE, sighandler_dump_stack);
2822
2823        trace.evlist = perf_evlist__new();
2824        trace.sctbl = syscalltbl__new();
2825
2826        if (trace.evlist == NULL || trace.sctbl == NULL) {
2827                pr_err("Not enough memory to run!\n");
2828                err = -ENOMEM;
2829                goto out;
2830        }
2831
2832        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2833                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2834
2835        err = bpf__setup_stdout(trace.evlist);
2836        if (err) {
2837                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2838                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2839                goto out;
2840        }
2841
2842        err = -1;
2843
2844        if (trace.trace_pgfaults) {
2845                trace.opts.sample_address = true;
2846                trace.opts.sample_time = true;
2847        }
2848
2849        if (trace.opts.mmap_pages == UINT_MAX)
2850                mmap_pages_user_set = false;
2851
2852        if (trace.max_stack == UINT_MAX) {
2853                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2854                max_stack_user_set = false;
2855        }
2856
2857#ifdef HAVE_DWARF_UNWIND_SUPPORT
2858        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2859                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2860#endif
2861
2862        if (callchain_param.enabled) {
2863                if (!mmap_pages_user_set && geteuid() == 0)
2864                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2865
2866                symbol_conf.use_callchain = true;
2867        }
2868
2869        if (trace.evlist->nr_entries > 0)
2870                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2871
2872        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2873                return trace__record(&trace, argc-1, &argv[1]);
2874
2875        /* summary_only implies summary option, but don't overwrite summary if set */
2876        if (trace.summary_only)
2877                trace.summary = trace.summary_only;
2878
2879        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2880            trace.evlist->nr_entries == 0 /* Was --events used? */) {
2881                pr_err("Please specify something to trace.\n");
2882                return -1;
2883        }
2884
2885        if (!trace.trace_syscalls && ev_qualifier_str) {
2886                pr_err("The -e option can't be used with --no-syscalls.\n");
2887                goto out;
2888        }
2889
2890        if (output_name != NULL) {
2891                err = trace__open_output(&trace, output_name);
2892                if (err < 0) {
2893                        perror("failed to create output file");
2894                        goto out;
2895                }
2896        }
2897
2898        trace.open_id = syscalltbl__id(trace.sctbl, "open");
2899
2900        if (ev_qualifier_str != NULL) {
2901                const char *s = ev_qualifier_str;
2902                struct strlist_config slist_config = {
2903                        .dirname = system_path(STRACE_GROUPS_DIR),
2904                };
2905
2906                trace.not_ev_qualifier = *s == '!';
2907                if (trace.not_ev_qualifier)
2908                        ++s;
2909                trace.ev_qualifier = strlist__new(s, &slist_config);
2910                if (trace.ev_qualifier == NULL) {
2911                        fputs("Not enough memory to parse event qualifier",
2912                              trace.output);
2913                        err = -ENOMEM;
2914                        goto out_close;
2915                }
2916
2917                err = trace__validate_ev_qualifier(&trace);
2918                if (err)
2919                        goto out_close;
2920        }
2921
2922        err = target__validate(&trace.opts.target);
2923        if (err) {
2924                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2925                fprintf(trace.output, "%s", bf);
2926                goto out_close;
2927        }
2928
2929        err = target__parse_uid(&trace.opts.target);
2930        if (err) {
2931                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932                fprintf(trace.output, "%s", bf);
2933                goto out_close;
2934        }
2935
2936        if (!argc && target__none(&trace.opts.target))
2937                trace.opts.target.system_wide = true;
2938
2939        if (input_name)
2940                err = trace__replay(&trace);
2941        else
2942                err = trace__run(&trace, argc, argv);
2943
2944out_close:
2945        if (output_name != NULL)
2946                fclose(trace.output);
2947out:
2948        return err;
2949}
2950