linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
  21#include "builtin.h"
  22#include "util/color.h"
  23#include "util/debug.h"
  24#include "util/evlist.h"
  25#include <subcmd/exec-cmd.h>
  26#include "util/machine.h"
  27#include "util/session.h"
  28#include "util/thread.h"
  29#include <subcmd/parse-options.h>
  30#include "util/strlist.h"
  31#include "util/intlist.h"
  32#include "util/thread_map.h"
  33#include "util/stat.h"
  34#include "trace-event.h"
  35#include "util/parse-events.h"
  36#include "util/bpf-loader.h"
  37#include "callchain.h"
  38#include "syscalltbl.h"
  39#include "rb_resort.h"
  40
  41#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
  42#include <stdlib.h>
  43#include <linux/err.h>
  44#include <linux/filter.h>
  45#include <linux/audit.h>
  46#include <linux/random.h>
  47#include <linux/stringify.h>
  48#include <linux/time64.h>
  49
  50#ifndef O_CLOEXEC
  51# define O_CLOEXEC              02000000
  52#endif
  53
  54struct trace {
  55        struct perf_tool        tool;
  56        struct syscalltbl       *sctbl;
  57        struct {
  58                int             max;
  59                struct syscall  *table;
  60                struct {
  61                        struct perf_evsel *sys_enter,
  62                                          *sys_exit;
  63                }               events;
  64        } syscalls;
  65        struct record_opts      opts;
  66        struct perf_evlist      *evlist;
  67        struct machine          *host;
  68        struct thread           *current;
  69        u64                     base_time;
  70        FILE                    *output;
  71        unsigned long           nr_events;
  72        struct strlist          *ev_qualifier;
  73        struct {
  74                size_t          nr;
  75                int             *entries;
  76        }                       ev_qualifier_ids;
  77        struct intlist          *tid_list;
  78        struct intlist          *pid_list;
  79        struct {
  80                size_t          nr;
  81                pid_t           *entries;
  82        }                       filter_pids;
  83        double                  duration_filter;
  84        double                  runtime_ms;
  85        struct {
  86                u64             vfs_getname,
  87                                proc_getname;
  88        } stats;
  89        unsigned int            max_stack;
  90        unsigned int            min_stack;
  91        bool                    not_ev_qualifier;
  92        bool                    live;
  93        bool                    full_time;
  94        bool                    sched;
  95        bool                    multiple_threads;
  96        bool                    summary;
  97        bool                    summary_only;
  98        bool                    show_comm;
  99        bool                    show_tool_stats;
 100        bool                    trace_syscalls;
 101        bool                    kernel_syscallchains;
 102        bool                    force;
 103        bool                    vfs_getname;
 104        int                     trace_pgfaults;
 105        int                     open_id;
 106};
 107
 108struct tp_field {
 109        int offset;
 110        union {
 111                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 112                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 113        };
 114};
 115
 116#define TP_UINT_FIELD(bits) \
 117static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 118{ \
 119        u##bits value; \
 120        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 121        return value;  \
 122}
 123
 124TP_UINT_FIELD(8);
 125TP_UINT_FIELD(16);
 126TP_UINT_FIELD(32);
 127TP_UINT_FIELD(64);
 128
 129#define TP_UINT_FIELD__SWAPPED(bits) \
 130static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 131{ \
 132        u##bits value; \
 133        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 134        return bswap_##bits(value);\
 135}
 136
 137TP_UINT_FIELD__SWAPPED(16);
 138TP_UINT_FIELD__SWAPPED(32);
 139TP_UINT_FIELD__SWAPPED(64);
 140
 141static int tp_field__init_uint(struct tp_field *field,
 142                               struct format_field *format_field,
 143                               bool needs_swap)
 144{
 145        field->offset = format_field->offset;
 146
 147        switch (format_field->size) {
 148        case 1:
 149                field->integer = tp_field__u8;
 150                break;
 151        case 2:
 152                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 153                break;
 154        case 4:
 155                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 156                break;
 157        case 8:
 158                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 159                break;
 160        default:
 161                return -1;
 162        }
 163
 164        return 0;
 165}
 166
 167static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 168{
 169        return sample->raw_data + field->offset;
 170}
 171
 172static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 173{
 174        field->offset = format_field->offset;
 175        field->pointer = tp_field__ptr;
 176        return 0;
 177}
 178
 179struct syscall_tp {
 180        struct tp_field id;
 181        union {
 182                struct tp_field args, ret;
 183        };
 184};
 185
 186static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 187                                          struct tp_field *field,
 188                                          const char *name)
 189{
 190        struct format_field *format_field = perf_evsel__field(evsel, name);
 191
 192        if (format_field == NULL)
 193                return -1;
 194
 195        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 196}
 197
 198#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 199        ({ struct syscall_tp *sc = evsel->priv;\
 200           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 201
 202static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 203                                         struct tp_field *field,
 204                                         const char *name)
 205{
 206        struct format_field *format_field = perf_evsel__field(evsel, name);
 207
 208        if (format_field == NULL)
 209                return -1;
 210
 211        return tp_field__init_ptr(field, format_field);
 212}
 213
 214#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 215        ({ struct syscall_tp *sc = evsel->priv;\
 216           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 217
 218static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 219{
 220        zfree(&evsel->priv);
 221        perf_evsel__delete(evsel);
 222}
 223
 224static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 225{
 226        evsel->priv = malloc(sizeof(struct syscall_tp));
 227        if (evsel->priv != NULL) {
 228                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 229                        goto out_delete;
 230
 231                evsel->handler = handler;
 232                return 0;
 233        }
 234
 235        return -ENOMEM;
 236
 237out_delete:
 238        zfree(&evsel->priv);
 239        return -ENOENT;
 240}
 241
 242static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 243{
 244        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 245
 246        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 247        if (IS_ERR(evsel))
 248                evsel = perf_evsel__newtp("syscalls", direction);
 249
 250        if (IS_ERR(evsel))
 251                return NULL;
 252
 253        if (perf_evsel__init_syscall_tp(evsel, handler))
 254                goto out_delete;
 255
 256        return evsel;
 257
 258out_delete:
 259        perf_evsel__delete_priv(evsel);
 260        return NULL;
 261}
 262
 263#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 264        ({ struct syscall_tp *fields = evsel->priv; \
 265           fields->name.integer(&fields->name, sample); })
 266
 267#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 268        ({ struct syscall_tp *fields = evsel->priv; \
 269           fields->name.pointer(&fields->name, sample); })
 270
 271struct syscall_arg {
 272        unsigned long val;
 273        struct thread *thread;
 274        struct trace  *trace;
 275        void          *parm;
 276        u8            idx;
 277        u8            mask;
 278};
 279
 280struct strarray {
 281        int         offset;
 282        int         nr_entries;
 283        const char **entries;
 284};
 285
 286#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
 287        .nr_entries = ARRAY_SIZE(array), \
 288        .entries = array, \
 289}
 290
 291#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
 292        .offset     = off, \
 293        .nr_entries = ARRAY_SIZE(array), \
 294        .entries = array, \
 295}
 296
 297static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 298                                                const char *intfmt,
 299                                                struct syscall_arg *arg)
 300{
 301        struct strarray *sa = arg->parm;
 302        int idx = arg->val - sa->offset;
 303
 304        if (idx < 0 || idx >= sa->nr_entries)
 305                return scnprintf(bf, size, intfmt, arg->val);
 306
 307        return scnprintf(bf, size, "%s", sa->entries[idx]);
 308}
 309
 310static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 311                                              struct syscall_arg *arg)
 312{
 313        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 314}
 315
 316#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 317
 318#if defined(__i386__) || defined(__x86_64__)
 319/*
 320 * FIXME: Make this available to all arches as soon as the ioctl beautifier
 321 *        gets rewritten to support all arches.
 322 */
 323static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
 324                                                 struct syscall_arg *arg)
 325{
 326        return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
 327}
 328
 329#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
 330#endif /* defined(__i386__) || defined(__x86_64__) */
 331
 332static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 333                                        struct syscall_arg *arg);
 334
 335#define SCA_FD syscall_arg__scnprintf_fd
 336
 337#ifndef AT_FDCWD
 338#define AT_FDCWD        -100
 339#endif
 340
 341static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 342                                           struct syscall_arg *arg)
 343{
 344        int fd = arg->val;
 345
 346        if (fd == AT_FDCWD)
 347                return scnprintf(bf, size, "CWD");
 348
 349        return syscall_arg__scnprintf_fd(bf, size, arg);
 350}
 351
 352#define SCA_FDAT syscall_arg__scnprintf_fd_at
 353
 354static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 355                                              struct syscall_arg *arg);
 356
 357#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 358
 359static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
 360                                         struct syscall_arg *arg)
 361{
 362        return scnprintf(bf, size, "%#lx", arg->val);
 363}
 364
 365#define SCA_HEX syscall_arg__scnprintf_hex
 366
 367static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
 368                                         struct syscall_arg *arg)
 369{
 370        return scnprintf(bf, size, "%d", arg->val);
 371}
 372
 373#define SCA_INT syscall_arg__scnprintf_int
 374
 375static const char *bpf_cmd[] = {
 376        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 377        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 378};
 379static DEFINE_STRARRAY(bpf_cmd);
 380
 381static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 382static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 383
 384static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 385static DEFINE_STRARRAY(itimers);
 386
 387static const char *keyctl_options[] = {
 388        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 389        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 390        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 391        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 392        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 393};
 394static DEFINE_STRARRAY(keyctl_options);
 395
 396static const char *whences[] = { "SET", "CUR", "END",
 397#ifdef SEEK_DATA
 398"DATA",
 399#endif
 400#ifdef SEEK_HOLE
 401"HOLE",
 402#endif
 403};
 404static DEFINE_STRARRAY(whences);
 405
 406static const char *fcntl_cmds[] = {
 407        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 408        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
 409        "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
 410        "F_GETOWNER_UIDS",
 411};
 412static DEFINE_STRARRAY(fcntl_cmds);
 413
 414static const char *rlimit_resources[] = {
 415        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 416        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 417        "RTTIME",
 418};
 419static DEFINE_STRARRAY(rlimit_resources);
 420
 421static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 422static DEFINE_STRARRAY(sighow);
 423
 424static const char *clockid[] = {
 425        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 426        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 427        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 428};
 429static DEFINE_STRARRAY(clockid);
 430
 431static const char *socket_families[] = {
 432        "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 433        "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 434        "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 435        "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 436        "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 437        "ALG", "NFC", "VSOCK",
 438};
 439static DEFINE_STRARRAY(socket_families);
 440
 441static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 442                                                 struct syscall_arg *arg)
 443{
 444        size_t printed = 0;
 445        int mode = arg->val;
 446
 447        if (mode == F_OK) /* 0 */
 448                return scnprintf(bf, size, "F");
 449#define P_MODE(n) \
 450        if (mode & n##_OK) { \
 451                printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 452                mode &= ~n##_OK; \
 453        }
 454
 455        P_MODE(R);
 456        P_MODE(W);
 457        P_MODE(X);
 458#undef P_MODE
 459
 460        if (mode)
 461                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 462
 463        return printed;
 464}
 465
 466#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 467
 468static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 469                                              struct syscall_arg *arg);
 470
 471#define SCA_FILENAME syscall_arg__scnprintf_filename
 472
 473static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 474                                                struct syscall_arg *arg)
 475{
 476        int printed = 0, flags = arg->val;
 477
 478#define P_FLAG(n) \
 479        if (flags & O_##n) { \
 480                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 481                flags &= ~O_##n; \
 482        }
 483
 484        P_FLAG(CLOEXEC);
 485        P_FLAG(NONBLOCK);
 486#undef P_FLAG
 487
 488        if (flags)
 489                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 490
 491        return printed;
 492}
 493
 494#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 495
 496#if defined(__i386__) || defined(__x86_64__)
 497/*
 498 * FIXME: Make this available to all arches.
 499 */
 500#define TCGETS          0x5401
 501
 502static const char *tioctls[] = {
 503        "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
 504        "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
 505        "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
 506        "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
 507        "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
 508        "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
 509        "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
 510        "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
 511        "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
 512        "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
 513        "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
 514        [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
 515        "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
 516        "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
 517        "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
 518};
 519
 520static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
 521#endif /* defined(__i386__) || defined(__x86_64__) */
 522
 523#ifndef GRND_NONBLOCK
 524#define GRND_NONBLOCK   0x0001
 525#endif
 526#ifndef GRND_RANDOM
 527#define GRND_RANDOM     0x0002
 528#endif
 529
 530static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 531                                                   struct syscall_arg *arg)
 532{
 533        int printed = 0, flags = arg->val;
 534
 535#define P_FLAG(n) \
 536        if (flags & GRND_##n) { \
 537                printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 538                flags &= ~GRND_##n; \
 539        }
 540
 541        P_FLAG(RANDOM);
 542        P_FLAG(NONBLOCK);
 543#undef P_FLAG
 544
 545        if (flags)
 546                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 547
 548        return printed;
 549}
 550
 551#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 552
 553#define STRARRAY(arg, name, array) \
 554          .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
 555          .arg_parm      = { [arg] = &strarray__##array, }
 556
 557#include "trace/beauty/eventfd.c"
 558#include "trace/beauty/flock.c"
 559#include "trace/beauty/futex_op.c"
 560#include "trace/beauty/mmap.c"
 561#include "trace/beauty/mode_t.c"
 562#include "trace/beauty/msg_flags.c"
 563#include "trace/beauty/open_flags.c"
 564#include "trace/beauty/perf_event_open.c"
 565#include "trace/beauty/pid.c"
 566#include "trace/beauty/sched_policy.c"
 567#include "trace/beauty/seccomp.c"
 568#include "trace/beauty/signum.c"
 569#include "trace/beauty/socket_type.c"
 570#include "trace/beauty/waitid_options.c"
 571
 572static struct syscall_fmt {
 573        const char *name;
 574        const char *alias;
 575        size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
 576        void       *arg_parm[6];
 577        bool       errmsg;
 578        bool       errpid;
 579        bool       timeout;
 580        bool       hexret;
 581} syscall_fmts[] = {
 582        { .name     = "access",     .errmsg = true,
 583          .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
 584        { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
 585        { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
 586        { .name     = "brk",        .hexret = true,
 587          .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
 588        { .name     = "chdir",      .errmsg = true, },
 589        { .name     = "chmod",      .errmsg = true, },
 590        { .name     = "chroot",     .errmsg = true, },
 591        { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
 592        { .name     = "clone",      .errpid = true, },
 593        { .name     = "close",      .errmsg = true,
 594          .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
 595        { .name     = "connect",    .errmsg = true, },
 596        { .name     = "creat",      .errmsg = true, },
 597        { .name     = "dup",        .errmsg = true, },
 598        { .name     = "dup2",       .errmsg = true, },
 599        { .name     = "dup3",       .errmsg = true, },
 600        { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
 601        { .name     = "eventfd2",   .errmsg = true,
 602          .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
 603        { .name     = "faccessat",  .errmsg = true, },
 604        { .name     = "fadvise64",  .errmsg = true, },
 605        { .name     = "fallocate",  .errmsg = true, },
 606        { .name     = "fchdir",     .errmsg = true, },
 607        { .name     = "fchmod",     .errmsg = true, },
 608        { .name     = "fchmodat",   .errmsg = true,
 609          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 610        { .name     = "fchown",     .errmsg = true, },
 611        { .name     = "fchownat",   .errmsg = true,
 612          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 613        { .name     = "fcntl",      .errmsg = true,
 614          .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
 615          .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
 616        { .name     = "fdatasync",  .errmsg = true, },
 617        { .name     = "flock",      .errmsg = true,
 618          .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
 619        { .name     = "fsetxattr",  .errmsg = true, },
 620        { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
 621        { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
 622        { .name     = "fstatfs",    .errmsg = true, },
 623        { .name     = "fsync",    .errmsg = true, },
 624        { .name     = "ftruncate", .errmsg = true, },
 625        { .name     = "futex",      .errmsg = true,
 626          .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
 627        { .name     = "futimesat", .errmsg = true,
 628          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 629        { .name     = "getdents",   .errmsg = true, },
 630        { .name     = "getdents64", .errmsg = true, },
 631        { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 632        { .name     = "getpid",     .errpid = true, },
 633        { .name     = "getpgid",    .errpid = true, },
 634        { .name     = "getppid",    .errpid = true, },
 635        { .name     = "getrandom",  .errmsg = true,
 636          .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
 637        { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 638        { .name     = "getxattr",   .errmsg = true, },
 639        { .name     = "inotify_add_watch",          .errmsg = true, },
 640        { .name     = "ioctl",      .errmsg = true,
 641          .arg_scnprintf = {
 642#if defined(__i386__) || defined(__x86_64__)
 643/*
 644 * FIXME: Make this available to all arches.
 645 */
 646                             [1] = SCA_STRHEXARRAY, /* cmd */
 647                             [2] = SCA_HEX, /* arg */ },
 648          .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
 649#else
 650                             [2] = SCA_HEX, /* arg */ }, },
 651#endif
 652        { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
 653        { .name     = "kill",       .errmsg = true,
 654          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 655        { .name     = "lchown",    .errmsg = true, },
 656        { .name     = "lgetxattr",  .errmsg = true, },
 657        { .name     = "linkat",     .errmsg = true,
 658          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 659        { .name     = "listxattr",  .errmsg = true, },
 660        { .name     = "llistxattr", .errmsg = true, },
 661        { .name     = "lremovexattr",  .errmsg = true, },
 662        { .name     = "lseek",      .errmsg = true,
 663          .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
 664          .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
 665        { .name     = "lsetxattr",  .errmsg = true, },
 666        { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
 667        { .name     = "lsxattr",    .errmsg = true, },
 668        { .name     = "madvise",    .errmsg = true,
 669          .arg_scnprintf = { [0] = SCA_HEX,      /* start */
 670                             [2] = SCA_MADV_BHV, /* behavior */ }, },
 671        { .name     = "mkdir",    .errmsg = true, },
 672        { .name     = "mkdirat",    .errmsg = true,
 673          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 674        { .name     = "mknod",      .errmsg = true, },
 675        { .name     = "mknodat",    .errmsg = true,
 676          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 677        { .name     = "mlock",      .errmsg = true,
 678          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 679        { .name     = "mlockall",   .errmsg = true,
 680          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 681        { .name     = "mmap",       .hexret = true,
 682          .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
 683                             [2] = SCA_MMAP_PROT, /* prot */
 684                             [3] = SCA_MMAP_FLAGS, /* flags */ }, },
 685        { .name     = "mprotect",   .errmsg = true,
 686          .arg_scnprintf = { [0] = SCA_HEX, /* start */
 687                             [2] = SCA_MMAP_PROT, /* prot */ }, },
 688        { .name     = "mq_unlink", .errmsg = true,
 689          .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
 690        { .name     = "mremap",     .hexret = true,
 691          .arg_scnprintf = { [0] = SCA_HEX, /* addr */
 692                             [3] = SCA_MREMAP_FLAGS, /* flags */
 693                             [4] = SCA_HEX, /* new_addr */ }, },
 694        { .name     = "munlock",    .errmsg = true,
 695          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 696        { .name     = "munmap",     .errmsg = true,
 697          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 698        { .name     = "name_to_handle_at", .errmsg = true,
 699          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 700        { .name     = "newfstatat", .errmsg = true,
 701          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 702        { .name     = "open",       .errmsg = true,
 703          .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
 704        { .name     = "open_by_handle_at", .errmsg = true,
 705          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 706                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 707        { .name     = "openat",     .errmsg = true,
 708          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 709                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 710        { .name     = "perf_event_open", .errmsg = true,
 711          .arg_scnprintf = { [2] = SCA_INT, /* cpu */
 712                             [3] = SCA_FD,  /* group_fd */
 713                             [4] = SCA_PERF_FLAGS,  /* flags */ }, },
 714        { .name     = "pipe2",      .errmsg = true,
 715          .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
 716        { .name     = "poll",       .errmsg = true, .timeout = true, },
 717        { .name     = "ppoll",      .errmsg = true, .timeout = true, },
 718        { .name     = "pread",      .errmsg = true, .alias = "pread64", },
 719        { .name     = "preadv",     .errmsg = true, .alias = "pread", },
 720        { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
 721        { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
 722        { .name     = "pwritev",    .errmsg = true, },
 723        { .name     = "read",       .errmsg = true, },
 724        { .name     = "readlink",   .errmsg = true, },
 725        { .name     = "readlinkat", .errmsg = true,
 726          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 727        { .name     = "readv",      .errmsg = true, },
 728        { .name     = "recvfrom",   .errmsg = true,
 729          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 730        { .name     = "recvmmsg",   .errmsg = true,
 731          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 732        { .name     = "recvmsg",    .errmsg = true,
 733          .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 734        { .name     = "removexattr", .errmsg = true, },
 735        { .name     = "renameat",   .errmsg = true,
 736          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 737        { .name     = "rmdir",    .errmsg = true, },
 738        { .name     = "rt_sigaction", .errmsg = true,
 739          .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
 740        { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
 741        { .name     = "rt_sigqueueinfo", .errmsg = true,
 742          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 743        { .name     = "rt_tgsigqueueinfo", .errmsg = true,
 744          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 745        { .name     = "sched_getattr",        .errmsg = true, },
 746        { .name     = "sched_setattr",        .errmsg = true, },
 747        { .name     = "sched_setscheduler",   .errmsg = true,
 748          .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
 749        { .name     = "seccomp", .errmsg = true,
 750          .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
 751                             [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
 752        { .name     = "select",     .errmsg = true, .timeout = true, },
 753        { .name     = "sendmmsg",    .errmsg = true,
 754          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 755        { .name     = "sendmsg",    .errmsg = true,
 756          .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 757        { .name     = "sendto",     .errmsg = true,
 758          .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 759        { .name     = "set_tid_address", .errpid = true, },
 760        { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 761        { .name     = "setpgid",    .errmsg = true, },
 762        { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 763        { .name     = "setxattr",   .errmsg = true, },
 764        { .name     = "shutdown",   .errmsg = true, },
 765        { .name     = "socket",     .errmsg = true,
 766          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 767                             [1] = SCA_SK_TYPE, /* type */ },
 768          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
 769        { .name     = "socketpair", .errmsg = true,
 770          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 771                             [1] = SCA_SK_TYPE, /* type */ },
 772          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
 773        { .name     = "stat",       .errmsg = true, .alias = "newstat", },
 774        { .name     = "statfs",     .errmsg = true, },
 775        { .name     = "swapoff",    .errmsg = true,
 776          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 777        { .name     = "swapon",     .errmsg = true,
 778          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 779        { .name     = "symlinkat",  .errmsg = true,
 780          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 781        { .name     = "tgkill",     .errmsg = true,
 782          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 783        { .name     = "tkill",      .errmsg = true,
 784          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 785        { .name     = "truncate",   .errmsg = true, },
 786        { .name     = "uname",      .errmsg = true, .alias = "newuname", },
 787        { .name     = "unlinkat",   .errmsg = true,
 788          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 789        { .name     = "utime",  .errmsg = true, },
 790        { .name     = "utimensat",  .errmsg = true,
 791          .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
 792        { .name     = "utimes",  .errmsg = true, },
 793        { .name     = "vmsplice",  .errmsg = true, },
 794        { .name     = "wait4",      .errpid = true,
 795          .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
 796        { .name     = "waitid",     .errpid = true,
 797          .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
 798        { .name     = "write",      .errmsg = true, },
 799        { .name     = "writev",     .errmsg = true, },
 800};
 801
 802static int syscall_fmt__cmp(const void *name, const void *fmtp)
 803{
 804        const struct syscall_fmt *fmt = fmtp;
 805        return strcmp(name, fmt->name);
 806}
 807
 808static struct syscall_fmt *syscall_fmt__find(const char *name)
 809{
 810        const int nmemb = ARRAY_SIZE(syscall_fmts);
 811        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 812}
 813
 814struct syscall {
 815        struct event_format *tp_format;
 816        int                 nr_args;
 817        struct format_field *args;
 818        const char          *name;
 819        bool                is_exit;
 820        struct syscall_fmt  *fmt;
 821        size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 822        void                **arg_parm;
 823};
 824
 825static size_t fprintf_duration(unsigned long t, FILE *fp)
 826{
 827        double duration = (double)t / NSEC_PER_MSEC;
 828        size_t printed = fprintf(fp, "(");
 829
 830        if (duration >= 1.0)
 831                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 832        else if (duration >= 0.01)
 833                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 834        else
 835                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 836        return printed + fprintf(fp, "): ");
 837}
 838
 839/**
 840 * filename.ptr: The filename char pointer that will be vfs_getname'd
 841 * filename.entry_str_pos: Where to insert the string translated from
 842 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 843 */
 844struct thread_trace {
 845        u64               entry_time;
 846        u64               exit_time;
 847        bool              entry_pending;
 848        unsigned long     nr_events;
 849        unsigned long     pfmaj, pfmin;
 850        char              *entry_str;
 851        double            runtime_ms;
 852        struct {
 853                unsigned long ptr;
 854                short int     entry_str_pos;
 855                bool          pending_open;
 856                unsigned int  namelen;
 857                char          *name;
 858        } filename;
 859        struct {
 860                int       max;
 861                char      **table;
 862        } paths;
 863
 864        struct intlist *syscall_stats;
 865};
 866
 867static struct thread_trace *thread_trace__new(void)
 868{
 869        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 870
 871        if (ttrace)
 872                ttrace->paths.max = -1;
 873
 874        ttrace->syscall_stats = intlist__new(NULL);
 875
 876        return ttrace;
 877}
 878
 879static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 880{
 881        struct thread_trace *ttrace;
 882
 883        if (thread == NULL)
 884                goto fail;
 885
 886        if (thread__priv(thread) == NULL)
 887                thread__set_priv(thread, thread_trace__new());
 888
 889        if (thread__priv(thread) == NULL)
 890                goto fail;
 891
 892        ttrace = thread__priv(thread);
 893        ++ttrace->nr_events;
 894
 895        return ttrace;
 896fail:
 897        color_fprintf(fp, PERF_COLOR_RED,
 898                      "WARNING: not enough memory, dropping samples!\n");
 899        return NULL;
 900}
 901
 902#define TRACE_PFMAJ             (1 << 0)
 903#define TRACE_PFMIN             (1 << 1)
 904
 905static const size_t trace__entry_str_size = 2048;
 906
 907static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 908{
 909        struct thread_trace *ttrace = thread__priv(thread);
 910
 911        if (fd > ttrace->paths.max) {
 912                char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 913
 914                if (npath == NULL)
 915                        return -1;
 916
 917                if (ttrace->paths.max != -1) {
 918                        memset(npath + ttrace->paths.max + 1, 0,
 919                               (fd - ttrace->paths.max) * sizeof(char *));
 920                } else {
 921                        memset(npath, 0, (fd + 1) * sizeof(char *));
 922                }
 923
 924                ttrace->paths.table = npath;
 925                ttrace->paths.max   = fd;
 926        }
 927
 928        ttrace->paths.table[fd] = strdup(pathname);
 929
 930        return ttrace->paths.table[fd] != NULL ? 0 : -1;
 931}
 932
 933static int thread__read_fd_path(struct thread *thread, int fd)
 934{
 935        char linkname[PATH_MAX], pathname[PATH_MAX];
 936        struct stat st;
 937        int ret;
 938
 939        if (thread->pid_ == thread->tid) {
 940                scnprintf(linkname, sizeof(linkname),
 941                          "/proc/%d/fd/%d", thread->pid_, fd);
 942        } else {
 943                scnprintf(linkname, sizeof(linkname),
 944                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 945        }
 946
 947        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 948                return -1;
 949
 950        ret = readlink(linkname, pathname, sizeof(pathname));
 951
 952        if (ret < 0 || ret > st.st_size)
 953                return -1;
 954
 955        pathname[ret] = '\0';
 956        return trace__set_fd_pathname(thread, fd, pathname);
 957}
 958
 959static const char *thread__fd_path(struct thread *thread, int fd,
 960                                   struct trace *trace)
 961{
 962        struct thread_trace *ttrace = thread__priv(thread);
 963
 964        if (ttrace == NULL)
 965                return NULL;
 966
 967        if (fd < 0)
 968                return NULL;
 969
 970        if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 971                if (!trace->live)
 972                        return NULL;
 973                ++trace->stats.proc_getname;
 974                if (thread__read_fd_path(thread, fd))
 975                        return NULL;
 976        }
 977
 978        return ttrace->paths.table[fd];
 979}
 980
 981static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 982                                        struct syscall_arg *arg)
 983{
 984        int fd = arg->val;
 985        size_t printed = scnprintf(bf, size, "%d", fd);
 986        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 987
 988        if (path)
 989                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
 990
 991        return printed;
 992}
 993
 994static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 995                                              struct syscall_arg *arg)
 996{
 997        int fd = arg->val;
 998        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
 999        struct thread_trace *ttrace = thread__priv(arg->thread);
1000
1001        if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1002                zfree(&ttrace->paths.table[fd]);
1003
1004        return printed;
1005}
1006
1007static void thread__set_filename_pos(struct thread *thread, const char *bf,
1008                                     unsigned long ptr)
1009{
1010        struct thread_trace *ttrace = thread__priv(thread);
1011
1012        ttrace->filename.ptr = ptr;
1013        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1014}
1015
1016static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1017                                              struct syscall_arg *arg)
1018{
1019        unsigned long ptr = arg->val;
1020
1021        if (!arg->trace->vfs_getname)
1022                return scnprintf(bf, size, "%#x", ptr);
1023
1024        thread__set_filename_pos(arg->thread, bf, ptr);
1025        return 0;
1026}
1027
1028static bool trace__filter_duration(struct trace *trace, double t)
1029{
1030        return t < (trace->duration_filter * NSEC_PER_MSEC);
1031}
1032
1033static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1034{
1035        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1036
1037        return fprintf(fp, "%10.3f ", ts);
1038}
1039
1040static bool done = false;
1041static bool interrupted = false;
1042
1043static void sig_handler(int sig)
1044{
1045        done = true;
1046        interrupted = sig == SIGINT;
1047}
1048
1049static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1050                                        u64 duration, u64 tstamp, FILE *fp)
1051{
1052        size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1053        printed += fprintf_duration(duration, fp);
1054
1055        if (trace->multiple_threads) {
1056                if (trace->show_comm)
1057                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1058                printed += fprintf(fp, "%d ", thread->tid);
1059        }
1060
1061        return printed;
1062}
1063
1064static int trace__process_event(struct trace *trace, struct machine *machine,
1065                                union perf_event *event, struct perf_sample *sample)
1066{
1067        int ret = 0;
1068
1069        switch (event->header.type) {
1070        case PERF_RECORD_LOST:
1071                color_fprintf(trace->output, PERF_COLOR_RED,
1072                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1073                ret = machine__process_lost_event(machine, event, sample);
1074                break;
1075        default:
1076                ret = machine__process_event(machine, event, sample);
1077                break;
1078        }
1079
1080        return ret;
1081}
1082
1083static int trace__tool_process(struct perf_tool *tool,
1084                               union perf_event *event,
1085                               struct perf_sample *sample,
1086                               struct machine *machine)
1087{
1088        struct trace *trace = container_of(tool, struct trace, tool);
1089        return trace__process_event(trace, machine, event, sample);
1090}
1091
1092static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1093{
1094        struct machine *machine = vmachine;
1095
1096        if (machine->kptr_restrict_warned)
1097                return NULL;
1098
1099        if (symbol_conf.kptr_restrict) {
1100                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1101                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1102                           "Kernel samples will not be resolved.\n");
1103                machine->kptr_restrict_warned = true;
1104                return NULL;
1105        }
1106
1107        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1108}
1109
1110static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1111{
1112        int err = symbol__init(NULL);
1113
1114        if (err)
1115                return err;
1116
1117        trace->host = machine__new_host();
1118        if (trace->host == NULL)
1119                return -ENOMEM;
1120
1121        if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1122                return -errno;
1123
1124        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1125                                            evlist->threads, trace__tool_process, false,
1126                                            trace->opts.proc_map_timeout);
1127        if (err)
1128                symbol__exit();
1129
1130        return err;
1131}
1132
1133static int syscall__set_arg_fmts(struct syscall *sc)
1134{
1135        struct format_field *field;
1136        int idx = 0, len;
1137
1138        sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1139        if (sc->arg_scnprintf == NULL)
1140                return -1;
1141
1142        if (sc->fmt)
1143                sc->arg_parm = sc->fmt->arg_parm;
1144
1145        for (field = sc->args; field; field = field->next) {
1146                if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1147                        sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1148                else if (strcmp(field->type, "const char *") == 0 &&
1149                         (strcmp(field->name, "filename") == 0 ||
1150                          strcmp(field->name, "path") == 0 ||
1151                          strcmp(field->name, "pathname") == 0))
1152                        sc->arg_scnprintf[idx] = SCA_FILENAME;
1153                else if (field->flags & FIELD_IS_POINTER)
1154                        sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1155                else if (strcmp(field->type, "pid_t") == 0)
1156                        sc->arg_scnprintf[idx] = SCA_PID;
1157                else if (strcmp(field->type, "umode_t") == 0)
1158                        sc->arg_scnprintf[idx] = SCA_MODE_T;
1159                else if ((strcmp(field->type, "int") == 0 ||
1160                          strcmp(field->type, "unsigned int") == 0 ||
1161                          strcmp(field->type, "long") == 0) &&
1162                         (len = strlen(field->name)) >= 2 &&
1163                         strcmp(field->name + len - 2, "fd") == 0) {
1164                        /*
1165                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1166                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1167                         * 65 int
1168                         * 23 unsigned int
1169                         * 7 unsigned long
1170                         */
1171                        sc->arg_scnprintf[idx] = SCA_FD;
1172                }
1173                ++idx;
1174        }
1175
1176        return 0;
1177}
1178
1179static int trace__read_syscall_info(struct trace *trace, int id)
1180{
1181        char tp_name[128];
1182        struct syscall *sc;
1183        const char *name = syscalltbl__name(trace->sctbl, id);
1184
1185        if (name == NULL)
1186                return -1;
1187
1188        if (id > trace->syscalls.max) {
1189                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1190
1191                if (nsyscalls == NULL)
1192                        return -1;
1193
1194                if (trace->syscalls.max != -1) {
1195                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1196                               (id - trace->syscalls.max) * sizeof(*sc));
1197                } else {
1198                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1199                }
1200
1201                trace->syscalls.table = nsyscalls;
1202                trace->syscalls.max   = id;
1203        }
1204
1205        sc = trace->syscalls.table + id;
1206        sc->name = name;
1207
1208        sc->fmt  = syscall_fmt__find(sc->name);
1209
1210        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1211        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1212
1213        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1214                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1215                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1216        }
1217
1218        if (IS_ERR(sc->tp_format))
1219                return -1;
1220
1221        sc->args = sc->tp_format->format.fields;
1222        sc->nr_args = sc->tp_format->format.nr_fields;
1223        /*
1224         * We need to check and discard the first variable '__syscall_nr'
1225         * or 'nr' that mean the syscall number. It is needless here.
1226         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1227         */
1228        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1229                sc->args = sc->args->next;
1230                --sc->nr_args;
1231        }
1232
1233        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1234
1235        return syscall__set_arg_fmts(sc);
1236}
1237
1238static int trace__validate_ev_qualifier(struct trace *trace)
1239{
1240        int err = 0, i;
1241        struct str_node *pos;
1242
1243        trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1244        trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1245                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1246
1247        if (trace->ev_qualifier_ids.entries == NULL) {
1248                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1249                       trace->output);
1250                err = -EINVAL;
1251                goto out;
1252        }
1253
1254        i = 0;
1255
1256        strlist__for_each_entry(pos, trace->ev_qualifier) {
1257                const char *sc = pos->s;
1258                int id = syscalltbl__id(trace->sctbl, sc);
1259
1260                if (id < 0) {
1261                        if (err == 0) {
1262                                fputs("Error:\tInvalid syscall ", trace->output);
1263                                err = -EINVAL;
1264                        } else {
1265                                fputs(", ", trace->output);
1266                        }
1267
1268                        fputs(sc, trace->output);
1269                }
1270
1271                trace->ev_qualifier_ids.entries[i++] = id;
1272        }
1273
1274        if (err < 0) {
1275                fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1276                      "\nHint:\tand: 'man syscalls'\n", trace->output);
1277                zfree(&trace->ev_qualifier_ids.entries);
1278                trace->ev_qualifier_ids.nr = 0;
1279        }
1280out:
1281        return err;
1282}
1283
1284/*
1285 * args is to be interpreted as a series of longs but we need to handle
1286 * 8-byte unaligned accesses. args points to raw_data within the event
1287 * and raw_data is guaranteed to be 8-byte unaligned because it is
1288 * preceded by raw_size which is a u32. So we need to copy args to a temp
1289 * variable to read it. Most notably this avoids extended load instructions
1290 * on unaligned addresses
1291 */
1292
1293static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1294                                      unsigned char *args, struct trace *trace,
1295                                      struct thread *thread)
1296{
1297        size_t printed = 0;
1298        unsigned char *p;
1299        unsigned long val;
1300
1301        if (sc->args != NULL) {
1302                struct format_field *field;
1303                u8 bit = 1;
1304                struct syscall_arg arg = {
1305                        .idx    = 0,
1306                        .mask   = 0,
1307                        .trace  = trace,
1308                        .thread = thread,
1309                };
1310
1311                for (field = sc->args; field;
1312                     field = field->next, ++arg.idx, bit <<= 1) {
1313                        if (arg.mask & bit)
1314                                continue;
1315
1316                        /* special care for unaligned accesses */
1317                        p = args + sizeof(unsigned long) * arg.idx;
1318                        memcpy(&val, p, sizeof(val));
1319
1320                        /*
1321                         * Suppress this argument if its value is zero and
1322                         * and we don't have a string associated in an
1323                         * strarray for it.
1324                         */
1325                        if (val == 0 &&
1326                            !(sc->arg_scnprintf &&
1327                              sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1328                              sc->arg_parm[arg.idx]))
1329                                continue;
1330
1331                        printed += scnprintf(bf + printed, size - printed,
1332                                             "%s%s: ", printed ? ", " : "", field->name);
1333                        if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1334                                arg.val = val;
1335                                if (sc->arg_parm)
1336                                        arg.parm = sc->arg_parm[arg.idx];
1337                                printed += sc->arg_scnprintf[arg.idx](bf + printed,
1338                                                                      size - printed, &arg);
1339                        } else {
1340                                printed += scnprintf(bf + printed, size - printed,
1341                                                     "%ld", val);
1342                        }
1343                }
1344        } else if (IS_ERR(sc->tp_format)) {
1345                /*
1346                 * If we managed to read the tracepoint /format file, then we
1347                 * may end up not having any args, like with gettid(), so only
1348                 * print the raw args when we didn't manage to read it.
1349                 */
1350                int i = 0;
1351
1352                while (i < 6) {
1353                        /* special care for unaligned accesses */
1354                        p = args + sizeof(unsigned long) * i;
1355                        memcpy(&val, p, sizeof(val));
1356                        printed += scnprintf(bf + printed, size - printed,
1357                                             "%sarg%d: %ld",
1358                                             printed ? ", " : "", i, val);
1359                        ++i;
1360                }
1361        }
1362
1363        return printed;
1364}
1365
1366typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1367                                  union perf_event *event,
1368                                  struct perf_sample *sample);
1369
1370static struct syscall *trace__syscall_info(struct trace *trace,
1371                                           struct perf_evsel *evsel, int id)
1372{
1373
1374        if (id < 0) {
1375
1376                /*
1377                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1378                 * before that, leaving at a higher verbosity level till that is
1379                 * explained. Reproduced with plain ftrace with:
1380                 *
1381                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1382                 * grep "NR -1 " /t/trace_pipe
1383                 *
1384                 * After generating some load on the machine.
1385                 */
1386                if (verbose > 1) {
1387                        static u64 n;
1388                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1389                                id, perf_evsel__name(evsel), ++n);
1390                }
1391                return NULL;
1392        }
1393
1394        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1395            trace__read_syscall_info(trace, id))
1396                goto out_cant_read;
1397
1398        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1399                goto out_cant_read;
1400
1401        return &trace->syscalls.table[id];
1402
1403out_cant_read:
1404        if (verbose) {
1405                fprintf(trace->output, "Problems reading syscall %d", id);
1406                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1407                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1408                fputs(" information\n", trace->output);
1409        }
1410        return NULL;
1411}
1412
1413static void thread__update_stats(struct thread_trace *ttrace,
1414                                 int id, struct perf_sample *sample)
1415{
1416        struct int_node *inode;
1417        struct stats *stats;
1418        u64 duration = 0;
1419
1420        inode = intlist__findnew(ttrace->syscall_stats, id);
1421        if (inode == NULL)
1422                return;
1423
1424        stats = inode->priv;
1425        if (stats == NULL) {
1426                stats = malloc(sizeof(struct stats));
1427                if (stats == NULL)
1428                        return;
1429                init_stats(stats);
1430                inode->priv = stats;
1431        }
1432
1433        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1434                duration = sample->time - ttrace->entry_time;
1435
1436        update_stats(stats, duration);
1437}
1438
1439static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1440{
1441        struct thread_trace *ttrace;
1442        u64 duration;
1443        size_t printed;
1444
1445        if (trace->current == NULL)
1446                return 0;
1447
1448        ttrace = thread__priv(trace->current);
1449
1450        if (!ttrace->entry_pending)
1451                return 0;
1452
1453        duration = sample->time - ttrace->entry_time;
1454
1455        printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1456        printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1457        ttrace->entry_pending = false;
1458
1459        return printed;
1460}
1461
1462static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1463                            union perf_event *event __maybe_unused,
1464                            struct perf_sample *sample)
1465{
1466        char *msg;
1467        void *args;
1468        size_t printed = 0;
1469        struct thread *thread;
1470        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1471        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1472        struct thread_trace *ttrace;
1473
1474        if (sc == NULL)
1475                return -1;
1476
1477        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1478        ttrace = thread__trace(thread, trace->output);
1479        if (ttrace == NULL)
1480                goto out_put;
1481
1482        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1483
1484        if (ttrace->entry_str == NULL) {
1485                ttrace->entry_str = malloc(trace__entry_str_size);
1486                if (!ttrace->entry_str)
1487                        goto out_put;
1488        }
1489
1490        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1491                trace__printf_interrupted_entry(trace, sample);
1492
1493        ttrace->entry_time = sample->time;
1494        msg = ttrace->entry_str;
1495        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1496
1497        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1498                                           args, trace, thread);
1499
1500        if (sc->is_exit) {
1501                if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1502                        trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1503                        fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1504                }
1505        } else {
1506                ttrace->entry_pending = true;
1507                /* See trace__vfs_getname & trace__sys_exit */
1508                ttrace->filename.pending_open = false;
1509        }
1510
1511        if (trace->current != thread) {
1512                thread__put(trace->current);
1513                trace->current = thread__get(thread);
1514        }
1515        err = 0;
1516out_put:
1517        thread__put(thread);
1518        return err;
1519}
1520
1521static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1522                                    struct perf_sample *sample,
1523                                    struct callchain_cursor *cursor)
1524{
1525        struct addr_location al;
1526
1527        if (machine__resolve(trace->host, &al, sample) < 0 ||
1528            thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1529                return -1;
1530
1531        return 0;
1532}
1533
1534static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1535{
1536        /* TODO: user-configurable print_opts */
1537        const unsigned int print_opts = EVSEL__PRINT_SYM |
1538                                        EVSEL__PRINT_DSO |
1539                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1540
1541        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1542}
1543
1544static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1545                           union perf_event *event __maybe_unused,
1546                           struct perf_sample *sample)
1547{
1548        long ret;
1549        u64 duration = 0;
1550        struct thread *thread;
1551        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1552        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1553        struct thread_trace *ttrace;
1554
1555        if (sc == NULL)
1556                return -1;
1557
1558        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1559        ttrace = thread__trace(thread, trace->output);
1560        if (ttrace == NULL)
1561                goto out_put;
1562
1563        if (trace->summary)
1564                thread__update_stats(ttrace, id, sample);
1565
1566        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1567
1568        if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1569                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1570                ttrace->filename.pending_open = false;
1571                ++trace->stats.vfs_getname;
1572        }
1573
1574        ttrace->exit_time = sample->time;
1575
1576        if (ttrace->entry_time) {
1577                duration = sample->time - ttrace->entry_time;
1578                if (trace__filter_duration(trace, duration))
1579                        goto out;
1580        } else if (trace->duration_filter)
1581                goto out;
1582
1583        if (sample->callchain) {
1584                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1585                if (callchain_ret == 0) {
1586                        if (callchain_cursor.nr < trace->min_stack)
1587                                goto out;
1588                        callchain_ret = 1;
1589                }
1590        }
1591
1592        if (trace->summary_only)
1593                goto out;
1594
1595        trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1596
1597        if (ttrace->entry_pending) {
1598                fprintf(trace->output, "%-70s", ttrace->entry_str);
1599        } else {
1600                fprintf(trace->output, " ... [");
1601                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1602                fprintf(trace->output, "]: %s()", sc->name);
1603        }
1604
1605        if (sc->fmt == NULL) {
1606signed_print:
1607                fprintf(trace->output, ") = %ld", ret);
1608        } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1609                char bf[STRERR_BUFSIZE];
1610                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1611                           *e = audit_errno_to_name(-ret);
1612
1613                fprintf(trace->output, ") = -1 %s %s", e, emsg);
1614        } else if (ret == 0 && sc->fmt->timeout)
1615                fprintf(trace->output, ") = 0 Timeout");
1616        else if (sc->fmt->hexret)
1617                fprintf(trace->output, ") = %#lx", ret);
1618        else if (sc->fmt->errpid) {
1619                struct thread *child = machine__find_thread(trace->host, ret, ret);
1620
1621                if (child != NULL) {
1622                        fprintf(trace->output, ") = %ld", ret);
1623                        if (child->comm_set)
1624                                fprintf(trace->output, " (%s)", thread__comm_str(child));
1625                        thread__put(child);
1626                }
1627        } else
1628                goto signed_print;
1629
1630        fputc('\n', trace->output);
1631
1632        if (callchain_ret > 0)
1633                trace__fprintf_callchain(trace, sample);
1634        else if (callchain_ret < 0)
1635                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1636out:
1637        ttrace->entry_pending = false;
1638        err = 0;
1639out_put:
1640        thread__put(thread);
1641        return err;
1642}
1643
1644static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1645                              union perf_event *event __maybe_unused,
1646                              struct perf_sample *sample)
1647{
1648        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1649        struct thread_trace *ttrace;
1650        size_t filename_len, entry_str_len, to_move;
1651        ssize_t remaining_space;
1652        char *pos;
1653        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1654
1655        if (!thread)
1656                goto out;
1657
1658        ttrace = thread__priv(thread);
1659        if (!ttrace)
1660                goto out;
1661
1662        filename_len = strlen(filename);
1663
1664        if (ttrace->filename.namelen < filename_len) {
1665                char *f = realloc(ttrace->filename.name, filename_len + 1);
1666
1667                if (f == NULL)
1668                                goto out;
1669
1670                ttrace->filename.namelen = filename_len;
1671                ttrace->filename.name = f;
1672        }
1673
1674        strcpy(ttrace->filename.name, filename);
1675        ttrace->filename.pending_open = true;
1676
1677        if (!ttrace->filename.ptr)
1678                goto out;
1679
1680        entry_str_len = strlen(ttrace->entry_str);
1681        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1682        if (remaining_space <= 0)
1683                goto out;
1684
1685        if (filename_len > (size_t)remaining_space) {
1686                filename += filename_len - remaining_space;
1687                filename_len = remaining_space;
1688        }
1689
1690        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1691        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1692        memmove(pos + filename_len, pos, to_move);
1693        memcpy(pos, filename, filename_len);
1694
1695        ttrace->filename.ptr = 0;
1696        ttrace->filename.entry_str_pos = 0;
1697out:
1698        return 0;
1699}
1700
1701static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1702                                     union perf_event *event __maybe_unused,
1703                                     struct perf_sample *sample)
1704{
1705        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1706        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1707        struct thread *thread = machine__findnew_thread(trace->host,
1708                                                        sample->pid,
1709                                                        sample->tid);
1710        struct thread_trace *ttrace = thread__trace(thread, trace->output);
1711
1712        if (ttrace == NULL)
1713                goto out_dump;
1714
1715        ttrace->runtime_ms += runtime_ms;
1716        trace->runtime_ms += runtime_ms;
1717        thread__put(thread);
1718        return 0;
1719
1720out_dump:
1721        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1722               evsel->name,
1723               perf_evsel__strval(evsel, sample, "comm"),
1724               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1725               runtime,
1726               perf_evsel__intval(evsel, sample, "vruntime"));
1727        thread__put(thread);
1728        return 0;
1729}
1730
1731static void bpf_output__printer(enum binary_printer_ops op,
1732                                unsigned int val, void *extra)
1733{
1734        FILE *output = extra;
1735        unsigned char ch = (unsigned char)val;
1736
1737        switch (op) {
1738        case BINARY_PRINT_CHAR_DATA:
1739                fprintf(output, "%c", isprint(ch) ? ch : '.');
1740                break;
1741        case BINARY_PRINT_DATA_BEGIN:
1742        case BINARY_PRINT_LINE_BEGIN:
1743        case BINARY_PRINT_ADDR:
1744        case BINARY_PRINT_NUM_DATA:
1745        case BINARY_PRINT_NUM_PAD:
1746        case BINARY_PRINT_SEP:
1747        case BINARY_PRINT_CHAR_PAD:
1748        case BINARY_PRINT_LINE_END:
1749        case BINARY_PRINT_DATA_END:
1750        default:
1751                break;
1752        }
1753}
1754
1755static void bpf_output__fprintf(struct trace *trace,
1756                                struct perf_sample *sample)
1757{
1758        print_binary(sample->raw_data, sample->raw_size, 8,
1759                     bpf_output__printer, trace->output);
1760}
1761
1762static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1763                                union perf_event *event __maybe_unused,
1764                                struct perf_sample *sample)
1765{
1766        int callchain_ret = 0;
1767
1768        if (sample->callchain) {
1769                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1770                if (callchain_ret == 0) {
1771                        if (callchain_cursor.nr < trace->min_stack)
1772                                goto out;
1773                        callchain_ret = 1;
1774                }
1775        }
1776
1777        trace__printf_interrupted_entry(trace, sample);
1778        trace__fprintf_tstamp(trace, sample->time, trace->output);
1779
1780        if (trace->trace_syscalls)
1781                fprintf(trace->output, "(         ): ");
1782
1783        fprintf(trace->output, "%s:", evsel->name);
1784
1785        if (perf_evsel__is_bpf_output(evsel)) {
1786                bpf_output__fprintf(trace, sample);
1787        } else if (evsel->tp_format) {
1788                event_format__fprintf(evsel->tp_format, sample->cpu,
1789                                      sample->raw_data, sample->raw_size,
1790                                      trace->output);
1791        }
1792
1793        fprintf(trace->output, ")\n");
1794
1795        if (callchain_ret > 0)
1796                trace__fprintf_callchain(trace, sample);
1797        else if (callchain_ret < 0)
1798                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1799out:
1800        return 0;
1801}
1802
1803static void print_location(FILE *f, struct perf_sample *sample,
1804                           struct addr_location *al,
1805                           bool print_dso, bool print_sym)
1806{
1807
1808        if ((verbose || print_dso) && al->map)
1809                fprintf(f, "%s@", al->map->dso->long_name);
1810
1811        if ((verbose || print_sym) && al->sym)
1812                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1813                        al->addr - al->sym->start);
1814        else if (al->map)
1815                fprintf(f, "0x%" PRIx64, al->addr);
1816        else
1817                fprintf(f, "0x%" PRIx64, sample->addr);
1818}
1819
1820static int trace__pgfault(struct trace *trace,
1821                          struct perf_evsel *evsel,
1822                          union perf_event *event __maybe_unused,
1823                          struct perf_sample *sample)
1824{
1825        struct thread *thread;
1826        struct addr_location al;
1827        char map_type = 'd';
1828        struct thread_trace *ttrace;
1829        int err = -1;
1830        int callchain_ret = 0;
1831
1832        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1833
1834        if (sample->callchain) {
1835                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1836                if (callchain_ret == 0) {
1837                        if (callchain_cursor.nr < trace->min_stack)
1838                                goto out_put;
1839                        callchain_ret = 1;
1840                }
1841        }
1842
1843        ttrace = thread__trace(thread, trace->output);
1844        if (ttrace == NULL)
1845                goto out_put;
1846
1847        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1848                ttrace->pfmaj++;
1849        else
1850                ttrace->pfmin++;
1851
1852        if (trace->summary_only)
1853                goto out;
1854
1855        thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1856                              sample->ip, &al);
1857
1858        trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1859
1860        fprintf(trace->output, "%sfault [",
1861                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1862                "maj" : "min");
1863
1864        print_location(trace->output, sample, &al, false, true);
1865
1866        fprintf(trace->output, "] => ");
1867
1868        thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1869                                   sample->addr, &al);
1870
1871        if (!al.map) {
1872                thread__find_addr_location(thread, sample->cpumode,
1873                                           MAP__FUNCTION, sample->addr, &al);
1874
1875                if (al.map)
1876                        map_type = 'x';
1877                else
1878                        map_type = '?';
1879        }
1880
1881        print_location(trace->output, sample, &al, true, false);
1882
1883        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1884
1885        if (callchain_ret > 0)
1886                trace__fprintf_callchain(trace, sample);
1887        else if (callchain_ret < 0)
1888                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1889out:
1890        err = 0;
1891out_put:
1892        thread__put(thread);
1893        return err;
1894}
1895
1896static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1897{
1898        if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1899            (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1900                return false;
1901
1902        if (trace->pid_list || trace->tid_list)
1903                return true;
1904
1905        return false;
1906}
1907
1908static void trace__set_base_time(struct trace *trace,
1909                                 struct perf_evsel *evsel,
1910                                 struct perf_sample *sample)
1911{
1912        /*
1913         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1914         * and don't use sample->time unconditionally, we may end up having
1915         * some other event in the future without PERF_SAMPLE_TIME for good
1916         * reason, i.e. we may not be interested in its timestamps, just in
1917         * it taking place, picking some piece of information when it
1918         * appears in our event stream (vfs_getname comes to mind).
1919         */
1920        if (trace->base_time == 0 && !trace->full_time &&
1921            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1922                trace->base_time = sample->time;
1923}
1924
1925static int trace__process_sample(struct perf_tool *tool,
1926                                 union perf_event *event,
1927                                 struct perf_sample *sample,
1928                                 struct perf_evsel *evsel,
1929                                 struct machine *machine __maybe_unused)
1930{
1931        struct trace *trace = container_of(tool, struct trace, tool);
1932        int err = 0;
1933
1934        tracepoint_handler handler = evsel->handler;
1935
1936        if (skip_sample(trace, sample))
1937                return 0;
1938
1939        trace__set_base_time(trace, evsel, sample);
1940
1941        if (handler) {
1942                ++trace->nr_events;
1943                handler(trace, evsel, event, sample);
1944        }
1945
1946        return err;
1947}
1948
1949static int parse_target_str(struct trace *trace)
1950{
1951        if (trace->opts.target.pid) {
1952                trace->pid_list = intlist__new(trace->opts.target.pid);
1953                if (trace->pid_list == NULL) {
1954                        pr_err("Error parsing process id string\n");
1955                        return -EINVAL;
1956                }
1957        }
1958
1959        if (trace->opts.target.tid) {
1960                trace->tid_list = intlist__new(trace->opts.target.tid);
1961                if (trace->tid_list == NULL) {
1962                        pr_err("Error parsing thread id string\n");
1963                        return -EINVAL;
1964                }
1965        }
1966
1967        return 0;
1968}
1969
1970static int trace__record(struct trace *trace, int argc, const char **argv)
1971{
1972        unsigned int rec_argc, i, j;
1973        const char **rec_argv;
1974        const char * const record_args[] = {
1975                "record",
1976                "-R",
1977                "-m", "1024",
1978                "-c", "1",
1979        };
1980
1981        const char * const sc_args[] = { "-e", };
1982        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1983        const char * const majpf_args[] = { "-e", "major-faults" };
1984        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1985        const char * const minpf_args[] = { "-e", "minor-faults" };
1986        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1987
1988        /* +1 is for the event string below */
1989        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1990                majpf_args_nr + minpf_args_nr + argc;
1991        rec_argv = calloc(rec_argc + 1, sizeof(char *));
1992
1993        if (rec_argv == NULL)
1994                return -ENOMEM;
1995
1996        j = 0;
1997        for (i = 0; i < ARRAY_SIZE(record_args); i++)
1998                rec_argv[j++] = record_args[i];
1999
2000        if (trace->trace_syscalls) {
2001                for (i = 0; i < sc_args_nr; i++)
2002                        rec_argv[j++] = sc_args[i];
2003
2004                /* event string may be different for older kernels - e.g., RHEL6 */
2005                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2006                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2007                else if (is_valid_tracepoint("syscalls:sys_enter"))
2008                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2009                else {
2010                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2011                        return -1;
2012                }
2013        }
2014
2015        if (trace->trace_pgfaults & TRACE_PFMAJ)
2016                for (i = 0; i < majpf_args_nr; i++)
2017                        rec_argv[j++] = majpf_args[i];
2018
2019        if (trace->trace_pgfaults & TRACE_PFMIN)
2020                for (i = 0; i < minpf_args_nr; i++)
2021                        rec_argv[j++] = minpf_args[i];
2022
2023        for (i = 0; i < (unsigned int)argc; i++)
2024                rec_argv[j++] = argv[i];
2025
2026        return cmd_record(j, rec_argv, NULL);
2027}
2028
2029static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2030
2031static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2032{
2033        struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2034
2035        if (IS_ERR(evsel))
2036                return false;
2037
2038        if (perf_evsel__field(evsel, "pathname") == NULL) {
2039                perf_evsel__delete(evsel);
2040                return false;
2041        }
2042
2043        evsel->handler = trace__vfs_getname;
2044        perf_evlist__add(evlist, evsel);
2045        return true;
2046}
2047
2048static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2049{
2050        struct perf_evsel *evsel;
2051        struct perf_event_attr attr = {
2052                .type = PERF_TYPE_SOFTWARE,
2053                .mmap_data = 1,
2054        };
2055
2056        attr.config = config;
2057        attr.sample_period = 1;
2058
2059        event_attr_init(&attr);
2060
2061        evsel = perf_evsel__new(&attr);
2062        if (evsel)
2063                evsel->handler = trace__pgfault;
2064
2065        return evsel;
2066}
2067
2068static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2069{
2070        const u32 type = event->header.type;
2071        struct perf_evsel *evsel;
2072
2073        if (type != PERF_RECORD_SAMPLE) {
2074                trace__process_event(trace, trace->host, event, sample);
2075                return;
2076        }
2077
2078        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2079        if (evsel == NULL) {
2080                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2081                return;
2082        }
2083
2084        trace__set_base_time(trace, evsel, sample);
2085
2086        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2087            sample->raw_data == NULL) {
2088                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2089                       perf_evsel__name(evsel), sample->tid,
2090                       sample->cpu, sample->raw_size);
2091        } else {
2092                tracepoint_handler handler = evsel->handler;
2093                handler(trace, evsel, event, sample);
2094        }
2095}
2096
2097static int trace__add_syscall_newtp(struct trace *trace)
2098{
2099        int ret = -1;
2100        struct perf_evlist *evlist = trace->evlist;
2101        struct perf_evsel *sys_enter, *sys_exit;
2102
2103        sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2104        if (sys_enter == NULL)
2105                goto out;
2106
2107        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2108                goto out_delete_sys_enter;
2109
2110        sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2111        if (sys_exit == NULL)
2112                goto out_delete_sys_enter;
2113
2114        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2115                goto out_delete_sys_exit;
2116
2117        perf_evlist__add(evlist, sys_enter);
2118        perf_evlist__add(evlist, sys_exit);
2119
2120        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2121                /*
2122                 * We're interested only in the user space callchain
2123                 * leading to the syscall, allow overriding that for
2124                 * debugging reasons using --kernel_syscall_callchains
2125                 */
2126                sys_exit->attr.exclude_callchain_kernel = 1;
2127        }
2128
2129        trace->syscalls.events.sys_enter = sys_enter;
2130        trace->syscalls.events.sys_exit  = sys_exit;
2131
2132        ret = 0;
2133out:
2134        return ret;
2135
2136out_delete_sys_exit:
2137        perf_evsel__delete_priv(sys_exit);
2138out_delete_sys_enter:
2139        perf_evsel__delete_priv(sys_enter);
2140        goto out;
2141}
2142
2143static int trace__set_ev_qualifier_filter(struct trace *trace)
2144{
2145        int err = -1;
2146        struct perf_evsel *sys_exit;
2147        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2148                                                trace->ev_qualifier_ids.nr,
2149                                                trace->ev_qualifier_ids.entries);
2150
2151        if (filter == NULL)
2152                goto out_enomem;
2153
2154        if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2155                                          filter)) {
2156                sys_exit = trace->syscalls.events.sys_exit;
2157                err = perf_evsel__append_tp_filter(sys_exit, filter);
2158        }
2159
2160        free(filter);
2161out:
2162        return err;
2163out_enomem:
2164        errno = ENOMEM;
2165        goto out;
2166}
2167
2168static int trace__run(struct trace *trace, int argc, const char **argv)
2169{
2170        struct perf_evlist *evlist = trace->evlist;
2171        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2172        int err = -1, i;
2173        unsigned long before;
2174        const bool forks = argc > 0;
2175        bool draining = false;
2176
2177        trace->live = true;
2178
2179        if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2180                goto out_error_raw_syscalls;
2181
2182        if (trace->trace_syscalls)
2183                trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2184
2185        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2186                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2187                if (pgfault_maj == NULL)
2188                        goto out_error_mem;
2189                perf_evlist__add(evlist, pgfault_maj);
2190        }
2191
2192        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2193                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2194                if (pgfault_min == NULL)
2195                        goto out_error_mem;
2196                perf_evlist__add(evlist, pgfault_min);
2197        }
2198
2199        if (trace->sched &&
2200            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2201                                   trace__sched_stat_runtime))
2202                goto out_error_sched_stat_runtime;
2203
2204        err = perf_evlist__create_maps(evlist, &trace->opts.target);
2205        if (err < 0) {
2206                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2207                goto out_delete_evlist;
2208        }
2209
2210        err = trace__symbols_init(trace, evlist);
2211        if (err < 0) {
2212                fprintf(trace->output, "Problems initializing symbol libraries!\n");
2213                goto out_delete_evlist;
2214        }
2215
2216        perf_evlist__config(evlist, &trace->opts, NULL);
2217
2218        if (callchain_param.enabled) {
2219                bool use_identifier = false;
2220
2221                if (trace->syscalls.events.sys_exit) {
2222                        perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2223                                                     &trace->opts, &callchain_param);
2224                        use_identifier = true;
2225                }
2226
2227                if (pgfault_maj) {
2228                        perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2229                        use_identifier = true;
2230                }
2231
2232                if (pgfault_min) {
2233                        perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2234                        use_identifier = true;
2235                }
2236
2237                if (use_identifier) {
2238                       /*
2239                        * Now we have evsels with different sample_ids, use
2240                        * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2241                        * from a fixed position in each ring buffer record.
2242                        *
2243                        * As of this the changeset introducing this comment, this
2244                        * isn't strictly needed, as the fields that can come before
2245                        * PERF_SAMPLE_ID are all used, but we'll probably disable
2246                        * some of those for things like copying the payload of
2247                        * pointer syscall arguments, and for vfs_getname we don't
2248                        * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2249                        * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2250                        */
2251                        perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2252                        perf_evlist__reset_sample_bit(evlist, ID);
2253                }
2254        }
2255
2256        signal(SIGCHLD, sig_handler);
2257        signal(SIGINT, sig_handler);
2258
2259        if (forks) {
2260                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2261                                                    argv, false, NULL);
2262                if (err < 0) {
2263                        fprintf(trace->output, "Couldn't run the workload!\n");
2264                        goto out_delete_evlist;
2265                }
2266        }
2267
2268        err = perf_evlist__open(evlist);
2269        if (err < 0)
2270                goto out_error_open;
2271
2272        err = bpf__apply_obj_config();
2273        if (err) {
2274                char errbuf[BUFSIZ];
2275
2276                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2277                pr_err("ERROR: Apply config to BPF failed: %s\n",
2278                         errbuf);
2279                goto out_error_open;
2280        }
2281
2282        /*
2283         * Better not use !target__has_task() here because we need to cover the
2284         * case where no threads were specified in the command line, but a
2285         * workload was, and in that case we will fill in the thread_map when
2286         * we fork the workload in perf_evlist__prepare_workload.
2287         */
2288        if (trace->filter_pids.nr > 0)
2289                err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2290        else if (thread_map__pid(evlist->threads, 0) == -1)
2291                err = perf_evlist__set_filter_pid(evlist, getpid());
2292
2293        if (err < 0)
2294                goto out_error_mem;
2295
2296        if (trace->ev_qualifier_ids.nr > 0) {
2297                err = trace__set_ev_qualifier_filter(trace);
2298                if (err < 0)
2299                        goto out_errno;
2300
2301                pr_debug("event qualifier tracepoint filter: %s\n",
2302                         trace->syscalls.events.sys_exit->filter);
2303        }
2304
2305        err = perf_evlist__apply_filters(evlist, &evsel);
2306        if (err < 0)
2307                goto out_error_apply_filters;
2308
2309        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2310        if (err < 0)
2311                goto out_error_mmap;
2312
2313        if (!target__none(&trace->opts.target))
2314                perf_evlist__enable(evlist);
2315
2316        if (forks)
2317                perf_evlist__start_workload(evlist);
2318
2319        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2320                                  evlist->threads->nr > 1 ||
2321                                  perf_evlist__first(evlist)->attr.inherit;
2322again:
2323        before = trace->nr_events;
2324
2325        for (i = 0; i < evlist->nr_mmaps; i++) {
2326                union perf_event *event;
2327
2328                while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2329                        struct perf_sample sample;
2330
2331                        ++trace->nr_events;
2332
2333                        err = perf_evlist__parse_sample(evlist, event, &sample);
2334                        if (err) {
2335                                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336                                goto next_event;
2337                        }
2338
2339                        trace__handle_event(trace, event, &sample);
2340next_event:
2341                        perf_evlist__mmap_consume(evlist, i);
2342
2343                        if (interrupted)
2344                                goto out_disable;
2345
2346                        if (done && !draining) {
2347                                perf_evlist__disable(evlist);
2348                                draining = true;
2349                        }
2350                }
2351        }
2352
2353        if (trace->nr_events == before) {
2354                int timeout = done ? 100 : -1;
2355
2356                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2357                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2358                                draining = true;
2359
2360                        goto again;
2361                }
2362        } else {
2363                goto again;
2364        }
2365
2366out_disable:
2367        thread__zput(trace->current);
2368
2369        perf_evlist__disable(evlist);
2370
2371        if (!err) {
2372                if (trace->summary)
2373                        trace__fprintf_thread_summary(trace, trace->output);
2374
2375                if (trace->show_tool_stats) {
2376                        fprintf(trace->output, "Stats:\n "
2377                                               " vfs_getname : %" PRIu64 "\n"
2378                                               " proc_getname: %" PRIu64 "\n",
2379                                trace->stats.vfs_getname,
2380                                trace->stats.proc_getname);
2381                }
2382        }
2383
2384out_delete_evlist:
2385        perf_evlist__delete(evlist);
2386        trace->evlist = NULL;
2387        trace->live = false;
2388        return err;
2389{
2390        char errbuf[BUFSIZ];
2391
2392out_error_sched_stat_runtime:
2393        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2394        goto out_error;
2395
2396out_error_raw_syscalls:
2397        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398        goto out_error;
2399
2400out_error_mmap:
2401        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402        goto out_error;
2403
2404out_error_open:
2405        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2406
2407out_error:
2408        fprintf(trace->output, "%s\n", errbuf);
2409        goto out_delete_evlist;
2410
2411out_error_apply_filters:
2412        fprintf(trace->output,
2413                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2414                evsel->filter, perf_evsel__name(evsel), errno,
2415                str_error_r(errno, errbuf, sizeof(errbuf)));
2416        goto out_delete_evlist;
2417}
2418out_error_mem:
2419        fprintf(trace->output, "Not enough memory to run!\n");
2420        goto out_delete_evlist;
2421
2422out_errno:
2423        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2424        goto out_delete_evlist;
2425}
2426
2427static int trace__replay(struct trace *trace)
2428{
2429        const struct perf_evsel_str_handler handlers[] = {
2430                { "probe:vfs_getname",       trace__vfs_getname, },
2431        };
2432        struct perf_data_file file = {
2433                .path  = input_name,
2434                .mode  = PERF_DATA_MODE_READ,
2435                .force = trace->force,
2436        };
2437        struct perf_session *session;
2438        struct perf_evsel *evsel;
2439        int err = -1;
2440
2441        trace->tool.sample        = trace__process_sample;
2442        trace->tool.mmap          = perf_event__process_mmap;
2443        trace->tool.mmap2         = perf_event__process_mmap2;
2444        trace->tool.comm          = perf_event__process_comm;
2445        trace->tool.exit          = perf_event__process_exit;
2446        trace->tool.fork          = perf_event__process_fork;
2447        trace->tool.attr          = perf_event__process_attr;
2448        trace->tool.tracing_data = perf_event__process_tracing_data;
2449        trace->tool.build_id      = perf_event__process_build_id;
2450
2451        trace->tool.ordered_events = true;
2452        trace->tool.ordering_requires_timestamps = true;
2453
2454        /* add tid to output */
2455        trace->multiple_threads = true;
2456
2457        session = perf_session__new(&file, false, &trace->tool);
2458        if (session == NULL)
2459                return -1;
2460
2461        if (symbol__init(&session->header.env) < 0)
2462                goto out;
2463
2464        trace->host = &session->machines.host;
2465
2466        err = perf_session__set_tracepoints_handlers(session, handlers);
2467        if (err)
2468                goto out;
2469
2470        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2471                                                     "raw_syscalls:sys_enter");
2472        /* older kernels have syscalls tp versus raw_syscalls */
2473        if (evsel == NULL)
2474                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475                                                             "syscalls:sys_enter");
2476
2477        if (evsel &&
2478            (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2479            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2480                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2481                goto out;
2482        }
2483
2484        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485                                                     "raw_syscalls:sys_exit");
2486        if (evsel == NULL)
2487                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488                                                             "syscalls:sys_exit");
2489        if (evsel &&
2490            (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2491            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2492                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2493                goto out;
2494        }
2495
2496        evlist__for_each_entry(session->evlist, evsel) {
2497                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2498                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2499                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2500                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2501                        evsel->handler = trace__pgfault;
2502        }
2503
2504        err = parse_target_str(trace);
2505        if (err != 0)
2506                goto out;
2507
2508        setup_pager();
2509
2510        err = perf_session__process_events(session);
2511        if (err)
2512                pr_err("Failed to process events, error %d", err);
2513
2514        else if (trace->summary)
2515                trace__fprintf_thread_summary(trace, trace->output);
2516
2517out:
2518        perf_session__delete(session);
2519
2520        return err;
2521}
2522
2523static size_t trace__fprintf_threads_header(FILE *fp)
2524{
2525        size_t printed;
2526
2527        printed  = fprintf(fp, "\n Summary of events:\n\n");
2528
2529        return printed;
2530}
2531
2532DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533        struct stats    *stats;
2534        double          msecs;
2535        int             syscall;
2536)
2537{
2538        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539        struct stats *stats = source->priv;
2540
2541        entry->syscall = source->i;
2542        entry->stats   = stats;
2543        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2544}
2545
2546static size_t thread__dump_stats(struct thread_trace *ttrace,
2547                                 struct trace *trace, FILE *fp)
2548{
2549        size_t printed = 0;
2550        struct syscall *sc;
2551        struct rb_node *nd;
2552        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2553
2554        if (syscall_stats == NULL)
2555                return 0;
2556
2557        printed += fprintf(fp, "\n");
2558
2559        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2560        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2561        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2562
2563        resort_rb__for_each_entry(nd, syscall_stats) {
2564                struct stats *stats = syscall_stats_entry->stats;
2565                if (stats) {
2566                        double min = (double)(stats->min) / NSEC_PER_MSEC;
2567                        double max = (double)(stats->max) / NSEC_PER_MSEC;
2568                        double avg = avg_stats(stats);
2569                        double pct;
2570                        u64 n = (u64) stats->n;
2571
2572                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573                        avg /= NSEC_PER_MSEC;
2574
2575                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576                        printed += fprintf(fp, "   %-15s", sc->name);
2577                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578                                           n, syscall_stats_entry->msecs, min, avg);
2579                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2580                }
2581        }
2582
2583        resort_rb__delete(syscall_stats);
2584        printed += fprintf(fp, "\n\n");
2585
2586        return printed;
2587}
2588
2589static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2590{
2591        size_t printed = 0;
2592        struct thread_trace *ttrace = thread__priv(thread);
2593        double ratio;
2594
2595        if (ttrace == NULL)
2596                return 0;
2597
2598        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2599
2600        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602        printed += fprintf(fp, "%.1f%%", ratio);
2603        if (ttrace->pfmaj)
2604                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605        if (ttrace->pfmin)
2606                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607        if (trace->sched)
2608                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609        else if (fputc('\n', fp) != EOF)
2610                ++printed;
2611
2612        printed += thread__dump_stats(ttrace, trace, fp);
2613
2614        return printed;
2615}
2616
2617static unsigned long thread__nr_events(struct thread_trace *ttrace)
2618{
2619        return ttrace ? ttrace->nr_events : 0;
2620}
2621
2622DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623        struct thread *thread;
2624)
2625{
2626        entry->thread = rb_entry(nd, struct thread, rb_node);
2627}
2628
2629static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630{
2631        DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632        size_t printed = trace__fprintf_threads_header(fp);
2633        struct rb_node *nd;
2634
2635        if (threads == NULL) {
2636                fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2637                return 0;
2638        }
2639
2640        resort_rb__for_each_entry(nd, threads)
2641                printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2642
2643        resort_rb__delete(threads);
2644
2645        return printed;
2646}
2647
2648static int trace__set_duration(const struct option *opt, const char *str,
2649                               int unset __maybe_unused)
2650{
2651        struct trace *trace = opt->value;
2652
2653        trace->duration_filter = atof(str);
2654        return 0;
2655}
2656
2657static int trace__set_filter_pids(const struct option *opt, const char *str,
2658                                  int unset __maybe_unused)
2659{
2660        int ret = -1;
2661        size_t i;
2662        struct trace *trace = opt->value;
2663        /*
2664         * FIXME: introduce a intarray class, plain parse csv and create a
2665         * { int nr, int entries[] } struct...
2666         */
2667        struct intlist *list = intlist__new(str);
2668
2669        if (list == NULL)
2670                return -1;
2671
2672        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2674
2675        if (trace->filter_pids.entries == NULL)
2676                goto out;
2677
2678        trace->filter_pids.entries[0] = getpid();
2679
2680        for (i = 1; i < trace->filter_pids.nr; ++i)
2681                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2682
2683        intlist__delete(list);
2684        ret = 0;
2685out:
2686        return ret;
2687}
2688
2689static int trace__open_output(struct trace *trace, const char *filename)
2690{
2691        struct stat st;
2692
2693        if (!stat(filename, &st) && st.st_size) {
2694                char oldname[PATH_MAX];
2695
2696                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697                unlink(oldname);
2698                rename(filename, oldname);
2699        }
2700
2701        trace->output = fopen(filename, "w");
2702
2703        return trace->output == NULL ? -errno : 0;
2704}
2705
2706static int parse_pagefaults(const struct option *opt, const char *str,
2707                            int unset __maybe_unused)
2708{
2709        int *trace_pgfaults = opt->value;
2710
2711        if (strcmp(str, "all") == 0)
2712                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713        else if (strcmp(str, "maj") == 0)
2714                *trace_pgfaults |= TRACE_PFMAJ;
2715        else if (strcmp(str, "min") == 0)
2716                *trace_pgfaults |= TRACE_PFMIN;
2717        else
2718                return -1;
2719
2720        return 0;
2721}
2722
2723static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2724{
2725        struct perf_evsel *evsel;
2726
2727        evlist__for_each_entry(evlist, evsel)
2728                evsel->handler = handler;
2729}
2730
2731int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2732{
2733        const char *trace_usage[] = {
2734                "perf trace [<options>] [<command>]",
2735                "perf trace [<options>] -- <command> [<options>]",
2736                "perf trace record [<options>] [<command>]",
2737                "perf trace record [<options>] -- <command> [<options>]",
2738                NULL
2739        };
2740        struct trace trace = {
2741                .syscalls = {
2742                        . max = -1,
2743                },
2744                .opts = {
2745                        .target = {
2746                                .uid       = UINT_MAX,
2747                                .uses_mmap = true,
2748                        },
2749                        .user_freq     = UINT_MAX,
2750                        .user_interval = ULLONG_MAX,
2751                        .no_buffering  = true,
2752                        .mmap_pages    = UINT_MAX,
2753                        .proc_map_timeout  = 500,
2754                },
2755                .output = stderr,
2756                .show_comm = true,
2757                .trace_syscalls = true,
2758                .kernel_syscallchains = false,
2759                .max_stack = UINT_MAX,
2760        };
2761        const char *output_name = NULL;
2762        const char *ev_qualifier_str = NULL;
2763        const struct option trace_options[] = {
2764        OPT_CALLBACK(0, "event", &trace.evlist, "event",
2765                     "event selector. use 'perf list' to list available events",
2766                     parse_events_option),
2767        OPT_BOOLEAN(0, "comm", &trace.show_comm,
2768                    "show the thread COMM next to its id"),
2769        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2770        OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2771        OPT_STRING('o', "output", &output_name, "file", "output file name"),
2772        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2773        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2774                    "trace events on existing process id"),
2775        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2776                    "trace events on existing thread id"),
2777        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2778                     "pids to filter (by the kernel)", trace__set_filter_pids),
2779        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2780                    "system-wide collection from all CPUs"),
2781        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2782                    "list of cpus to monitor"),
2783        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2784                    "child tasks do not inherit counters"),
2785        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2786                     "number of mmap data pages",
2787                     perf_evlist__parse_mmap_pages),
2788        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2789                   "user to profile"),
2790        OPT_CALLBACK(0, "duration", &trace, "float",
2791                     "show only events with duration > N.M ms",
2792                     trace__set_duration),
2793        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2794        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2795        OPT_BOOLEAN('T', "time", &trace.full_time,
2796                    "Show full timestamp, not time relative to first start"),
2797        OPT_BOOLEAN('s', "summary", &trace.summary_only,
2798                    "Show only syscall summary with statistics"),
2799        OPT_BOOLEAN('S', "with-summary", &trace.summary,
2800                    "Show all syscalls and summary with statistics"),
2801        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2802                     "Trace pagefaults", parse_pagefaults, "maj"),
2803        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2804        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2805        OPT_CALLBACK(0, "call-graph", &trace.opts,
2806                     "record_mode[,record_size]", record_callchain_help,
2807                     &record_parse_callchain_opt),
2808        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2809                    "Show the kernel callchains on the syscall exit path"),
2810        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2811                     "Set the minimum stack depth when parsing the callchain, "
2812                     "anything below the specified depth will be ignored."),
2813        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2814                     "Set the maximum stack depth when parsing the callchain, "
2815                     "anything beyond the specified depth will be ignored. "
2816                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2817        OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2818                        "per thread proc mmap processing timeout in ms"),
2819        OPT_END()
2820        };
2821        bool __maybe_unused max_stack_user_set = true;
2822        bool mmap_pages_user_set = true;
2823        const char * const trace_subcommands[] = { "record", NULL };
2824        int err;
2825        char bf[BUFSIZ];
2826
2827        signal(SIGSEGV, sighandler_dump_stack);
2828        signal(SIGFPE, sighandler_dump_stack);
2829
2830        trace.evlist = perf_evlist__new();
2831        trace.sctbl = syscalltbl__new();
2832
2833        if (trace.evlist == NULL || trace.sctbl == NULL) {
2834                pr_err("Not enough memory to run!\n");
2835                err = -ENOMEM;
2836                goto out;
2837        }
2838
2839        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2840                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2841
2842        err = bpf__setup_stdout(trace.evlist);
2843        if (err) {
2844                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2845                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2846                goto out;
2847        }
2848
2849        err = -1;
2850
2851        if (trace.trace_pgfaults) {
2852                trace.opts.sample_address = true;
2853                trace.opts.sample_time = true;
2854        }
2855
2856        if (trace.opts.mmap_pages == UINT_MAX)
2857                mmap_pages_user_set = false;
2858
2859        if (trace.max_stack == UINT_MAX) {
2860                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2861                max_stack_user_set = false;
2862        }
2863
2864#ifdef HAVE_DWARF_UNWIND_SUPPORT
2865        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2866                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2867#endif
2868
2869        if (callchain_param.enabled) {
2870                if (!mmap_pages_user_set && geteuid() == 0)
2871                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2872
2873                symbol_conf.use_callchain = true;
2874        }
2875
2876        if (trace.evlist->nr_entries > 0)
2877                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2878
2879        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2880                return trace__record(&trace, argc-1, &argv[1]);
2881
2882        /* summary_only implies summary option, but don't overwrite summary if set */
2883        if (trace.summary_only)
2884                trace.summary = trace.summary_only;
2885
2886        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2887            trace.evlist->nr_entries == 0 /* Was --events used? */) {
2888                pr_err("Please specify something to trace.\n");
2889                return -1;
2890        }
2891
2892        if (!trace.trace_syscalls && ev_qualifier_str) {
2893                pr_err("The -e option can't be used with --no-syscalls.\n");
2894                goto out;
2895        }
2896
2897        if (output_name != NULL) {
2898                err = trace__open_output(&trace, output_name);
2899                if (err < 0) {
2900                        perror("failed to create output file");
2901                        goto out;
2902                }
2903        }
2904
2905        trace.open_id = syscalltbl__id(trace.sctbl, "open");
2906
2907        if (ev_qualifier_str != NULL) {
2908                const char *s = ev_qualifier_str;
2909                struct strlist_config slist_config = {
2910                        .dirname = system_path(STRACE_GROUPS_DIR),
2911                };
2912
2913                trace.not_ev_qualifier = *s == '!';
2914                if (trace.not_ev_qualifier)
2915                        ++s;
2916                trace.ev_qualifier = strlist__new(s, &slist_config);
2917                if (trace.ev_qualifier == NULL) {
2918                        fputs("Not enough memory to parse event qualifier",
2919                              trace.output);
2920                        err = -ENOMEM;
2921                        goto out_close;
2922                }
2923
2924                err = trace__validate_ev_qualifier(&trace);
2925                if (err)
2926                        goto out_close;
2927        }
2928
2929        err = target__validate(&trace.opts.target);
2930        if (err) {
2931                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932                fprintf(trace.output, "%s", bf);
2933                goto out_close;
2934        }
2935
2936        err = target__parse_uid(&trace.opts.target);
2937        if (err) {
2938                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2939                fprintf(trace.output, "%s", bf);
2940                goto out_close;
2941        }
2942
2943        if (!argc && target__none(&trace.opts.target))
2944                trace.opts.target.system_wide = true;
2945
2946        if (input_name)
2947                err = trace__replay(&trace);
2948        else
2949                err = trace__run(&trace, argc, argv);
2950
2951out_close:
2952        if (output_name != NULL)
2953                fclose(trace.output);
2954out:
2955        return err;
2956}
2957