linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
  21#include <bpf/bpf.h>
  22#include "builtin.h"
  23#include "util/cgroup.h"
  24#include "util/color.h"
  25#include "util/config.h"
  26#include "util/debug.h"
  27#include "util/env.h"
  28#include "util/event.h"
  29#include "util/evlist.h"
  30#include <subcmd/exec-cmd.h>
  31#include "util/machine.h"
  32#include "util/path.h"
  33#include "util/session.h"
  34#include "util/thread.h"
  35#include <subcmd/parse-options.h>
  36#include "util/strlist.h"
  37#include "util/intlist.h"
  38#include "util/thread_map.h"
  39#include "util/stat.h"
  40#include "trace/beauty/beauty.h"
  41#include "trace-event.h"
  42#include "util/parse-events.h"
  43#include "util/bpf-loader.h"
  44#include "callchain.h"
  45#include "print_binary.h"
  46#include "string2.h"
  47#include "syscalltbl.h"
  48#include "rb_resort.h"
  49
  50#include <errno.h>
  51#include <inttypes.h>
  52#include <poll.h>
  53#include <signal.h>
  54#include <stdlib.h>
  55#include <string.h>
  56#include <linux/err.h>
  57#include <linux/filter.h>
  58#include <linux/kernel.h>
  59#include <linux/random.h>
  60#include <linux/stringify.h>
  61#include <linux/time64.h>
  62#include <fcntl.h>
  63#include <sys/sysmacros.h>
  64
  65#include "sane_ctype.h"
  66
  67#ifndef O_CLOEXEC
  68# define O_CLOEXEC              02000000
  69#endif
  70
  71#ifndef F_LINUX_SPECIFIC_BASE
  72# define F_LINUX_SPECIFIC_BASE  1024
  73#endif
  74
  75struct trace {
  76        struct perf_tool        tool;
  77        struct syscalltbl       *sctbl;
  78        struct {
  79                int             max;
  80                struct syscall  *table;
  81                struct bpf_map  *map;
  82                struct {
  83                        struct perf_evsel *sys_enter,
  84                                          *sys_exit,
  85                                          *augmented;
  86                }               events;
  87        } syscalls;
  88        struct record_opts      opts;
  89        struct perf_evlist      *evlist;
  90        struct machine          *host;
  91        struct thread           *current;
  92        struct cgroup           *cgroup;
  93        u64                     base_time;
  94        FILE                    *output;
  95        unsigned long           nr_events;
  96        unsigned long           nr_events_printed;
  97        unsigned long           max_events;
  98        struct strlist          *ev_qualifier;
  99        struct {
 100                size_t          nr;
 101                int             *entries;
 102        }                       ev_qualifier_ids;
 103        struct {
 104                size_t          nr;
 105                pid_t           *entries;
 106                struct bpf_map  *map;
 107        }                       filter_pids;
 108        double                  duration_filter;
 109        double                  runtime_ms;
 110        struct {
 111                u64             vfs_getname,
 112                                proc_getname;
 113        } stats;
 114        unsigned int            max_stack;
 115        unsigned int            min_stack;
 116        int                     raw_augmented_syscalls_args_size;
 117        bool                    raw_augmented_syscalls;
 118        bool                    sort_events;
 119        bool                    not_ev_qualifier;
 120        bool                    live;
 121        bool                    full_time;
 122        bool                    sched;
 123        bool                    multiple_threads;
 124        bool                    summary;
 125        bool                    summary_only;
 126        bool                    failure_only;
 127        bool                    show_comm;
 128        bool                    print_sample;
 129        bool                    show_tool_stats;
 130        bool                    trace_syscalls;
 131        bool                    kernel_syscallchains;
 132        s16                     args_alignment;
 133        bool                    show_tstamp;
 134        bool                    show_duration;
 135        bool                    show_zeros;
 136        bool                    show_arg_names;
 137        bool                    show_string_prefix;
 138        bool                    force;
 139        bool                    vfs_getname;
 140        int                     trace_pgfaults;
 141        struct {
 142                struct ordered_events   data;
 143                u64                     last;
 144        } oe;
 145};
 146
 147struct tp_field {
 148        int offset;
 149        union {
 150                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 151                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 152        };
 153};
 154
 155#define TP_UINT_FIELD(bits) \
 156static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 157{ \
 158        u##bits value; \
 159        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 160        return value;  \
 161}
 162
 163TP_UINT_FIELD(8);
 164TP_UINT_FIELD(16);
 165TP_UINT_FIELD(32);
 166TP_UINT_FIELD(64);
 167
 168#define TP_UINT_FIELD__SWAPPED(bits) \
 169static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 170{ \
 171        u##bits value; \
 172        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 173        return bswap_##bits(value);\
 174}
 175
 176TP_UINT_FIELD__SWAPPED(16);
 177TP_UINT_FIELD__SWAPPED(32);
 178TP_UINT_FIELD__SWAPPED(64);
 179
 180static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 181{
 182        field->offset = offset;
 183
 184        switch (size) {
 185        case 1:
 186                field->integer = tp_field__u8;
 187                break;
 188        case 2:
 189                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 190                break;
 191        case 4:
 192                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 193                break;
 194        case 8:
 195                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 196                break;
 197        default:
 198                return -1;
 199        }
 200
 201        return 0;
 202}
 203
 204static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
 205{
 206        return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 207}
 208
 209static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 210{
 211        return sample->raw_data + field->offset;
 212}
 213
 214static int __tp_field__init_ptr(struct tp_field *field, int offset)
 215{
 216        field->offset = offset;
 217        field->pointer = tp_field__ptr;
 218        return 0;
 219}
 220
 221static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
 222{
 223        return __tp_field__init_ptr(field, format_field->offset);
 224}
 225
 226struct syscall_tp {
 227        struct tp_field id;
 228        union {
 229                struct tp_field args, ret;
 230        };
 231};
 232
 233static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 234                                          struct tp_field *field,
 235                                          const char *name)
 236{
 237        struct tep_format_field *format_field = perf_evsel__field(evsel, name);
 238
 239        if (format_field == NULL)
 240                return -1;
 241
 242        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 243}
 244
 245#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 246        ({ struct syscall_tp *sc = evsel->priv;\
 247           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 248
 249static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 250                                         struct tp_field *field,
 251                                         const char *name)
 252{
 253        struct tep_format_field *format_field = perf_evsel__field(evsel, name);
 254
 255        if (format_field == NULL)
 256                return -1;
 257
 258        return tp_field__init_ptr(field, format_field);
 259}
 260
 261#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 262        ({ struct syscall_tp *sc = evsel->priv;\
 263           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 264
 265static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 266{
 267        zfree(&evsel->priv);
 268        perf_evsel__delete(evsel);
 269}
 270
 271static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
 272{
 273        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 274
 275        if (evsel->priv != NULL) {
 276                if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 277                    perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 278                        goto out_delete;
 279                return 0;
 280        }
 281
 282        return -ENOMEM;
 283out_delete:
 284        zfree(&evsel->priv);
 285        return -ENOENT;
 286}
 287
 288static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
 289{
 290        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 291
 292        if (evsel->priv != NULL) {
 293                struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
 294                if (syscall_id == NULL)
 295                        syscall_id = perf_evsel__field(tp, "__syscall_nr");
 296                if (syscall_id == NULL)
 297                        goto out_delete;
 298                if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 299                        goto out_delete;
 300
 301                return 0;
 302        }
 303
 304        return -ENOMEM;
 305out_delete:
 306        zfree(&evsel->priv);
 307        return -EINVAL;
 308}
 309
 310static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
 311{
 312        struct syscall_tp *sc = evsel->priv;
 313
 314        return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 315}
 316
 317static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
 318{
 319        struct syscall_tp *sc = evsel->priv;
 320
 321        return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
 322}
 323
 324static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
 325{
 326        evsel->priv = malloc(sizeof(struct syscall_tp));
 327        if (evsel->priv != NULL) {
 328                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 329                        goto out_delete;
 330
 331                evsel->handler = handler;
 332                return 0;
 333        }
 334
 335        return -ENOMEM;
 336
 337out_delete:
 338        zfree(&evsel->priv);
 339        return -ENOENT;
 340}
 341
 342static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 343{
 344        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 345
 346        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 347        if (IS_ERR(evsel))
 348                evsel = perf_evsel__newtp("syscalls", direction);
 349
 350        if (IS_ERR(evsel))
 351                return NULL;
 352
 353        if (perf_evsel__init_raw_syscall_tp(evsel, handler))
 354                goto out_delete;
 355
 356        return evsel;
 357
 358out_delete:
 359        perf_evsel__delete_priv(evsel);
 360        return NULL;
 361}
 362
 363#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 364        ({ struct syscall_tp *fields = evsel->priv; \
 365           fields->name.integer(&fields->name, sample); })
 366
 367#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 368        ({ struct syscall_tp *fields = evsel->priv; \
 369           fields->name.pointer(&fields->name, sample); })
 370
 371size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 372{
 373        int idx = val - sa->offset;
 374
 375        if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 376                size_t printed = scnprintf(bf, size, intfmt, val);
 377                if (show_prefix)
 378                        printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 379                return printed;
 380        }
 381
 382        return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 383}
 384
 385static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 386                                                const char *intfmt,
 387                                                struct syscall_arg *arg)
 388{
 389        return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 390}
 391
 392static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 393                                              struct syscall_arg *arg)
 394{
 395        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 396}
 397
 398#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 399
 400size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 401{
 402        size_t printed;
 403        int i;
 404
 405        for (i = 0; i < sas->nr_entries; ++i) {
 406                struct strarray *sa = sas->entries[i];
 407                int idx = val - sa->offset;
 408
 409                if (idx >= 0 && idx < sa->nr_entries) {
 410                        if (sa->entries[idx] == NULL)
 411                                break;
 412                        return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 413                }
 414        }
 415
 416        printed = scnprintf(bf, size, intfmt, val);
 417        if (show_prefix)
 418                printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
 419        return printed;
 420}
 421
 422size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 423                                        struct syscall_arg *arg)
 424{
 425        return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 426}
 427
 428#ifndef AT_FDCWD
 429#define AT_FDCWD        -100
 430#endif
 431
 432static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 433                                           struct syscall_arg *arg)
 434{
 435        int fd = arg->val;
 436        const char *prefix = "AT_FD";
 437
 438        if (fd == AT_FDCWD)
 439                return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 440
 441        return syscall_arg__scnprintf_fd(bf, size, arg);
 442}
 443
 444#define SCA_FDAT syscall_arg__scnprintf_fd_at
 445
 446static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 447                                              struct syscall_arg *arg);
 448
 449#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 450
 451size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 452{
 453        return scnprintf(bf, size, "%#lx", arg->val);
 454}
 455
 456size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
 457{
 458        if (arg->val == 0)
 459                return scnprintf(bf, size, "NULL");
 460        return syscall_arg__scnprintf_hex(bf, size, arg);
 461}
 462
 463size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 464{
 465        return scnprintf(bf, size, "%d", arg->val);
 466}
 467
 468size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 469{
 470        return scnprintf(bf, size, "%ld", arg->val);
 471}
 472
 473static const char *bpf_cmd[] = {
 474        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 475        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 476};
 477static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 478
 479static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 480static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 481
 482static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 483static DEFINE_STRARRAY(itimers, "ITIMER_");
 484
 485static const char *keyctl_options[] = {
 486        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 487        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 488        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 489        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 490        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 491};
 492static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 493
 494static const char *whences[] = { "SET", "CUR", "END",
 495#ifdef SEEK_DATA
 496"DATA",
 497#endif
 498#ifdef SEEK_HOLE
 499"HOLE",
 500#endif
 501};
 502static DEFINE_STRARRAY(whences, "SEEK_");
 503
 504static const char *fcntl_cmds[] = {
 505        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 506        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 507        "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 508        "GETOWNER_UIDS",
 509};
 510static DEFINE_STRARRAY(fcntl_cmds, "F_");
 511
 512static const char *fcntl_linux_specific_cmds[] = {
 513        "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 514        "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 515        "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 516};
 517
 518static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 519
 520static struct strarray *fcntl_cmds_arrays[] = {
 521        &strarray__fcntl_cmds,
 522        &strarray__fcntl_linux_specific_cmds,
 523};
 524
 525static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 526
 527static const char *rlimit_resources[] = {
 528        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 529        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 530        "RTTIME",
 531};
 532static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 533
 534static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 535static DEFINE_STRARRAY(sighow, "SIG_");
 536
 537static const char *clockid[] = {
 538        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 539        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 540        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 541};
 542static DEFINE_STRARRAY(clockid, "CLOCK_");
 543
 544static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 545                                                 struct syscall_arg *arg)
 546{
 547        bool show_prefix = arg->show_string_prefix;
 548        const char *suffix = "_OK";
 549        size_t printed = 0;
 550        int mode = arg->val;
 551
 552        if (mode == F_OK) /* 0 */
 553                return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 554#define P_MODE(n) \
 555        if (mode & n##_OK) { \
 556                printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
 557                mode &= ~n##_OK; \
 558        }
 559
 560        P_MODE(R);
 561        P_MODE(W);
 562        P_MODE(X);
 563#undef P_MODE
 564
 565        if (mode)
 566                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 567
 568        return printed;
 569}
 570
 571#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 572
 573static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 574                                              struct syscall_arg *arg);
 575
 576#define SCA_FILENAME syscall_arg__scnprintf_filename
 577
 578static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 579                                                struct syscall_arg *arg)
 580{
 581        bool show_prefix = arg->show_string_prefix;
 582        const char *prefix = "O_";
 583        int printed = 0, flags = arg->val;
 584
 585#define P_FLAG(n) \
 586        if (flags & O_##n) { \
 587                printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 588                flags &= ~O_##n; \
 589        }
 590
 591        P_FLAG(CLOEXEC);
 592        P_FLAG(NONBLOCK);
 593#undef P_FLAG
 594
 595        if (flags)
 596                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 597
 598        return printed;
 599}
 600
 601#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 602
 603#ifndef GRND_NONBLOCK
 604#define GRND_NONBLOCK   0x0001
 605#endif
 606#ifndef GRND_RANDOM
 607#define GRND_RANDOM     0x0002
 608#endif
 609
 610static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 611                                                   struct syscall_arg *arg)
 612{
 613        bool show_prefix = arg->show_string_prefix;
 614        const char *prefix = "GRND_";
 615        int printed = 0, flags = arg->val;
 616
 617#define P_FLAG(n) \
 618        if (flags & GRND_##n) { \
 619                printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 620                flags &= ~GRND_##n; \
 621        }
 622
 623        P_FLAG(RANDOM);
 624        P_FLAG(NONBLOCK);
 625#undef P_FLAG
 626
 627        if (flags)
 628                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 629
 630        return printed;
 631}
 632
 633#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 634
 635#define STRARRAY(name, array) \
 636          { .scnprintf  = SCA_STRARRAY, \
 637            .parm       = &strarray__##array, }
 638
 639#include "trace/beauty/arch_errno_names.c"
 640#include "trace/beauty/eventfd.c"
 641#include "trace/beauty/futex_op.c"
 642#include "trace/beauty/futex_val3.c"
 643#include "trace/beauty/mmap.c"
 644#include "trace/beauty/mode_t.c"
 645#include "trace/beauty/msg_flags.c"
 646#include "trace/beauty/open_flags.c"
 647#include "trace/beauty/perf_event_open.c"
 648#include "trace/beauty/pid.c"
 649#include "trace/beauty/sched_policy.c"
 650#include "trace/beauty/seccomp.c"
 651#include "trace/beauty/signum.c"
 652#include "trace/beauty/socket_type.c"
 653#include "trace/beauty/waitid_options.c"
 654
 655struct syscall_arg_fmt {
 656        size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 657        unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
 658        void       *parm;
 659        const char *name;
 660        bool       show_zero;
 661};
 662
 663static struct syscall_fmt {
 664        const char *name;
 665        const char *alias;
 666        struct syscall_arg_fmt arg[6];
 667        u8         nr_args;
 668        bool       errpid;
 669        bool       timeout;
 670        bool       hexret;
 671} syscall_fmts[] = {
 672        { .name     = "access",
 673          .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 674        { .name     = "arch_prctl",
 675          .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
 676                   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
 677        { .name     = "bind",
 678          .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
 679        { .name     = "bpf",
 680          .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 681        { .name     = "brk",        .hexret = true,
 682          .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
 683        { .name     = "clock_gettime",
 684          .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 685        { .name     = "clone",      .errpid = true, .nr_args = 5,
 686          .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
 687                   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 688                   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 689                   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 690                   [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
 691        { .name     = "close",
 692          .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 693        { .name     = "connect",
 694          .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
 695        { .name     = "epoll_ctl",
 696          .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 697        { .name     = "eventfd2",
 698          .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 699        { .name     = "fchmodat",
 700          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 701        { .name     = "fchownat",
 702          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 703        { .name     = "fcntl",
 704          .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 705                           .parm      = &strarrays__fcntl_cmds_arrays,
 706                           .show_zero = true, },
 707                   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 708        { .name     = "flock",
 709          .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 710        { .name     = "fstat", .alias = "newfstat", },
 711        { .name     = "fstatat", .alias = "newfstatat", },
 712        { .name     = "futex",
 713          .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 714                   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 715        { .name     = "futimesat",
 716          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 717        { .name     = "getitimer",
 718          .arg = { [0] = STRARRAY(which, itimers), }, },
 719        { .name     = "getpid",     .errpid = true, },
 720        { .name     = "getpgid",    .errpid = true, },
 721        { .name     = "getppid",    .errpid = true, },
 722        { .name     = "getrandom",
 723          .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 724        { .name     = "getrlimit",
 725          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 726        { .name     = "gettid",     .errpid = true, },
 727        { .name     = "ioctl",
 728          .arg = {
 729#if defined(__i386__) || defined(__x86_64__)
 730/*
 731 * FIXME: Make this available to all arches.
 732 */
 733                   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 734                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 735#else
 736                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 737#endif
 738        { .name     = "kcmp",       .nr_args = 5,
 739          .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
 740                   [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
 741                   [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
 742                   [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
 743                   [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
 744        { .name     = "keyctl",
 745          .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 746        { .name     = "kill",
 747          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 748        { .name     = "linkat",
 749          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 750        { .name     = "lseek",
 751          .arg = { [2] = STRARRAY(whence, whences), }, },
 752        { .name     = "lstat", .alias = "newlstat", },
 753        { .name     = "madvise",
 754          .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 755                   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 756        { .name     = "mkdirat",
 757          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 758        { .name     = "mknodat",
 759          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 760        { .name     = "mmap",       .hexret = true,
 761/* The standard mmap maps to old_mmap on s390x */
 762#if defined(__s390x__)
 763        .alias = "old_mmap",
 764#endif
 765          .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 766                   [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
 767                   [5] = { .scnprintf = SCA_HEX,        /* offset */ }, }, },
 768        { .name     = "mount",
 769          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
 770                   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
 771                           .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
 772        { .name     = "mprotect",
 773          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 774                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
 775        { .name     = "mq_unlink",
 776          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 777        { .name     = "mremap",     .hexret = true,
 778          .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
 779        { .name     = "name_to_handle_at",
 780          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 781        { .name     = "newfstatat",
 782          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 783        { .name     = "open",
 784          .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 785        { .name     = "open_by_handle_at",
 786          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 787                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 788        { .name     = "openat",
 789          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 790                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 791        { .name     = "perf_event_open",
 792          .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
 793                   [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
 794                   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 795        { .name     = "pipe2",
 796          .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 797        { .name     = "pkey_alloc",
 798          .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
 799        { .name     = "pkey_free",
 800          .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
 801        { .name     = "pkey_mprotect",
 802          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 803                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 804                   [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
 805        { .name     = "poll", .timeout = true, },
 806        { .name     = "ppoll", .timeout = true, },
 807        { .name     = "prctl",
 808          .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 809                   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 810                   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 811        { .name     = "pread", .alias = "pread64", },
 812        { .name     = "preadv", .alias = "pread", },
 813        { .name     = "prlimit64",
 814          .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 815        { .name     = "pwrite", .alias = "pwrite64", },
 816        { .name     = "readlinkat",
 817          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 818        { .name     = "recvfrom",
 819          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 820        { .name     = "recvmmsg",
 821          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 822        { .name     = "recvmsg",
 823          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 824        { .name     = "renameat",
 825          .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
 826                   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
 827        { .name     = "renameat2",
 828          .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
 829                   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
 830                   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
 831        { .name     = "rt_sigaction",
 832          .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 833        { .name     = "rt_sigprocmask",
 834          .arg = { [0] = STRARRAY(how, sighow), }, },
 835        { .name     = "rt_sigqueueinfo",
 836          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 837        { .name     = "rt_tgsigqueueinfo",
 838          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 839        { .name     = "sched_setscheduler",
 840          .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 841        { .name     = "seccomp",
 842          .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
 843                   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 844        { .name     = "select", .timeout = true, },
 845        { .name     = "sendmmsg",
 846          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 847        { .name     = "sendmsg",
 848          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 849        { .name     = "sendto",
 850          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
 851                   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
 852        { .name     = "set_tid_address", .errpid = true, },
 853        { .name     = "setitimer",
 854          .arg = { [0] = STRARRAY(which, itimers), }, },
 855        { .name     = "setrlimit",
 856          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 857        { .name     = "socket",
 858          .arg = { [0] = STRARRAY(family, socket_families),
 859                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 860                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 861        { .name     = "socketpair",
 862          .arg = { [0] = STRARRAY(family, socket_families),
 863                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 864                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 865        { .name     = "stat", .alias = "newstat", },
 866        { .name     = "statx",
 867          .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
 868                   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 869                   [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
 870        { .name     = "swapoff",
 871          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 872        { .name     = "swapon",
 873          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 874        { .name     = "symlinkat",
 875          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 876        { .name     = "tgkill",
 877          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 878        { .name     = "tkill",
 879          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 880        { .name     = "umount2", .alias = "umount",
 881          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
 882        { .name     = "uname", .alias = "newuname", },
 883        { .name     = "unlinkat",
 884          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 885        { .name     = "utimensat",
 886          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 887        { .name     = "wait4",      .errpid = true,
 888          .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 889        { .name     = "waitid",     .errpid = true,
 890          .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 891};
 892
 893static int syscall_fmt__cmp(const void *name, const void *fmtp)
 894{
 895        const struct syscall_fmt *fmt = fmtp;
 896        return strcmp(name, fmt->name);
 897}
 898
 899static struct syscall_fmt *syscall_fmt__find(const char *name)
 900{
 901        const int nmemb = ARRAY_SIZE(syscall_fmts);
 902        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 903}
 904
 905static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
 906{
 907        int i, nmemb = ARRAY_SIZE(syscall_fmts);
 908
 909        for (i = 0; i < nmemb; ++i) {
 910                if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
 911                        return &syscall_fmts[i];
 912        }
 913
 914        return NULL;
 915}
 916
 917/*
 918 * is_exit: is this "exit" or "exit_group"?
 919 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
 920 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
 921 */
 922struct syscall {
 923        struct tep_event    *tp_format;
 924        int                 nr_args;
 925        int                 args_size;
 926        bool                is_exit;
 927        bool                is_open;
 928        struct tep_format_field *args;
 929        const char          *name;
 930        struct syscall_fmt  *fmt;
 931        struct syscall_arg_fmt *arg_fmt;
 932};
 933
 934struct bpf_map_syscall_entry {
 935        bool    enabled;
 936};
 937
 938/*
 939 * We need to have this 'calculated' boolean because in some cases we really
 940 * don't know what is the duration of a syscall, for instance, when we start
 941 * a session and some threads are waiting for a syscall to finish, say 'poll',
 942 * in which case all we can do is to print "( ? ) for duration and for the
 943 * start timestamp.
 944 */
 945static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 946{
 947        double duration = (double)t / NSEC_PER_MSEC;
 948        size_t printed = fprintf(fp, "(");
 949
 950        if (!calculated)
 951                printed += fprintf(fp, "         ");
 952        else if (duration >= 1.0)
 953                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 954        else if (duration >= 0.01)
 955                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 956        else
 957                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 958        return printed + fprintf(fp, "): ");
 959}
 960
 961/**
 962 * filename.ptr: The filename char pointer that will be vfs_getname'd
 963 * filename.entry_str_pos: Where to insert the string translated from
 964 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 965 * ret_scnprintf: syscall args may set this to a different syscall return
 966 *                formatter, for instance, fcntl may return fds, file flags, etc.
 967 */
 968struct thread_trace {
 969        u64               entry_time;
 970        bool              entry_pending;
 971        unsigned long     nr_events;
 972        unsigned long     pfmaj, pfmin;
 973        char              *entry_str;
 974        double            runtime_ms;
 975        size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 976        struct {
 977                unsigned long ptr;
 978                short int     entry_str_pos;
 979                bool          pending_open;
 980                unsigned int  namelen;
 981                char          *name;
 982        } filename;
 983        struct {
 984                int           max;
 985                struct file   *table;
 986        } files;
 987
 988        struct intlist *syscall_stats;
 989};
 990
 991static struct thread_trace *thread_trace__new(void)
 992{
 993        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 994
 995        if (ttrace)
 996                ttrace->files.max = -1;
 997
 998        ttrace->syscall_stats = intlist__new(NULL);
 999
1000        return ttrace;
1001}
1002
1003static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1004{
1005        struct thread_trace *ttrace;
1006
1007        if (thread == NULL)
1008                goto fail;
1009
1010        if (thread__priv(thread) == NULL)
1011                thread__set_priv(thread, thread_trace__new());
1012
1013        if (thread__priv(thread) == NULL)
1014                goto fail;
1015
1016        ttrace = thread__priv(thread);
1017        ++ttrace->nr_events;
1018
1019        return ttrace;
1020fail:
1021        color_fprintf(fp, PERF_COLOR_RED,
1022                      "WARNING: not enough memory, dropping samples!\n");
1023        return NULL;
1024}
1025
1026
1027void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1028                                    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1029{
1030        struct thread_trace *ttrace = thread__priv(arg->thread);
1031
1032        ttrace->ret_scnprintf = ret_scnprintf;
1033}
1034
1035#define TRACE_PFMAJ             (1 << 0)
1036#define TRACE_PFMIN             (1 << 1)
1037
1038static const size_t trace__entry_str_size = 2048;
1039
1040static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1041{
1042        if (fd > ttrace->files.max) {
1043                struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1044
1045                if (nfiles == NULL)
1046                        return NULL;
1047
1048                if (ttrace->files.max != -1) {
1049                        memset(nfiles + ttrace->files.max + 1, 0,
1050                               (fd - ttrace->files.max) * sizeof(struct file));
1051                } else {
1052                        memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1053                }
1054
1055                ttrace->files.table = nfiles;
1056                ttrace->files.max   = fd;
1057        }
1058
1059        return ttrace->files.table + fd;
1060}
1061
1062struct file *thread__files_entry(struct thread *thread, int fd)
1063{
1064        return thread_trace__files_entry(thread__priv(thread), fd);
1065}
1066
1067static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1068{
1069        struct thread_trace *ttrace = thread__priv(thread);
1070        struct file *file = thread_trace__files_entry(ttrace, fd);
1071
1072        if (file != NULL) {
1073                struct stat st;
1074                if (stat(pathname, &st) == 0)
1075                        file->dev_maj = major(st.st_rdev);
1076                file->pathname = strdup(pathname);
1077                if (file->pathname)
1078                        return 0;
1079        }
1080
1081        return -1;
1082}
1083
1084static int thread__read_fd_path(struct thread *thread, int fd)
1085{
1086        char linkname[PATH_MAX], pathname[PATH_MAX];
1087        struct stat st;
1088        int ret;
1089
1090        if (thread->pid_ == thread->tid) {
1091                scnprintf(linkname, sizeof(linkname),
1092                          "/proc/%d/fd/%d", thread->pid_, fd);
1093        } else {
1094                scnprintf(linkname, sizeof(linkname),
1095                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1096        }
1097
1098        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1099                return -1;
1100
1101        ret = readlink(linkname, pathname, sizeof(pathname));
1102
1103        if (ret < 0 || ret > st.st_size)
1104                return -1;
1105
1106        pathname[ret] = '\0';
1107        return trace__set_fd_pathname(thread, fd, pathname);
1108}
1109
1110static const char *thread__fd_path(struct thread *thread, int fd,
1111                                   struct trace *trace)
1112{
1113        struct thread_trace *ttrace = thread__priv(thread);
1114
1115        if (ttrace == NULL)
1116                return NULL;
1117
1118        if (fd < 0)
1119                return NULL;
1120
1121        if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1122                if (!trace->live)
1123                        return NULL;
1124                ++trace->stats.proc_getname;
1125                if (thread__read_fd_path(thread, fd))
1126                        return NULL;
1127        }
1128
1129        return ttrace->files.table[fd].pathname;
1130}
1131
1132size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1133{
1134        int fd = arg->val;
1135        size_t printed = scnprintf(bf, size, "%d", fd);
1136        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1137
1138        if (path)
1139                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1140
1141        return printed;
1142}
1143
1144size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1145{
1146        size_t printed = scnprintf(bf, size, "%d", fd);
1147        struct thread *thread = machine__find_thread(trace->host, pid, pid);
1148
1149        if (thread) {
1150                const char *path = thread__fd_path(thread, fd, trace);
1151
1152                if (path)
1153                        printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1154
1155                thread__put(thread);
1156        }
1157
1158        return printed;
1159}
1160
1161static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1162                                              struct syscall_arg *arg)
1163{
1164        int fd = arg->val;
1165        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1166        struct thread_trace *ttrace = thread__priv(arg->thread);
1167
1168        if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1169                zfree(&ttrace->files.table[fd].pathname);
1170
1171        return printed;
1172}
1173
1174static void thread__set_filename_pos(struct thread *thread, const char *bf,
1175                                     unsigned long ptr)
1176{
1177        struct thread_trace *ttrace = thread__priv(thread);
1178
1179        ttrace->filename.ptr = ptr;
1180        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1181}
1182
1183static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1184{
1185        struct augmented_arg *augmented_arg = arg->augmented.args;
1186
1187        return scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1188}
1189
1190static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1191                                              struct syscall_arg *arg)
1192{
1193        unsigned long ptr = arg->val;
1194
1195        if (arg->augmented.args)
1196                return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1197
1198        if (!arg->trace->vfs_getname)
1199                return scnprintf(bf, size, "%#x", ptr);
1200
1201        thread__set_filename_pos(arg->thread, bf, ptr);
1202        return 0;
1203}
1204
1205static bool trace__filter_duration(struct trace *trace, double t)
1206{
1207        return t < (trace->duration_filter * NSEC_PER_MSEC);
1208}
1209
1210static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1211{
1212        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1213
1214        return fprintf(fp, "%10.3f ", ts);
1215}
1216
1217/*
1218 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1219 * using ttrace->entry_time for a thread that receives a sys_exit without
1220 * first having received a sys_enter ("poll" issued before tracing session
1221 * starts, lost sys_enter exit due to ring buffer overflow).
1222 */
1223static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1224{
1225        if (tstamp > 0)
1226                return __trace__fprintf_tstamp(trace, tstamp, fp);
1227
1228        return fprintf(fp, "         ? ");
1229}
1230
1231static bool done = false;
1232static bool interrupted = false;
1233
1234static void sig_handler(int sig)
1235{
1236        done = true;
1237        interrupted = sig == SIGINT;
1238}
1239
1240static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1241{
1242        size_t printed = 0;
1243
1244        if (trace->multiple_threads) {
1245                if (trace->show_comm)
1246                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1247                printed += fprintf(fp, "%d ", thread->tid);
1248        }
1249
1250        return printed;
1251}
1252
1253static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1254                                        u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1255{
1256        size_t printed = 0;
1257
1258        if (trace->show_tstamp)
1259                printed = trace__fprintf_tstamp(trace, tstamp, fp);
1260        if (trace->show_duration)
1261                printed += fprintf_duration(duration, duration_calculated, fp);
1262        return printed + trace__fprintf_comm_tid(trace, thread, fp);
1263}
1264
1265static int trace__process_event(struct trace *trace, struct machine *machine,
1266                                union perf_event *event, struct perf_sample *sample)
1267{
1268        int ret = 0;
1269
1270        switch (event->header.type) {
1271        case PERF_RECORD_LOST:
1272                color_fprintf(trace->output, PERF_COLOR_RED,
1273                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1274                ret = machine__process_lost_event(machine, event, sample);
1275                break;
1276        default:
1277                ret = machine__process_event(machine, event, sample);
1278                break;
1279        }
1280
1281        return ret;
1282}
1283
1284static int trace__tool_process(struct perf_tool *tool,
1285                               union perf_event *event,
1286                               struct perf_sample *sample,
1287                               struct machine *machine)
1288{
1289        struct trace *trace = container_of(tool, struct trace, tool);
1290        return trace__process_event(trace, machine, event, sample);
1291}
1292
1293static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1294{
1295        struct machine *machine = vmachine;
1296
1297        if (machine->kptr_restrict_warned)
1298                return NULL;
1299
1300        if (symbol_conf.kptr_restrict) {
1301                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1302                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1303                           "Kernel samples will not be resolved.\n");
1304                machine->kptr_restrict_warned = true;
1305                return NULL;
1306        }
1307
1308        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1309}
1310
1311static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1312{
1313        int err = symbol__init(NULL);
1314
1315        if (err)
1316                return err;
1317
1318        trace->host = machine__new_host();
1319        if (trace->host == NULL)
1320                return -ENOMEM;
1321
1322        err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1323        if (err < 0)
1324                goto out;
1325
1326        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1327                                            evlist->threads, trace__tool_process, false,
1328                                            1);
1329out:
1330        if (err)
1331                symbol__exit();
1332
1333        return err;
1334}
1335
1336static void trace__symbols__exit(struct trace *trace)
1337{
1338        machine__exit(trace->host);
1339        trace->host = NULL;
1340
1341        symbol__exit();
1342}
1343
1344static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1345{
1346        int idx;
1347
1348        if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1349                nr_args = sc->fmt->nr_args;
1350
1351        sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1352        if (sc->arg_fmt == NULL)
1353                return -1;
1354
1355        for (idx = 0; idx < nr_args; ++idx) {
1356                if (sc->fmt)
1357                        sc->arg_fmt[idx] = sc->fmt->arg[idx];
1358        }
1359
1360        sc->nr_args = nr_args;
1361        return 0;
1362}
1363
1364static int syscall__set_arg_fmts(struct syscall *sc)
1365{
1366        struct tep_format_field *field, *last_field = NULL;
1367        int idx = 0, len;
1368
1369        for (field = sc->args; field; field = field->next, ++idx) {
1370                last_field = field;
1371
1372                if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1373                        continue;
1374
1375                if (strcmp(field->type, "const char *") == 0 &&
1376                         (strcmp(field->name, "filename") == 0 ||
1377                          strcmp(field->name, "path") == 0 ||
1378                          strcmp(field->name, "pathname") == 0))
1379                        sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1380                else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1381                        sc->arg_fmt[idx].scnprintf = SCA_PTR;
1382                else if (strcmp(field->type, "pid_t") == 0)
1383                        sc->arg_fmt[idx].scnprintf = SCA_PID;
1384                else if (strcmp(field->type, "umode_t") == 0)
1385                        sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1386                else if ((strcmp(field->type, "int") == 0 ||
1387                          strcmp(field->type, "unsigned int") == 0 ||
1388                          strcmp(field->type, "long") == 0) &&
1389                         (len = strlen(field->name)) >= 2 &&
1390                         strcmp(field->name + len - 2, "fd") == 0) {
1391                        /*
1392                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1393                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1394                         * 65 int
1395                         * 23 unsigned int
1396                         * 7 unsigned long
1397                         */
1398                        sc->arg_fmt[idx].scnprintf = SCA_FD;
1399                }
1400        }
1401
1402        if (last_field)
1403                sc->args_size = last_field->offset + last_field->size;
1404
1405        return 0;
1406}
1407
1408static int trace__read_syscall_info(struct trace *trace, int id)
1409{
1410        char tp_name[128];
1411        struct syscall *sc;
1412        const char *name = syscalltbl__name(trace->sctbl, id);
1413
1414        if (name == NULL)
1415                return -1;
1416
1417        if (id > trace->syscalls.max) {
1418                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1419
1420                if (nsyscalls == NULL)
1421                        return -1;
1422
1423                if (trace->syscalls.max != -1) {
1424                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1425                               (id - trace->syscalls.max) * sizeof(*sc));
1426                } else {
1427                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1428                }
1429
1430                trace->syscalls.table = nsyscalls;
1431                trace->syscalls.max   = id;
1432        }
1433
1434        sc = trace->syscalls.table + id;
1435        sc->name = name;
1436
1437        sc->fmt  = syscall_fmt__find(sc->name);
1438
1439        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1440        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1441
1442        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1443                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1444                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1445        }
1446
1447        if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1448                return -1;
1449
1450        if (IS_ERR(sc->tp_format))
1451                return -1;
1452
1453        sc->args = sc->tp_format->format.fields;
1454        /*
1455         * We need to check and discard the first variable '__syscall_nr'
1456         * or 'nr' that mean the syscall number. It is needless here.
1457         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1458         */
1459        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1460                sc->args = sc->args->next;
1461                --sc->nr_args;
1462        }
1463
1464        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1465        sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1466
1467        return syscall__set_arg_fmts(sc);
1468}
1469
1470static int trace__validate_ev_qualifier(struct trace *trace)
1471{
1472        int err = 0, i;
1473        size_t nr_allocated;
1474        struct str_node *pos;
1475
1476        trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1477        trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1478                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1479
1480        if (trace->ev_qualifier_ids.entries == NULL) {
1481                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1482                       trace->output);
1483                err = -EINVAL;
1484                goto out;
1485        }
1486
1487        nr_allocated = trace->ev_qualifier_ids.nr;
1488        i = 0;
1489
1490        strlist__for_each_entry(pos, trace->ev_qualifier) {
1491                const char *sc = pos->s;
1492                int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1493
1494                if (id < 0) {
1495                        id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1496                        if (id >= 0)
1497                                goto matches;
1498
1499                        if (err == 0) {
1500                                fputs("Error:\tInvalid syscall ", trace->output);
1501                                err = -EINVAL;
1502                        } else {
1503                                fputs(", ", trace->output);
1504                        }
1505
1506                        fputs(sc, trace->output);
1507                }
1508matches:
1509                trace->ev_qualifier_ids.entries[i++] = id;
1510                if (match_next == -1)
1511                        continue;
1512
1513                while (1) {
1514                        id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1515                        if (id < 0)
1516                                break;
1517                        if (nr_allocated == trace->ev_qualifier_ids.nr) {
1518                                void *entries;
1519
1520                                nr_allocated += 8;
1521                                entries = realloc(trace->ev_qualifier_ids.entries,
1522                                                  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1523                                if (entries == NULL) {
1524                                        err = -ENOMEM;
1525                                        fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1526                                        goto out_free;
1527                                }
1528                                trace->ev_qualifier_ids.entries = entries;
1529                        }
1530                        trace->ev_qualifier_ids.nr++;
1531                        trace->ev_qualifier_ids.entries[i++] = id;
1532                }
1533        }
1534
1535        if (err < 0) {
1536                fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1537                      "\nHint:\tand: 'man syscalls'\n", trace->output);
1538out_free:
1539                zfree(&trace->ev_qualifier_ids.entries);
1540                trace->ev_qualifier_ids.nr = 0;
1541        }
1542out:
1543        return err;
1544}
1545
1546/*
1547 * args is to be interpreted as a series of longs but we need to handle
1548 * 8-byte unaligned accesses. args points to raw_data within the event
1549 * and raw_data is guaranteed to be 8-byte unaligned because it is
1550 * preceded by raw_size which is a u32. So we need to copy args to a temp
1551 * variable to read it. Most notably this avoids extended load instructions
1552 * on unaligned addresses
1553 */
1554unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1555{
1556        unsigned long val;
1557        unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1558
1559        memcpy(&val, p, sizeof(val));
1560        return val;
1561}
1562
1563static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1564                                      struct syscall_arg *arg)
1565{
1566        if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1567                return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1568
1569        return scnprintf(bf, size, "arg%d: ", arg->idx);
1570}
1571
1572/*
1573 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
1574 * as mount 'flags' argument that needs ignoring some magic flag, see comment
1575 * in tools/perf/trace/beauty/mount_flags.c
1576 */
1577static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
1578{
1579        if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
1580                return sc->arg_fmt[arg->idx].mask_val(arg, val);
1581
1582        return val;
1583}
1584
1585static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1586                                     struct syscall_arg *arg, unsigned long val)
1587{
1588        if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1589                arg->val = val;
1590                if (sc->arg_fmt[arg->idx].parm)
1591                        arg->parm = sc->arg_fmt[arg->idx].parm;
1592                return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1593        }
1594        return scnprintf(bf, size, "%ld", val);
1595}
1596
1597static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1598                                      unsigned char *args, void *augmented_args, int augmented_args_size,
1599                                      struct trace *trace, struct thread *thread)
1600{
1601        size_t printed = 0;
1602        unsigned long val;
1603        u8 bit = 1;
1604        struct syscall_arg arg = {
1605                .args   = args,
1606                .augmented = {
1607                        .size = augmented_args_size,
1608                        .args = augmented_args,
1609                },
1610                .idx    = 0,
1611                .mask   = 0,
1612                .trace  = trace,
1613                .thread = thread,
1614                .show_string_prefix = trace->show_string_prefix,
1615        };
1616        struct thread_trace *ttrace = thread__priv(thread);
1617
1618        /*
1619         * Things like fcntl will set this in its 'cmd' formatter to pick the
1620         * right formatter for the return value (an fd? file flags?), which is
1621         * not needed for syscalls that always return a given type, say an fd.
1622         */
1623        ttrace->ret_scnprintf = NULL;
1624
1625        if (sc->args != NULL) {
1626                struct tep_format_field *field;
1627
1628                for (field = sc->args; field;
1629                     field = field->next, ++arg.idx, bit <<= 1) {
1630                        if (arg.mask & bit)
1631                                continue;
1632
1633                        val = syscall_arg__val(&arg, arg.idx);
1634                        /*
1635                         * Some syscall args need some mask, most don't and
1636                         * return val untouched.
1637                         */
1638                        val = syscall__mask_val(sc, &arg, val);
1639
1640                        /*
1641                         * Suppress this argument if its value is zero and
1642                         * and we don't have a string associated in an
1643                         * strarray for it.
1644                         */
1645                        if (val == 0 &&
1646                            !trace->show_zeros &&
1647                            !(sc->arg_fmt &&
1648                              (sc->arg_fmt[arg.idx].show_zero ||
1649                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1650                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1651                              sc->arg_fmt[arg.idx].parm))
1652                                continue;
1653
1654                        printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
1655
1656                        if (trace->show_arg_names)
1657                                printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
1658
1659                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1660                }
1661        } else if (IS_ERR(sc->tp_format)) {
1662                /*
1663                 * If we managed to read the tracepoint /format file, then we
1664                 * may end up not having any args, like with gettid(), so only
1665                 * print the raw args when we didn't manage to read it.
1666                 */
1667                while (arg.idx < sc->nr_args) {
1668                        if (arg.mask & bit)
1669                                goto next_arg;
1670                        val = syscall_arg__val(&arg, arg.idx);
1671                        if (printed)
1672                                printed += scnprintf(bf + printed, size - printed, ", ");
1673                        printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1674                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1675next_arg:
1676                        ++arg.idx;
1677                        bit <<= 1;
1678                }
1679        }
1680
1681        return printed;
1682}
1683
1684typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1685                                  union perf_event *event,
1686                                  struct perf_sample *sample);
1687
1688static struct syscall *trace__syscall_info(struct trace *trace,
1689                                           struct perf_evsel *evsel, int id)
1690{
1691
1692        if (id < 0) {
1693
1694                /*
1695                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1696                 * before that, leaving at a higher verbosity level till that is
1697                 * explained. Reproduced with plain ftrace with:
1698                 *
1699                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1700                 * grep "NR -1 " /t/trace_pipe
1701                 *
1702                 * After generating some load on the machine.
1703                 */
1704                if (verbose > 1) {
1705                        static u64 n;
1706                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1707                                id, perf_evsel__name(evsel), ++n);
1708                }
1709                return NULL;
1710        }
1711
1712        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1713            trace__read_syscall_info(trace, id))
1714                goto out_cant_read;
1715
1716        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1717                goto out_cant_read;
1718
1719        return &trace->syscalls.table[id];
1720
1721out_cant_read:
1722        if (verbose > 0) {
1723                fprintf(trace->output, "Problems reading syscall %d", id);
1724                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1725                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1726                fputs(" information\n", trace->output);
1727        }
1728        return NULL;
1729}
1730
1731static void thread__update_stats(struct thread_trace *ttrace,
1732                                 int id, struct perf_sample *sample)
1733{
1734        struct int_node *inode;
1735        struct stats *stats;
1736        u64 duration = 0;
1737
1738        inode = intlist__findnew(ttrace->syscall_stats, id);
1739        if (inode == NULL)
1740                return;
1741
1742        stats = inode->priv;
1743        if (stats == NULL) {
1744                stats = malloc(sizeof(struct stats));
1745                if (stats == NULL)
1746                        return;
1747                init_stats(stats);
1748                inode->priv = stats;
1749        }
1750
1751        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1752                duration = sample->time - ttrace->entry_time;
1753
1754        update_stats(stats, duration);
1755}
1756
1757static int trace__printf_interrupted_entry(struct trace *trace)
1758{
1759        struct thread_trace *ttrace;
1760        size_t printed;
1761        int len;
1762
1763        if (trace->failure_only || trace->current == NULL)
1764                return 0;
1765
1766        ttrace = thread__priv(trace->current);
1767
1768        if (!ttrace->entry_pending)
1769                return 0;
1770
1771        printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1772        printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
1773
1774        if (len < trace->args_alignment - 4)
1775                printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1776
1777        printed += fprintf(trace->output, " ...\n");
1778
1779        ttrace->entry_pending = false;
1780        ++trace->nr_events_printed;
1781
1782        return printed;
1783}
1784
1785static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1786                                 struct perf_sample *sample, struct thread *thread)
1787{
1788        int printed = 0;
1789
1790        if (trace->print_sample) {
1791                double ts = (double)sample->time / NSEC_PER_MSEC;
1792
1793                printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1794                                   perf_evsel__name(evsel), ts,
1795                                   thread__comm_str(thread),
1796                                   sample->pid, sample->tid, sample->cpu);
1797        }
1798
1799        return printed;
1800}
1801
1802static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1803{
1804        void *augmented_args = NULL;
1805        /*
1806         * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1807         * and there we get all 6 syscall args plus the tracepoint common fields
1808         * that gets calculated at the start and the syscall_nr (another long).
1809         * So we check if that is the case and if so don't look after the
1810         * sc->args_size but always after the full raw_syscalls:sys_enter payload,
1811         * which is fixed.
1812         *
1813         * We'll revisit this later to pass s->args_size to the BPF augmenter
1814         * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
1815         * copies only what we need for each syscall, like what happens when we
1816         * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
1817         * traffic to just what is needed for each syscall.
1818         */
1819        int args_size = raw_augmented_args_size ?: sc->args_size;
1820
1821        *augmented_args_size = sample->raw_size - args_size;
1822        if (*augmented_args_size > 0)
1823                augmented_args = sample->raw_data + args_size;
1824
1825        return augmented_args;
1826}
1827
1828static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1829                            union perf_event *event __maybe_unused,
1830                            struct perf_sample *sample)
1831{
1832        char *msg;
1833        void *args;
1834        int printed = 0;
1835        struct thread *thread;
1836        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1837        int augmented_args_size = 0;
1838        void *augmented_args = NULL;
1839        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1840        struct thread_trace *ttrace;
1841
1842        if (sc == NULL)
1843                return -1;
1844
1845        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1846        ttrace = thread__trace(thread, trace->output);
1847        if (ttrace == NULL)
1848                goto out_put;
1849
1850        trace__fprintf_sample(trace, evsel, sample, thread);
1851
1852        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1853
1854        if (ttrace->entry_str == NULL) {
1855                ttrace->entry_str = malloc(trace__entry_str_size);
1856                if (!ttrace->entry_str)
1857                        goto out_put;
1858        }
1859
1860        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1861                trace__printf_interrupted_entry(trace);
1862        /*
1863         * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
1864         * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
1865         * this breaks syscall__augmented_args() check for augmented args, as we calculate
1866         * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
1867         * so when handling, say the openat syscall, we end up getting 6 args for the
1868         * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
1869         * thinking that the extra 2 u64 args are the augmented filename, so just check
1870         * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
1871         */
1872        if (evsel != trace->syscalls.events.sys_enter)
1873                augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1874        ttrace->entry_time = sample->time;
1875        msg = ttrace->entry_str;
1876        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1877
1878        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1879                                           args, augmented_args, augmented_args_size, trace, thread);
1880
1881        if (sc->is_exit) {
1882                if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1883                        int alignment = 0;
1884
1885                        trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1886                        printed = fprintf(trace->output, "%s)", ttrace->entry_str);
1887                        if (trace->args_alignment > printed)
1888                                alignment = trace->args_alignment - printed;
1889                        fprintf(trace->output, "%*s= ?\n", alignment, " ");
1890                }
1891        } else {
1892                ttrace->entry_pending = true;
1893                /* See trace__vfs_getname & trace__sys_exit */
1894                ttrace->filename.pending_open = false;
1895        }
1896
1897        if (trace->current != thread) {
1898                thread__put(trace->current);
1899                trace->current = thread__get(thread);
1900        }
1901        err = 0;
1902out_put:
1903        thread__put(thread);
1904        return err;
1905}
1906
1907static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1908                                    struct perf_sample *sample)
1909{
1910        struct thread_trace *ttrace;
1911        struct thread *thread;
1912        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1913        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1914        char msg[1024];
1915        void *args, *augmented_args = NULL;
1916        int augmented_args_size;
1917
1918        if (sc == NULL)
1919                return -1;
1920
1921        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1922        ttrace = thread__trace(thread, trace->output);
1923        /*
1924         * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1925         * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1926         */
1927        if (ttrace == NULL)
1928                goto out_put;
1929
1930        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1931        augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1932        syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
1933        fprintf(trace->output, "%s", msg);
1934        err = 0;
1935out_put:
1936        thread__put(thread);
1937        return err;
1938}
1939
1940static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1941                                    struct perf_sample *sample,
1942                                    struct callchain_cursor *cursor)
1943{
1944        struct addr_location al;
1945        int max_stack = evsel->attr.sample_max_stack ?
1946                        evsel->attr.sample_max_stack :
1947                        trace->max_stack;
1948        int err;
1949
1950        if (machine__resolve(trace->host, &al, sample) < 0)
1951                return -1;
1952
1953        err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
1954        addr_location__put(&al);
1955        return err;
1956}
1957
1958static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1959{
1960        /* TODO: user-configurable print_opts */
1961        const unsigned int print_opts = EVSEL__PRINT_SYM |
1962                                        EVSEL__PRINT_DSO |
1963                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1964
1965        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1966}
1967
1968static const char *errno_to_name(struct perf_evsel *evsel, int err)
1969{
1970        struct perf_env *env = perf_evsel__env(evsel);
1971        const char *arch_name = perf_env__arch(env);
1972
1973        return arch_syscalls__strerrno(arch_name, err);
1974}
1975
1976static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1977                           union perf_event *event __maybe_unused,
1978                           struct perf_sample *sample)
1979{
1980        long ret;
1981        u64 duration = 0;
1982        bool duration_calculated = false;
1983        struct thread *thread;
1984        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
1985        int alignment = trace->args_alignment;
1986        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1987        struct thread_trace *ttrace;
1988
1989        if (sc == NULL)
1990                return -1;
1991
1992        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1993        ttrace = thread__trace(thread, trace->output);
1994        if (ttrace == NULL)
1995                goto out_put;
1996
1997        trace__fprintf_sample(trace, evsel, sample, thread);
1998
1999        if (trace->summary)
2000                thread__update_stats(ttrace, id, sample);
2001
2002        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2003
2004        if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2005                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2006                ttrace->filename.pending_open = false;
2007                ++trace->stats.vfs_getname;
2008        }
2009
2010        if (ttrace->entry_time) {
2011                duration = sample->time - ttrace->entry_time;
2012                if (trace__filter_duration(trace, duration))
2013                        goto out;
2014                duration_calculated = true;
2015        } else if (trace->duration_filter)
2016                goto out;
2017
2018        if (sample->callchain) {
2019                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2020                if (callchain_ret == 0) {
2021                        if (callchain_cursor.nr < trace->min_stack)
2022                                goto out;
2023                        callchain_ret = 1;
2024                }
2025        }
2026
2027        if (trace->summary_only || (ret >= 0 && trace->failure_only))
2028                goto out;
2029
2030        trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2031
2032        if (ttrace->entry_pending) {
2033                printed = fprintf(trace->output, "%s", ttrace->entry_str);
2034        } else {
2035                printed += fprintf(trace->output, " ... [");
2036                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2037                printed += 9;
2038                printed += fprintf(trace->output, "]: %s()", sc->name);
2039        }
2040
2041        printed++; /* the closing ')' */
2042
2043        if (alignment > printed)
2044                alignment -= printed;
2045        else
2046                alignment = 0;
2047
2048        fprintf(trace->output, ")%*s= ", alignment, " ");
2049
2050        if (sc->fmt == NULL) {
2051                if (ret < 0)
2052                        goto errno_print;
2053signed_print:
2054                fprintf(trace->output, "%ld", ret);
2055        } else if (ret < 0) {
2056errno_print: {
2057                char bf[STRERR_BUFSIZE];
2058                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2059                           *e = errno_to_name(evsel, -ret);
2060
2061                fprintf(trace->output, "-1 %s (%s)", e, emsg);
2062        }
2063        } else if (ret == 0 && sc->fmt->timeout)
2064                fprintf(trace->output, "0 (Timeout)");
2065        else if (ttrace->ret_scnprintf) {
2066                char bf[1024];
2067                struct syscall_arg arg = {
2068                        .val    = ret,
2069                        .thread = thread,
2070                        .trace  = trace,
2071                };
2072                ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2073                ttrace->ret_scnprintf = NULL;
2074                fprintf(trace->output, "%s", bf);
2075        } else if (sc->fmt->hexret)
2076                fprintf(trace->output, "%#lx", ret);
2077        else if (sc->fmt->errpid) {
2078                struct thread *child = machine__find_thread(trace->host, ret, ret);
2079
2080                if (child != NULL) {
2081                        fprintf(trace->output, "%ld", ret);
2082                        if (child->comm_set)
2083                                fprintf(trace->output, " (%s)", thread__comm_str(child));
2084                        thread__put(child);
2085                }
2086        } else
2087                goto signed_print;
2088
2089        fputc('\n', trace->output);
2090
2091        /*
2092         * We only consider an 'event' for the sake of --max-events a non-filtered
2093         * sys_enter + sys_exit and other tracepoint events.
2094         */
2095        if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2096                interrupted = true;
2097
2098        if (callchain_ret > 0)
2099                trace__fprintf_callchain(trace, sample);
2100        else if (callchain_ret < 0)
2101                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2102out:
2103        ttrace->entry_pending = false;
2104        err = 0;
2105out_put:
2106        thread__put(thread);
2107        return err;
2108}
2109
2110static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2111                              union perf_event *event __maybe_unused,
2112                              struct perf_sample *sample)
2113{
2114        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2115        struct thread_trace *ttrace;
2116        size_t filename_len, entry_str_len, to_move;
2117        ssize_t remaining_space;
2118        char *pos;
2119        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2120
2121        if (!thread)
2122                goto out;
2123
2124        ttrace = thread__priv(thread);
2125        if (!ttrace)
2126                goto out_put;
2127
2128        filename_len = strlen(filename);
2129        if (filename_len == 0)
2130                goto out_put;
2131
2132        if (ttrace->filename.namelen < filename_len) {
2133                char *f = realloc(ttrace->filename.name, filename_len + 1);
2134
2135                if (f == NULL)
2136                        goto out_put;
2137
2138                ttrace->filename.namelen = filename_len;
2139                ttrace->filename.name = f;
2140        }
2141
2142        strcpy(ttrace->filename.name, filename);
2143        ttrace->filename.pending_open = true;
2144
2145        if (!ttrace->filename.ptr)
2146                goto out_put;
2147
2148        entry_str_len = strlen(ttrace->entry_str);
2149        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2150        if (remaining_space <= 0)
2151                goto out_put;
2152
2153        if (filename_len > (size_t)remaining_space) {
2154                filename += filename_len - remaining_space;
2155                filename_len = remaining_space;
2156        }
2157
2158        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2159        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2160        memmove(pos + filename_len, pos, to_move);
2161        memcpy(pos, filename, filename_len);
2162
2163        ttrace->filename.ptr = 0;
2164        ttrace->filename.entry_str_pos = 0;
2165out_put:
2166        thread__put(thread);
2167out:
2168        return 0;
2169}
2170
2171static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2172                                     union perf_event *event __maybe_unused,
2173                                     struct perf_sample *sample)
2174{
2175        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2176        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2177        struct thread *thread = machine__findnew_thread(trace->host,
2178                                                        sample->pid,
2179                                                        sample->tid);
2180        struct thread_trace *ttrace = thread__trace(thread, trace->output);
2181
2182        if (ttrace == NULL)
2183                goto out_dump;
2184
2185        ttrace->runtime_ms += runtime_ms;
2186        trace->runtime_ms += runtime_ms;
2187out_put:
2188        thread__put(thread);
2189        return 0;
2190
2191out_dump:
2192        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2193               evsel->name,
2194               perf_evsel__strval(evsel, sample, "comm"),
2195               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2196               runtime,
2197               perf_evsel__intval(evsel, sample, "vruntime"));
2198        goto out_put;
2199}
2200
2201static int bpf_output__printer(enum binary_printer_ops op,
2202                               unsigned int val, void *extra __maybe_unused, FILE *fp)
2203{
2204        unsigned char ch = (unsigned char)val;
2205
2206        switch (op) {
2207        case BINARY_PRINT_CHAR_DATA:
2208                return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2209        case BINARY_PRINT_DATA_BEGIN:
2210        case BINARY_PRINT_LINE_BEGIN:
2211        case BINARY_PRINT_ADDR:
2212        case BINARY_PRINT_NUM_DATA:
2213        case BINARY_PRINT_NUM_PAD:
2214        case BINARY_PRINT_SEP:
2215        case BINARY_PRINT_CHAR_PAD:
2216        case BINARY_PRINT_LINE_END:
2217        case BINARY_PRINT_DATA_END:
2218        default:
2219                break;
2220        }
2221
2222        return 0;
2223}
2224
2225static void bpf_output__fprintf(struct trace *trace,
2226                                struct perf_sample *sample)
2227{
2228        binary__fprintf(sample->raw_data, sample->raw_size, 8,
2229                        bpf_output__printer, NULL, trace->output);
2230        ++trace->nr_events_printed;
2231}
2232
2233static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2234                                union perf_event *event __maybe_unused,
2235                                struct perf_sample *sample)
2236{
2237        struct thread *thread;
2238        int callchain_ret = 0;
2239        /*
2240         * Check if we called perf_evsel__disable(evsel) due to, for instance,
2241         * this event's max_events having been hit and this is an entry coming
2242         * from the ring buffer that we should discard, since the max events
2243         * have already been considered/printed.
2244         */
2245        if (evsel->disabled)
2246                return 0;
2247
2248        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2249
2250        if (sample->callchain) {
2251                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2252                if (callchain_ret == 0) {
2253                        if (callchain_cursor.nr < trace->min_stack)
2254                                goto out;
2255                        callchain_ret = 1;
2256                }
2257        }
2258
2259        trace__printf_interrupted_entry(trace);
2260        trace__fprintf_tstamp(trace, sample->time, trace->output);
2261
2262        if (trace->trace_syscalls && trace->show_duration)
2263                fprintf(trace->output, "(         ): ");
2264
2265        if (thread)
2266                trace__fprintf_comm_tid(trace, thread, trace->output);
2267
2268        if (evsel == trace->syscalls.events.augmented) {
2269                int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2270                struct syscall *sc = trace__syscall_info(trace, evsel, id);
2271
2272                if (sc) {
2273                        fprintf(trace->output, "%s(", sc->name);
2274                        trace__fprintf_sys_enter(trace, evsel, sample);
2275                        fputc(')', trace->output);
2276                        goto newline;
2277                }
2278
2279                /*
2280                 * XXX: Not having the associated syscall info or not finding/adding
2281                 *      the thread should never happen, but if it does...
2282                 *      fall thru and print it as a bpf_output event.
2283                 */
2284        }
2285
2286        fprintf(trace->output, "%s:", evsel->name);
2287
2288        if (perf_evsel__is_bpf_output(evsel)) {
2289                bpf_output__fprintf(trace, sample);
2290        } else if (evsel->tp_format) {
2291                if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2292                    trace__fprintf_sys_enter(trace, evsel, sample)) {
2293                        event_format__fprintf(evsel->tp_format, sample->cpu,
2294                                              sample->raw_data, sample->raw_size,
2295                                              trace->output);
2296                        ++trace->nr_events_printed;
2297
2298                        if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2299                                perf_evsel__disable(evsel);
2300                                perf_evsel__close(evsel);
2301                        }
2302                }
2303        }
2304
2305newline:
2306        fprintf(trace->output, "\n");
2307
2308        if (callchain_ret > 0)
2309                trace__fprintf_callchain(trace, sample);
2310        else if (callchain_ret < 0)
2311                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2312out:
2313        thread__put(thread);
2314        return 0;
2315}
2316
2317static void print_location(FILE *f, struct perf_sample *sample,
2318                           struct addr_location *al,
2319                           bool print_dso, bool print_sym)
2320{
2321
2322        if ((verbose > 0 || print_dso) && al->map)
2323                fprintf(f, "%s@", al->map->dso->long_name);
2324
2325        if ((verbose > 0 || print_sym) && al->sym)
2326                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2327                        al->addr - al->sym->start);
2328        else if (al->map)
2329                fprintf(f, "0x%" PRIx64, al->addr);
2330        else
2331                fprintf(f, "0x%" PRIx64, sample->addr);
2332}
2333
2334static int trace__pgfault(struct trace *trace,
2335                          struct perf_evsel *evsel,
2336                          union perf_event *event __maybe_unused,
2337                          struct perf_sample *sample)
2338{
2339        struct thread *thread;
2340        struct addr_location al;
2341        char map_type = 'd';
2342        struct thread_trace *ttrace;
2343        int err = -1;
2344        int callchain_ret = 0;
2345
2346        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2347
2348        if (sample->callchain) {
2349                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2350                if (callchain_ret == 0) {
2351                        if (callchain_cursor.nr < trace->min_stack)
2352                                goto out_put;
2353                        callchain_ret = 1;
2354                }
2355        }
2356
2357        ttrace = thread__trace(thread, trace->output);
2358        if (ttrace == NULL)
2359                goto out_put;
2360
2361        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2362                ttrace->pfmaj++;
2363        else
2364                ttrace->pfmin++;
2365
2366        if (trace->summary_only)
2367                goto out;
2368
2369        thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2370
2371        trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2372
2373        fprintf(trace->output, "%sfault [",
2374                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2375                "maj" : "min");
2376
2377        print_location(trace->output, sample, &al, false, true);
2378
2379        fprintf(trace->output, "] => ");
2380
2381        thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2382
2383        if (!al.map) {
2384                thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2385
2386                if (al.map)
2387                        map_type = 'x';
2388                else
2389                        map_type = '?';
2390        }
2391
2392        print_location(trace->output, sample, &al, true, false);
2393
2394        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2395
2396        if (callchain_ret > 0)
2397                trace__fprintf_callchain(trace, sample);
2398        else if (callchain_ret < 0)
2399                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2400
2401        ++trace->nr_events_printed;
2402out:
2403        err = 0;
2404out_put:
2405        thread__put(thread);
2406        return err;
2407}
2408
2409static void trace__set_base_time(struct trace *trace,
2410                                 struct perf_evsel *evsel,
2411                                 struct perf_sample *sample)
2412{
2413        /*
2414         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2415         * and don't use sample->time unconditionally, we may end up having
2416         * some other event in the future without PERF_SAMPLE_TIME for good
2417         * reason, i.e. we may not be interested in its timestamps, just in
2418         * it taking place, picking some piece of information when it
2419         * appears in our event stream (vfs_getname comes to mind).
2420         */
2421        if (trace->base_time == 0 && !trace->full_time &&
2422            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2423                trace->base_time = sample->time;
2424}
2425
2426static int trace__process_sample(struct perf_tool *tool,
2427                                 union perf_event *event,
2428                                 struct perf_sample *sample,
2429                                 struct perf_evsel *evsel,
2430                                 struct machine *machine __maybe_unused)
2431{
2432        struct trace *trace = container_of(tool, struct trace, tool);
2433        struct thread *thread;
2434        int err = 0;
2435
2436        tracepoint_handler handler = evsel->handler;
2437
2438        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2439        if (thread && thread__is_filtered(thread))
2440                goto out;
2441
2442        trace__set_base_time(trace, evsel, sample);
2443
2444        if (handler) {
2445                ++trace->nr_events;
2446                handler(trace, evsel, event, sample);
2447        }
2448out:
2449        thread__put(thread);
2450        return err;
2451}
2452
2453static int trace__record(struct trace *trace, int argc, const char **argv)
2454{
2455        unsigned int rec_argc, i, j;
2456        const char **rec_argv;
2457        const char * const record_args[] = {
2458                "record",
2459                "-R",
2460                "-m", "1024",
2461                "-c", "1",
2462        };
2463
2464        const char * const sc_args[] = { "-e", };
2465        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2466        const char * const majpf_args[] = { "-e", "major-faults" };
2467        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2468        const char * const minpf_args[] = { "-e", "minor-faults" };
2469        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2470
2471        /* +1 is for the event string below */
2472        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2473                majpf_args_nr + minpf_args_nr + argc;
2474        rec_argv = calloc(rec_argc + 1, sizeof(char *));
2475
2476        if (rec_argv == NULL)
2477                return -ENOMEM;
2478
2479        j = 0;
2480        for (i = 0; i < ARRAY_SIZE(record_args); i++)
2481                rec_argv[j++] = record_args[i];
2482
2483        if (trace->trace_syscalls) {
2484                for (i = 0; i < sc_args_nr; i++)
2485                        rec_argv[j++] = sc_args[i];
2486
2487                /* event string may be different for older kernels - e.g., RHEL6 */
2488                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2489                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2490                else if (is_valid_tracepoint("syscalls:sys_enter"))
2491                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2492                else {
2493                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2494                        free(rec_argv);
2495                        return -1;
2496                }
2497        }
2498
2499        if (trace->trace_pgfaults & TRACE_PFMAJ)
2500                for (i = 0; i < majpf_args_nr; i++)
2501                        rec_argv[j++] = majpf_args[i];
2502
2503        if (trace->trace_pgfaults & TRACE_PFMIN)
2504                for (i = 0; i < minpf_args_nr; i++)
2505                        rec_argv[j++] = minpf_args[i];
2506
2507        for (i = 0; i < (unsigned int)argc; i++)
2508                rec_argv[j++] = argv[i];
2509
2510        return cmd_record(j, rec_argv);
2511}
2512
2513static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2514
2515static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2516{
2517        bool found = false;
2518        struct perf_evsel *evsel, *tmp;
2519        struct parse_events_error err = { .idx = 0, };
2520        int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2521
2522        if (ret)
2523                return false;
2524
2525        evlist__for_each_entry_safe(evlist, evsel, tmp) {
2526                if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2527                        continue;
2528
2529                if (perf_evsel__field(evsel, "pathname")) {
2530                        evsel->handler = trace__vfs_getname;
2531                        found = true;
2532                        continue;
2533                }
2534
2535                list_del_init(&evsel->node);
2536                evsel->evlist = NULL;
2537                perf_evsel__delete(evsel);
2538        }
2539
2540        return found;
2541}
2542
2543static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2544{
2545        struct perf_evsel *evsel;
2546        struct perf_event_attr attr = {
2547                .type = PERF_TYPE_SOFTWARE,
2548                .mmap_data = 1,
2549        };
2550
2551        attr.config = config;
2552        attr.sample_period = 1;
2553
2554        event_attr_init(&attr);
2555
2556        evsel = perf_evsel__new(&attr);
2557        if (evsel)
2558                evsel->handler = trace__pgfault;
2559
2560        return evsel;
2561}
2562
2563static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2564{
2565        const u32 type = event->header.type;
2566        struct perf_evsel *evsel;
2567
2568        if (type != PERF_RECORD_SAMPLE) {
2569                trace__process_event(trace, trace->host, event, sample);
2570                return;
2571        }
2572
2573        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2574        if (evsel == NULL) {
2575                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2576                return;
2577        }
2578
2579        trace__set_base_time(trace, evsel, sample);
2580
2581        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2582            sample->raw_data == NULL) {
2583                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2584                       perf_evsel__name(evsel), sample->tid,
2585                       sample->cpu, sample->raw_size);
2586        } else {
2587                tracepoint_handler handler = evsel->handler;
2588                handler(trace, evsel, event, sample);
2589        }
2590
2591        if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
2592                interrupted = true;
2593}
2594
2595static int trace__add_syscall_newtp(struct trace *trace)
2596{
2597        int ret = -1;
2598        struct perf_evlist *evlist = trace->evlist;
2599        struct perf_evsel *sys_enter, *sys_exit;
2600
2601        sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2602        if (sys_enter == NULL)
2603                goto out;
2604
2605        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2606                goto out_delete_sys_enter;
2607
2608        sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2609        if (sys_exit == NULL)
2610                goto out_delete_sys_enter;
2611
2612        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2613                goto out_delete_sys_exit;
2614
2615        perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2616        perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2617
2618        perf_evlist__add(evlist, sys_enter);
2619        perf_evlist__add(evlist, sys_exit);
2620
2621        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2622                /*
2623                 * We're interested only in the user space callchain
2624                 * leading to the syscall, allow overriding that for
2625                 * debugging reasons using --kernel_syscall_callchains
2626                 */
2627                sys_exit->attr.exclude_callchain_kernel = 1;
2628        }
2629
2630        trace->syscalls.events.sys_enter = sys_enter;
2631        trace->syscalls.events.sys_exit  = sys_exit;
2632
2633        ret = 0;
2634out:
2635        return ret;
2636
2637out_delete_sys_exit:
2638        perf_evsel__delete_priv(sys_exit);
2639out_delete_sys_enter:
2640        perf_evsel__delete_priv(sys_enter);
2641        goto out;
2642}
2643
2644static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2645{
2646        int err = -1;
2647        struct perf_evsel *sys_exit;
2648        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2649                                                trace->ev_qualifier_ids.nr,
2650                                                trace->ev_qualifier_ids.entries);
2651
2652        if (filter == NULL)
2653                goto out_enomem;
2654
2655        if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2656                                          filter)) {
2657                sys_exit = trace->syscalls.events.sys_exit;
2658                err = perf_evsel__append_tp_filter(sys_exit, filter);
2659        }
2660
2661        free(filter);
2662out:
2663        return err;
2664out_enomem:
2665        errno = ENOMEM;
2666        goto out;
2667}
2668
2669#ifdef HAVE_LIBBPF_SUPPORT
2670static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
2671{
2672        int fd = bpf_map__fd(trace->syscalls.map);
2673        struct bpf_map_syscall_entry value = {
2674                .enabled = !trace->not_ev_qualifier,
2675        };
2676        int err = 0;
2677        size_t i;
2678
2679        for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
2680                int key = trace->ev_qualifier_ids.entries[i];
2681
2682                err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
2683                if (err)
2684                        break;
2685        }
2686
2687        return err;
2688}
2689
2690static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
2691{
2692        int fd = bpf_map__fd(trace->syscalls.map);
2693        struct bpf_map_syscall_entry value = {
2694                .enabled = enabled,
2695        };
2696        int err = 0, key;
2697
2698        for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2699                err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2700                if (err)
2701                        break;
2702        }
2703
2704        return err;
2705}
2706
2707static int trace__init_syscalls_bpf_map(struct trace *trace)
2708{
2709        bool enabled = true;
2710
2711        if (trace->ev_qualifier_ids.nr)
2712                enabled = trace->not_ev_qualifier;
2713
2714        return __trace__init_syscalls_bpf_map(trace, enabled);
2715}
2716#else
2717static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
2718{
2719        return 0;
2720}
2721
2722static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
2723{
2724        return 0;
2725}
2726#endif // HAVE_LIBBPF_SUPPORT
2727
2728static int trace__set_ev_qualifier_filter(struct trace *trace)
2729{
2730        if (trace->syscalls.map)
2731                return trace__set_ev_qualifier_bpf_filter(trace);
2732        if (trace->syscalls.events.sys_enter)
2733                return trace__set_ev_qualifier_tp_filter(trace);
2734        return 0;
2735}
2736
2737static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
2738                                    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
2739{
2740        int err = 0;
2741#ifdef HAVE_LIBBPF_SUPPORT
2742        bool value = true;
2743        int map_fd = bpf_map__fd(map);
2744        size_t i;
2745
2746        for (i = 0; i < npids; ++i) {
2747                err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
2748                if (err)
2749                        break;
2750        }
2751#endif
2752        return err;
2753}
2754
2755static int trace__set_filter_loop_pids(struct trace *trace)
2756{
2757        unsigned int nr = 1, err;
2758        pid_t pids[32] = {
2759                getpid(),
2760        };
2761        struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2762
2763        while (thread && nr < ARRAY_SIZE(pids)) {
2764                struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2765
2766                if (parent == NULL)
2767                        break;
2768
2769                if (!strcmp(thread__comm_str(parent), "sshd")) {
2770                        pids[nr++] = parent->tid;
2771                        break;
2772                }
2773                thread = parent;
2774        }
2775
2776        err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
2777        if (!err && trace->filter_pids.map)
2778                err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
2779
2780        return err;
2781}
2782
2783static int trace__set_filter_pids(struct trace *trace)
2784{
2785        int err = 0;
2786        /*
2787         * Better not use !target__has_task() here because we need to cover the
2788         * case where no threads were specified in the command line, but a
2789         * workload was, and in that case we will fill in the thread_map when
2790         * we fork the workload in perf_evlist__prepare_workload.
2791         */
2792        if (trace->filter_pids.nr > 0) {
2793                err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
2794                                                      trace->filter_pids.entries);
2795                if (!err && trace->filter_pids.map) {
2796                        err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
2797                                                       trace->filter_pids.entries);
2798                }
2799        } else if (thread_map__pid(trace->evlist->threads, 0) == -1) {
2800                err = trace__set_filter_loop_pids(trace);
2801        }
2802
2803        return err;
2804}
2805
2806static int __trace__deliver_event(struct trace *trace, union perf_event *event)
2807{
2808        struct perf_evlist *evlist = trace->evlist;
2809        struct perf_sample sample;
2810        int err;
2811
2812        err = perf_evlist__parse_sample(evlist, event, &sample);
2813        if (err)
2814                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2815        else
2816                trace__handle_event(trace, event, &sample);
2817
2818        return 0;
2819}
2820
2821static int __trace__flush_events(struct trace *trace)
2822{
2823        u64 first = ordered_events__first_time(&trace->oe.data);
2824        u64 flush = trace->oe.last - NSEC_PER_SEC;
2825
2826        /* Is there some thing to flush.. */
2827        if (first && first < flush)
2828                return ordered_events__flush_time(&trace->oe.data, flush);
2829
2830        return 0;
2831}
2832
2833static int trace__flush_events(struct trace *trace)
2834{
2835        return !trace->sort_events ? 0 : __trace__flush_events(trace);
2836}
2837
2838static int trace__deliver_event(struct trace *trace, union perf_event *event)
2839{
2840        int err;
2841
2842        if (!trace->sort_events)
2843                return __trace__deliver_event(trace, event);
2844
2845        err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
2846        if (err && err != -1)
2847                return err;
2848
2849        err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
2850        if (err)
2851                return err;
2852
2853        return trace__flush_events(trace);
2854}
2855
2856static int ordered_events__deliver_event(struct ordered_events *oe,
2857                                         struct ordered_event *event)
2858{
2859        struct trace *trace = container_of(oe, struct trace, oe.data);
2860
2861        return __trace__deliver_event(trace, event->event);
2862}
2863
2864static int trace__run(struct trace *trace, int argc, const char **argv)
2865{
2866        struct perf_evlist *evlist = trace->evlist;
2867        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2868        int err = -1, i;
2869        unsigned long before;
2870        const bool forks = argc > 0;
2871        bool draining = false;
2872
2873        trace->live = true;
2874
2875        if (!trace->raw_augmented_syscalls) {
2876                if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2877                        goto out_error_raw_syscalls;
2878
2879                if (trace->trace_syscalls)
2880                        trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2881        }
2882
2883        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2884                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2885                if (pgfault_maj == NULL)
2886                        goto out_error_mem;
2887                perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2888                perf_evlist__add(evlist, pgfault_maj);
2889        }
2890
2891        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2892                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2893                if (pgfault_min == NULL)
2894                        goto out_error_mem;
2895                perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2896                perf_evlist__add(evlist, pgfault_min);
2897        }
2898
2899        if (trace->sched &&
2900            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2901                                   trace__sched_stat_runtime))
2902                goto out_error_sched_stat_runtime;
2903
2904        /*
2905         * If a global cgroup was set, apply it to all the events without an
2906         * explicit cgroup. I.e.:
2907         *
2908         *      trace -G A -e sched:*switch
2909         *
2910         * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2911         * _and_ sched:sched_switch to the 'A' cgroup, while:
2912         *
2913         * trace -e sched:*switch -G A
2914         *
2915         * will only set the sched:sched_switch event to the 'A' cgroup, all the
2916         * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2917         * a cgroup (on the root cgroup, sys wide, etc).
2918         *
2919         * Multiple cgroups:
2920         *
2921         * trace -G A -e sched:*switch -G B
2922         *
2923         * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2924         * to the 'B' cgroup.
2925         *
2926         * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2927         * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2928         */
2929        if (trace->cgroup)
2930                evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2931
2932        err = perf_evlist__create_maps(evlist, &trace->opts.target);
2933        if (err < 0) {
2934                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2935                goto out_delete_evlist;
2936        }
2937
2938        err = trace__symbols_init(trace, evlist);
2939        if (err < 0) {
2940                fprintf(trace->output, "Problems initializing symbol libraries!\n");
2941                goto out_delete_evlist;
2942        }
2943
2944        perf_evlist__config(evlist, &trace->opts, &callchain_param);
2945
2946        signal(SIGCHLD, sig_handler);
2947        signal(SIGINT, sig_handler);
2948
2949        if (forks) {
2950                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2951                                                    argv, false, NULL);
2952                if (err < 0) {
2953                        fprintf(trace->output, "Couldn't run the workload!\n");
2954                        goto out_delete_evlist;
2955                }
2956        }
2957
2958        err = perf_evlist__open(evlist);
2959        if (err < 0)
2960                goto out_error_open;
2961
2962        err = bpf__apply_obj_config();
2963        if (err) {
2964                char errbuf[BUFSIZ];
2965
2966                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2967                pr_err("ERROR: Apply config to BPF failed: %s\n",
2968                         errbuf);
2969                goto out_error_open;
2970        }
2971
2972        err = trace__set_filter_pids(trace);
2973        if (err < 0)
2974                goto out_error_mem;
2975
2976        if (trace->syscalls.map)
2977                trace__init_syscalls_bpf_map(trace);
2978
2979        if (trace->ev_qualifier_ids.nr > 0) {
2980                err = trace__set_ev_qualifier_filter(trace);
2981                if (err < 0)
2982                        goto out_errno;
2983
2984                if (trace->syscalls.events.sys_exit) {
2985                        pr_debug("event qualifier tracepoint filter: %s\n",
2986                                 trace->syscalls.events.sys_exit->filter);
2987                }
2988        }
2989
2990        err = perf_evlist__apply_filters(evlist, &evsel);
2991        if (err < 0)
2992                goto out_error_apply_filters;
2993
2994        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2995        if (err < 0)
2996                goto out_error_mmap;
2997
2998        if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2999                perf_evlist__enable(evlist);
3000
3001        if (forks)
3002                perf_evlist__start_workload(evlist);
3003
3004        if (trace->opts.initial_delay) {
3005                usleep(trace->opts.initial_delay * 1000);
3006                perf_evlist__enable(evlist);
3007        }
3008
3009        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
3010                                  evlist->threads->nr > 1 ||
3011                                  perf_evlist__first(evlist)->attr.inherit;
3012
3013        /*
3014         * Now that we already used evsel->attr to ask the kernel to setup the
3015         * events, lets reuse evsel->attr.sample_max_stack as the limit in
3016         * trace__resolve_callchain(), allowing per-event max-stack settings
3017         * to override an explicitly set --max-stack global setting.
3018         */
3019        evlist__for_each_entry(evlist, evsel) {
3020                if (evsel__has_callchain(evsel) &&
3021                    evsel->attr.sample_max_stack == 0)
3022                        evsel->attr.sample_max_stack = trace->max_stack;
3023        }
3024again:
3025        before = trace->nr_events;
3026
3027        for (i = 0; i < evlist->nr_mmaps; i++) {
3028                union perf_event *event;
3029                struct perf_mmap *md;
3030
3031                md = &evlist->mmap[i];
3032                if (perf_mmap__read_init(md) < 0)
3033                        continue;
3034
3035                while ((event = perf_mmap__read_event(md)) != NULL) {
3036                        ++trace->nr_events;
3037
3038                        err = trace__deliver_event(trace, event);
3039                        if (err)
3040                                goto out_disable;
3041
3042                        perf_mmap__consume(md);
3043
3044                        if (interrupted)
3045                                goto out_disable;
3046
3047                        if (done && !draining) {
3048                                perf_evlist__disable(evlist);
3049                                draining = true;
3050                        }
3051                }
3052                perf_mmap__read_done(md);
3053        }
3054
3055        if (trace->nr_events == before) {
3056                int timeout = done ? 100 : -1;
3057
3058                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
3059                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3060                                draining = true;
3061
3062                        goto again;
3063                } else {
3064                        if (trace__flush_events(trace))
3065                                goto out_disable;
3066                }
3067        } else {
3068                goto again;
3069        }
3070
3071out_disable:
3072        thread__zput(trace->current);
3073
3074        perf_evlist__disable(evlist);
3075
3076        if (trace->sort_events)
3077                ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
3078
3079        if (!err) {
3080                if (trace->summary)
3081                        trace__fprintf_thread_summary(trace, trace->output);
3082
3083                if (trace->show_tool_stats) {
3084                        fprintf(trace->output, "Stats:\n "
3085                                               " vfs_getname : %" PRIu64 "\n"
3086                                               " proc_getname: %" PRIu64 "\n",
3087                                trace->stats.vfs_getname,
3088                                trace->stats.proc_getname);
3089                }
3090        }
3091
3092out_delete_evlist:
3093        trace__symbols__exit(trace);
3094
3095        perf_evlist__delete(evlist);
3096        cgroup__put(trace->cgroup);
3097        trace->evlist = NULL;
3098        trace->live = false;
3099        return err;
3100{
3101        char errbuf[BUFSIZ];
3102
3103out_error_sched_stat_runtime:
3104        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3105        goto out_error;
3106
3107out_error_raw_syscalls:
3108        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3109        goto out_error;
3110
3111out_error_mmap:
3112        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
3113        goto out_error;
3114
3115out_error_open:
3116        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
3117
3118out_error:
3119        fprintf(trace->output, "%s\n", errbuf);
3120        goto out_delete_evlist;
3121
3122out_error_apply_filters:
3123        fprintf(trace->output,
3124                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
3125                evsel->filter, perf_evsel__name(evsel), errno,
3126                str_error_r(errno, errbuf, sizeof(errbuf)));
3127        goto out_delete_evlist;
3128}
3129out_error_mem:
3130        fprintf(trace->output, "Not enough memory to run!\n");
3131        goto out_delete_evlist;
3132
3133out_errno:
3134        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
3135        goto out_delete_evlist;
3136}
3137
3138static int trace__replay(struct trace *trace)
3139{
3140        const struct perf_evsel_str_handler handlers[] = {
3141                { "probe:vfs_getname",       trace__vfs_getname, },
3142        };
3143        struct perf_data data = {
3144                .file      = {
3145                        .path = input_name,
3146                },
3147                .mode      = PERF_DATA_MODE_READ,
3148                .force     = trace->force,
3149        };
3150        struct perf_session *session;
3151        struct perf_evsel *evsel;
3152        int err = -1;
3153
3154        trace->tool.sample        = trace__process_sample;
3155        trace->tool.mmap          = perf_event__process_mmap;
3156        trace->tool.mmap2         = perf_event__process_mmap2;
3157        trace->tool.comm          = perf_event__process_comm;
3158        trace->tool.exit          = perf_event__process_exit;
3159        trace->tool.fork          = perf_event__process_fork;
3160        trace->tool.attr          = perf_event__process_attr;
3161        trace->tool.tracing_data  = perf_event__process_tracing_data;
3162        trace->tool.build_id      = perf_event__process_build_id;
3163        trace->tool.namespaces    = perf_event__process_namespaces;
3164
3165        trace->tool.ordered_events = true;
3166        trace->tool.ordering_requires_timestamps = true;
3167
3168        /* add tid to output */
3169        trace->multiple_threads = true;
3170
3171        session = perf_session__new(&data, false, &trace->tool);
3172        if (session == NULL)
3173                return -1;
3174
3175        if (trace->opts.target.pid)
3176                symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
3177
3178        if (trace->opts.target.tid)
3179                symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
3180
3181        if (symbol__init(&session->header.env) < 0)
3182                goto out;
3183
3184        trace->host = &session->machines.host;
3185
3186        err = perf_session__set_tracepoints_handlers(session, handlers);
3187        if (err)
3188                goto out;
3189
3190        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3191                                                     "raw_syscalls:sys_enter");
3192        /* older kernels have syscalls tp versus raw_syscalls */
3193        if (evsel == NULL)
3194                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3195                                                             "syscalls:sys_enter");
3196
3197        if (evsel &&
3198            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3199            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3200                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3201                goto out;
3202        }
3203
3204        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3205                                                     "raw_syscalls:sys_exit");
3206        if (evsel == NULL)
3207                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3208                                                             "syscalls:sys_exit");
3209        if (evsel &&
3210            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3211            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3212                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3213                goto out;
3214        }
3215
3216        evlist__for_each_entry(session->evlist, evsel) {
3217                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
3218                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3219                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3220                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3221                        evsel->handler = trace__pgfault;
3222        }
3223
3224        setup_pager();
3225
3226        err = perf_session__process_events(session);
3227        if (err)
3228                pr_err("Failed to process events, error %d", err);
3229
3230        else if (trace->summary)
3231                trace__fprintf_thread_summary(trace, trace->output);
3232
3233out:
3234        perf_session__delete(session);
3235
3236        return err;
3237}
3238
3239static size_t trace__fprintf_threads_header(FILE *fp)
3240{
3241        size_t printed;
3242
3243        printed  = fprintf(fp, "\n Summary of events:\n\n");
3244
3245        return printed;
3246}
3247
3248DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
3249        struct stats    *stats;
3250        double          msecs;
3251        int             syscall;
3252)
3253{
3254        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
3255        struct stats *stats = source->priv;
3256
3257        entry->syscall = source->i;
3258        entry->stats   = stats;
3259        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
3260}
3261
3262static size_t thread__dump_stats(struct thread_trace *ttrace,
3263                                 struct trace *trace, FILE *fp)
3264{
3265        size_t printed = 0;
3266        struct syscall *sc;
3267        struct rb_node *nd;
3268        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3269
3270        if (syscall_stats == NULL)
3271                return 0;
3272
3273        printed += fprintf(fp, "\n");
3274
3275        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
3276        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
3277        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3278
3279        resort_rb__for_each_entry(nd, syscall_stats) {
3280                struct stats *stats = syscall_stats_entry->stats;
3281                if (stats) {
3282                        double min = (double)(stats->min) / NSEC_PER_MSEC;
3283                        double max = (double)(stats->max) / NSEC_PER_MSEC;
3284                        double avg = avg_stats(stats);
3285                        double pct;
3286                        u64 n = (u64) stats->n;
3287
3288                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3289                        avg /= NSEC_PER_MSEC;
3290
3291                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3292                        printed += fprintf(fp, "   %-15s", sc->name);
3293                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3294                                           n, syscall_stats_entry->msecs, min, avg);
3295                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3296                }
3297        }
3298
3299        resort_rb__delete(syscall_stats);
3300        printed += fprintf(fp, "\n\n");
3301
3302        return printed;
3303}
3304
3305static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3306{
3307        size_t printed = 0;
3308        struct thread_trace *ttrace = thread__priv(thread);
3309        double ratio;
3310
3311        if (ttrace == NULL)
3312                return 0;
3313
3314        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3315
3316        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3317        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3318        printed += fprintf(fp, "%.1f%%", ratio);
3319        if (ttrace->pfmaj)
3320                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3321        if (ttrace->pfmin)
3322                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3323        if (trace->sched)
3324                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3325        else if (fputc('\n', fp) != EOF)
3326                ++printed;
3327
3328        printed += thread__dump_stats(ttrace, trace, fp);
3329
3330        return printed;
3331}
3332
3333static unsigned long thread__nr_events(struct thread_trace *ttrace)
3334{
3335        return ttrace ? ttrace->nr_events : 0;
3336}
3337
3338DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
3339        struct thread *thread;
3340)
3341{
3342        entry->thread = rb_entry(nd, struct thread, rb_node);
3343}
3344
3345static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3346{
3347        size_t printed = trace__fprintf_threads_header(fp);
3348        struct rb_node *nd;
3349        int i;
3350
3351        for (i = 0; i < THREADS__TABLE_SIZE; i++) {
3352                DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3353
3354                if (threads == NULL) {
3355                        fprintf(fp, "%s", "Error sorting output by nr_events!\n");
3356                        return 0;
3357                }
3358
3359                resort_rb__for_each_entry(nd, threads)
3360                        printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3361
3362                resort_rb__delete(threads);
3363        }
3364        return printed;
3365}
3366
3367static int trace__set_duration(const struct option *opt, const char *str,
3368                               int unset __maybe_unused)
3369{
3370        struct trace *trace = opt->value;
3371
3372        trace->duration_filter = atof(str);
3373        return 0;
3374}
3375
3376static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
3377                                              int unset __maybe_unused)
3378{
3379        int ret = -1;
3380        size_t i;
3381        struct trace *trace = opt->value;
3382        /*
3383         * FIXME: introduce a intarray class, plain parse csv and create a
3384         * { int nr, int entries[] } struct...
3385         */
3386        struct intlist *list = intlist__new(str);
3387
3388        if (list == NULL)
3389                return -1;
3390
3391        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3392        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3393
3394        if (trace->filter_pids.entries == NULL)
3395                goto out;
3396
3397        trace->filter_pids.entries[0] = getpid();
3398
3399        for (i = 1; i < trace->filter_pids.nr; ++i)
3400                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3401
3402        intlist__delete(list);
3403        ret = 0;
3404out:
3405        return ret;
3406}
3407
3408static int trace__open_output(struct trace *trace, const char *filename)
3409{
3410        struct stat st;
3411
3412        if (!stat(filename, &st) && st.st_size) {
3413                char oldname[PATH_MAX];
3414
3415                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3416                unlink(oldname);
3417                rename(filename, oldname);
3418        }
3419
3420        trace->output = fopen(filename, "w");
3421
3422        return trace->output == NULL ? -errno : 0;
3423}
3424
3425static int parse_pagefaults(const struct option *opt, const char *str,
3426                            int unset __maybe_unused)
3427{
3428        int *trace_pgfaults = opt->value;
3429
3430        if (strcmp(str, "all") == 0)
3431                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3432        else if (strcmp(str, "maj") == 0)
3433                *trace_pgfaults |= TRACE_PFMAJ;
3434        else if (strcmp(str, "min") == 0)
3435                *trace_pgfaults |= TRACE_PFMIN;
3436        else
3437                return -1;
3438
3439        return 0;
3440}
3441
3442static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3443{
3444        struct perf_evsel *evsel;
3445
3446        evlist__for_each_entry(evlist, evsel)
3447                evsel->handler = handler;
3448}
3449
3450static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3451{
3452        struct perf_evsel *evsel;
3453
3454        evlist__for_each_entry(evlist, evsel) {
3455                if (evsel->priv || !evsel->tp_format)
3456                        continue;
3457
3458                if (strcmp(evsel->tp_format->system, "syscalls"))
3459                        continue;
3460
3461                if (perf_evsel__init_syscall_tp(evsel))
3462                        return -1;
3463
3464                if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3465                        struct syscall_tp *sc = evsel->priv;
3466
3467                        if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3468                                return -1;
3469                } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3470                        struct syscall_tp *sc = evsel->priv;
3471
3472                        if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3473                                return -1;
3474                }
3475        }
3476
3477        return 0;
3478}
3479
3480/*
3481 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3482 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3483 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3484 *
3485 * It'd be better to introduce a parse_options() variant that would return a
3486 * list with the terms it didn't match to an event...
3487 */
3488static int trace__parse_events_option(const struct option *opt, const char *str,
3489                                      int unset __maybe_unused)
3490{
3491        struct trace *trace = (struct trace *)opt->value;
3492        const char *s = str;
3493        char *sep = NULL, *lists[2] = { NULL, NULL, };
3494        int len = strlen(str) + 1, err = -1, list, idx;
3495        char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3496        char group_name[PATH_MAX];
3497        struct syscall_fmt *fmt;
3498
3499        if (strace_groups_dir == NULL)
3500                return -1;
3501
3502        if (*s == '!') {
3503                ++s;
3504                trace->not_ev_qualifier = true;
3505        }
3506
3507        while (1) {
3508                if ((sep = strchr(s, ',')) != NULL)
3509                        *sep = '\0';
3510
3511                list = 0;
3512                if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3513                    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3514                        list = 1;
3515                        goto do_concat;
3516                }
3517
3518                fmt = syscall_fmt__find_by_alias(s);
3519                if (fmt != NULL) {
3520                        list = 1;
3521                        s = fmt->name;
3522                } else {
3523                        path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3524                        if (access(group_name, R_OK) == 0)
3525                                list = 1;
3526                }
3527do_concat:
3528                if (lists[list]) {
3529                        sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3530                } else {
3531                        lists[list] = malloc(len);
3532                        if (lists[list] == NULL)
3533                                goto out;
3534                        strcpy(lists[list], s);
3535                }
3536
3537                if (!sep)
3538                        break;
3539
3540                *sep = ',';
3541                s = sep + 1;
3542        }
3543
3544        if (lists[1] != NULL) {
3545                struct strlist_config slist_config = {
3546                        .dirname = strace_groups_dir,
3547                };
3548
3549                trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3550                if (trace->ev_qualifier == NULL) {
3551                        fputs("Not enough memory to parse event qualifier", trace->output);
3552                        goto out;
3553                }
3554
3555                if (trace__validate_ev_qualifier(trace))
3556                        goto out;
3557                trace->trace_syscalls = true;
3558        }
3559
3560        err = 0;
3561
3562        if (lists[0]) {
3563                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3564                                               "event selector. use 'perf list' to list available events",
3565                                               parse_events_option);
3566                err = parse_events_option(&o, lists[0], 0);
3567        }
3568out:
3569        if (sep)
3570                *sep = ',';
3571
3572        return err;
3573}
3574
3575static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3576{
3577        struct trace *trace = opt->value;
3578
3579        if (!list_empty(&trace->evlist->entries))
3580                return parse_cgroups(opt, str, unset);
3581
3582        trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3583
3584        return 0;
3585}
3586
3587static struct bpf_map *bpf__find_map_by_name(const char *name)
3588{
3589        struct bpf_object *obj, *tmp;
3590
3591        bpf_object__for_each_safe(obj, tmp) {
3592                struct bpf_map *map = bpf_object__find_map_by_name(obj, name);
3593                if (map)
3594                        return map;
3595
3596        }
3597
3598        return NULL;
3599}
3600
3601static void trace__set_bpf_map_filtered_pids(struct trace *trace)
3602{
3603        trace->filter_pids.map = bpf__find_map_by_name("pids_filtered");
3604}
3605
3606static void trace__set_bpf_map_syscalls(struct trace *trace)
3607{
3608        trace->syscalls.map = bpf__find_map_by_name("syscalls");
3609}
3610
3611static int trace__config(const char *var, const char *value, void *arg)
3612{
3613        struct trace *trace = arg;
3614        int err = 0;
3615
3616        if (!strcmp(var, "trace.add_events")) {
3617                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3618                                               "event selector. use 'perf list' to list available events",
3619                                               parse_events_option);
3620                err = parse_events_option(&o, value, 0);
3621        } else if (!strcmp(var, "trace.show_timestamp")) {
3622                trace->show_tstamp = perf_config_bool(var, value);
3623        } else if (!strcmp(var, "trace.show_duration")) {
3624                trace->show_duration = perf_config_bool(var, value);
3625        } else if (!strcmp(var, "trace.show_arg_names")) {
3626                trace->show_arg_names = perf_config_bool(var, value);
3627                if (!trace->show_arg_names)
3628                        trace->show_zeros = true;
3629        } else if (!strcmp(var, "trace.show_zeros")) {
3630                bool new_show_zeros = perf_config_bool(var, value);
3631                if (!trace->show_arg_names && !new_show_zeros) {
3632                        pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
3633                        goto out;
3634                }
3635                trace->show_zeros = new_show_zeros;
3636        } else if (!strcmp(var, "trace.show_prefix")) {
3637                trace->show_string_prefix = perf_config_bool(var, value);
3638        } else if (!strcmp(var, "trace.no_inherit")) {
3639                trace->opts.no_inherit = perf_config_bool(var, value);
3640        } else if (!strcmp(var, "trace.args_alignment")) {
3641                int args_alignment = 0;
3642                if (perf_config_int(&args_alignment, var, value) == 0)
3643                        trace->args_alignment = args_alignment;
3644        }
3645out:
3646        return err;
3647}
3648
3649int cmd_trace(int argc, const char **argv)
3650{
3651        const char *trace_usage[] = {
3652                "perf trace [<options>] [<command>]",
3653                "perf trace [<options>] -- <command> [<options>]",
3654                "perf trace record [<options>] [<command>]",
3655                "perf trace record [<options>] -- <command> [<options>]",
3656                NULL
3657        };
3658        struct trace trace = {
3659                .syscalls = {
3660                        . max = -1,
3661                },
3662                .opts = {
3663                        .target = {
3664                                .uid       = UINT_MAX,
3665                                .uses_mmap = true,
3666                        },
3667                        .user_freq     = UINT_MAX,
3668                        .user_interval = ULLONG_MAX,
3669                        .no_buffering  = true,
3670                        .mmap_pages    = UINT_MAX,
3671                },
3672                .output = stderr,
3673                .show_comm = true,
3674                .show_tstamp = true,
3675                .show_duration = true,
3676                .show_arg_names = true,
3677                .args_alignment = 70,
3678                .trace_syscalls = false,
3679                .kernel_syscallchains = false,
3680                .max_stack = UINT_MAX,
3681                .max_events = ULONG_MAX,
3682        };
3683        const char *output_name = NULL;
3684        const struct option trace_options[] = {
3685        OPT_CALLBACK('e', "event", &trace, "event",
3686                     "event/syscall selector. use 'perf list' to list available events",
3687                     trace__parse_events_option),
3688        OPT_BOOLEAN(0, "comm", &trace.show_comm,
3689                    "show the thread COMM next to its id"),
3690        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3691        OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3692                     trace__parse_events_option),
3693        OPT_STRING('o', "output", &output_name, "file", "output file name"),
3694        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3695        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3696                    "trace events on existing process id"),
3697        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3698                    "trace events on existing thread id"),
3699        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3700                     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
3701        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3702                    "system-wide collection from all CPUs"),
3703        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3704                    "list of cpus to monitor"),
3705        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3706                    "child tasks do not inherit counters"),
3707        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3708                     "number of mmap data pages",
3709                     perf_evlist__parse_mmap_pages),
3710        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3711                   "user to profile"),
3712        OPT_CALLBACK(0, "duration", &trace, "float",
3713                     "show only events with duration > N.M ms",
3714                     trace__set_duration),
3715        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3716        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3717        OPT_BOOLEAN('T', "time", &trace.full_time,
3718                    "Show full timestamp, not time relative to first start"),
3719        OPT_BOOLEAN(0, "failure", &trace.failure_only,
3720                    "Show only syscalls that failed"),
3721        OPT_BOOLEAN('s', "summary", &trace.summary_only,
3722                    "Show only syscall summary with statistics"),
3723        OPT_BOOLEAN('S', "with-summary", &trace.summary,
3724                    "Show all syscalls and summary with statistics"),
3725        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3726                     "Trace pagefaults", parse_pagefaults, "maj"),
3727        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3728        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3729        OPT_CALLBACK(0, "call-graph", &trace.opts,
3730                     "record_mode[,record_size]", record_callchain_help,
3731                     &record_parse_callchain_opt),
3732        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3733                    "Show the kernel callchains on the syscall exit path"),
3734        OPT_ULONG(0, "max-events", &trace.max_events,
3735                "Set the maximum number of events to print, exit after that is reached. "),
3736        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3737                     "Set the minimum stack depth when parsing the callchain, "
3738                     "anything below the specified depth will be ignored."),
3739        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3740                     "Set the maximum stack depth when parsing the callchain, "
3741                     "anything beyond the specified depth will be ignored. "
3742                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3743        OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
3744                        "Sort batch of events before processing, use if getting out of order events"),
3745        OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3746                        "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3747        OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3748                        "per thread proc mmap processing timeout in ms"),
3749        OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3750                     trace__parse_cgroups),
3751        OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3752                     "ms to wait before starting measurement after program "
3753                     "start"),
3754        OPT_END()
3755        };
3756        bool __maybe_unused max_stack_user_set = true;
3757        bool mmap_pages_user_set = true;
3758        struct perf_evsel *evsel;
3759        const char * const trace_subcommands[] = { "record", NULL };
3760        int err = -1;
3761        char bf[BUFSIZ];
3762
3763        signal(SIGSEGV, sighandler_dump_stack);
3764        signal(SIGFPE, sighandler_dump_stack);
3765
3766        trace.evlist = perf_evlist__new();
3767        trace.sctbl = syscalltbl__new();
3768
3769        if (trace.evlist == NULL || trace.sctbl == NULL) {
3770                pr_err("Not enough memory to run!\n");
3771                err = -ENOMEM;
3772                goto out;
3773        }
3774
3775        err = perf_config(trace__config, &trace);
3776        if (err)
3777                goto out;
3778
3779        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3780                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3781
3782        if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3783                usage_with_options_msg(trace_usage, trace_options,
3784                                       "cgroup monitoring only available in system-wide mode");
3785        }
3786
3787        evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3788        if (IS_ERR(evsel)) {
3789                bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3790                pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3791                goto out;
3792        }
3793
3794        if (evsel) {
3795                trace.syscalls.events.augmented = evsel;
3796                trace__set_bpf_map_filtered_pids(&trace);
3797                trace__set_bpf_map_syscalls(&trace);
3798        }
3799
3800        err = bpf__setup_stdout(trace.evlist);
3801        if (err) {
3802                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3803                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3804                goto out;
3805        }
3806
3807        err = -1;
3808
3809        if (trace.trace_pgfaults) {
3810                trace.opts.sample_address = true;
3811                trace.opts.sample_time = true;
3812        }
3813
3814        if (trace.opts.mmap_pages == UINT_MAX)
3815                mmap_pages_user_set = false;
3816
3817        if (trace.max_stack == UINT_MAX) {
3818                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3819                max_stack_user_set = false;
3820        }
3821
3822#ifdef HAVE_DWARF_UNWIND_SUPPORT
3823        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3824                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3825        }
3826#endif
3827
3828        if (callchain_param.enabled) {
3829                if (!mmap_pages_user_set && geteuid() == 0)
3830                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3831
3832                symbol_conf.use_callchain = true;
3833        }
3834
3835        if (trace.evlist->nr_entries > 0) {
3836                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3837                if (evlist__set_syscall_tp_fields(trace.evlist)) {
3838                        perror("failed to set syscalls:* tracepoint fields");
3839                        goto out;
3840                }
3841        }
3842
3843        if (trace.sort_events) {
3844                ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
3845                ordered_events__set_copy_on_queue(&trace.oe.data, true);
3846        }
3847
3848        /*
3849         * If we are augmenting syscalls, then combine what we put in the
3850         * __augmented_syscalls__ BPF map with what is in the
3851         * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
3852         * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
3853         *
3854         * We'll switch to look at two BPF maps, one for sys_enter and the
3855         * other for sys_exit when we start augmenting the sys_exit paths with
3856         * buffers that are being copied from kernel to userspace, think 'read'
3857         * syscall.
3858         */
3859        if (trace.syscalls.events.augmented) {
3860                evlist__for_each_entry(trace.evlist, evsel) {
3861                        bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
3862
3863                        if (raw_syscalls_sys_exit) {
3864                                trace.raw_augmented_syscalls = true;
3865                                goto init_augmented_syscall_tp;
3866                        }
3867
3868                        if (strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_enter") == 0) {
3869                                struct perf_evsel *augmented = trace.syscalls.events.augmented;
3870                                if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
3871                                    perf_evsel__init_augmented_syscall_tp_args(augmented))
3872                                        goto out;
3873                                augmented->handler = trace__sys_enter;
3874                        }
3875
3876                        if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
3877                                struct syscall_tp *sc;
3878init_augmented_syscall_tp:
3879                                if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
3880                                        goto out;
3881                                sc = evsel->priv;
3882                                /*
3883                                 * For now with BPF raw_augmented we hook into
3884                                 * raw_syscalls:sys_enter and there we get all
3885                                 * 6 syscall args plus the tracepoint common
3886                                 * fields and the syscall_nr (another long).
3887                                 * So we check if that is the case and if so
3888                                 * don't look after the sc->args_size but
3889                                 * always after the full raw_syscalls:sys_enter
3890                                 * payload, which is fixed.
3891                                 *
3892                                 * We'll revisit this later to pass
3893                                 * s->args_size to the BPF augmenter (now
3894                                 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
3895                                 * so that it copies only what we need for each
3896                                 * syscall, like what happens when we use
3897                                 * syscalls:sys_enter_NAME, so that we reduce
3898                                 * the kernel/userspace traffic to just what is
3899                                 * needed for each syscall.
3900                                 */
3901                                if (trace.raw_augmented_syscalls)
3902                                        trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
3903                                perf_evsel__init_augmented_syscall_tp_ret(evsel);
3904                                evsel->handler = trace__sys_exit;
3905                        }
3906                }
3907        }
3908
3909        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3910                return trace__record(&trace, argc-1, &argv[1]);
3911
3912        /* summary_only implies summary option, but don't overwrite summary if set */
3913        if (trace.summary_only)
3914                trace.summary = trace.summary_only;
3915
3916        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3917            trace.evlist->nr_entries == 0 /* Was --events used? */) {
3918                trace.trace_syscalls = true;
3919        }
3920
3921        if (output_name != NULL) {
3922                err = trace__open_output(&trace, output_name);
3923                if (err < 0) {
3924                        perror("failed to create output file");
3925                        goto out;
3926                }
3927        }
3928
3929        err = target__validate(&trace.opts.target);
3930        if (err) {
3931                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3932                fprintf(trace.output, "%s", bf);
3933                goto out_close;
3934        }
3935
3936        err = target__parse_uid(&trace.opts.target);
3937        if (err) {
3938                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3939                fprintf(trace.output, "%s", bf);
3940                goto out_close;
3941        }
3942
3943        if (!argc && target__none(&trace.opts.target))
3944                trace.opts.target.system_wide = true;
3945
3946        if (input_name)
3947                err = trace__replay(&trace);
3948        else
3949                err = trace__run(&trace, argc, argv);
3950
3951out_close:
3952        if (output_name != NULL)
3953                fclose(trace.output);
3954out:
3955        return err;
3956}
3957