linux/tools/perf/builtin-trace.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * builtin-trace.c
   4 *
   5 * Builtin 'trace' command:
   6 *
   7 * Display a continuously updated trace of any workload, CPU, specific PID,
   8 * system wide, etc.  Default format is loosely strace like, but any other
   9 * event may be specified using --event.
  10 *
  11 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  12 *
  13 * Initially based on the 'trace' prototype by Thomas Gleixner:
  14 *
  15 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  16 */
  17
  18#include <traceevent/event-parse.h>
  19#include <api/fs/tracing_path.h>
  20#include <bpf/bpf.h>
  21#include "util/bpf_map.h"
  22#include "util/rlimit.h"
  23#include "builtin.h"
  24#include "util/cgroup.h"
  25#include "util/color.h"
  26#include "util/config.h"
  27#include "util/debug.h"
  28#include "util/env.h"
  29#include "util/event.h"
  30#include "util/evlist.h"
  31#include <subcmd/exec-cmd.h>
  32#include "util/machine.h"
  33#include "util/map.h"
  34#include "util/symbol.h"
  35#include "util/path.h"
  36#include "util/session.h"
  37#include "util/thread.h"
  38#include <subcmd/parse-options.h>
  39#include "util/strlist.h"
  40#include "util/intlist.h"
  41#include "util/thread_map.h"
  42#include "util/stat.h"
  43#include "trace/beauty/beauty.h"
  44#include "trace-event.h"
  45#include "util/parse-events.h"
  46#include "util/bpf-loader.h"
  47#include "callchain.h"
  48#include "print_binary.h"
  49#include "string2.h"
  50#include "syscalltbl.h"
  51#include "rb_resort.h"
  52
  53#include <errno.h>
  54#include <inttypes.h>
  55#include <poll.h>
  56#include <signal.h>
  57#include <stdlib.h>
  58#include <string.h>
  59#include <linux/err.h>
  60#include <linux/filter.h>
  61#include <linux/kernel.h>
  62#include <linux/random.h>
  63#include <linux/stringify.h>
  64#include <linux/time64.h>
  65#include <linux/zalloc.h>
  66#include <fcntl.h>
  67#include <sys/sysmacros.h>
  68
  69#include <linux/ctype.h>
  70
  71#ifndef O_CLOEXEC
  72# define O_CLOEXEC              02000000
  73#endif
  74
  75#ifndef F_LINUX_SPECIFIC_BASE
  76# define F_LINUX_SPECIFIC_BASE  1024
  77#endif
  78
  79struct trace {
  80        struct perf_tool        tool;
  81        struct syscalltbl       *sctbl;
  82        struct {
  83                int             max;
  84                struct syscall  *table;
  85                struct bpf_map  *map;
  86                struct {
  87                        struct perf_evsel *sys_enter,
  88                                          *sys_exit,
  89                                          *augmented;
  90                }               events;
  91        } syscalls;
  92        struct {
  93                struct bpf_map *map;
  94        } dump;
  95        struct record_opts      opts;
  96        struct perf_evlist      *evlist;
  97        struct machine          *host;
  98        struct thread           *current;
  99        struct cgroup           *cgroup;
 100        u64                     base_time;
 101        FILE                    *output;
 102        unsigned long           nr_events;
 103        unsigned long           nr_events_printed;
 104        unsigned long           max_events;
 105        struct strlist          *ev_qualifier;
 106        struct {
 107                size_t          nr;
 108                int             *entries;
 109        }                       ev_qualifier_ids;
 110        struct {
 111                size_t          nr;
 112                pid_t           *entries;
 113                struct bpf_map  *map;
 114        }                       filter_pids;
 115        double                  duration_filter;
 116        double                  runtime_ms;
 117        struct {
 118                u64             vfs_getname,
 119                                proc_getname;
 120        } stats;
 121        unsigned int            max_stack;
 122        unsigned int            min_stack;
 123        int                     raw_augmented_syscalls_args_size;
 124        bool                    raw_augmented_syscalls;
 125        bool                    sort_events;
 126        bool                    not_ev_qualifier;
 127        bool                    live;
 128        bool                    full_time;
 129        bool                    sched;
 130        bool                    multiple_threads;
 131        bool                    summary;
 132        bool                    summary_only;
 133        bool                    failure_only;
 134        bool                    show_comm;
 135        bool                    print_sample;
 136        bool                    show_tool_stats;
 137        bool                    trace_syscalls;
 138        bool                    kernel_syscallchains;
 139        s16                     args_alignment;
 140        bool                    show_tstamp;
 141        bool                    show_duration;
 142        bool                    show_zeros;
 143        bool                    show_arg_names;
 144        bool                    show_string_prefix;
 145        bool                    force;
 146        bool                    vfs_getname;
 147        int                     trace_pgfaults;
 148        struct {
 149                struct ordered_events   data;
 150                u64                     last;
 151        } oe;
 152};
 153
 154struct tp_field {
 155        int offset;
 156        union {
 157                u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 158                void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 159        };
 160};
 161
 162#define TP_UINT_FIELD(bits) \
 163static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 164{ \
 165        u##bits value; \
 166        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 167        return value;  \
 168}
 169
 170TP_UINT_FIELD(8);
 171TP_UINT_FIELD(16);
 172TP_UINT_FIELD(32);
 173TP_UINT_FIELD(64);
 174
 175#define TP_UINT_FIELD__SWAPPED(bits) \
 176static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 177{ \
 178        u##bits value; \
 179        memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 180        return bswap_##bits(value);\
 181}
 182
 183TP_UINT_FIELD__SWAPPED(16);
 184TP_UINT_FIELD__SWAPPED(32);
 185TP_UINT_FIELD__SWAPPED(64);
 186
 187static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 188{
 189        field->offset = offset;
 190
 191        switch (size) {
 192        case 1:
 193                field->integer = tp_field__u8;
 194                break;
 195        case 2:
 196                field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 197                break;
 198        case 4:
 199                field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 200                break;
 201        case 8:
 202                field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 203                break;
 204        default:
 205                return -1;
 206        }
 207
 208        return 0;
 209}
 210
 211static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
 212{
 213        return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 214}
 215
 216static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 217{
 218        return sample->raw_data + field->offset;
 219}
 220
 221static int __tp_field__init_ptr(struct tp_field *field, int offset)
 222{
 223        field->offset = offset;
 224        field->pointer = tp_field__ptr;
 225        return 0;
 226}
 227
 228static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
 229{
 230        return __tp_field__init_ptr(field, format_field->offset);
 231}
 232
 233struct syscall_tp {
 234        struct tp_field id;
 235        union {
 236                struct tp_field args, ret;
 237        };
 238};
 239
 240static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 241                                          struct tp_field *field,
 242                                          const char *name)
 243{
 244        struct tep_format_field *format_field = perf_evsel__field(evsel, name);
 245
 246        if (format_field == NULL)
 247                return -1;
 248
 249        return tp_field__init_uint(field, format_field, evsel->needs_swap);
 250}
 251
 252#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 253        ({ struct syscall_tp *sc = evsel->priv;\
 254           perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 255
 256static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 257                                         struct tp_field *field,
 258                                         const char *name)
 259{
 260        struct tep_format_field *format_field = perf_evsel__field(evsel, name);
 261
 262        if (format_field == NULL)
 263                return -1;
 264
 265        return tp_field__init_ptr(field, format_field);
 266}
 267
 268#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 269        ({ struct syscall_tp *sc = evsel->priv;\
 270           perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 271
 272static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 273{
 274        zfree(&evsel->priv);
 275        perf_evsel__delete(evsel);
 276}
 277
 278static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
 279{
 280        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 281
 282        if (evsel->priv != NULL) {
 283                if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 284                    perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 285                        goto out_delete;
 286                return 0;
 287        }
 288
 289        return -ENOMEM;
 290out_delete:
 291        zfree(&evsel->priv);
 292        return -ENOENT;
 293}
 294
 295static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
 296{
 297        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 298
 299        if (evsel->priv != NULL) {
 300                struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
 301                if (syscall_id == NULL)
 302                        syscall_id = perf_evsel__field(tp, "__syscall_nr");
 303                if (syscall_id == NULL)
 304                        goto out_delete;
 305                if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 306                        goto out_delete;
 307
 308                return 0;
 309        }
 310
 311        return -ENOMEM;
 312out_delete:
 313        zfree(&evsel->priv);
 314        return -EINVAL;
 315}
 316
 317static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
 318{
 319        struct syscall_tp *sc = evsel->priv;
 320
 321        return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 322}
 323
 324static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
 325{
 326        struct syscall_tp *sc = evsel->priv;
 327
 328        return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
 329}
 330
 331static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
 332{
 333        evsel->priv = malloc(sizeof(struct syscall_tp));
 334        if (evsel->priv != NULL) {
 335                if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 336                        goto out_delete;
 337
 338                evsel->handler = handler;
 339                return 0;
 340        }
 341
 342        return -ENOMEM;
 343
 344out_delete:
 345        zfree(&evsel->priv);
 346        return -ENOENT;
 347}
 348
 349static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 350{
 351        struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 352
 353        /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 354        if (IS_ERR(evsel))
 355                evsel = perf_evsel__newtp("syscalls", direction);
 356
 357        if (IS_ERR(evsel))
 358                return NULL;
 359
 360        if (perf_evsel__init_raw_syscall_tp(evsel, handler))
 361                goto out_delete;
 362
 363        return evsel;
 364
 365out_delete:
 366        perf_evsel__delete_priv(evsel);
 367        return NULL;
 368}
 369
 370#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 371        ({ struct syscall_tp *fields = evsel->priv; \
 372           fields->name.integer(&fields->name, sample); })
 373
 374#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 375        ({ struct syscall_tp *fields = evsel->priv; \
 376           fields->name.pointer(&fields->name, sample); })
 377
 378size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 379{
 380        int idx = val - sa->offset;
 381
 382        if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 383                size_t printed = scnprintf(bf, size, intfmt, val);
 384                if (show_prefix)
 385                        printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 386                return printed;
 387        }
 388
 389        return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 390}
 391
 392static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 393                                                const char *intfmt,
 394                                                struct syscall_arg *arg)
 395{
 396        return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 397}
 398
 399static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 400                                              struct syscall_arg *arg)
 401{
 402        return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 403}
 404
 405#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 406
 407size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
 408{
 409        return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
 410}
 411
 412size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 413{
 414        size_t printed;
 415        int i;
 416
 417        for (i = 0; i < sas->nr_entries; ++i) {
 418                struct strarray *sa = sas->entries[i];
 419                int idx = val - sa->offset;
 420
 421                if (idx >= 0 && idx < sa->nr_entries) {
 422                        if (sa->entries[idx] == NULL)
 423                                break;
 424                        return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 425                }
 426        }
 427
 428        printed = scnprintf(bf, size, intfmt, val);
 429        if (show_prefix)
 430                printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
 431        return printed;
 432}
 433
 434size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 435                                        struct syscall_arg *arg)
 436{
 437        return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 438}
 439
 440#ifndef AT_FDCWD
 441#define AT_FDCWD        -100
 442#endif
 443
 444static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 445                                           struct syscall_arg *arg)
 446{
 447        int fd = arg->val;
 448        const char *prefix = "AT_FD";
 449
 450        if (fd == AT_FDCWD)
 451                return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 452
 453        return syscall_arg__scnprintf_fd(bf, size, arg);
 454}
 455
 456#define SCA_FDAT syscall_arg__scnprintf_fd_at
 457
 458static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 459                                              struct syscall_arg *arg);
 460
 461#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 462
 463size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 464{
 465        return scnprintf(bf, size, "%#lx", arg->val);
 466}
 467
 468size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
 469{
 470        if (arg->val == 0)
 471                return scnprintf(bf, size, "NULL");
 472        return syscall_arg__scnprintf_hex(bf, size, arg);
 473}
 474
 475size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 476{
 477        return scnprintf(bf, size, "%d", arg->val);
 478}
 479
 480size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 481{
 482        return scnprintf(bf, size, "%ld", arg->val);
 483}
 484
 485static const char *bpf_cmd[] = {
 486        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 487        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 488};
 489static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 490
 491static const char *fsmount_flags[] = {
 492        [1] = "CLOEXEC",
 493};
 494static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
 495
 496#include "trace/beauty/generated/fsconfig_arrays.c"
 497
 498static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
 499
 500static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 501static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 502
 503static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 504static DEFINE_STRARRAY(itimers, "ITIMER_");
 505
 506static const char *keyctl_options[] = {
 507        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 508        "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 509        "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 510        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 511        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 512};
 513static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 514
 515static const char *whences[] = { "SET", "CUR", "END",
 516#ifdef SEEK_DATA
 517"DATA",
 518#endif
 519#ifdef SEEK_HOLE
 520"HOLE",
 521#endif
 522};
 523static DEFINE_STRARRAY(whences, "SEEK_");
 524
 525static const char *fcntl_cmds[] = {
 526        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 527        "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 528        "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 529        "GETOWNER_UIDS",
 530};
 531static DEFINE_STRARRAY(fcntl_cmds, "F_");
 532
 533static const char *fcntl_linux_specific_cmds[] = {
 534        "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 535        "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 536        "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 537};
 538
 539static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 540
 541static struct strarray *fcntl_cmds_arrays[] = {
 542        &strarray__fcntl_cmds,
 543        &strarray__fcntl_linux_specific_cmds,
 544};
 545
 546static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 547
 548static const char *rlimit_resources[] = {
 549        "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 550        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 551        "RTTIME",
 552};
 553static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 554
 555static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 556static DEFINE_STRARRAY(sighow, "SIG_");
 557
 558static const char *clockid[] = {
 559        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 560        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 561        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 562};
 563static DEFINE_STRARRAY(clockid, "CLOCK_");
 564
 565static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 566                                                 struct syscall_arg *arg)
 567{
 568        bool show_prefix = arg->show_string_prefix;
 569        const char *suffix = "_OK";
 570        size_t printed = 0;
 571        int mode = arg->val;
 572
 573        if (mode == F_OK) /* 0 */
 574                return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 575#define P_MODE(n) \
 576        if (mode & n##_OK) { \
 577                printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
 578                mode &= ~n##_OK; \
 579        }
 580
 581        P_MODE(R);
 582        P_MODE(W);
 583        P_MODE(X);
 584#undef P_MODE
 585
 586        if (mode)
 587                printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 588
 589        return printed;
 590}
 591
 592#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 593
 594static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 595                                              struct syscall_arg *arg);
 596
 597#define SCA_FILENAME syscall_arg__scnprintf_filename
 598
 599static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 600                                                struct syscall_arg *arg)
 601{
 602        bool show_prefix = arg->show_string_prefix;
 603        const char *prefix = "O_";
 604        int printed = 0, flags = arg->val;
 605
 606#define P_FLAG(n) \
 607        if (flags & O_##n) { \
 608                printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 609                flags &= ~O_##n; \
 610        }
 611
 612        P_FLAG(CLOEXEC);
 613        P_FLAG(NONBLOCK);
 614#undef P_FLAG
 615
 616        if (flags)
 617                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 618
 619        return printed;
 620}
 621
 622#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 623
 624#ifndef GRND_NONBLOCK
 625#define GRND_NONBLOCK   0x0001
 626#endif
 627#ifndef GRND_RANDOM
 628#define GRND_RANDOM     0x0002
 629#endif
 630
 631static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 632                                                   struct syscall_arg *arg)
 633{
 634        bool show_prefix = arg->show_string_prefix;
 635        const char *prefix = "GRND_";
 636        int printed = 0, flags = arg->val;
 637
 638#define P_FLAG(n) \
 639        if (flags & GRND_##n) { \
 640                printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 641                flags &= ~GRND_##n; \
 642        }
 643
 644        P_FLAG(RANDOM);
 645        P_FLAG(NONBLOCK);
 646#undef P_FLAG
 647
 648        if (flags)
 649                printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 650
 651        return printed;
 652}
 653
 654#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 655
 656#define STRARRAY(name, array) \
 657          { .scnprintf  = SCA_STRARRAY, \
 658            .parm       = &strarray__##array, }
 659
 660#define STRARRAY_FLAGS(name, array) \
 661          { .scnprintf  = SCA_STRARRAY_FLAGS, \
 662            .parm       = &strarray__##array, }
 663
 664#include "trace/beauty/arch_errno_names.c"
 665#include "trace/beauty/eventfd.c"
 666#include "trace/beauty/futex_op.c"
 667#include "trace/beauty/futex_val3.c"
 668#include "trace/beauty/mmap.c"
 669#include "trace/beauty/mode_t.c"
 670#include "trace/beauty/msg_flags.c"
 671#include "trace/beauty/open_flags.c"
 672#include "trace/beauty/perf_event_open.c"
 673#include "trace/beauty/pid.c"
 674#include "trace/beauty/sched_policy.c"
 675#include "trace/beauty/seccomp.c"
 676#include "trace/beauty/signum.c"
 677#include "trace/beauty/socket_type.c"
 678#include "trace/beauty/waitid_options.c"
 679
 680struct syscall_arg_fmt {
 681        size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 682        unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
 683        void       *parm;
 684        const char *name;
 685        bool       show_zero;
 686};
 687
 688static struct syscall_fmt {
 689        const char *name;
 690        const char *alias;
 691        struct syscall_arg_fmt arg[6];
 692        u8         nr_args;
 693        bool       errpid;
 694        bool       timeout;
 695        bool       hexret;
 696} syscall_fmts[] = {
 697        { .name     = "access",
 698          .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 699        { .name     = "arch_prctl",
 700          .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
 701                   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
 702        { .name     = "bind",
 703          .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
 704        { .name     = "bpf",
 705          .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 706        { .name     = "brk",        .hexret = true,
 707          .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
 708        { .name     = "clock_gettime",
 709          .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 710        { .name     = "clone",      .errpid = true, .nr_args = 5,
 711          .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
 712                   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 713                   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 714                   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 715                   [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
 716        { .name     = "close",
 717          .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 718        { .name     = "connect",
 719          .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
 720        { .name     = "epoll_ctl",
 721          .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 722        { .name     = "eventfd2",
 723          .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 724        { .name     = "fchmodat",
 725          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 726        { .name     = "fchownat",
 727          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 728        { .name     = "fcntl",
 729          .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 730                           .parm      = &strarrays__fcntl_cmds_arrays,
 731                           .show_zero = true, },
 732                   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 733        { .name     = "flock",
 734          .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 735        { .name     = "fsconfig",
 736          .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
 737        { .name     = "fsmount",
 738          .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
 739                   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
 740        { .name     = "fspick",
 741          .arg = { [0] = { .scnprintf = SCA_FDAT,         /* dfd */ },
 742                   [1] = { .scnprintf = SCA_FILENAME,     /* path */ },
 743                   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
 744        { .name     = "fstat", .alias = "newfstat", },
 745        { .name     = "fstatat", .alias = "newfstatat", },
 746        { .name     = "futex",
 747          .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 748                   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 749        { .name     = "futimesat",
 750          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 751        { .name     = "getitimer",
 752          .arg = { [0] = STRARRAY(which, itimers), }, },
 753        { .name     = "getpid",     .errpid = true, },
 754        { .name     = "getpgid",    .errpid = true, },
 755        { .name     = "getppid",    .errpid = true, },
 756        { .name     = "getrandom",
 757          .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 758        { .name     = "getrlimit",
 759          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 760        { .name     = "gettid",     .errpid = true, },
 761        { .name     = "ioctl",
 762          .arg = {
 763#if defined(__i386__) || defined(__x86_64__)
 764/*
 765 * FIXME: Make this available to all arches.
 766 */
 767                   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 768                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 769#else
 770                   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 771#endif
 772        { .name     = "kcmp",       .nr_args = 5,
 773          .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
 774                   [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
 775                   [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
 776                   [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
 777                   [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
 778        { .name     = "keyctl",
 779          .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 780        { .name     = "kill",
 781          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 782        { .name     = "linkat",
 783          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 784        { .name     = "lseek",
 785          .arg = { [2] = STRARRAY(whence, whences), }, },
 786        { .name     = "lstat", .alias = "newlstat", },
 787        { .name     = "madvise",
 788          .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 789                   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 790        { .name     = "mkdirat",
 791          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 792        { .name     = "mknodat",
 793          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 794        { .name     = "mmap",       .hexret = true,
 795/* The standard mmap maps to old_mmap on s390x */
 796#if defined(__s390x__)
 797        .alias = "old_mmap",
 798#endif
 799          .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 800                   [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
 801                   [5] = { .scnprintf = SCA_HEX,        /* offset */ }, }, },
 802        { .name     = "mount",
 803          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
 804                   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
 805                           .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
 806        { .name     = "move_mount",
 807          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* from_dfd */ },
 808                   [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
 809                   [2] = { .scnprintf = SCA_FDAT,       /* to_dfd */ },
 810                   [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
 811                   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
 812        { .name     = "mprotect",
 813          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 814                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
 815        { .name     = "mq_unlink",
 816          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 817        { .name     = "mremap",     .hexret = true,
 818          .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
 819        { .name     = "name_to_handle_at",
 820          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 821        { .name     = "newfstatat",
 822          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 823        { .name     = "open",
 824          .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 825        { .name     = "open_by_handle_at",
 826          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 827                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 828        { .name     = "openat",
 829          .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
 830                   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 831        { .name     = "perf_event_open",
 832          .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
 833                   [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
 834                   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 835        { .name     = "pipe2",
 836          .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 837        { .name     = "pkey_alloc",
 838          .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
 839        { .name     = "pkey_free",
 840          .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
 841        { .name     = "pkey_mprotect",
 842          .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
 843                   [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
 844                   [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
 845        { .name     = "poll", .timeout = true, },
 846        { .name     = "ppoll", .timeout = true, },
 847        { .name     = "prctl",
 848          .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 849                   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 850                   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 851        { .name     = "pread", .alias = "pread64", },
 852        { .name     = "preadv", .alias = "pread", },
 853        { .name     = "prlimit64",
 854          .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 855        { .name     = "pwrite", .alias = "pwrite64", },
 856        { .name     = "readlinkat",
 857          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 858        { .name     = "recvfrom",
 859          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 860        { .name     = "recvmmsg",
 861          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 862        { .name     = "recvmsg",
 863          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 864        { .name     = "renameat",
 865          .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
 866                   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
 867        { .name     = "renameat2",
 868          .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
 869                   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
 870                   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
 871        { .name     = "rt_sigaction",
 872          .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 873        { .name     = "rt_sigprocmask",
 874          .arg = { [0] = STRARRAY(how, sighow), }, },
 875        { .name     = "rt_sigqueueinfo",
 876          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 877        { .name     = "rt_tgsigqueueinfo",
 878          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 879        { .name     = "sched_setscheduler",
 880          .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 881        { .name     = "seccomp",
 882          .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
 883                   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 884        { .name     = "select", .timeout = true, },
 885        { .name     = "sendmmsg",
 886          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 887        { .name     = "sendmsg",
 888          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 889        { .name     = "sendto",
 890          .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
 891                   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
 892        { .name     = "set_tid_address", .errpid = true, },
 893        { .name     = "setitimer",
 894          .arg = { [0] = STRARRAY(which, itimers), }, },
 895        { .name     = "setrlimit",
 896          .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 897        { .name     = "socket",
 898          .arg = { [0] = STRARRAY(family, socket_families),
 899                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 900                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 901        { .name     = "socketpair",
 902          .arg = { [0] = STRARRAY(family, socket_families),
 903                   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
 904                   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
 905        { .name     = "stat", .alias = "newstat", },
 906        { .name     = "statx",
 907          .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
 908                   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 909                   [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
 910        { .name     = "swapoff",
 911          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 912        { .name     = "swapon",
 913          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 914        { .name     = "symlinkat",
 915          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 916        { .name     = "sync_file_range",
 917          .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
 918        { .name     = "tgkill",
 919          .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 920        { .name     = "tkill",
 921          .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 922        { .name     = "umount2", .alias = "umount",
 923          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
 924        { .name     = "uname", .alias = "newuname", },
 925        { .name     = "unlinkat",
 926          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 927        { .name     = "utimensat",
 928          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 929        { .name     = "wait4",      .errpid = true,
 930          .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 931        { .name     = "waitid",     .errpid = true,
 932          .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 933};
 934
 935static int syscall_fmt__cmp(const void *name, const void *fmtp)
 936{
 937        const struct syscall_fmt *fmt = fmtp;
 938        return strcmp(name, fmt->name);
 939}
 940
 941static struct syscall_fmt *syscall_fmt__find(const char *name)
 942{
 943        const int nmemb = ARRAY_SIZE(syscall_fmts);
 944        return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 945}
 946
 947static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
 948{
 949        int i, nmemb = ARRAY_SIZE(syscall_fmts);
 950
 951        for (i = 0; i < nmemb; ++i) {
 952                if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
 953                        return &syscall_fmts[i];
 954        }
 955
 956        return NULL;
 957}
 958
 959/*
 960 * is_exit: is this "exit" or "exit_group"?
 961 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
 962 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
 963 */
 964struct syscall {
 965        struct tep_event    *tp_format;
 966        int                 nr_args;
 967        int                 args_size;
 968        bool                is_exit;
 969        bool                is_open;
 970        struct tep_format_field *args;
 971        const char          *name;
 972        struct syscall_fmt  *fmt;
 973        struct syscall_arg_fmt *arg_fmt;
 974};
 975
 976/*
 977 * Must match what is in the BPF program:
 978 *
 979 * tools/perf/examples/bpf/augmented_raw_syscalls.c
 980 */
 981struct bpf_map_syscall_entry {
 982        bool    enabled;
 983        u16     string_args_len[6];
 984};
 985
 986/*
 987 * We need to have this 'calculated' boolean because in some cases we really
 988 * don't know what is the duration of a syscall, for instance, when we start
 989 * a session and some threads are waiting for a syscall to finish, say 'poll',
 990 * in which case all we can do is to print "( ? ) for duration and for the
 991 * start timestamp.
 992 */
 993static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 994{
 995        double duration = (double)t / NSEC_PER_MSEC;
 996        size_t printed = fprintf(fp, "(");
 997
 998        if (!calculated)
 999                printed += fprintf(fp, "         ");
1000        else if (duration >= 1.0)
1001                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1002        else if (duration >= 0.01)
1003                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1004        else
1005                printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1006        return printed + fprintf(fp, "): ");
1007}
1008
1009/**
1010 * filename.ptr: The filename char pointer that will be vfs_getname'd
1011 * filename.entry_str_pos: Where to insert the string translated from
1012 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1013 * ret_scnprintf: syscall args may set this to a different syscall return
1014 *                formatter, for instance, fcntl may return fds, file flags, etc.
1015 */
1016struct thread_trace {
1017        u64               entry_time;
1018        bool              entry_pending;
1019        unsigned long     nr_events;
1020        unsigned long     pfmaj, pfmin;
1021        char              *entry_str;
1022        double            runtime_ms;
1023        size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1024        struct {
1025                unsigned long ptr;
1026                short int     entry_str_pos;
1027                bool          pending_open;
1028                unsigned int  namelen;
1029                char          *name;
1030        } filename;
1031        struct {
1032                int           max;
1033                struct file   *table;
1034        } files;
1035
1036        struct intlist *syscall_stats;
1037};
1038
1039static struct thread_trace *thread_trace__new(void)
1040{
1041        struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1042
1043        if (ttrace) {
1044                ttrace->files.max = -1;
1045                ttrace->syscall_stats = intlist__new(NULL);
1046        }
1047
1048        return ttrace;
1049}
1050
1051static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1052{
1053        struct thread_trace *ttrace;
1054
1055        if (thread == NULL)
1056                goto fail;
1057
1058        if (thread__priv(thread) == NULL)
1059                thread__set_priv(thread, thread_trace__new());
1060
1061        if (thread__priv(thread) == NULL)
1062                goto fail;
1063
1064        ttrace = thread__priv(thread);
1065        ++ttrace->nr_events;
1066
1067        return ttrace;
1068fail:
1069        color_fprintf(fp, PERF_COLOR_RED,
1070                      "WARNING: not enough memory, dropping samples!\n");
1071        return NULL;
1072}
1073
1074
1075void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1076                                    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1077{
1078        struct thread_trace *ttrace = thread__priv(arg->thread);
1079
1080        ttrace->ret_scnprintf = ret_scnprintf;
1081}
1082
1083#define TRACE_PFMAJ             (1 << 0)
1084#define TRACE_PFMIN             (1 << 1)
1085
1086static const size_t trace__entry_str_size = 2048;
1087
1088static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1089{
1090        if (fd < 0)
1091                return NULL;
1092
1093        if (fd > ttrace->files.max) {
1094                struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1095
1096                if (nfiles == NULL)
1097                        return NULL;
1098
1099                if (ttrace->files.max != -1) {
1100                        memset(nfiles + ttrace->files.max + 1, 0,
1101                               (fd - ttrace->files.max) * sizeof(struct file));
1102                } else {
1103                        memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1104                }
1105
1106                ttrace->files.table = nfiles;
1107                ttrace->files.max   = fd;
1108        }
1109
1110        return ttrace->files.table + fd;
1111}
1112
1113struct file *thread__files_entry(struct thread *thread, int fd)
1114{
1115        return thread_trace__files_entry(thread__priv(thread), fd);
1116}
1117
1118static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1119{
1120        struct thread_trace *ttrace = thread__priv(thread);
1121        struct file *file = thread_trace__files_entry(ttrace, fd);
1122
1123        if (file != NULL) {
1124                struct stat st;
1125                if (stat(pathname, &st) == 0)
1126                        file->dev_maj = major(st.st_rdev);
1127                file->pathname = strdup(pathname);
1128                if (file->pathname)
1129                        return 0;
1130        }
1131
1132        return -1;
1133}
1134
1135static int thread__read_fd_path(struct thread *thread, int fd)
1136{
1137        char linkname[PATH_MAX], pathname[PATH_MAX];
1138        struct stat st;
1139        int ret;
1140
1141        if (thread->pid_ == thread->tid) {
1142                scnprintf(linkname, sizeof(linkname),
1143                          "/proc/%d/fd/%d", thread->pid_, fd);
1144        } else {
1145                scnprintf(linkname, sizeof(linkname),
1146                          "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1147        }
1148
1149        if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1150                return -1;
1151
1152        ret = readlink(linkname, pathname, sizeof(pathname));
1153
1154        if (ret < 0 || ret > st.st_size)
1155                return -1;
1156
1157        pathname[ret] = '\0';
1158        return trace__set_fd_pathname(thread, fd, pathname);
1159}
1160
1161static const char *thread__fd_path(struct thread *thread, int fd,
1162                                   struct trace *trace)
1163{
1164        struct thread_trace *ttrace = thread__priv(thread);
1165
1166        if (ttrace == NULL)
1167                return NULL;
1168
1169        if (fd < 0)
1170                return NULL;
1171
1172        if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1173                if (!trace->live)
1174                        return NULL;
1175                ++trace->stats.proc_getname;
1176                if (thread__read_fd_path(thread, fd))
1177                        return NULL;
1178        }
1179
1180        return ttrace->files.table[fd].pathname;
1181}
1182
1183size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1184{
1185        int fd = arg->val;
1186        size_t printed = scnprintf(bf, size, "%d", fd);
1187        const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1188
1189        if (path)
1190                printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1191
1192        return printed;
1193}
1194
1195size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1196{
1197        size_t printed = scnprintf(bf, size, "%d", fd);
1198        struct thread *thread = machine__find_thread(trace->host, pid, pid);
1199
1200        if (thread) {
1201                const char *path = thread__fd_path(thread, fd, trace);
1202
1203                if (path)
1204                        printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1205
1206                thread__put(thread);
1207        }
1208
1209        return printed;
1210}
1211
1212static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1213                                              struct syscall_arg *arg)
1214{
1215        int fd = arg->val;
1216        size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1217        struct thread_trace *ttrace = thread__priv(arg->thread);
1218
1219        if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1220                zfree(&ttrace->files.table[fd].pathname);
1221
1222        return printed;
1223}
1224
1225static void thread__set_filename_pos(struct thread *thread, const char *bf,
1226                                     unsigned long ptr)
1227{
1228        struct thread_trace *ttrace = thread__priv(thread);
1229
1230        ttrace->filename.ptr = ptr;
1231        ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1232}
1233
1234static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1235{
1236        struct augmented_arg *augmented_arg = arg->augmented.args;
1237        size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1238        /*
1239         * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1240         * we would have two strings, each prefixed by its size.
1241         */
1242        int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1243
1244        arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1245        arg->augmented.size -= consumed;
1246
1247        return printed;
1248}
1249
1250static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1251                                              struct syscall_arg *arg)
1252{
1253        unsigned long ptr = arg->val;
1254
1255        if (arg->augmented.args)
1256                return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1257
1258        if (!arg->trace->vfs_getname)
1259                return scnprintf(bf, size, "%#x", ptr);
1260
1261        thread__set_filename_pos(arg->thread, bf, ptr);
1262        return 0;
1263}
1264
1265static bool trace__filter_duration(struct trace *trace, double t)
1266{
1267        return t < (trace->duration_filter * NSEC_PER_MSEC);
1268}
1269
1270static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1271{
1272        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1273
1274        return fprintf(fp, "%10.3f ", ts);
1275}
1276
1277/*
1278 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1279 * using ttrace->entry_time for a thread that receives a sys_exit without
1280 * first having received a sys_enter ("poll" issued before tracing session
1281 * starts, lost sys_enter exit due to ring buffer overflow).
1282 */
1283static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1284{
1285        if (tstamp > 0)
1286                return __trace__fprintf_tstamp(trace, tstamp, fp);
1287
1288        return fprintf(fp, "         ? ");
1289}
1290
1291static bool done = false;
1292static bool interrupted = false;
1293
1294static void sig_handler(int sig)
1295{
1296        done = true;
1297        interrupted = sig == SIGINT;
1298}
1299
1300static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1301{
1302        size_t printed = 0;
1303
1304        if (trace->multiple_threads) {
1305                if (trace->show_comm)
1306                        printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1307                printed += fprintf(fp, "%d ", thread->tid);
1308        }
1309
1310        return printed;
1311}
1312
1313static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1314                                        u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1315{
1316        size_t printed = 0;
1317
1318        if (trace->show_tstamp)
1319                printed = trace__fprintf_tstamp(trace, tstamp, fp);
1320        if (trace->show_duration)
1321                printed += fprintf_duration(duration, duration_calculated, fp);
1322        return printed + trace__fprintf_comm_tid(trace, thread, fp);
1323}
1324
1325static int trace__process_event(struct trace *trace, struct machine *machine,
1326                                union perf_event *event, struct perf_sample *sample)
1327{
1328        int ret = 0;
1329
1330        switch (event->header.type) {
1331        case PERF_RECORD_LOST:
1332                color_fprintf(trace->output, PERF_COLOR_RED,
1333                              "LOST %" PRIu64 " events!\n", event->lost.lost);
1334                ret = machine__process_lost_event(machine, event, sample);
1335                break;
1336        default:
1337                ret = machine__process_event(machine, event, sample);
1338                break;
1339        }
1340
1341        return ret;
1342}
1343
1344static int trace__tool_process(struct perf_tool *tool,
1345                               union perf_event *event,
1346                               struct perf_sample *sample,
1347                               struct machine *machine)
1348{
1349        struct trace *trace = container_of(tool, struct trace, tool);
1350        return trace__process_event(trace, machine, event, sample);
1351}
1352
1353static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1354{
1355        struct machine *machine = vmachine;
1356
1357        if (machine->kptr_restrict_warned)
1358                return NULL;
1359
1360        if (symbol_conf.kptr_restrict) {
1361                pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1362                           "Check /proc/sys/kernel/kptr_restrict.\n\n"
1363                           "Kernel samples will not be resolved.\n");
1364                machine->kptr_restrict_warned = true;
1365                return NULL;
1366        }
1367
1368        return machine__resolve_kernel_addr(vmachine, addrp, modp);
1369}
1370
1371static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1372{
1373        int err = symbol__init(NULL);
1374
1375        if (err)
1376                return err;
1377
1378        trace->host = machine__new_host();
1379        if (trace->host == NULL)
1380                return -ENOMEM;
1381
1382        err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1383        if (err < 0)
1384                goto out;
1385
1386        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1387                                            evlist->threads, trace__tool_process, false,
1388                                            1);
1389out:
1390        if (err)
1391                symbol__exit();
1392
1393        return err;
1394}
1395
1396static void trace__symbols__exit(struct trace *trace)
1397{
1398        machine__exit(trace->host);
1399        trace->host = NULL;
1400
1401        symbol__exit();
1402}
1403
1404static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1405{
1406        int idx;
1407
1408        if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1409                nr_args = sc->fmt->nr_args;
1410
1411        sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1412        if (sc->arg_fmt == NULL)
1413                return -1;
1414
1415        for (idx = 0; idx < nr_args; ++idx) {
1416                if (sc->fmt)
1417                        sc->arg_fmt[idx] = sc->fmt->arg[idx];
1418        }
1419
1420        sc->nr_args = nr_args;
1421        return 0;
1422}
1423
1424static int syscall__set_arg_fmts(struct syscall *sc)
1425{
1426        struct tep_format_field *field, *last_field = NULL;
1427        int idx = 0, len;
1428
1429        for (field = sc->args; field; field = field->next, ++idx) {
1430                last_field = field;
1431
1432                if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1433                        continue;
1434
1435                len = strlen(field->name);
1436
1437                if (strcmp(field->type, "const char *") == 0 &&
1438                    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
1439                     strstr(field->name, "path") != NULL))
1440                        sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1441                else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1442                        sc->arg_fmt[idx].scnprintf = SCA_PTR;
1443                else if (strcmp(field->type, "pid_t") == 0)
1444                        sc->arg_fmt[idx].scnprintf = SCA_PID;
1445                else if (strcmp(field->type, "umode_t") == 0)
1446                        sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1447                else if ((strcmp(field->type, "int") == 0 ||
1448                          strcmp(field->type, "unsigned int") == 0 ||
1449                          strcmp(field->type, "long") == 0) &&
1450                         len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
1451                        /*
1452                         * /sys/kernel/tracing/events/syscalls/sys_enter*
1453                         * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1454                         * 65 int
1455                         * 23 unsigned int
1456                         * 7 unsigned long
1457                         */
1458                        sc->arg_fmt[idx].scnprintf = SCA_FD;
1459                }
1460        }
1461
1462        if (last_field)
1463                sc->args_size = last_field->offset + last_field->size;
1464
1465        return 0;
1466}
1467
1468static int trace__read_syscall_info(struct trace *trace, int id)
1469{
1470        char tp_name[128];
1471        struct syscall *sc;
1472        const char *name = syscalltbl__name(trace->sctbl, id);
1473
1474        if (name == NULL)
1475                return -1;
1476
1477        if (id > trace->syscalls.max) {
1478                struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1479
1480                if (nsyscalls == NULL)
1481                        return -1;
1482
1483                if (trace->syscalls.max != -1) {
1484                        memset(nsyscalls + trace->syscalls.max + 1, 0,
1485                               (id - trace->syscalls.max) * sizeof(*sc));
1486                } else {
1487                        memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1488                }
1489
1490                trace->syscalls.table = nsyscalls;
1491                trace->syscalls.max   = id;
1492        }
1493
1494        sc = trace->syscalls.table + id;
1495        sc->name = name;
1496
1497        sc->fmt  = syscall_fmt__find(sc->name);
1498
1499        snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1500        sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1501
1502        if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1503                snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1504                sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1505        }
1506
1507        if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1508                return -1;
1509
1510        if (IS_ERR(sc->tp_format))
1511                return -1;
1512
1513        sc->args = sc->tp_format->format.fields;
1514        /*
1515         * We need to check and discard the first variable '__syscall_nr'
1516         * or 'nr' that mean the syscall number. It is needless here.
1517         * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1518         */
1519        if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1520                sc->args = sc->args->next;
1521                --sc->nr_args;
1522        }
1523
1524        sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1525        sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1526
1527        return syscall__set_arg_fmts(sc);
1528}
1529
1530static int trace__validate_ev_qualifier(struct trace *trace)
1531{
1532        int err = 0;
1533        bool printed_invalid_prefix = false;
1534        struct str_node *pos;
1535        size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
1536
1537        trace->ev_qualifier_ids.entries = malloc(nr_allocated *
1538                                                 sizeof(trace->ev_qualifier_ids.entries[0]));
1539
1540        if (trace->ev_qualifier_ids.entries == NULL) {
1541                fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1542                       trace->output);
1543                err = -EINVAL;
1544                goto out;
1545        }
1546
1547        strlist__for_each_entry(pos, trace->ev_qualifier) {
1548                const char *sc = pos->s;
1549                int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1550
1551                if (id < 0) {
1552                        id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1553                        if (id >= 0)
1554                                goto matches;
1555
1556                        if (!printed_invalid_prefix) {
1557                                pr_debug("Skipping unknown syscalls: ");
1558                                printed_invalid_prefix = true;
1559                        } else {
1560                                pr_debug(", ");
1561                        }
1562
1563                        pr_debug("%s", sc);
1564                        continue;
1565                }
1566matches:
1567                trace->ev_qualifier_ids.entries[nr_used++] = id;
1568                if (match_next == -1)
1569                        continue;
1570
1571                while (1) {
1572                        id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1573                        if (id < 0)
1574                                break;
1575                        if (nr_allocated == nr_used) {
1576                                void *entries;
1577
1578                                nr_allocated += 8;
1579                                entries = realloc(trace->ev_qualifier_ids.entries,
1580                                                  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1581                                if (entries == NULL) {
1582                                        err = -ENOMEM;
1583                                        fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1584                                        goto out_free;
1585                                }
1586                                trace->ev_qualifier_ids.entries = entries;
1587                        }
1588                        trace->ev_qualifier_ids.entries[nr_used++] = id;
1589                }
1590        }
1591
1592        trace->ev_qualifier_ids.nr = nr_used;
1593out:
1594        if (printed_invalid_prefix)
1595                pr_debug("\n");
1596        return err;
1597out_free:
1598        zfree(&trace->ev_qualifier_ids.entries);
1599        trace->ev_qualifier_ids.nr = 0;
1600        goto out;
1601}
1602
1603/*
1604 * args is to be interpreted as a series of longs but we need to handle
1605 * 8-byte unaligned accesses. args points to raw_data within the event
1606 * and raw_data is guaranteed to be 8-byte unaligned because it is
1607 * preceded by raw_size which is a u32. So we need to copy args to a temp
1608 * variable to read it. Most notably this avoids extended load instructions
1609 * on unaligned addresses
1610 */
1611unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1612{
1613        unsigned long val;
1614        unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1615
1616        memcpy(&val, p, sizeof(val));
1617        return val;
1618}
1619
1620static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1621                                      struct syscall_arg *arg)
1622{
1623        if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1624                return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1625
1626        return scnprintf(bf, size, "arg%d: ", arg->idx);
1627}
1628
1629/*
1630 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
1631 * as mount 'flags' argument that needs ignoring some magic flag, see comment
1632 * in tools/perf/trace/beauty/mount_flags.c
1633 */
1634static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
1635{
1636        if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
1637                return sc->arg_fmt[arg->idx].mask_val(arg, val);
1638
1639        return val;
1640}
1641
1642static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1643                                     struct syscall_arg *arg, unsigned long val)
1644{
1645        if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1646                arg->val = val;
1647                if (sc->arg_fmt[arg->idx].parm)
1648                        arg->parm = sc->arg_fmt[arg->idx].parm;
1649                return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1650        }
1651        return scnprintf(bf, size, "%ld", val);
1652}
1653
1654static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1655                                      unsigned char *args, void *augmented_args, int augmented_args_size,
1656                                      struct trace *trace, struct thread *thread)
1657{
1658        size_t printed = 0;
1659        unsigned long val;
1660        u8 bit = 1;
1661        struct syscall_arg arg = {
1662                .args   = args,
1663                .augmented = {
1664                        .size = augmented_args_size,
1665                        .args = augmented_args,
1666                },
1667                .idx    = 0,
1668                .mask   = 0,
1669                .trace  = trace,
1670                .thread = thread,
1671                .show_string_prefix = trace->show_string_prefix,
1672        };
1673        struct thread_trace *ttrace = thread__priv(thread);
1674
1675        /*
1676         * Things like fcntl will set this in its 'cmd' formatter to pick the
1677         * right formatter for the return value (an fd? file flags?), which is
1678         * not needed for syscalls that always return a given type, say an fd.
1679         */
1680        ttrace->ret_scnprintf = NULL;
1681
1682        if (sc->args != NULL) {
1683                struct tep_format_field *field;
1684
1685                for (field = sc->args; field;
1686                     field = field->next, ++arg.idx, bit <<= 1) {
1687                        if (arg.mask & bit)
1688                                continue;
1689
1690                        val = syscall_arg__val(&arg, arg.idx);
1691                        /*
1692                         * Some syscall args need some mask, most don't and
1693                         * return val untouched.
1694                         */
1695                        val = syscall__mask_val(sc, &arg, val);
1696
1697                        /*
1698                         * Suppress this argument if its value is zero and
1699                         * and we don't have a string associated in an
1700                         * strarray for it.
1701                         */
1702                        if (val == 0 &&
1703                            !trace->show_zeros &&
1704                            !(sc->arg_fmt &&
1705                              (sc->arg_fmt[arg.idx].show_zero ||
1706                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1707                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1708                              sc->arg_fmt[arg.idx].parm))
1709                                continue;
1710
1711                        printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
1712
1713                        if (trace->show_arg_names)
1714                                printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
1715
1716                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1717                }
1718        } else if (IS_ERR(sc->tp_format)) {
1719                /*
1720                 * If we managed to read the tracepoint /format file, then we
1721                 * may end up not having any args, like with gettid(), so only
1722                 * print the raw args when we didn't manage to read it.
1723                 */
1724                while (arg.idx < sc->nr_args) {
1725                        if (arg.mask & bit)
1726                                goto next_arg;
1727                        val = syscall_arg__val(&arg, arg.idx);
1728                        if (printed)
1729                                printed += scnprintf(bf + printed, size - printed, ", ");
1730                        printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1731                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1732next_arg:
1733                        ++arg.idx;
1734                        bit <<= 1;
1735                }
1736        }
1737
1738        return printed;
1739}
1740
1741typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1742                                  union perf_event *event,
1743                                  struct perf_sample *sample);
1744
1745static struct syscall *trace__syscall_info(struct trace *trace,
1746                                           struct perf_evsel *evsel, int id)
1747{
1748
1749        if (id < 0) {
1750
1751                /*
1752                 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1753                 * before that, leaving at a higher verbosity level till that is
1754                 * explained. Reproduced with plain ftrace with:
1755                 *
1756                 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1757                 * grep "NR -1 " /t/trace_pipe
1758                 *
1759                 * After generating some load on the machine.
1760                 */
1761                if (verbose > 1) {
1762                        static u64 n;
1763                        fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1764                                id, perf_evsel__name(evsel), ++n);
1765                }
1766                return NULL;
1767        }
1768
1769        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1770            trace__read_syscall_info(trace, id))
1771                goto out_cant_read;
1772
1773        if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1774                goto out_cant_read;
1775
1776        return &trace->syscalls.table[id];
1777
1778out_cant_read:
1779        if (verbose > 0) {
1780                fprintf(trace->output, "Problems reading syscall %d", id);
1781                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1782                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1783                fputs(" information\n", trace->output);
1784        }
1785        return NULL;
1786}
1787
1788static void thread__update_stats(struct thread_trace *ttrace,
1789                                 int id, struct perf_sample *sample)
1790{
1791        struct int_node *inode;
1792        struct stats *stats;
1793        u64 duration = 0;
1794
1795        inode = intlist__findnew(ttrace->syscall_stats, id);
1796        if (inode == NULL)
1797                return;
1798
1799        stats = inode->priv;
1800        if (stats == NULL) {
1801                stats = malloc(sizeof(struct stats));
1802                if (stats == NULL)
1803                        return;
1804                init_stats(stats);
1805                inode->priv = stats;
1806        }
1807
1808        if (ttrace->entry_time && sample->time > ttrace->entry_time)
1809                duration = sample->time - ttrace->entry_time;
1810
1811        update_stats(stats, duration);
1812}
1813
1814static int trace__printf_interrupted_entry(struct trace *trace)
1815{
1816        struct thread_trace *ttrace;
1817        size_t printed;
1818        int len;
1819
1820        if (trace->failure_only || trace->current == NULL)
1821                return 0;
1822
1823        ttrace = thread__priv(trace->current);
1824
1825        if (!ttrace->entry_pending)
1826                return 0;
1827
1828        printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1829        printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
1830
1831        if (len < trace->args_alignment - 4)
1832                printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1833
1834        printed += fprintf(trace->output, " ...\n");
1835
1836        ttrace->entry_pending = false;
1837        ++trace->nr_events_printed;
1838
1839        return printed;
1840}
1841
1842static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1843                                 struct perf_sample *sample, struct thread *thread)
1844{
1845        int printed = 0;
1846
1847        if (trace->print_sample) {
1848                double ts = (double)sample->time / NSEC_PER_MSEC;
1849
1850                printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1851                                   perf_evsel__name(evsel), ts,
1852                                   thread__comm_str(thread),
1853                                   sample->pid, sample->tid, sample->cpu);
1854        }
1855
1856        return printed;
1857}
1858
1859static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1860{
1861        void *augmented_args = NULL;
1862        /*
1863         * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1864         * and there we get all 6 syscall args plus the tracepoint common fields
1865         * that gets calculated at the start and the syscall_nr (another long).
1866         * So we check if that is the case and if so don't look after the
1867         * sc->args_size but always after the full raw_syscalls:sys_enter payload,
1868         * which is fixed.
1869         *
1870         * We'll revisit this later to pass s->args_size to the BPF augmenter
1871         * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
1872         * copies only what we need for each syscall, like what happens when we
1873         * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
1874         * traffic to just what is needed for each syscall.
1875         */
1876        int args_size = raw_augmented_args_size ?: sc->args_size;
1877
1878        *augmented_args_size = sample->raw_size - args_size;
1879        if (*augmented_args_size > 0)
1880                augmented_args = sample->raw_data + args_size;
1881
1882        return augmented_args;
1883}
1884
1885static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1886                            union perf_event *event __maybe_unused,
1887                            struct perf_sample *sample)
1888{
1889        char *msg;
1890        void *args;
1891        int printed = 0;
1892        struct thread *thread;
1893        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1894        int augmented_args_size = 0;
1895        void *augmented_args = NULL;
1896        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1897        struct thread_trace *ttrace;
1898
1899        if (sc == NULL)
1900                return -1;
1901
1902        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1903        ttrace = thread__trace(thread, trace->output);
1904        if (ttrace == NULL)
1905                goto out_put;
1906
1907        trace__fprintf_sample(trace, evsel, sample, thread);
1908
1909        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1910
1911        if (ttrace->entry_str == NULL) {
1912                ttrace->entry_str = malloc(trace__entry_str_size);
1913                if (!ttrace->entry_str)
1914                        goto out_put;
1915        }
1916
1917        if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1918                trace__printf_interrupted_entry(trace);
1919        /*
1920         * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
1921         * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
1922         * this breaks syscall__augmented_args() check for augmented args, as we calculate
1923         * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
1924         * so when handling, say the openat syscall, we end up getting 6 args for the
1925         * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
1926         * thinking that the extra 2 u64 args are the augmented filename, so just check
1927         * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
1928         */
1929        if (evsel != trace->syscalls.events.sys_enter)
1930                augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1931        ttrace->entry_time = sample->time;
1932        msg = ttrace->entry_str;
1933        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1934
1935        printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1936                                           args, augmented_args, augmented_args_size, trace, thread);
1937
1938        if (sc->is_exit) {
1939                if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1940                        int alignment = 0;
1941
1942                        trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1943                        printed = fprintf(trace->output, "%s)", ttrace->entry_str);
1944                        if (trace->args_alignment > printed)
1945                                alignment = trace->args_alignment - printed;
1946                        fprintf(trace->output, "%*s= ?\n", alignment, " ");
1947                }
1948        } else {
1949                ttrace->entry_pending = true;
1950                /* See trace__vfs_getname & trace__sys_exit */
1951                ttrace->filename.pending_open = false;
1952        }
1953
1954        if (trace->current != thread) {
1955                thread__put(trace->current);
1956                trace->current = thread__get(thread);
1957        }
1958        err = 0;
1959out_put:
1960        thread__put(thread);
1961        return err;
1962}
1963
1964static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1965                                    struct perf_sample *sample)
1966{
1967        struct thread_trace *ttrace;
1968        struct thread *thread;
1969        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1970        struct syscall *sc = trace__syscall_info(trace, evsel, id);
1971        char msg[1024];
1972        void *args, *augmented_args = NULL;
1973        int augmented_args_size;
1974
1975        if (sc == NULL)
1976                return -1;
1977
1978        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1979        ttrace = thread__trace(thread, trace->output);
1980        /*
1981         * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1982         * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1983         */
1984        if (ttrace == NULL)
1985                goto out_put;
1986
1987        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1988        augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1989        syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
1990        fprintf(trace->output, "%s", msg);
1991        err = 0;
1992out_put:
1993        thread__put(thread);
1994        return err;
1995}
1996
1997static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1998                                    struct perf_sample *sample,
1999                                    struct callchain_cursor *cursor)
2000{
2001        struct addr_location al;
2002        int max_stack = evsel->attr.sample_max_stack ?
2003                        evsel->attr.sample_max_stack :
2004                        trace->max_stack;
2005        int err;
2006
2007        if (machine__resolve(trace->host, &al, sample) < 0)
2008                return -1;
2009
2010        err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2011        addr_location__put(&al);
2012        return err;
2013}
2014
2015static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2016{
2017        /* TODO: user-configurable print_opts */
2018        const unsigned int print_opts = EVSEL__PRINT_SYM |
2019                                        EVSEL__PRINT_DSO |
2020                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
2021
2022        return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
2023}
2024
2025static const char *errno_to_name(struct perf_evsel *evsel, int err)
2026{
2027        struct perf_env *env = perf_evsel__env(evsel);
2028        const char *arch_name = perf_env__arch(env);
2029
2030        return arch_syscalls__strerrno(arch_name, err);
2031}
2032
2033static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2034                           union perf_event *event __maybe_unused,
2035                           struct perf_sample *sample)
2036{
2037        long ret;
2038        u64 duration = 0;
2039        bool duration_calculated = false;
2040        struct thread *thread;
2041        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2042        int alignment = trace->args_alignment;
2043        struct syscall *sc = trace__syscall_info(trace, evsel, id);
2044        struct thread_trace *ttrace;
2045
2046        if (sc == NULL)
2047                return -1;
2048
2049        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2050        ttrace = thread__trace(thread, trace->output);
2051        if (ttrace == NULL)
2052                goto out_put;
2053
2054        trace__fprintf_sample(trace, evsel, sample, thread);
2055
2056        if (trace->summary)
2057                thread__update_stats(ttrace, id, sample);
2058
2059        ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2060
2061        if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2062                trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2063                ttrace->filename.pending_open = false;
2064                ++trace->stats.vfs_getname;
2065        }
2066
2067        if (ttrace->entry_time) {
2068                duration = sample->time - ttrace->entry_time;
2069                if (trace__filter_duration(trace, duration))
2070                        goto out;
2071                duration_calculated = true;
2072        } else if (trace->duration_filter)
2073                goto out;
2074
2075        if (sample->callchain) {
2076                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2077                if (callchain_ret == 0) {
2078                        if (callchain_cursor.nr < trace->min_stack)
2079                                goto out;
2080                        callchain_ret = 1;
2081                }
2082        }
2083
2084        if (trace->summary_only || (ret >= 0 && trace->failure_only))
2085                goto out;
2086
2087        trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2088
2089        if (ttrace->entry_pending) {
2090                printed = fprintf(trace->output, "%s", ttrace->entry_str);
2091        } else {
2092                printed += fprintf(trace->output, " ... [");
2093                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2094                printed += 9;
2095                printed += fprintf(trace->output, "]: %s()", sc->name);
2096        }
2097
2098        printed++; /* the closing ')' */
2099
2100        if (alignment > printed)
2101                alignment -= printed;
2102        else
2103                alignment = 0;
2104
2105        fprintf(trace->output, ")%*s= ", alignment, " ");
2106
2107        if (sc->fmt == NULL) {
2108                if (ret < 0)
2109                        goto errno_print;
2110signed_print:
2111                fprintf(trace->output, "%ld", ret);
2112        } else if (ret < 0) {
2113errno_print: {
2114                char bf[STRERR_BUFSIZE];
2115                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2116                           *e = errno_to_name(evsel, -ret);
2117
2118                fprintf(trace->output, "-1 %s (%s)", e, emsg);
2119        }
2120        } else if (ret == 0 && sc->fmt->timeout)
2121                fprintf(trace->output, "0 (Timeout)");
2122        else if (ttrace->ret_scnprintf) {
2123                char bf[1024];
2124                struct syscall_arg arg = {
2125                        .val    = ret,
2126                        .thread = thread,
2127                        .trace  = trace,
2128                };
2129                ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2130                ttrace->ret_scnprintf = NULL;
2131                fprintf(trace->output, "%s", bf);
2132        } else if (sc->fmt->hexret)
2133                fprintf(trace->output, "%#lx", ret);
2134        else if (sc->fmt->errpid) {
2135                struct thread *child = machine__find_thread(trace->host, ret, ret);
2136
2137                if (child != NULL) {
2138                        fprintf(trace->output, "%ld", ret);
2139                        if (child->comm_set)
2140                                fprintf(trace->output, " (%s)", thread__comm_str(child));
2141                        thread__put(child);
2142                }
2143        } else
2144                goto signed_print;
2145
2146        fputc('\n', trace->output);
2147
2148        /*
2149         * We only consider an 'event' for the sake of --max-events a non-filtered
2150         * sys_enter + sys_exit and other tracepoint events.
2151         */
2152        if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2153                interrupted = true;
2154
2155        if (callchain_ret > 0)
2156                trace__fprintf_callchain(trace, sample);
2157        else if (callchain_ret < 0)
2158                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2159out:
2160        ttrace->entry_pending = false;
2161        err = 0;
2162out_put:
2163        thread__put(thread);
2164        return err;
2165}
2166
2167static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2168                              union perf_event *event __maybe_unused,
2169                              struct perf_sample *sample)
2170{
2171        struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2172        struct thread_trace *ttrace;
2173        size_t filename_len, entry_str_len, to_move;
2174        ssize_t remaining_space;
2175        char *pos;
2176        const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2177
2178        if (!thread)
2179                goto out;
2180
2181        ttrace = thread__priv(thread);
2182        if (!ttrace)
2183                goto out_put;
2184
2185        filename_len = strlen(filename);
2186        if (filename_len == 0)
2187                goto out_put;
2188
2189        if (ttrace->filename.namelen < filename_len) {
2190                char *f = realloc(ttrace->filename.name, filename_len + 1);
2191
2192                if (f == NULL)
2193                        goto out_put;
2194
2195                ttrace->filename.namelen = filename_len;
2196                ttrace->filename.name = f;
2197        }
2198
2199        strcpy(ttrace->filename.name, filename);
2200        ttrace->filename.pending_open = true;
2201
2202        if (!ttrace->filename.ptr)
2203                goto out_put;
2204
2205        entry_str_len = strlen(ttrace->entry_str);
2206        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2207        if (remaining_space <= 0)
2208                goto out_put;
2209
2210        if (filename_len > (size_t)remaining_space) {
2211                filename += filename_len - remaining_space;
2212                filename_len = remaining_space;
2213        }
2214
2215        to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2216        pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2217        memmove(pos + filename_len, pos, to_move);
2218        memcpy(pos, filename, filename_len);
2219
2220        ttrace->filename.ptr = 0;
2221        ttrace->filename.entry_str_pos = 0;
2222out_put:
2223        thread__put(thread);
2224out:
2225        return 0;
2226}
2227
2228static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2229                                     union perf_event *event __maybe_unused,
2230                                     struct perf_sample *sample)
2231{
2232        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2233        double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2234        struct thread *thread = machine__findnew_thread(trace->host,
2235                                                        sample->pid,
2236                                                        sample->tid);
2237        struct thread_trace *ttrace = thread__trace(thread, trace->output);
2238
2239        if (ttrace == NULL)
2240                goto out_dump;
2241
2242        ttrace->runtime_ms += runtime_ms;
2243        trace->runtime_ms += runtime_ms;
2244out_put:
2245        thread__put(thread);
2246        return 0;
2247
2248out_dump:
2249        fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2250               evsel->name,
2251               perf_evsel__strval(evsel, sample, "comm"),
2252               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2253               runtime,
2254               perf_evsel__intval(evsel, sample, "vruntime"));
2255        goto out_put;
2256}
2257
2258static int bpf_output__printer(enum binary_printer_ops op,
2259                               unsigned int val, void *extra __maybe_unused, FILE *fp)
2260{
2261        unsigned char ch = (unsigned char)val;
2262
2263        switch (op) {
2264        case BINARY_PRINT_CHAR_DATA:
2265                return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2266        case BINARY_PRINT_DATA_BEGIN:
2267        case BINARY_PRINT_LINE_BEGIN:
2268        case BINARY_PRINT_ADDR:
2269        case BINARY_PRINT_NUM_DATA:
2270        case BINARY_PRINT_NUM_PAD:
2271        case BINARY_PRINT_SEP:
2272        case BINARY_PRINT_CHAR_PAD:
2273        case BINARY_PRINT_LINE_END:
2274        case BINARY_PRINT_DATA_END:
2275        default:
2276                break;
2277        }
2278
2279        return 0;
2280}
2281
2282static void bpf_output__fprintf(struct trace *trace,
2283                                struct perf_sample *sample)
2284{
2285        binary__fprintf(sample->raw_data, sample->raw_size, 8,
2286                        bpf_output__printer, NULL, trace->output);
2287        ++trace->nr_events_printed;
2288}
2289
2290static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2291                                union perf_event *event __maybe_unused,
2292                                struct perf_sample *sample)
2293{
2294        struct thread *thread;
2295        int callchain_ret = 0;
2296        /*
2297         * Check if we called perf_evsel__disable(evsel) due to, for instance,
2298         * this event's max_events having been hit and this is an entry coming
2299         * from the ring buffer that we should discard, since the max events
2300         * have already been considered/printed.
2301         */
2302        if (evsel->disabled)
2303                return 0;
2304
2305        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2306
2307        if (sample->callchain) {
2308                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2309                if (callchain_ret == 0) {
2310                        if (callchain_cursor.nr < trace->min_stack)
2311                                goto out;
2312                        callchain_ret = 1;
2313                }
2314        }
2315
2316        trace__printf_interrupted_entry(trace);
2317        trace__fprintf_tstamp(trace, sample->time, trace->output);
2318
2319        if (trace->trace_syscalls && trace->show_duration)
2320                fprintf(trace->output, "(         ): ");
2321
2322        if (thread)
2323                trace__fprintf_comm_tid(trace, thread, trace->output);
2324
2325        if (evsel == trace->syscalls.events.augmented) {
2326                int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2327                struct syscall *sc = trace__syscall_info(trace, evsel, id);
2328
2329                if (sc) {
2330                        fprintf(trace->output, "%s(", sc->name);
2331                        trace__fprintf_sys_enter(trace, evsel, sample);
2332                        fputc(')', trace->output);
2333                        goto newline;
2334                }
2335
2336                /*
2337                 * XXX: Not having the associated syscall info or not finding/adding
2338                 *      the thread should never happen, but if it does...
2339                 *      fall thru and print it as a bpf_output event.
2340                 */
2341        }
2342
2343        fprintf(trace->output, "%s:", evsel->name);
2344
2345        if (perf_evsel__is_bpf_output(evsel)) {
2346                bpf_output__fprintf(trace, sample);
2347        } else if (evsel->tp_format) {
2348                if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2349                    trace__fprintf_sys_enter(trace, evsel, sample)) {
2350                        event_format__fprintf(evsel->tp_format, sample->cpu,
2351                                              sample->raw_data, sample->raw_size,
2352                                              trace->output);
2353                        ++trace->nr_events_printed;
2354
2355                        if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2356                                perf_evsel__disable(evsel);
2357                                perf_evsel__close(evsel);
2358                        }
2359                }
2360        }
2361
2362newline:
2363        fprintf(trace->output, "\n");
2364
2365        if (callchain_ret > 0)
2366                trace__fprintf_callchain(trace, sample);
2367        else if (callchain_ret < 0)
2368                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2369out:
2370        thread__put(thread);
2371        return 0;
2372}
2373
2374static void print_location(FILE *f, struct perf_sample *sample,
2375                           struct addr_location *al,
2376                           bool print_dso, bool print_sym)
2377{
2378
2379        if ((verbose > 0 || print_dso) && al->map)
2380                fprintf(f, "%s@", al->map->dso->long_name);
2381
2382        if ((verbose > 0 || print_sym) && al->sym)
2383                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2384                        al->addr - al->sym->start);
2385        else if (al->map)
2386                fprintf(f, "0x%" PRIx64, al->addr);
2387        else
2388                fprintf(f, "0x%" PRIx64, sample->addr);
2389}
2390
2391static int trace__pgfault(struct trace *trace,
2392                          struct perf_evsel *evsel,
2393                          union perf_event *event __maybe_unused,
2394                          struct perf_sample *sample)
2395{
2396        struct thread *thread;
2397        struct addr_location al;
2398        char map_type = 'd';
2399        struct thread_trace *ttrace;
2400        int err = -1;
2401        int callchain_ret = 0;
2402
2403        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2404
2405        if (sample->callchain) {
2406                callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2407                if (callchain_ret == 0) {
2408                        if (callchain_cursor.nr < trace->min_stack)
2409                                goto out_put;
2410                        callchain_ret = 1;
2411                }
2412        }
2413
2414        ttrace = thread__trace(thread, trace->output);
2415        if (ttrace == NULL)
2416                goto out_put;
2417
2418        if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2419                ttrace->pfmaj++;
2420        else
2421                ttrace->pfmin++;
2422
2423        if (trace->summary_only)
2424                goto out;
2425
2426        thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2427
2428        trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2429
2430        fprintf(trace->output, "%sfault [",
2431                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2432                "maj" : "min");
2433
2434        print_location(trace->output, sample, &al, false, true);
2435
2436        fprintf(trace->output, "] => ");
2437
2438        thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2439
2440        if (!al.map) {
2441                thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2442
2443                if (al.map)
2444                        map_type = 'x';
2445                else
2446                        map_type = '?';
2447        }
2448
2449        print_location(trace->output, sample, &al, true, false);
2450
2451        fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2452
2453        if (callchain_ret > 0)
2454                trace__fprintf_callchain(trace, sample);
2455        else if (callchain_ret < 0)
2456                pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2457
2458        ++trace->nr_events_printed;
2459out:
2460        err = 0;
2461out_put:
2462        thread__put(thread);
2463        return err;
2464}
2465
2466static void trace__set_base_time(struct trace *trace,
2467                                 struct perf_evsel *evsel,
2468                                 struct perf_sample *sample)
2469{
2470        /*
2471         * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2472         * and don't use sample->time unconditionally, we may end up having
2473         * some other event in the future without PERF_SAMPLE_TIME for good
2474         * reason, i.e. we may not be interested in its timestamps, just in
2475         * it taking place, picking some piece of information when it
2476         * appears in our event stream (vfs_getname comes to mind).
2477         */
2478        if (trace->base_time == 0 && !trace->full_time &&
2479            (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2480                trace->base_time = sample->time;
2481}
2482
2483static int trace__process_sample(struct perf_tool *tool,
2484                                 union perf_event *event,
2485                                 struct perf_sample *sample,
2486                                 struct perf_evsel *evsel,
2487                                 struct machine *machine __maybe_unused)
2488{
2489        struct trace *trace = container_of(tool, struct trace, tool);
2490        struct thread *thread;
2491        int err = 0;
2492
2493        tracepoint_handler handler = evsel->handler;
2494
2495        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2496        if (thread && thread__is_filtered(thread))
2497                goto out;
2498
2499        trace__set_base_time(trace, evsel, sample);
2500
2501        if (handler) {
2502                ++trace->nr_events;
2503                handler(trace, evsel, event, sample);
2504        }
2505out:
2506        thread__put(thread);
2507        return err;
2508}
2509
2510static int trace__record(struct trace *trace, int argc, const char **argv)
2511{
2512        unsigned int rec_argc, i, j;
2513        const char **rec_argv;
2514        const char * const record_args[] = {
2515                "record",
2516                "-R",
2517                "-m", "1024",
2518                "-c", "1",
2519        };
2520
2521        const char * const sc_args[] = { "-e", };
2522        unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2523        const char * const majpf_args[] = { "-e", "major-faults" };
2524        unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2525        const char * const minpf_args[] = { "-e", "minor-faults" };
2526        unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2527
2528        /* +1 is for the event string below */
2529        rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2530                majpf_args_nr + minpf_args_nr + argc;
2531        rec_argv = calloc(rec_argc + 1, sizeof(char *));
2532
2533        if (rec_argv == NULL)
2534                return -ENOMEM;
2535
2536        j = 0;
2537        for (i = 0; i < ARRAY_SIZE(record_args); i++)
2538                rec_argv[j++] = record_args[i];
2539
2540        if (trace->trace_syscalls) {
2541                for (i = 0; i < sc_args_nr; i++)
2542                        rec_argv[j++] = sc_args[i];
2543
2544                /* event string may be different for older kernels - e.g., RHEL6 */
2545                if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2546                        rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2547                else if (is_valid_tracepoint("syscalls:sys_enter"))
2548                        rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2549                else {
2550                        pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2551                        free(rec_argv);
2552                        return -1;
2553                }
2554        }
2555
2556        if (trace->trace_pgfaults & TRACE_PFMAJ)
2557                for (i = 0; i < majpf_args_nr; i++)
2558                        rec_argv[j++] = majpf_args[i];
2559
2560        if (trace->trace_pgfaults & TRACE_PFMIN)
2561                for (i = 0; i < minpf_args_nr; i++)
2562                        rec_argv[j++] = minpf_args[i];
2563
2564        for (i = 0; i < (unsigned int)argc; i++)
2565                rec_argv[j++] = argv[i];
2566
2567        return cmd_record(j, rec_argv);
2568}
2569
2570static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2571
2572static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2573{
2574        bool found = false;
2575        struct perf_evsel *evsel, *tmp;
2576        struct parse_events_error err = { .idx = 0, };
2577        int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2578
2579        if (ret)
2580                return false;
2581
2582        evlist__for_each_entry_safe(evlist, evsel, tmp) {
2583                if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2584                        continue;
2585
2586                if (perf_evsel__field(evsel, "pathname")) {
2587                        evsel->handler = trace__vfs_getname;
2588                        found = true;
2589                        continue;
2590                }
2591
2592                list_del_init(&evsel->node);
2593                evsel->evlist = NULL;
2594                perf_evsel__delete(evsel);
2595        }
2596
2597        return found;
2598}
2599
2600static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2601{
2602        struct perf_evsel *evsel;
2603        struct perf_event_attr attr = {
2604                .type = PERF_TYPE_SOFTWARE,
2605                .mmap_data = 1,
2606        };
2607
2608        attr.config = config;
2609        attr.sample_period = 1;
2610
2611        event_attr_init(&attr);
2612
2613        evsel = perf_evsel__new(&attr);
2614        if (evsel)
2615                evsel->handler = trace__pgfault;
2616
2617        return evsel;
2618}
2619
2620static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2621{
2622        const u32 type = event->header.type;
2623        struct perf_evsel *evsel;
2624
2625        if (type != PERF_RECORD_SAMPLE) {
2626                trace__process_event(trace, trace->host, event, sample);
2627                return;
2628        }
2629
2630        evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2631        if (evsel == NULL) {
2632                fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2633                return;
2634        }
2635
2636        trace__set_base_time(trace, evsel, sample);
2637
2638        if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2639            sample->raw_data == NULL) {
2640                fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2641                       perf_evsel__name(evsel), sample->tid,
2642                       sample->cpu, sample->raw_size);
2643        } else {
2644                tracepoint_handler handler = evsel->handler;
2645                handler(trace, evsel, event, sample);
2646        }
2647
2648        if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
2649                interrupted = true;
2650}
2651
2652static int trace__add_syscall_newtp(struct trace *trace)
2653{
2654        int ret = -1;
2655        struct perf_evlist *evlist = trace->evlist;
2656        struct perf_evsel *sys_enter, *sys_exit;
2657
2658        sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2659        if (sys_enter == NULL)
2660                goto out;
2661
2662        if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2663                goto out_delete_sys_enter;
2664
2665        sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2666        if (sys_exit == NULL)
2667                goto out_delete_sys_enter;
2668
2669        if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2670                goto out_delete_sys_exit;
2671
2672        perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2673        perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2674
2675        perf_evlist__add(evlist, sys_enter);
2676        perf_evlist__add(evlist, sys_exit);
2677
2678        if (callchain_param.enabled && !trace->kernel_syscallchains) {
2679                /*
2680                 * We're interested only in the user space callchain
2681                 * leading to the syscall, allow overriding that for
2682                 * debugging reasons using --kernel_syscall_callchains
2683                 */
2684                sys_exit->attr.exclude_callchain_kernel = 1;
2685        }
2686
2687        trace->syscalls.events.sys_enter = sys_enter;
2688        trace->syscalls.events.sys_exit  = sys_exit;
2689
2690        ret = 0;
2691out:
2692        return ret;
2693
2694out_delete_sys_exit:
2695        perf_evsel__delete_priv(sys_exit);
2696out_delete_sys_enter:
2697        perf_evsel__delete_priv(sys_enter);
2698        goto out;
2699}
2700
2701static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2702{
2703        int err = -1;
2704        struct perf_evsel *sys_exit;
2705        char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2706                                                trace->ev_qualifier_ids.nr,
2707                                                trace->ev_qualifier_ids.entries);
2708
2709        if (filter == NULL)
2710                goto out_enomem;
2711
2712        if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2713                                          filter)) {
2714                sys_exit = trace->syscalls.events.sys_exit;
2715                err = perf_evsel__append_tp_filter(sys_exit, filter);
2716        }
2717
2718        free(filter);
2719out:
2720        return err;
2721out_enomem:
2722        errno = ENOMEM;
2723        goto out;
2724}
2725
2726#ifdef HAVE_LIBBPF_SUPPORT
2727static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
2728{
2729        struct syscall *sc = trace__syscall_info(trace, NULL, id);
2730        int arg = 0;
2731
2732        if (sc == NULL)
2733                goto out;
2734
2735        for (; arg < sc->nr_args; ++arg) {
2736                entry->string_args_len[arg] = 0;
2737                if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
2738                        /* Should be set like strace -s strsize */
2739                        entry->string_args_len[arg] = PATH_MAX;
2740                }
2741        }
2742out:
2743        for (; arg < 6; ++arg)
2744                entry->string_args_len[arg] = 0;
2745}
2746static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
2747{
2748        int fd = bpf_map__fd(trace->syscalls.map);
2749        struct bpf_map_syscall_entry value = {
2750                .enabled = !trace->not_ev_qualifier,
2751        };
2752        int err = 0;
2753        size_t i;
2754
2755        for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
2756                int key = trace->ev_qualifier_ids.entries[i];
2757
2758                if (value.enabled)
2759                        trace__init_bpf_map_syscall_args(trace, key, &value);
2760
2761                err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
2762                if (err)
2763                        break;
2764        }
2765
2766        return err;
2767}
2768
2769static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
2770{
2771        int fd = bpf_map__fd(trace->syscalls.map);
2772        struct bpf_map_syscall_entry value = {
2773                .enabled = enabled,
2774        };
2775        int err = 0, key;
2776
2777        for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2778                if (enabled)
2779                        trace__init_bpf_map_syscall_args(trace, key, &value);
2780
2781                err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2782                if (err)
2783                        break;
2784        }
2785
2786        return err;
2787}
2788
2789static int trace__init_syscalls_bpf_map(struct trace *trace)
2790{
2791        bool enabled = true;
2792
2793        if (trace->ev_qualifier_ids.nr)
2794                enabled = trace->not_ev_qualifier;
2795
2796        return __trace__init_syscalls_bpf_map(trace, enabled);
2797}
2798#else
2799static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
2800{
2801        return 0;
2802}
2803
2804static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
2805{
2806        return 0;
2807}
2808#endif // HAVE_LIBBPF_SUPPORT
2809
2810static int trace__set_ev_qualifier_filter(struct trace *trace)
2811{
2812        if (trace->syscalls.map)
2813                return trace__set_ev_qualifier_bpf_filter(trace);
2814        if (trace->syscalls.events.sys_enter)
2815                return trace__set_ev_qualifier_tp_filter(trace);
2816        return 0;
2817}
2818
2819static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
2820                                    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
2821{
2822        int err = 0;
2823#ifdef HAVE_LIBBPF_SUPPORT
2824        bool value = true;
2825        int map_fd = bpf_map__fd(map);
2826        size_t i;
2827
2828        for (i = 0; i < npids; ++i) {
2829                err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
2830                if (err)
2831                        break;
2832        }
2833#endif
2834        return err;
2835}
2836
2837static int trace__set_filter_loop_pids(struct trace *trace)
2838{
2839        unsigned int nr = 1, err;
2840        pid_t pids[32] = {
2841                getpid(),
2842        };
2843        struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2844
2845        while (thread && nr < ARRAY_SIZE(pids)) {
2846                struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2847
2848                if (parent == NULL)
2849                        break;
2850
2851                if (!strcmp(thread__comm_str(parent), "sshd") ||
2852                    strstarts(thread__comm_str(parent), "gnome-terminal")) {
2853                        pids[nr++] = parent->tid;
2854                        break;
2855                }
2856                thread = parent;
2857        }
2858
2859        err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
2860        if (!err && trace->filter_pids.map)
2861                err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
2862
2863        return err;
2864}
2865
2866static int trace__set_filter_pids(struct trace *trace)
2867{
2868        int err = 0;
2869        /*
2870         * Better not use !target__has_task() here because we need to cover the
2871         * case where no threads were specified in the command line, but a
2872         * workload was, and in that case we will fill in the thread_map when
2873         * we fork the workload in perf_evlist__prepare_workload.
2874         */
2875        if (trace->filter_pids.nr > 0) {
2876                err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
2877                                                      trace->filter_pids.entries);
2878                if (!err && trace->filter_pids.map) {
2879                        err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
2880                                                       trace->filter_pids.entries);
2881                }
2882        } else if (thread_map__pid(trace->evlist->threads, 0) == -1) {
2883                err = trace__set_filter_loop_pids(trace);
2884        }
2885
2886        return err;
2887}
2888
2889static int __trace__deliver_event(struct trace *trace, union perf_event *event)
2890{
2891        struct perf_evlist *evlist = trace->evlist;
2892        struct perf_sample sample;
2893        int err;
2894
2895        err = perf_evlist__parse_sample(evlist, event, &sample);
2896        if (err)
2897                fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2898        else
2899                trace__handle_event(trace, event, &sample);
2900
2901        return 0;
2902}
2903
2904static int __trace__flush_events(struct trace *trace)
2905{
2906        u64 first = ordered_events__first_time(&trace->oe.data);
2907        u64 flush = trace->oe.last - NSEC_PER_SEC;
2908
2909        /* Is there some thing to flush.. */
2910        if (first && first < flush)
2911                return ordered_events__flush_time(&trace->oe.data, flush);
2912
2913        return 0;
2914}
2915
2916static int trace__flush_events(struct trace *trace)
2917{
2918        return !trace->sort_events ? 0 : __trace__flush_events(trace);
2919}
2920
2921static int trace__deliver_event(struct trace *trace, union perf_event *event)
2922{
2923        int err;
2924
2925        if (!trace->sort_events)
2926                return __trace__deliver_event(trace, event);
2927
2928        err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
2929        if (err && err != -1)
2930                return err;
2931
2932        err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
2933        if (err)
2934                return err;
2935
2936        return trace__flush_events(trace);
2937}
2938
2939static int ordered_events__deliver_event(struct ordered_events *oe,
2940                                         struct ordered_event *event)
2941{
2942        struct trace *trace = container_of(oe, struct trace, oe.data);
2943
2944        return __trace__deliver_event(trace, event->event);
2945}
2946
2947static int trace__run(struct trace *trace, int argc, const char **argv)
2948{
2949        struct perf_evlist *evlist = trace->evlist;
2950        struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2951        int err = -1, i;
2952        unsigned long before;
2953        const bool forks = argc > 0;
2954        bool draining = false;
2955
2956        trace->live = true;
2957
2958        if (!trace->raw_augmented_syscalls) {
2959                if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2960                        goto out_error_raw_syscalls;
2961
2962                if (trace->trace_syscalls)
2963                        trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2964        }
2965
2966        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2967                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2968                if (pgfault_maj == NULL)
2969                        goto out_error_mem;
2970                perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2971                perf_evlist__add(evlist, pgfault_maj);
2972        }
2973
2974        if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2975                pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2976                if (pgfault_min == NULL)
2977                        goto out_error_mem;
2978                perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2979                perf_evlist__add(evlist, pgfault_min);
2980        }
2981
2982        if (trace->sched &&
2983            perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2984                                   trace__sched_stat_runtime))
2985                goto out_error_sched_stat_runtime;
2986
2987        /*
2988         * If a global cgroup was set, apply it to all the events without an
2989         * explicit cgroup. I.e.:
2990         *
2991         *      trace -G A -e sched:*switch
2992         *
2993         * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2994         * _and_ sched:sched_switch to the 'A' cgroup, while:
2995         *
2996         * trace -e sched:*switch -G A
2997         *
2998         * will only set the sched:sched_switch event to the 'A' cgroup, all the
2999         * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
3000         * a cgroup (on the root cgroup, sys wide, etc).
3001         *
3002         * Multiple cgroups:
3003         *
3004         * trace -G A -e sched:*switch -G B
3005         *
3006         * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
3007         * to the 'B' cgroup.
3008         *
3009         * evlist__set_default_cgroup() grabs a reference of the passed cgroup
3010         * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
3011         */
3012        if (trace->cgroup)
3013                evlist__set_default_cgroup(trace->evlist, trace->cgroup);
3014
3015        err = perf_evlist__create_maps(evlist, &trace->opts.target);
3016        if (err < 0) {
3017                fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
3018                goto out_delete_evlist;
3019        }
3020
3021        err = trace__symbols_init(trace, evlist);
3022        if (err < 0) {
3023                fprintf(trace->output, "Problems initializing symbol libraries!\n");
3024                goto out_delete_evlist;
3025        }
3026
3027        perf_evlist__config(evlist, &trace->opts, &callchain_param);
3028
3029        signal(SIGCHLD, sig_handler);
3030        signal(SIGINT, sig_handler);
3031
3032        if (forks) {
3033                err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
3034                                                    argv, false, NULL);
3035                if (err < 0) {
3036                        fprintf(trace->output, "Couldn't run the workload!\n");
3037                        goto out_delete_evlist;
3038                }
3039        }
3040
3041        err = perf_evlist__open(evlist);
3042        if (err < 0)
3043                goto out_error_open;
3044
3045        err = bpf__apply_obj_config();
3046        if (err) {
3047                char errbuf[BUFSIZ];
3048
3049                bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
3050                pr_err("ERROR: Apply config to BPF failed: %s\n",
3051                         errbuf);
3052                goto out_error_open;
3053        }
3054
3055        err = trace__set_filter_pids(trace);
3056        if (err < 0)
3057                goto out_error_mem;
3058
3059        if (trace->syscalls.map)
3060                trace__init_syscalls_bpf_map(trace);
3061
3062        if (trace->ev_qualifier_ids.nr > 0) {
3063                err = trace__set_ev_qualifier_filter(trace);
3064                if (err < 0)
3065                        goto out_errno;
3066
3067                if (trace->syscalls.events.sys_exit) {
3068                        pr_debug("event qualifier tracepoint filter: %s\n",
3069                                 trace->syscalls.events.sys_exit->filter);
3070                }
3071        }
3072
3073        err = perf_evlist__apply_filters(evlist, &evsel);
3074        if (err < 0)
3075                goto out_error_apply_filters;
3076
3077        if (trace->dump.map)
3078                bpf_map__fprintf(trace->dump.map, trace->output);
3079
3080        err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
3081        if (err < 0)
3082                goto out_error_mmap;
3083
3084        if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
3085                perf_evlist__enable(evlist);
3086
3087        if (forks)
3088                perf_evlist__start_workload(evlist);
3089
3090        if (trace->opts.initial_delay) {
3091                usleep(trace->opts.initial_delay * 1000);
3092                perf_evlist__enable(evlist);
3093        }
3094
3095        trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
3096                                  evlist->threads->nr > 1 ||
3097                                  perf_evlist__first(evlist)->attr.inherit;
3098
3099        /*
3100         * Now that we already used evsel->attr to ask the kernel to setup the
3101         * events, lets reuse evsel->attr.sample_max_stack as the limit in
3102         * trace__resolve_callchain(), allowing per-event max-stack settings
3103         * to override an explicitly set --max-stack global setting.
3104         */
3105        evlist__for_each_entry(evlist, evsel) {
3106                if (evsel__has_callchain(evsel) &&
3107                    evsel->attr.sample_max_stack == 0)
3108                        evsel->attr.sample_max_stack = trace->max_stack;
3109        }
3110again:
3111        before = trace->nr_events;
3112
3113        for (i = 0; i < evlist->nr_mmaps; i++) {
3114                union perf_event *event;
3115                struct perf_mmap *md;
3116
3117                md = &evlist->mmap[i];
3118                if (perf_mmap__read_init(md) < 0)
3119                        continue;
3120
3121                while ((event = perf_mmap__read_event(md)) != NULL) {
3122                        ++trace->nr_events;
3123
3124                        err = trace__deliver_event(trace, event);
3125                        if (err)
3126                                goto out_disable;
3127
3128                        perf_mmap__consume(md);
3129
3130                        if (interrupted)
3131                                goto out_disable;
3132
3133                        if (done && !draining) {
3134                                perf_evlist__disable(evlist);
3135                                draining = true;
3136                        }
3137                }
3138                perf_mmap__read_done(md);
3139        }
3140
3141        if (trace->nr_events == before) {
3142                int timeout = done ? 100 : -1;
3143
3144                if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
3145                        if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3146                                draining = true;
3147
3148                        goto again;
3149                } else {
3150                        if (trace__flush_events(trace))
3151                                goto out_disable;
3152                }
3153        } else {
3154                goto again;
3155        }
3156
3157out_disable:
3158        thread__zput(trace->current);
3159
3160        perf_evlist__disable(evlist);
3161
3162        if (trace->sort_events)
3163                ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
3164
3165        if (!err) {
3166                if (trace->summary)
3167                        trace__fprintf_thread_summary(trace, trace->output);
3168
3169                if (trace->show_tool_stats) {
3170                        fprintf(trace->output, "Stats:\n "
3171                                               " vfs_getname : %" PRIu64 "\n"
3172                                               " proc_getname: %" PRIu64 "\n",
3173                                trace->stats.vfs_getname,
3174                                trace->stats.proc_getname);
3175                }
3176        }
3177
3178out_delete_evlist:
3179        trace__symbols__exit(trace);
3180
3181        perf_evlist__delete(evlist);
3182        cgroup__put(trace->cgroup);
3183        trace->evlist = NULL;
3184        trace->live = false;
3185        return err;
3186{
3187        char errbuf[BUFSIZ];
3188
3189out_error_sched_stat_runtime:
3190        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3191        goto out_error;
3192
3193out_error_raw_syscalls:
3194        tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3195        goto out_error;
3196
3197out_error_mmap:
3198        perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
3199        goto out_error;
3200
3201out_error_open:
3202        perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
3203
3204out_error:
3205        fprintf(trace->output, "%s\n", errbuf);
3206        goto out_delete_evlist;
3207
3208out_error_apply_filters:
3209        fprintf(trace->output,
3210                "Failed to set filter \"%s\" on event %s with %d (%s)\n",
3211                evsel->filter, perf_evsel__name(evsel), errno,
3212                str_error_r(errno, errbuf, sizeof(errbuf)));
3213        goto out_delete_evlist;
3214}
3215out_error_mem:
3216        fprintf(trace->output, "Not enough memory to run!\n");
3217        goto out_delete_evlist;
3218
3219out_errno:
3220        fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
3221        goto out_delete_evlist;
3222}
3223
3224static int trace__replay(struct trace *trace)
3225{
3226        const struct perf_evsel_str_handler handlers[] = {
3227                { "probe:vfs_getname",       trace__vfs_getname, },
3228        };
3229        struct perf_data data = {
3230                .path  = input_name,
3231                .mode  = PERF_DATA_MODE_READ,
3232                .force = trace->force,
3233        };
3234        struct perf_session *session;
3235        struct perf_evsel *evsel;
3236        int err = -1;
3237
3238        trace->tool.sample        = trace__process_sample;
3239        trace->tool.mmap          = perf_event__process_mmap;
3240        trace->tool.mmap2         = perf_event__process_mmap2;
3241        trace->tool.comm          = perf_event__process_comm;
3242        trace->tool.exit          = perf_event__process_exit;
3243        trace->tool.fork          = perf_event__process_fork;
3244        trace->tool.attr          = perf_event__process_attr;
3245        trace->tool.tracing_data  = perf_event__process_tracing_data;
3246        trace->tool.build_id      = perf_event__process_build_id;
3247        trace->tool.namespaces    = perf_event__process_namespaces;
3248
3249        trace->tool.ordered_events = true;
3250        trace->tool.ordering_requires_timestamps = true;
3251
3252        /* add tid to output */
3253        trace->multiple_threads = true;
3254
3255        session = perf_session__new(&data, false, &trace->tool);
3256        if (session == NULL)
3257                return -1;
3258
3259        if (trace->opts.target.pid)
3260                symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
3261
3262        if (trace->opts.target.tid)
3263                symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
3264
3265        if (symbol__init(&session->header.env) < 0)
3266                goto out;
3267
3268        trace->host = &session->machines.host;
3269
3270        err = perf_session__set_tracepoints_handlers(session, handlers);
3271        if (err)
3272                goto out;
3273
3274        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3275                                                     "raw_syscalls:sys_enter");
3276        /* older kernels have syscalls tp versus raw_syscalls */
3277        if (evsel == NULL)
3278                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3279                                                             "syscalls:sys_enter");
3280
3281        if (evsel &&
3282            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3283            perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3284                pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3285                goto out;
3286        }
3287
3288        evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3289                                                     "raw_syscalls:sys_exit");
3290        if (evsel == NULL)
3291                evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3292                                                             "syscalls:sys_exit");
3293        if (evsel &&
3294            (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3295            perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3296                pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3297                goto out;
3298        }
3299
3300        evlist__for_each_entry(session->evlist, evsel) {
3301                if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
3302                    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3303                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3304                     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3305                        evsel->handler = trace__pgfault;
3306        }
3307
3308        setup_pager();
3309
3310        err = perf_session__process_events(session);
3311        if (err)
3312                pr_err("Failed to process events, error %d", err);
3313
3314        else if (trace->summary)
3315                trace__fprintf_thread_summary(trace, trace->output);
3316
3317out:
3318        perf_session__delete(session);
3319
3320        return err;
3321}
3322
3323static size_t trace__fprintf_threads_header(FILE *fp)
3324{
3325        size_t printed;
3326
3327        printed  = fprintf(fp, "\n Summary of events:\n\n");
3328
3329        return printed;
3330}
3331
3332DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
3333        struct stats    *stats;
3334        double          msecs;
3335        int             syscall;
3336)
3337{
3338        struct int_node *source = rb_entry(nd, struct int_node, rb_node);
3339        struct stats *stats = source->priv;
3340
3341        entry->syscall = source->i;
3342        entry->stats   = stats;
3343        entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
3344}
3345
3346static size_t thread__dump_stats(struct thread_trace *ttrace,
3347                                 struct trace *trace, FILE *fp)
3348{
3349        size_t printed = 0;
3350        struct syscall *sc;
3351        struct rb_node *nd;
3352        DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3353
3354        if (syscall_stats == NULL)
3355                return 0;
3356
3357        printed += fprintf(fp, "\n");
3358
3359        printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
3360        printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
3361        printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3362
3363        resort_rb__for_each_entry(nd, syscall_stats) {
3364                struct stats *stats = syscall_stats_entry->stats;
3365                if (stats) {
3366                        double min = (double)(stats->min) / NSEC_PER_MSEC;
3367                        double max = (double)(stats->max) / NSEC_PER_MSEC;
3368                        double avg = avg_stats(stats);
3369                        double pct;
3370                        u64 n = (u64) stats->n;
3371
3372                        pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3373                        avg /= NSEC_PER_MSEC;
3374
3375                        sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3376                        printed += fprintf(fp, "   %-15s", sc->name);
3377                        printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3378                                           n, syscall_stats_entry->msecs, min, avg);
3379                        printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3380                }
3381        }
3382
3383        resort_rb__delete(syscall_stats);
3384        printed += fprintf(fp, "\n\n");
3385
3386        return printed;
3387}
3388
3389static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3390{
3391        size_t printed = 0;
3392        struct thread_trace *ttrace = thread__priv(thread);
3393        double ratio;
3394
3395        if (ttrace == NULL)
3396                return 0;
3397
3398        ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3399
3400        printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3401        printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3402        printed += fprintf(fp, "%.1f%%", ratio);
3403        if (ttrace->pfmaj)
3404                printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3405        if (ttrace->pfmin)
3406                printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3407        if (trace->sched)
3408                printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3409        else if (fputc('\n', fp) != EOF)
3410                ++printed;
3411
3412        printed += thread__dump_stats(ttrace, trace, fp);
3413
3414        return printed;
3415}
3416
3417static unsigned long thread__nr_events(struct thread_trace *ttrace)
3418{
3419        return ttrace ? ttrace->nr_events : 0;
3420}
3421
3422DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
3423        struct thread *thread;
3424)
3425{
3426        entry->thread = rb_entry(nd, struct thread, rb_node);
3427}
3428
3429static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3430{
3431        size_t printed = trace__fprintf_threads_header(fp);
3432        struct rb_node *nd;
3433        int i;
3434
3435        for (i = 0; i < THREADS__TABLE_SIZE; i++) {
3436                DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3437
3438                if (threads == NULL) {
3439                        fprintf(fp, "%s", "Error sorting output by nr_events!\n");
3440                        return 0;
3441                }
3442
3443                resort_rb__for_each_entry(nd, threads)
3444                        printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3445
3446                resort_rb__delete(threads);
3447        }
3448        return printed;
3449}
3450
3451static int trace__set_duration(const struct option *opt, const char *str,
3452                               int unset __maybe_unused)
3453{
3454        struct trace *trace = opt->value;
3455
3456        trace->duration_filter = atof(str);
3457        return 0;
3458}
3459
3460static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
3461                                              int unset __maybe_unused)
3462{
3463        int ret = -1;
3464        size_t i;
3465        struct trace *trace = opt->value;
3466        /*
3467         * FIXME: introduce a intarray class, plain parse csv and create a
3468         * { int nr, int entries[] } struct...
3469         */
3470        struct intlist *list = intlist__new(str);
3471
3472        if (list == NULL)
3473                return -1;
3474
3475        i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3476        trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3477
3478        if (trace->filter_pids.entries == NULL)
3479                goto out;
3480
3481        trace->filter_pids.entries[0] = getpid();
3482
3483        for (i = 1; i < trace->filter_pids.nr; ++i)
3484                trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3485
3486        intlist__delete(list);
3487        ret = 0;
3488out:
3489        return ret;
3490}
3491
3492static int trace__open_output(struct trace *trace, const char *filename)
3493{
3494        struct stat st;
3495
3496        if (!stat(filename, &st) && st.st_size) {
3497                char oldname[PATH_MAX];
3498
3499                scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3500                unlink(oldname);
3501                rename(filename, oldname);
3502        }
3503
3504        trace->output = fopen(filename, "w");
3505
3506        return trace->output == NULL ? -errno : 0;
3507}
3508
3509static int parse_pagefaults(const struct option *opt, const char *str,
3510                            int unset __maybe_unused)
3511{
3512        int *trace_pgfaults = opt->value;
3513
3514        if (strcmp(str, "all") == 0)
3515                *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3516        else if (strcmp(str, "maj") == 0)
3517                *trace_pgfaults |= TRACE_PFMAJ;
3518        else if (strcmp(str, "min") == 0)
3519                *trace_pgfaults |= TRACE_PFMIN;
3520        else
3521                return -1;
3522
3523        return 0;
3524}
3525
3526static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3527{
3528        struct perf_evsel *evsel;
3529
3530        evlist__for_each_entry(evlist, evsel)
3531                evsel->handler = handler;
3532}
3533
3534static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3535{
3536        struct perf_evsel *evsel;
3537
3538        evlist__for_each_entry(evlist, evsel) {
3539                if (evsel->priv || !evsel->tp_format)
3540                        continue;
3541
3542                if (strcmp(evsel->tp_format->system, "syscalls"))
3543                        continue;
3544
3545                if (perf_evsel__init_syscall_tp(evsel))
3546                        return -1;
3547
3548                if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3549                        struct syscall_tp *sc = evsel->priv;
3550
3551                        if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3552                                return -1;
3553                } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3554                        struct syscall_tp *sc = evsel->priv;
3555
3556                        if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3557                                return -1;
3558                }
3559        }
3560
3561        return 0;
3562}
3563
3564/*
3565 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3566 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3567 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3568 *
3569 * It'd be better to introduce a parse_options() variant that would return a
3570 * list with the terms it didn't match to an event...
3571 */
3572static int trace__parse_events_option(const struct option *opt, const char *str,
3573                                      int unset __maybe_unused)
3574{
3575        struct trace *trace = (struct trace *)opt->value;
3576        const char *s = str;
3577        char *sep = NULL, *lists[2] = { NULL, NULL, };
3578        int len = strlen(str) + 1, err = -1, list, idx;
3579        char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3580        char group_name[PATH_MAX];
3581        struct syscall_fmt *fmt;
3582
3583        if (strace_groups_dir == NULL)
3584                return -1;
3585
3586        if (*s == '!') {
3587                ++s;
3588                trace->not_ev_qualifier = true;
3589        }
3590
3591        while (1) {
3592                if ((sep = strchr(s, ',')) != NULL)
3593                        *sep = '\0';
3594
3595                list = 0;
3596                if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3597                    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3598                        list = 1;
3599                        goto do_concat;
3600                }
3601
3602                fmt = syscall_fmt__find_by_alias(s);
3603                if (fmt != NULL) {
3604                        list = 1;
3605                        s = fmt->name;
3606                } else {
3607                        path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3608                        if (access(group_name, R_OK) == 0)
3609                                list = 1;
3610                }
3611do_concat:
3612                if (lists[list]) {
3613                        sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3614                } else {
3615                        lists[list] = malloc(len);
3616                        if (lists[list] == NULL)
3617                                goto out;
3618                        strcpy(lists[list], s);
3619                }
3620
3621                if (!sep)
3622                        break;
3623
3624                *sep = ',';
3625                s = sep + 1;
3626        }
3627
3628        if (lists[1] != NULL) {
3629                struct strlist_config slist_config = {
3630                        .dirname = strace_groups_dir,
3631                };
3632
3633                trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3634                if (trace->ev_qualifier == NULL) {
3635                        fputs("Not enough memory to parse event qualifier", trace->output);
3636                        goto out;
3637                }
3638
3639                if (trace__validate_ev_qualifier(trace))
3640                        goto out;
3641                trace->trace_syscalls = true;
3642        }
3643
3644        err = 0;
3645
3646        if (lists[0]) {
3647                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3648                                               "event selector. use 'perf list' to list available events",
3649                                               parse_events_option);
3650                err = parse_events_option(&o, lists[0], 0);
3651        }
3652out:
3653        if (sep)
3654                *sep = ',';
3655
3656        return err;
3657}
3658
3659static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3660{
3661        struct trace *trace = opt->value;
3662
3663        if (!list_empty(&trace->evlist->entries))
3664                return parse_cgroups(opt, str, unset);
3665
3666        trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3667
3668        return 0;
3669}
3670
3671static struct bpf_map *bpf__find_map_by_name(const char *name)
3672{
3673        struct bpf_object *obj, *tmp;
3674
3675        bpf_object__for_each_safe(obj, tmp) {
3676                struct bpf_map *map = bpf_object__find_map_by_name(obj, name);
3677                if (map)
3678                        return map;
3679
3680        }
3681
3682        return NULL;
3683}
3684
3685static void trace__set_bpf_map_filtered_pids(struct trace *trace)
3686{
3687        trace->filter_pids.map = bpf__find_map_by_name("pids_filtered");
3688}
3689
3690static void trace__set_bpf_map_syscalls(struct trace *trace)
3691{
3692        trace->syscalls.map = bpf__find_map_by_name("syscalls");
3693}
3694
3695static int trace__config(const char *var, const char *value, void *arg)
3696{
3697        struct trace *trace = arg;
3698        int err = 0;
3699
3700        if (!strcmp(var, "trace.add_events")) {
3701                struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3702                                               "event selector. use 'perf list' to list available events",
3703                                               parse_events_option);
3704                /*
3705                 * We can't propagate parse_event_option() return, as it is 1
3706                 * for failure while perf_config() expects -1.
3707                 */
3708                if (parse_events_option(&o, value, 0))
3709                        err = -1;
3710        } else if (!strcmp(var, "trace.show_timestamp")) {
3711                trace->show_tstamp = perf_config_bool(var, value);
3712        } else if (!strcmp(var, "trace.show_duration")) {
3713                trace->show_duration = perf_config_bool(var, value);
3714        } else if (!strcmp(var, "trace.show_arg_names")) {
3715                trace->show_arg_names = perf_config_bool(var, value);
3716                if (!trace->show_arg_names)
3717                        trace->show_zeros = true;
3718        } else if (!strcmp(var, "trace.show_zeros")) {
3719                bool new_show_zeros = perf_config_bool(var, value);
3720                if (!trace->show_arg_names && !new_show_zeros) {
3721                        pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
3722                        goto out;
3723                }
3724                trace->show_zeros = new_show_zeros;
3725        } else if (!strcmp(var, "trace.show_prefix")) {
3726                trace->show_string_prefix = perf_config_bool(var, value);
3727        } else if (!strcmp(var, "trace.no_inherit")) {
3728                trace->opts.no_inherit = perf_config_bool(var, value);
3729        } else if (!strcmp(var, "trace.args_alignment")) {
3730                int args_alignment = 0;
3731                if (perf_config_int(&args_alignment, var, value) == 0)
3732                        trace->args_alignment = args_alignment;
3733        }
3734out:
3735        return err;
3736}
3737
3738int cmd_trace(int argc, const char **argv)
3739{
3740        const char *trace_usage[] = {
3741                "perf trace [<options>] [<command>]",
3742                "perf trace [<options>] -- <command> [<options>]",
3743                "perf trace record [<options>] [<command>]",
3744                "perf trace record [<options>] -- <command> [<options>]",
3745                NULL
3746        };
3747        struct trace trace = {
3748                .syscalls = {
3749                        . max = -1,
3750                },
3751                .opts = {
3752                        .target = {
3753                                .uid       = UINT_MAX,
3754                                .uses_mmap = true,
3755                        },
3756                        .user_freq     = UINT_MAX,
3757                        .user_interval = ULLONG_MAX,
3758                        .no_buffering  = true,
3759                        .mmap_pages    = UINT_MAX,
3760                },
3761                .output = stderr,
3762                .show_comm = true,
3763                .show_tstamp = true,
3764                .show_duration = true,
3765                .show_arg_names = true,
3766                .args_alignment = 70,
3767                .trace_syscalls = false,
3768                .kernel_syscallchains = false,
3769                .max_stack = UINT_MAX,
3770                .max_events = ULONG_MAX,
3771        };
3772        const char *map_dump_str = NULL;
3773        const char *output_name = NULL;
3774        const struct option trace_options[] = {
3775        OPT_CALLBACK('e', "event", &trace, "event",
3776                     "event/syscall selector. use 'perf list' to list available events",
3777                     trace__parse_events_option),
3778        OPT_BOOLEAN(0, "comm", &trace.show_comm,
3779                    "show the thread COMM next to its id"),
3780        OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3781        OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3782                     trace__parse_events_option),
3783        OPT_STRING('o', "output", &output_name, "file", "output file name"),
3784        OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3785        OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3786                    "trace events on existing process id"),
3787        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3788                    "trace events on existing thread id"),
3789        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3790                     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
3791        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3792                    "system-wide collection from all CPUs"),
3793        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3794                    "list of cpus to monitor"),
3795        OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3796                    "child tasks do not inherit counters"),
3797        OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3798                     "number of mmap data pages",
3799                     perf_evlist__parse_mmap_pages),
3800        OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3801                   "user to profile"),
3802        OPT_CALLBACK(0, "duration", &trace, "float",
3803                     "show only events with duration > N.M ms",
3804                     trace__set_duration),
3805#ifdef HAVE_LIBBPF_SUPPORT
3806        OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
3807#endif
3808        OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3809        OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3810        OPT_BOOLEAN('T', "time", &trace.full_time,
3811                    "Show full timestamp, not time relative to first start"),
3812        OPT_BOOLEAN(0, "failure", &trace.failure_only,
3813                    "Show only syscalls that failed"),
3814        OPT_BOOLEAN('s', "summary", &trace.summary_only,
3815                    "Show only syscall summary with statistics"),
3816        OPT_BOOLEAN('S', "with-summary", &trace.summary,
3817                    "Show all syscalls and summary with statistics"),
3818        OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3819                     "Trace pagefaults", parse_pagefaults, "maj"),
3820        OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3821        OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3822        OPT_CALLBACK(0, "call-graph", &trace.opts,
3823                     "record_mode[,record_size]", record_callchain_help,
3824                     &record_parse_callchain_opt),
3825        OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3826                    "Show the kernel callchains on the syscall exit path"),
3827        OPT_ULONG(0, "max-events", &trace.max_events,
3828                "Set the maximum number of events to print, exit after that is reached. "),
3829        OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3830                     "Set the minimum stack depth when parsing the callchain, "
3831                     "anything below the specified depth will be ignored."),
3832        OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3833                     "Set the maximum stack depth when parsing the callchain, "
3834                     "anything beyond the specified depth will be ignored. "
3835                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3836        OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
3837                        "Sort batch of events before processing, use if getting out of order events"),
3838        OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3839                        "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3840        OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3841                        "per thread proc mmap processing timeout in ms"),
3842        OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3843                     trace__parse_cgroups),
3844        OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3845                     "ms to wait before starting measurement after program "
3846                     "start"),
3847        OPT_END()
3848        };
3849        bool __maybe_unused max_stack_user_set = true;
3850        bool mmap_pages_user_set = true;
3851        struct perf_evsel *evsel;
3852        const char * const trace_subcommands[] = { "record", NULL };
3853        int err = -1;
3854        char bf[BUFSIZ];
3855
3856        signal(SIGSEGV, sighandler_dump_stack);
3857        signal(SIGFPE, sighandler_dump_stack);
3858
3859        trace.evlist = perf_evlist__new();
3860        trace.sctbl = syscalltbl__new();
3861
3862        if (trace.evlist == NULL || trace.sctbl == NULL) {
3863                pr_err("Not enough memory to run!\n");
3864                err = -ENOMEM;
3865                goto out;
3866        }
3867
3868        /*
3869         * Parsing .perfconfig may entail creating a BPF event, that may need
3870         * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
3871         * is too small. This affects just this process, not touching the
3872         * global setting. If it fails we'll get something in 'perf trace -v'
3873         * to help diagnose the problem.
3874         */
3875        rlimit__bump_memlock();
3876
3877        err = perf_config(trace__config, &trace);
3878        if (err)
3879                goto out;
3880
3881        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3882                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3883
3884        if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3885                usage_with_options_msg(trace_usage, trace_options,
3886                                       "cgroup monitoring only available in system-wide mode");
3887        }
3888
3889        evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3890        if (IS_ERR(evsel)) {
3891                bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3892                pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3893                goto out;
3894        }
3895
3896        if (evsel) {
3897                trace.syscalls.events.augmented = evsel;
3898                trace__set_bpf_map_filtered_pids(&trace);
3899                trace__set_bpf_map_syscalls(&trace);
3900        }
3901
3902        err = bpf__setup_stdout(trace.evlist);
3903        if (err) {
3904                bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3905                pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3906                goto out;
3907        }
3908
3909        err = -1;
3910
3911        if (map_dump_str) {
3912                trace.dump.map = bpf__find_map_by_name(map_dump_str);
3913                if (trace.dump.map == NULL) {
3914                        pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
3915                        goto out;
3916                }
3917        }
3918
3919        if (trace.trace_pgfaults) {
3920                trace.opts.sample_address = true;
3921                trace.opts.sample_time = true;
3922        }
3923
3924        if (trace.opts.mmap_pages == UINT_MAX)
3925                mmap_pages_user_set = false;
3926
3927        if (trace.max_stack == UINT_MAX) {
3928                trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3929                max_stack_user_set = false;
3930        }
3931
3932#ifdef HAVE_DWARF_UNWIND_SUPPORT
3933        if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3934                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3935        }
3936#endif
3937
3938        if (callchain_param.enabled) {
3939                if (!mmap_pages_user_set && geteuid() == 0)
3940                        trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3941
3942                symbol_conf.use_callchain = true;
3943        }
3944
3945        if (trace.evlist->nr_entries > 0) {
3946                evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3947                if (evlist__set_syscall_tp_fields(trace.evlist)) {
3948                        perror("failed to set syscalls:* tracepoint fields");
3949                        goto out;
3950                }
3951        }
3952
3953        if (trace.sort_events) {
3954                ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
3955                ordered_events__set_copy_on_queue(&trace.oe.data, true);
3956        }
3957
3958        /*
3959         * If we are augmenting syscalls, then combine what we put in the
3960         * __augmented_syscalls__ BPF map with what is in the
3961         * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
3962         * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
3963         *
3964         * We'll switch to look at two BPF maps, one for sys_enter and the
3965         * other for sys_exit when we start augmenting the sys_exit paths with
3966         * buffers that are being copied from kernel to userspace, think 'read'
3967         * syscall.
3968         */
3969        if (trace.syscalls.events.augmented) {
3970                evlist__for_each_entry(trace.evlist, evsel) {
3971                        bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
3972
3973                        if (raw_syscalls_sys_exit) {
3974                                trace.raw_augmented_syscalls = true;
3975                                goto init_augmented_syscall_tp;
3976                        }
3977
3978                        if (trace.syscalls.events.augmented->priv == NULL &&
3979                            strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) {
3980                                struct perf_evsel *augmented = trace.syscalls.events.augmented;
3981                                if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
3982                                    perf_evsel__init_augmented_syscall_tp_args(augmented))
3983                                        goto out;
3984                                augmented->handler = trace__sys_enter;
3985                        }
3986
3987                        if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
3988                                struct syscall_tp *sc;
3989init_augmented_syscall_tp:
3990                                if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
3991                                        goto out;
3992                                sc = evsel->priv;
3993                                /*
3994                                 * For now with BPF raw_augmented we hook into
3995                                 * raw_syscalls:sys_enter and there we get all
3996                                 * 6 syscall args plus the tracepoint common
3997                                 * fields and the syscall_nr (another long).
3998                                 * So we check if that is the case and if so
3999                                 * don't look after the sc->args_size but
4000                                 * always after the full raw_syscalls:sys_enter
4001                                 * payload, which is fixed.
4002                                 *
4003                                 * We'll revisit this later to pass
4004                                 * s->args_size to the BPF augmenter (now
4005                                 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
4006                                 * so that it copies only what we need for each
4007                                 * syscall, like what happens when we use
4008                                 * syscalls:sys_enter_NAME, so that we reduce
4009                                 * the kernel/userspace traffic to just what is
4010                                 * needed for each syscall.
4011                                 */
4012                                if (trace.raw_augmented_syscalls)
4013                                        trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
4014                                perf_evsel__init_augmented_syscall_tp_ret(evsel);
4015                                evsel->handler = trace__sys_exit;
4016                        }
4017                }
4018        }
4019
4020        if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
4021                return trace__record(&trace, argc-1, &argv[1]);
4022
4023        /* summary_only implies summary option, but don't overwrite summary if set */
4024        if (trace.summary_only)
4025                trace.summary = trace.summary_only;
4026
4027        if (!trace.trace_syscalls && !trace.trace_pgfaults &&
4028            trace.evlist->nr_entries == 0 /* Was --events used? */) {
4029                trace.trace_syscalls = true;
4030        }
4031
4032        if (output_name != NULL) {
4033                err = trace__open_output(&trace, output_name);
4034                if (err < 0) {
4035                        perror("failed to create output file");
4036                        goto out;
4037                }
4038        }
4039
4040        err = target__validate(&trace.opts.target);
4041        if (err) {
4042                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4043                fprintf(trace.output, "%s", bf);
4044                goto out_close;
4045        }
4046
4047        err = target__parse_uid(&trace.opts.target);
4048        if (err) {
4049                target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4050                fprintf(trace.output, "%s", bf);
4051                goto out_close;
4052        }
4053
4054        if (!argc && target__none(&trace.opts.target))
4055                trace.opts.target.system_wide = true;
4056
4057        if (input_name)
4058                err = trace__replay(&trace);
4059        else
4060                err = trace__run(&trace, argc, argv);
4061
4062out_close:
4063        if (output_name != NULL)
4064                fclose(trace.output);
4065out:
4066        return err;
4067}
4068