linux/tools/perf/util/evsel.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
   4 *
   5 * Parts came from builtin-{top,stat,record}.c, see those files for further
   6 * copyright notes.
   7 */
   8
   9#include <byteswap.h>
  10#include <errno.h>
  11#include <inttypes.h>
  12#include <linux/bitops.h>
  13#include <api/fs/fs.h>
  14#include <api/fs/tracing_path.h>
  15#include <traceevent/event-parse.h>
  16#include <linux/hw_breakpoint.h>
  17#include <linux/perf_event.h>
  18#include <linux/compiler.h>
  19#include <linux/err.h>
  20#include <linux/zalloc.h>
  21#include <sys/ioctl.h>
  22#include <sys/resource.h>
  23#include <sys/types.h>
  24#include <dirent.h>
  25#include <stdlib.h>
  26#include <perf/evsel.h>
  27#include "asm/bug.h"
  28#include "callchain.h"
  29#include "cgroup.h"
  30#include "counts.h"
  31#include "event.h"
  32#include "evsel.h"
  33#include "util/env.h"
  34#include "util/evsel_config.h"
  35#include "util/evsel_fprintf.h"
  36#include "evlist.h"
  37#include <perf/cpumap.h>
  38#include "thread_map.h"
  39#include "target.h"
  40#include "perf_regs.h"
  41#include "record.h"
  42#include "debug.h"
  43#include "trace-event.h"
  44#include "stat.h"
  45#include "string2.h"
  46#include "memswap.h"
  47#include "util.h"
  48#include "../perf-sys.h"
  49#include "util/parse-branch-options.h"
  50#include <internal/xyarray.h>
  51#include <internal/lib.h>
  52
  53#include <linux/ctype.h>
  54
  55struct perf_missing_features perf_missing_features;
  56
  57static clockid_t clockid;
  58
  59static int evsel__no_extra_init(struct evsel *evsel __maybe_unused)
  60{
  61        return 0;
  62}
  63
  64void __weak test_attr__ready(void) { }
  65
  66static void evsel__no_extra_fini(struct evsel *evsel __maybe_unused)
  67{
  68}
  69
  70static struct {
  71        size_t  size;
  72        int     (*init)(struct evsel *evsel);
  73        void    (*fini)(struct evsel *evsel);
  74} perf_evsel__object = {
  75        .size = sizeof(struct evsel),
  76        .init = evsel__no_extra_init,
  77        .fini = evsel__no_extra_fini,
  78};
  79
  80int evsel__object_config(size_t object_size, int (*init)(struct evsel *evsel),
  81                         void (*fini)(struct evsel *evsel))
  82{
  83
  84        if (object_size == 0)
  85                goto set_methods;
  86
  87        if (perf_evsel__object.size > object_size)
  88                return -EINVAL;
  89
  90        perf_evsel__object.size = object_size;
  91
  92set_methods:
  93        if (init != NULL)
  94                perf_evsel__object.init = init;
  95
  96        if (fini != NULL)
  97                perf_evsel__object.fini = fini;
  98
  99        return 0;
 100}
 101
 102#define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
 103
 104int __evsel__sample_size(u64 sample_type)
 105{
 106        u64 mask = sample_type & PERF_SAMPLE_MASK;
 107        int size = 0;
 108        int i;
 109
 110        for (i = 0; i < 64; i++) {
 111                if (mask & (1ULL << i))
 112                        size++;
 113        }
 114
 115        size *= sizeof(u64);
 116
 117        return size;
 118}
 119
 120/**
 121 * __perf_evsel__calc_id_pos - calculate id_pos.
 122 * @sample_type: sample type
 123 *
 124 * This function returns the position of the event id (PERF_SAMPLE_ID or
 125 * PERF_SAMPLE_IDENTIFIER) in a sample event i.e. in the array of struct
 126 * perf_record_sample.
 127 */
 128static int __perf_evsel__calc_id_pos(u64 sample_type)
 129{
 130        int idx = 0;
 131
 132        if (sample_type & PERF_SAMPLE_IDENTIFIER)
 133                return 0;
 134
 135        if (!(sample_type & PERF_SAMPLE_ID))
 136                return -1;
 137
 138        if (sample_type & PERF_SAMPLE_IP)
 139                idx += 1;
 140
 141        if (sample_type & PERF_SAMPLE_TID)
 142                idx += 1;
 143
 144        if (sample_type & PERF_SAMPLE_TIME)
 145                idx += 1;
 146
 147        if (sample_type & PERF_SAMPLE_ADDR)
 148                idx += 1;
 149
 150        return idx;
 151}
 152
 153/**
 154 * __perf_evsel__calc_is_pos - calculate is_pos.
 155 * @sample_type: sample type
 156 *
 157 * This function returns the position (counting backwards) of the event id
 158 * (PERF_SAMPLE_ID or PERF_SAMPLE_IDENTIFIER) in a non-sample event i.e. if
 159 * sample_id_all is used there is an id sample appended to non-sample events.
 160 */
 161static int __perf_evsel__calc_is_pos(u64 sample_type)
 162{
 163        int idx = 1;
 164
 165        if (sample_type & PERF_SAMPLE_IDENTIFIER)
 166                return 1;
 167
 168        if (!(sample_type & PERF_SAMPLE_ID))
 169                return -1;
 170
 171        if (sample_type & PERF_SAMPLE_CPU)
 172                idx += 1;
 173
 174        if (sample_type & PERF_SAMPLE_STREAM_ID)
 175                idx += 1;
 176
 177        return idx;
 178}
 179
 180void evsel__calc_id_pos(struct evsel *evsel)
 181{
 182        evsel->id_pos = __perf_evsel__calc_id_pos(evsel->core.attr.sample_type);
 183        evsel->is_pos = __perf_evsel__calc_is_pos(evsel->core.attr.sample_type);
 184}
 185
 186void __evsel__set_sample_bit(struct evsel *evsel,
 187                                  enum perf_event_sample_format bit)
 188{
 189        if (!(evsel->core.attr.sample_type & bit)) {
 190                evsel->core.attr.sample_type |= bit;
 191                evsel->sample_size += sizeof(u64);
 192                evsel__calc_id_pos(evsel);
 193        }
 194}
 195
 196void __evsel__reset_sample_bit(struct evsel *evsel,
 197                                    enum perf_event_sample_format bit)
 198{
 199        if (evsel->core.attr.sample_type & bit) {
 200                evsel->core.attr.sample_type &= ~bit;
 201                evsel->sample_size -= sizeof(u64);
 202                evsel__calc_id_pos(evsel);
 203        }
 204}
 205
 206void evsel__set_sample_id(struct evsel *evsel,
 207                               bool can_sample_identifier)
 208{
 209        if (can_sample_identifier) {
 210                evsel__reset_sample_bit(evsel, ID);
 211                evsel__set_sample_bit(evsel, IDENTIFIER);
 212        } else {
 213                evsel__set_sample_bit(evsel, ID);
 214        }
 215        evsel->core.attr.read_format |= PERF_FORMAT_ID;
 216}
 217
 218/**
 219 * evsel__is_function_event - Return whether given evsel is a function
 220 * trace event
 221 *
 222 * @evsel - evsel selector to be tested
 223 *
 224 * Return %true if event is function trace event
 225 */
 226bool evsel__is_function_event(struct evsel *evsel)
 227{
 228#define FUNCTION_EVENT "ftrace:function"
 229
 230        return evsel->name &&
 231               !strncmp(FUNCTION_EVENT, evsel->name, sizeof(FUNCTION_EVENT));
 232
 233#undef FUNCTION_EVENT
 234}
 235
 236void evsel__init(struct evsel *evsel,
 237                 struct perf_event_attr *attr, int idx)
 238{
 239        perf_evsel__init(&evsel->core, attr);
 240        evsel->idx         = idx;
 241        evsel->tracking    = !idx;
 242        evsel->leader      = evsel;
 243        evsel->unit        = "";
 244        evsel->scale       = 1.0;
 245        evsel->max_events  = ULONG_MAX;
 246        evsel->evlist      = NULL;
 247        evsel->bpf_obj     = NULL;
 248        evsel->bpf_fd      = -1;
 249        INIT_LIST_HEAD(&evsel->config_terms);
 250        perf_evsel__object.init(evsel);
 251        evsel->sample_size = __evsel__sample_size(attr->sample_type);
 252        evsel__calc_id_pos(evsel);
 253        evsel->cmdline_group_boundary = false;
 254        evsel->metric_expr   = NULL;
 255        evsel->metric_name   = NULL;
 256        evsel->metric_events = NULL;
 257        evsel->per_pkg_mask  = NULL;
 258        evsel->collect_stat  = false;
 259        evsel->pmu_name      = NULL;
 260}
 261
 262struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx)
 263{
 264        struct evsel *evsel = zalloc(perf_evsel__object.size);
 265
 266        if (!evsel)
 267                return NULL;
 268        evsel__init(evsel, attr, idx);
 269
 270        if (evsel__is_bpf_output(evsel)) {
 271                evsel->core.attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
 272                                            PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
 273                evsel->core.attr.sample_period = 1;
 274        }
 275
 276        if (evsel__is_clock(evsel)) {
 277                /*
 278                 * The evsel->unit points to static alias->unit
 279                 * so it's ok to use static string in here.
 280                 */
 281                static const char *unit = "msec";
 282
 283                evsel->unit = unit;
 284                evsel->scale = 1e-6;
 285        }
 286
 287        return evsel;
 288}
 289
 290static bool perf_event_can_profile_kernel(void)
 291{
 292        return perf_event_paranoid_check(1);
 293}
 294
 295struct evsel *evsel__new_cycles(bool precise)
 296{
 297        struct perf_event_attr attr = {
 298                .type   = PERF_TYPE_HARDWARE,
 299                .config = PERF_COUNT_HW_CPU_CYCLES,
 300                .exclude_kernel = !perf_event_can_profile_kernel(),
 301        };
 302        struct evsel *evsel;
 303
 304        event_attr_init(&attr);
 305
 306        if (!precise)
 307                goto new_event;
 308
 309        /*
 310         * Now let the usual logic to set up the perf_event_attr defaults
 311         * to kick in when we return and before perf_evsel__open() is called.
 312         */
 313new_event:
 314        evsel = evsel__new(&attr);
 315        if (evsel == NULL)
 316                goto out;
 317
 318        evsel->precise_max = true;
 319
 320        /* use asprintf() because free(evsel) assumes name is allocated */
 321        if (asprintf(&evsel->name, "cycles%s%s%.*s",
 322                     (attr.precise_ip || attr.exclude_kernel) ? ":" : "",
 323                     attr.exclude_kernel ? "u" : "",
 324                     attr.precise_ip ? attr.precise_ip + 1 : 0, "ppp") < 0)
 325                goto error_free;
 326out:
 327        return evsel;
 328error_free:
 329        evsel__delete(evsel);
 330        evsel = NULL;
 331        goto out;
 332}
 333
 334/*
 335 * Returns pointer with encoded error via <linux/err.h> interface.
 336 */
 337struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
 338{
 339        struct evsel *evsel = zalloc(perf_evsel__object.size);
 340        int err = -ENOMEM;
 341
 342        if (evsel == NULL) {
 343                goto out_err;
 344        } else {
 345                struct perf_event_attr attr = {
 346                        .type          = PERF_TYPE_TRACEPOINT,
 347                        .sample_type   = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
 348                                          PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
 349                };
 350
 351                if (asprintf(&evsel->name, "%s:%s", sys, name) < 0)
 352                        goto out_free;
 353
 354                evsel->tp_format = trace_event__tp_format(sys, name);
 355                if (IS_ERR(evsel->tp_format)) {
 356                        err = PTR_ERR(evsel->tp_format);
 357                        goto out_free;
 358                }
 359
 360                event_attr_init(&attr);
 361                attr.config = evsel->tp_format->id;
 362                attr.sample_period = 1;
 363                evsel__init(evsel, &attr, idx);
 364        }
 365
 366        return evsel;
 367
 368out_free:
 369        zfree(&evsel->name);
 370        free(evsel);
 371out_err:
 372        return ERR_PTR(err);
 373}
 374
 375const char *evsel__hw_names[PERF_COUNT_HW_MAX] = {
 376        "cycles",
 377        "instructions",
 378        "cache-references",
 379        "cache-misses",
 380        "branches",
 381        "branch-misses",
 382        "bus-cycles",
 383        "stalled-cycles-frontend",
 384        "stalled-cycles-backend",
 385        "ref-cycles",
 386};
 387
 388static const char *__evsel__hw_name(u64 config)
 389{
 390        if (config < PERF_COUNT_HW_MAX && evsel__hw_names[config])
 391                return evsel__hw_names[config];
 392
 393        return "unknown-hardware";
 394}
 395
 396static int perf_evsel__add_modifiers(struct evsel *evsel, char *bf, size_t size)
 397{
 398        int colon = 0, r = 0;
 399        struct perf_event_attr *attr = &evsel->core.attr;
 400        bool exclude_guest_default = false;
 401
 402#define MOD_PRINT(context, mod) do {                                    \
 403                if (!attr->exclude_##context) {                         \
 404                        if (!colon) colon = ++r;                        \
 405                        r += scnprintf(bf + r, size - r, "%c", mod);    \
 406                } } while(0)
 407
 408        if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv) {
 409                MOD_PRINT(kernel, 'k');
 410                MOD_PRINT(user, 'u');
 411                MOD_PRINT(hv, 'h');
 412                exclude_guest_default = true;
 413        }
 414
 415        if (attr->precise_ip) {
 416                if (!colon)
 417                        colon = ++r;
 418                r += scnprintf(bf + r, size - r, "%.*s", attr->precise_ip, "ppp");
 419                exclude_guest_default = true;
 420        }
 421
 422        if (attr->exclude_host || attr->exclude_guest == exclude_guest_default) {
 423                MOD_PRINT(host, 'H');
 424                MOD_PRINT(guest, 'G');
 425        }
 426#undef MOD_PRINT
 427        if (colon)
 428                bf[colon - 1] = ':';
 429        return r;
 430}
 431
 432static int evsel__hw_name(struct evsel *evsel, char *bf, size_t size)
 433{
 434        int r = scnprintf(bf, size, "%s", __evsel__hw_name(evsel->core.attr.config));
 435        return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
 436}
 437
 438const char *evsel__sw_names[PERF_COUNT_SW_MAX] = {
 439        "cpu-clock",
 440        "task-clock",
 441        "page-faults",
 442        "context-switches",
 443        "cpu-migrations",
 444        "minor-faults",
 445        "major-faults",
 446        "alignment-faults",
 447        "emulation-faults",
 448        "dummy",
 449};
 450
 451static const char *__evsel__sw_name(u64 config)
 452{
 453        if (config < PERF_COUNT_SW_MAX && evsel__sw_names[config])
 454                return evsel__sw_names[config];
 455        return "unknown-software";
 456}
 457
 458static int evsel__sw_name(struct evsel *evsel, char *bf, size_t size)
 459{
 460        int r = scnprintf(bf, size, "%s", __evsel__sw_name(evsel->core.attr.config));
 461        return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
 462}
 463
 464static int __evsel__bp_name(char *bf, size_t size, u64 addr, u64 type)
 465{
 466        int r;
 467
 468        r = scnprintf(bf, size, "mem:0x%" PRIx64 ":", addr);
 469
 470        if (type & HW_BREAKPOINT_R)
 471                r += scnprintf(bf + r, size - r, "r");
 472
 473        if (type & HW_BREAKPOINT_W)
 474                r += scnprintf(bf + r, size - r, "w");
 475
 476        if (type & HW_BREAKPOINT_X)
 477                r += scnprintf(bf + r, size - r, "x");
 478
 479        return r;
 480}
 481
 482static int evsel__bp_name(struct evsel *evsel, char *bf, size_t size)
 483{
 484        struct perf_event_attr *attr = &evsel->core.attr;
 485        int r = __evsel__bp_name(bf, size, attr->bp_addr, attr->bp_type);
 486        return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
 487}
 488
 489const char *evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX][EVSEL__MAX_ALIASES] = {
 490 { "L1-dcache", "l1-d",         "l1d",          "L1-data",              },
 491 { "L1-icache", "l1-i",         "l1i",          "L1-instruction",       },
 492 { "LLC",       "L2",                                                   },
 493 { "dTLB",      "d-tlb",        "Data-TLB",                             },
 494 { "iTLB",      "i-tlb",        "Instruction-TLB",                      },
 495 { "branch",    "branches",     "bpu",          "btb",          "bpc",  },
 496 { "node",                                                              },
 497};
 498
 499const char *evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][EVSEL__MAX_ALIASES] = {
 500 { "load",      "loads",        "read",                                 },
 501 { "store",     "stores",       "write",                                },
 502 { "prefetch",  "prefetches",   "speculative-read", "speculative-load", },
 503};
 504
 505const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_ALIASES] = {
 506 { "refs",      "Reference",    "ops",          "access",               },
 507 { "misses",    "miss",                                                 },
 508};
 509
 510#define C(x)            PERF_COUNT_HW_CACHE_##x
 511#define CACHE_READ      (1 << C(OP_READ))
 512#define CACHE_WRITE     (1 << C(OP_WRITE))
 513#define CACHE_PREFETCH  (1 << C(OP_PREFETCH))
 514#define COP(x)          (1 << x)
 515
 516/*
 517 * cache operartion stat
 518 * L1I : Read and prefetch only
 519 * ITLB and BPU : Read-only
 520 */
 521static unsigned long evsel__hw_cache_stat[C(MAX)] = {
 522 [C(L1D)]       = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
 523 [C(L1I)]       = (CACHE_READ | CACHE_PREFETCH),
 524 [C(LL)]        = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
 525 [C(DTLB)]      = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
 526 [C(ITLB)]      = (CACHE_READ),
 527 [C(BPU)]       = (CACHE_READ),
 528 [C(NODE)]      = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
 529};
 530
 531bool evsel__is_cache_op_valid(u8 type, u8 op)
 532{
 533        if (evsel__hw_cache_stat[type] & COP(op))
 534                return true;    /* valid */
 535        else
 536                return false;   /* invalid */
 537}
 538
 539int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size)
 540{
 541        if (result) {
 542                return scnprintf(bf, size, "%s-%s-%s", evsel__hw_cache[type][0],
 543                                 evsel__hw_cache_op[op][0],
 544                                 evsel__hw_cache_result[result][0]);
 545        }
 546
 547        return scnprintf(bf, size, "%s-%s", evsel__hw_cache[type][0],
 548                         evsel__hw_cache_op[op][1]);
 549}
 550
 551static int __evsel__hw_cache_name(u64 config, char *bf, size_t size)
 552{
 553        u8 op, result, type = (config >>  0) & 0xff;
 554        const char *err = "unknown-ext-hardware-cache-type";
 555
 556        if (type >= PERF_COUNT_HW_CACHE_MAX)
 557                goto out_err;
 558
 559        op = (config >>  8) & 0xff;
 560        err = "unknown-ext-hardware-cache-op";
 561        if (op >= PERF_COUNT_HW_CACHE_OP_MAX)
 562                goto out_err;
 563
 564        result = (config >> 16) & 0xff;
 565        err = "unknown-ext-hardware-cache-result";
 566        if (result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 567                goto out_err;
 568
 569        err = "invalid-cache";
 570        if (!evsel__is_cache_op_valid(type, op))
 571                goto out_err;
 572
 573        return __evsel__hw_cache_type_op_res_name(type, op, result, bf, size);
 574out_err:
 575        return scnprintf(bf, size, "%s", err);
 576}
 577
 578static int evsel__hw_cache_name(struct evsel *evsel, char *bf, size_t size)
 579{
 580        int ret = __evsel__hw_cache_name(evsel->core.attr.config, bf, size);
 581        return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret);
 582}
 583
 584static int evsel__raw_name(struct evsel *evsel, char *bf, size_t size)
 585{
 586        int ret = scnprintf(bf, size, "raw 0x%" PRIx64, evsel->core.attr.config);
 587        return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret);
 588}
 589
 590static int evsel__tool_name(char *bf, size_t size)
 591{
 592        int ret = scnprintf(bf, size, "duration_time");
 593        return ret;
 594}
 595
 596const char *evsel__name(struct evsel *evsel)
 597{
 598        char bf[128];
 599
 600        if (!evsel)
 601                goto out_unknown;
 602
 603        if (evsel->name)
 604                return evsel->name;
 605
 606        switch (evsel->core.attr.type) {
 607        case PERF_TYPE_RAW:
 608                evsel__raw_name(evsel, bf, sizeof(bf));
 609                break;
 610
 611        case PERF_TYPE_HARDWARE:
 612                evsel__hw_name(evsel, bf, sizeof(bf));
 613                break;
 614
 615        case PERF_TYPE_HW_CACHE:
 616                evsel__hw_cache_name(evsel, bf, sizeof(bf));
 617                break;
 618
 619        case PERF_TYPE_SOFTWARE:
 620                if (evsel->tool_event)
 621                        evsel__tool_name(bf, sizeof(bf));
 622                else
 623                        evsel__sw_name(evsel, bf, sizeof(bf));
 624                break;
 625
 626        case PERF_TYPE_TRACEPOINT:
 627                scnprintf(bf, sizeof(bf), "%s", "unknown tracepoint");
 628                break;
 629
 630        case PERF_TYPE_BREAKPOINT:
 631                evsel__bp_name(evsel, bf, sizeof(bf));
 632                break;
 633
 634        default:
 635                scnprintf(bf, sizeof(bf), "unknown attr type: %d",
 636                          evsel->core.attr.type);
 637                break;
 638        }
 639
 640        evsel->name = strdup(bf);
 641
 642        if (evsel->name)
 643                return evsel->name;
 644out_unknown:
 645        return "unknown";
 646}
 647
 648const char *evsel__group_name(struct evsel *evsel)
 649{
 650        return evsel->group_name ?: "anon group";
 651}
 652
 653/*
 654 * Returns the group details for the specified leader,
 655 * with following rules.
 656 *
 657 *  For record -e '{cycles,instructions}'
 658 *    'anon group { cycles:u, instructions:u }'
 659 *
 660 *  For record -e 'cycles,instructions' and report --group
 661 *    'cycles:u, instructions:u'
 662 */
 663int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
 664{
 665        int ret = 0;
 666        struct evsel *pos;
 667        const char *group_name = evsel__group_name(evsel);
 668
 669        if (!evsel->forced_leader)
 670                ret = scnprintf(buf, size, "%s { ", group_name);
 671
 672        ret += scnprintf(buf + ret, size - ret, "%s", evsel__name(evsel));
 673
 674        for_each_group_member(pos, evsel)
 675                ret += scnprintf(buf + ret, size - ret, ", %s", evsel__name(pos));
 676
 677        if (!evsel->forced_leader)
 678                ret += scnprintf(buf + ret, size - ret, " }");
 679
 680        return ret;
 681}
 682
 683static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
 684                                      struct callchain_param *param)
 685{
 686        bool function = evsel__is_function_event(evsel);
 687        struct perf_event_attr *attr = &evsel->core.attr;
 688
 689        evsel__set_sample_bit(evsel, CALLCHAIN);
 690
 691        attr->sample_max_stack = param->max_stack;
 692
 693        if (opts->kernel_callchains)
 694                attr->exclude_callchain_user = 1;
 695        if (opts->user_callchains)
 696                attr->exclude_callchain_kernel = 1;
 697        if (param->record_mode == CALLCHAIN_LBR) {
 698                if (!opts->branch_stack) {
 699                        if (attr->exclude_user) {
 700                                pr_warning("LBR callstack option is only available "
 701                                           "to get user callchain information. "
 702                                           "Falling back to framepointers.\n");
 703                        } else {
 704                                evsel__set_sample_bit(evsel, BRANCH_STACK);
 705                                attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
 706                                                        PERF_SAMPLE_BRANCH_CALL_STACK |
 707                                                        PERF_SAMPLE_BRANCH_NO_CYCLES |
 708                                                        PERF_SAMPLE_BRANCH_NO_FLAGS |
 709                                                        PERF_SAMPLE_BRANCH_HW_INDEX;
 710                        }
 711                } else
 712                         pr_warning("Cannot use LBR callstack with branch stack. "
 713                                    "Falling back to framepointers.\n");
 714        }
 715
 716        if (param->record_mode == CALLCHAIN_DWARF) {
 717                if (!function) {
 718                        evsel__set_sample_bit(evsel, REGS_USER);
 719                        evsel__set_sample_bit(evsel, STACK_USER);
 720                        if (opts->sample_user_regs && DWARF_MINIMAL_REGS != PERF_REGS_MASK) {
 721                                attr->sample_regs_user |= DWARF_MINIMAL_REGS;
 722                                pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
 723                                           "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
 724                                           "so the minimal registers set (IP, SP) is explicitly forced.\n");
 725                        } else {
 726                                attr->sample_regs_user |= PERF_REGS_MASK;
 727                        }
 728                        attr->sample_stack_user = param->dump_size;
 729                        attr->exclude_callchain_user = 1;
 730                } else {
 731                        pr_info("Cannot use DWARF unwind for function trace event,"
 732                                " falling back to framepointers.\n");
 733                }
 734        }
 735
 736        if (function) {
 737                pr_info("Disabling user space callchains for function trace event.\n");
 738                attr->exclude_callchain_user = 1;
 739        }
 740}
 741
 742void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
 743                             struct callchain_param *param)
 744{
 745        if (param->enabled)
 746                return __evsel__config_callchain(evsel, opts, param);
 747}
 748
 749static void
 750perf_evsel__reset_callgraph(struct evsel *evsel,
 751                            struct callchain_param *param)
 752{
 753        struct perf_event_attr *attr = &evsel->core.attr;
 754
 755        evsel__reset_sample_bit(evsel, CALLCHAIN);
 756        if (param->record_mode == CALLCHAIN_LBR) {
 757                evsel__reset_sample_bit(evsel, BRANCH_STACK);
 758                attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER |
 759                                              PERF_SAMPLE_BRANCH_CALL_STACK |
 760                                              PERF_SAMPLE_BRANCH_HW_INDEX);
 761        }
 762        if (param->record_mode == CALLCHAIN_DWARF) {
 763                evsel__reset_sample_bit(evsel, REGS_USER);
 764                evsel__reset_sample_bit(evsel, STACK_USER);
 765        }
 766}
 767
 768static void evsel__apply_config_terms(struct evsel *evsel,
 769                                      struct record_opts *opts, bool track)
 770{
 771        struct evsel_config_term *term;
 772        struct list_head *config_terms = &evsel->config_terms;
 773        struct perf_event_attr *attr = &evsel->core.attr;
 774        /* callgraph default */
 775        struct callchain_param param = {
 776                .record_mode = callchain_param.record_mode,
 777        };
 778        u32 dump_size = 0;
 779        int max_stack = 0;
 780        const char *callgraph_buf = NULL;
 781
 782        list_for_each_entry(term, config_terms, list) {
 783                switch (term->type) {
 784                case EVSEL__CONFIG_TERM_PERIOD:
 785                        if (!(term->weak && opts->user_interval != ULLONG_MAX)) {
 786                                attr->sample_period = term->val.period;
 787                                attr->freq = 0;
 788                                evsel__reset_sample_bit(evsel, PERIOD);
 789                        }
 790                        break;
 791                case EVSEL__CONFIG_TERM_FREQ:
 792                        if (!(term->weak && opts->user_freq != UINT_MAX)) {
 793                                attr->sample_freq = term->val.freq;
 794                                attr->freq = 1;
 795                                evsel__set_sample_bit(evsel, PERIOD);
 796                        }
 797                        break;
 798                case EVSEL__CONFIG_TERM_TIME:
 799                        if (term->val.time)
 800                                evsel__set_sample_bit(evsel, TIME);
 801                        else
 802                                evsel__reset_sample_bit(evsel, TIME);
 803                        break;
 804                case EVSEL__CONFIG_TERM_CALLGRAPH:
 805                        callgraph_buf = term->val.str;
 806                        break;
 807                case EVSEL__CONFIG_TERM_BRANCH:
 808                        if (term->val.str && strcmp(term->val.str, "no")) {
 809                                evsel__set_sample_bit(evsel, BRANCH_STACK);
 810                                parse_branch_str(term->val.str,
 811                                                 &attr->branch_sample_type);
 812                        } else
 813                                evsel__reset_sample_bit(evsel, BRANCH_STACK);
 814                        break;
 815                case EVSEL__CONFIG_TERM_STACK_USER:
 816                        dump_size = term->val.stack_user;
 817                        break;
 818                case EVSEL__CONFIG_TERM_MAX_STACK:
 819                        max_stack = term->val.max_stack;
 820                        break;
 821                case EVSEL__CONFIG_TERM_MAX_EVENTS:
 822                        evsel->max_events = term->val.max_events;
 823                        break;
 824                case EVSEL__CONFIG_TERM_INHERIT:
 825                        /*
 826                         * attr->inherit should has already been set by
 827                         * evsel__config. If user explicitly set
 828                         * inherit using config terms, override global
 829                         * opt->no_inherit setting.
 830                         */
 831                        attr->inherit = term->val.inherit ? 1 : 0;
 832                        break;
 833                case EVSEL__CONFIG_TERM_OVERWRITE:
 834                        attr->write_backward = term->val.overwrite ? 1 : 0;
 835                        break;
 836                case EVSEL__CONFIG_TERM_DRV_CFG:
 837                        break;
 838                case EVSEL__CONFIG_TERM_PERCORE:
 839                        break;
 840                case EVSEL__CONFIG_TERM_AUX_OUTPUT:
 841                        attr->aux_output = term->val.aux_output ? 1 : 0;
 842                        break;
 843                case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE:
 844                        /* Already applied by auxtrace */
 845                        break;
 846                case EVSEL__CONFIG_TERM_CFG_CHG:
 847                        break;
 848                default:
 849                        break;
 850                }
 851        }
 852
 853        /* User explicitly set per-event callgraph, clear the old setting and reset. */
 854        if ((callgraph_buf != NULL) || (dump_size > 0) || max_stack) {
 855                bool sample_address = false;
 856
 857                if (max_stack) {
 858                        param.max_stack = max_stack;
 859                        if (callgraph_buf == NULL)
 860                                callgraph_buf = "fp";
 861                }
 862
 863                /* parse callgraph parameters */
 864                if (callgraph_buf != NULL) {
 865                        if (!strcmp(callgraph_buf, "no")) {
 866                                param.enabled = false;
 867                                param.record_mode = CALLCHAIN_NONE;
 868                        } else {
 869                                param.enabled = true;
 870                                if (parse_callchain_record(callgraph_buf, &param)) {
 871                                        pr_err("per-event callgraph setting for %s failed. "
 872                                               "Apply callgraph global setting for it\n",
 873                                               evsel->name);
 874                                        return;
 875                                }
 876                                if (param.record_mode == CALLCHAIN_DWARF)
 877                                        sample_address = true;
 878                        }
 879                }
 880                if (dump_size > 0) {
 881                        dump_size = round_up(dump_size, sizeof(u64));
 882                        param.dump_size = dump_size;
 883                }
 884
 885                /* If global callgraph set, clear it */
 886                if (callchain_param.enabled)
 887                        perf_evsel__reset_callgraph(evsel, &callchain_param);
 888
 889                /* set perf-event callgraph */
 890                if (param.enabled) {
 891                        if (sample_address) {
 892                                evsel__set_sample_bit(evsel, ADDR);
 893                                evsel__set_sample_bit(evsel, DATA_SRC);
 894                                evsel->core.attr.mmap_data = track;
 895                        }
 896                        evsel__config_callchain(evsel, opts, &param);
 897                }
 898        }
 899}
 900
 901struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evsel_term_type type)
 902{
 903        struct evsel_config_term *term, *found_term = NULL;
 904
 905        list_for_each_entry(term, &evsel->config_terms, list) {
 906                if (term->type == type)
 907                        found_term = term;
 908        }
 909
 910        return found_term;
 911}
 912
 913/*
 914 * The enable_on_exec/disabled value strategy:
 915 *
 916 *  1) For any type of traced program:
 917 *    - all independent events and group leaders are disabled
 918 *    - all group members are enabled
 919 *
 920 *     Group members are ruled by group leaders. They need to
 921 *     be enabled, because the group scheduling relies on that.
 922 *
 923 *  2) For traced programs executed by perf:
 924 *     - all independent events and group leaders have
 925 *       enable_on_exec set
 926 *     - we don't specifically enable or disable any event during
 927 *       the record command
 928 *
 929 *     Independent events and group leaders are initially disabled
 930 *     and get enabled by exec. Group members are ruled by group
 931 *     leaders as stated in 1).
 932 *
 933 *  3) For traced programs attached by perf (pid/tid):
 934 *     - we specifically enable or disable all events during
 935 *       the record command
 936 *
 937 *     When attaching events to already running traced we
 938 *     enable/disable events specifically, as there's no
 939 *     initial traced exec call.
 940 */
 941void evsel__config(struct evsel *evsel, struct record_opts *opts,
 942                   struct callchain_param *callchain)
 943{
 944        struct evsel *leader = evsel->leader;
 945        struct perf_event_attr *attr = &evsel->core.attr;
 946        int track = evsel->tracking;
 947        bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread;
 948
 949        attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
 950        attr->inherit       = !opts->no_inherit;
 951        attr->write_backward = opts->overwrite ? 1 : 0;
 952
 953        evsel__set_sample_bit(evsel, IP);
 954        evsel__set_sample_bit(evsel, TID);
 955
 956        if (evsel->sample_read) {
 957                evsel__set_sample_bit(evsel, READ);
 958
 959                /*
 960                 * We need ID even in case of single event, because
 961                 * PERF_SAMPLE_READ process ID specific data.
 962                 */
 963                evsel__set_sample_id(evsel, false);
 964
 965                /*
 966                 * Apply group format only if we belong to group
 967                 * with more than one members.
 968                 */
 969                if (leader->core.nr_members > 1) {
 970                        attr->read_format |= PERF_FORMAT_GROUP;
 971                        attr->inherit = 0;
 972                }
 973        }
 974
 975        /*
 976         * We default some events to have a default interval. But keep
 977         * it a weak assumption overridable by the user.
 978         */
 979        if (!attr->sample_period) {
 980                if (opts->freq) {
 981                        attr->freq              = 1;
 982                        attr->sample_freq       = opts->freq;
 983                } else {
 984                        attr->sample_period = opts->default_interval;
 985                }
 986        }
 987        /*
 988         * If attr->freq was set (here or earlier), ask for period
 989         * to be sampled.
 990         */
 991        if (attr->freq)
 992                evsel__set_sample_bit(evsel, PERIOD);
 993
 994        if (opts->no_samples)
 995                attr->sample_freq = 0;
 996
 997        if (opts->inherit_stat) {
 998                evsel->core.attr.read_format |=
 999                        PERF_FORMAT_TOTAL_TIME_ENABLED |
1000                        PERF_FORMAT_TOTAL_TIME_RUNNING |
1001                        PERF_FORMAT_ID;
1002                attr->inherit_stat = 1;
1003        }
1004
1005        if (opts->sample_address) {
1006                evsel__set_sample_bit(evsel, ADDR);
1007                attr->mmap_data = track;
1008        }
1009
1010        /*
1011         * We don't allow user space callchains for  function trace
1012         * event, due to issues with page faults while tracing page
1013         * fault handler and its overall trickiness nature.
1014         */
1015        if (evsel__is_function_event(evsel))
1016                evsel->core.attr.exclude_callchain_user = 1;
1017
1018        if (callchain && callchain->enabled && !evsel->no_aux_samples)
1019                evsel__config_callchain(evsel, opts, callchain);
1020
1021        if (opts->sample_intr_regs && !evsel->no_aux_samples &&
1022            !evsel__is_dummy_event(evsel)) {
1023                attr->sample_regs_intr = opts->sample_intr_regs;
1024                evsel__set_sample_bit(evsel, REGS_INTR);
1025        }
1026
1027        if (opts->sample_user_regs && !evsel->no_aux_samples &&
1028            !evsel__is_dummy_event(evsel)) {
1029                attr->sample_regs_user |= opts->sample_user_regs;
1030                evsel__set_sample_bit(evsel, REGS_USER);
1031        }
1032
1033        if (target__has_cpu(&opts->target) || opts->sample_cpu)
1034                evsel__set_sample_bit(evsel, CPU);
1035
1036        /*
1037         * When the user explicitly disabled time don't force it here.
1038         */
1039        if (opts->sample_time &&
1040            (!perf_missing_features.sample_id_all &&
1041            (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu ||
1042             opts->sample_time_set)))
1043                evsel__set_sample_bit(evsel, TIME);
1044
1045        if (opts->raw_samples && !evsel->no_aux_samples) {
1046                evsel__set_sample_bit(evsel, TIME);
1047                evsel__set_sample_bit(evsel, RAW);
1048                evsel__set_sample_bit(evsel, CPU);
1049        }
1050
1051        if (opts->sample_address)
1052                evsel__set_sample_bit(evsel, DATA_SRC);
1053
1054        if (opts->sample_phys_addr)
1055                evsel__set_sample_bit(evsel, PHYS_ADDR);
1056
1057        if (opts->no_buffering) {
1058                attr->watermark = 0;
1059                attr->wakeup_events = 1;
1060        }
1061        if (opts->branch_stack && !evsel->no_aux_samples) {
1062                evsel__set_sample_bit(evsel, BRANCH_STACK);
1063                attr->branch_sample_type = opts->branch_stack;
1064        }
1065
1066        if (opts->sample_weight)
1067                evsel__set_sample_bit(evsel, WEIGHT);
1068
1069        attr->task  = track;
1070        attr->mmap  = track;
1071        attr->mmap2 = track && !perf_missing_features.mmap2;
1072        attr->comm  = track;
1073        /*
1074         * ksymbol is tracked separately with text poke because it needs to be
1075         * system wide and enabled immediately.
1076         */
1077        if (!opts->text_poke)
1078                attr->ksymbol = track && !perf_missing_features.ksymbol;
1079        attr->bpf_event = track && !opts->no_bpf_event && !perf_missing_features.bpf;
1080
1081        if (opts->record_namespaces)
1082                attr->namespaces  = track;
1083
1084        if (opts->record_cgroup) {
1085                attr->cgroup = track && !perf_missing_features.cgroup;
1086                evsel__set_sample_bit(evsel, CGROUP);
1087        }
1088
1089        if (opts->record_switch_events)
1090                attr->context_switch = track;
1091
1092        if (opts->sample_transaction)
1093                evsel__set_sample_bit(evsel, TRANSACTION);
1094
1095        if (opts->running_time) {
1096                evsel->core.attr.read_format |=
1097                        PERF_FORMAT_TOTAL_TIME_ENABLED |
1098                        PERF_FORMAT_TOTAL_TIME_RUNNING;
1099        }
1100
1101        /*
1102         * XXX see the function comment above
1103         *
1104         * Disabling only independent events or group leaders,
1105         * keeping group members enabled.
1106         */
1107        if (evsel__is_group_leader(evsel))
1108                attr->disabled = 1;
1109
1110        /*
1111         * Setting enable_on_exec for independent events and
1112         * group leaders for traced executed by perf.
1113         */
1114        if (target__none(&opts->target) && evsel__is_group_leader(evsel) &&
1115            !opts->initial_delay)
1116                attr->enable_on_exec = 1;
1117
1118        if (evsel->immediate) {
1119                attr->disabled = 0;
1120                attr->enable_on_exec = 0;
1121        }
1122
1123        clockid = opts->clockid;
1124        if (opts->use_clockid) {
1125                attr->use_clockid = 1;
1126                attr->clockid = opts->clockid;
1127        }
1128
1129        if (evsel->precise_max)
1130                attr->precise_ip = 3;
1131
1132        if (opts->all_user) {
1133                attr->exclude_kernel = 1;
1134                attr->exclude_user   = 0;
1135        }
1136
1137        if (opts->all_kernel) {
1138                attr->exclude_kernel = 0;
1139                attr->exclude_user   = 1;
1140        }
1141
1142        if (evsel->core.own_cpus || evsel->unit)
1143                evsel->core.attr.read_format |= PERF_FORMAT_ID;
1144
1145        /*
1146         * Apply event specific term settings,
1147         * it overloads any global configuration.
1148         */
1149        evsel__apply_config_terms(evsel, opts, track);
1150
1151        evsel->ignore_missing_thread = opts->ignore_missing_thread;
1152
1153        /* The --period option takes the precedence. */
1154        if (opts->period_set) {
1155                if (opts->period)
1156                        evsel__set_sample_bit(evsel, PERIOD);
1157                else
1158                        evsel__reset_sample_bit(evsel, PERIOD);
1159        }
1160
1161        /*
1162         * A dummy event never triggers any actual counter and therefore
1163         * cannot be used with branch_stack.
1164         *
1165         * For initial_delay, a dummy event is added implicitly.
1166         * The software event will trigger -EOPNOTSUPP error out,
1167         * if BRANCH_STACK bit is set.
1168         */
1169        if (evsel__is_dummy_event(evsel))
1170                evsel__reset_sample_bit(evsel, BRANCH_STACK);
1171}
1172
1173int evsel__set_filter(struct evsel *evsel, const char *filter)
1174{
1175        char *new_filter = strdup(filter);
1176
1177        if (new_filter != NULL) {
1178                free(evsel->filter);
1179                evsel->filter = new_filter;
1180                return 0;
1181        }
1182
1183        return -1;
1184}
1185
1186static int evsel__append_filter(struct evsel *evsel, const char *fmt, const char *filter)
1187{
1188        char *new_filter;
1189
1190        if (evsel->filter == NULL)
1191                return evsel__set_filter(evsel, filter);
1192
1193        if (asprintf(&new_filter, fmt, evsel->filter, filter) > 0) {
1194                free(evsel->filter);
1195                evsel->filter = new_filter;
1196                return 0;
1197        }
1198
1199        return -1;
1200}
1201
1202int evsel__append_tp_filter(struct evsel *evsel, const char *filter)
1203{
1204        return evsel__append_filter(evsel, "(%s) && (%s)", filter);
1205}
1206
1207int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
1208{
1209        return evsel__append_filter(evsel, "%s,%s", filter);
1210}
1211
1212/* Caller has to clear disabled after going through all CPUs. */
1213int evsel__enable_cpu(struct evsel *evsel, int cpu)
1214{
1215        return perf_evsel__enable_cpu(&evsel->core, cpu);
1216}
1217
1218int evsel__enable(struct evsel *evsel)
1219{
1220        int err = perf_evsel__enable(&evsel->core);
1221
1222        if (!err)
1223                evsel->disabled = false;
1224        return err;
1225}
1226
1227/* Caller has to set disabled after going through all CPUs. */
1228int evsel__disable_cpu(struct evsel *evsel, int cpu)
1229{
1230        return perf_evsel__disable_cpu(&evsel->core, cpu);
1231}
1232
1233int evsel__disable(struct evsel *evsel)
1234{
1235        int err = perf_evsel__disable(&evsel->core);
1236        /*
1237         * We mark it disabled here so that tools that disable a event can
1238         * ignore events after they disable it. I.e. the ring buffer may have
1239         * already a few more events queued up before the kernel got the stop
1240         * request.
1241         */
1242        if (!err)
1243                evsel->disabled = true;
1244
1245        return err;
1246}
1247
1248static void evsel__free_config_terms(struct evsel *evsel)
1249{
1250        struct evsel_config_term *term, *h;
1251
1252        list_for_each_entry_safe(term, h, &evsel->config_terms, list) {
1253                list_del_init(&term->list);
1254                if (term->free_str)
1255                        zfree(&term->val.str);
1256                free(term);
1257        }
1258}
1259
1260void evsel__exit(struct evsel *evsel)
1261{
1262        assert(list_empty(&evsel->core.node));
1263        assert(evsel->evlist == NULL);
1264        evsel__free_counts(evsel);
1265        perf_evsel__free_fd(&evsel->core);
1266        perf_evsel__free_id(&evsel->core);
1267        evsel__free_config_terms(evsel);
1268        cgroup__put(evsel->cgrp);
1269        perf_cpu_map__put(evsel->core.cpus);
1270        perf_cpu_map__put(evsel->core.own_cpus);
1271        perf_thread_map__put(evsel->core.threads);
1272        zfree(&evsel->group_name);
1273        zfree(&evsel->name);
1274        zfree(&evsel->pmu_name);
1275        zfree(&evsel->per_pkg_mask);
1276        zfree(&evsel->metric_events);
1277        perf_evsel__object.fini(evsel);
1278}
1279
1280void evsel__delete(struct evsel *evsel)
1281{
1282        evsel__exit(evsel);
1283        free(evsel);
1284}
1285
1286void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
1287                           struct perf_counts_values *count)
1288{
1289        struct perf_counts_values tmp;
1290
1291        if (!evsel->prev_raw_counts)
1292                return;
1293
1294        if (cpu == -1) {
1295                tmp = evsel->prev_raw_counts->aggr;
1296                evsel->prev_raw_counts->aggr = *count;
1297        } else {
1298                tmp = *perf_counts(evsel->prev_raw_counts, cpu, thread);
1299                *perf_counts(evsel->prev_raw_counts, cpu, thread) = *count;
1300        }
1301
1302        count->val = count->val - tmp.val;
1303        count->ena = count->ena - tmp.ena;
1304        count->run = count->run - tmp.run;
1305}
1306
1307void perf_counts_values__scale(struct perf_counts_values *count,
1308                               bool scale, s8 *pscaled)
1309{
1310        s8 scaled = 0;
1311
1312        if (scale) {
1313                if (count->run == 0) {
1314                        scaled = -1;
1315                        count->val = 0;
1316                } else if (count->run < count->ena) {
1317                        scaled = 1;
1318                        count->val = (u64)((double) count->val * count->ena / count->run);
1319                }
1320        }
1321
1322        if (pscaled)
1323                *pscaled = scaled;
1324}
1325
1326static int evsel__read_one(struct evsel *evsel, int cpu, int thread)
1327{
1328        struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread);
1329
1330        return perf_evsel__read(&evsel->core, cpu, thread, count);
1331}
1332
1333static void
1334perf_evsel__set_count(struct evsel *counter, int cpu, int thread,
1335                      u64 val, u64 ena, u64 run)
1336{
1337        struct perf_counts_values *count;
1338
1339        count = perf_counts(counter->counts, cpu, thread);
1340
1341        count->val    = val;
1342        count->ena    = ena;
1343        count->run    = run;
1344
1345        perf_counts__set_loaded(counter->counts, cpu, thread, true);
1346}
1347
1348static int
1349perf_evsel__process_group_data(struct evsel *leader,
1350                               int cpu, int thread, u64 *data)
1351{
1352        u64 read_format = leader->core.attr.read_format;
1353        struct sample_read_value *v;
1354        u64 nr, ena = 0, run = 0, i;
1355
1356        nr = *data++;
1357
1358        if (nr != (u64) leader->core.nr_members)
1359                return -EINVAL;
1360
1361        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1362                ena = *data++;
1363
1364        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1365                run = *data++;
1366
1367        v = (struct sample_read_value *) data;
1368
1369        perf_evsel__set_count(leader, cpu, thread,
1370                              v[0].value, ena, run);
1371
1372        for (i = 1; i < nr; i++) {
1373                struct evsel *counter;
1374
1375                counter = perf_evlist__id2evsel(leader->evlist, v[i].id);
1376                if (!counter)
1377                        return -EINVAL;
1378
1379                perf_evsel__set_count(counter, cpu, thread,
1380                                      v[i].value, ena, run);
1381        }
1382
1383        return 0;
1384}
1385
1386static int evsel__read_group(struct evsel *leader, int cpu, int thread)
1387{
1388        struct perf_stat_evsel *ps = leader->stats;
1389        u64 read_format = leader->core.attr.read_format;
1390        int size = perf_evsel__read_size(&leader->core);
1391        u64 *data = ps->group_data;
1392
1393        if (!(read_format & PERF_FORMAT_ID))
1394                return -EINVAL;
1395
1396        if (!evsel__is_group_leader(leader))
1397                return -EINVAL;
1398
1399        if (!data) {
1400                data = zalloc(size);
1401                if (!data)
1402                        return -ENOMEM;
1403
1404                ps->group_data = data;
1405        }
1406
1407        if (FD(leader, cpu, thread) < 0)
1408                return -EINVAL;
1409
1410        if (readn(FD(leader, cpu, thread), data, size) <= 0)
1411                return -errno;
1412
1413        return perf_evsel__process_group_data(leader, cpu, thread, data);
1414}
1415
1416int evsel__read_counter(struct evsel *evsel, int cpu, int thread)
1417{
1418        u64 read_format = evsel->core.attr.read_format;
1419
1420        if (read_format & PERF_FORMAT_GROUP)
1421                return evsel__read_group(evsel, cpu, thread);
1422
1423        return evsel__read_one(evsel, cpu, thread);
1424}
1425
1426int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale)
1427{
1428        struct perf_counts_values count;
1429        size_t nv = scale ? 3 : 1;
1430
1431        if (FD(evsel, cpu, thread) < 0)
1432                return -EINVAL;
1433
1434        if (evsel->counts == NULL && evsel__alloc_counts(evsel, cpu + 1, thread + 1) < 0)
1435                return -ENOMEM;
1436
1437        if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0)
1438                return -errno;
1439
1440        evsel__compute_deltas(evsel, cpu, thread, &count);
1441        perf_counts_values__scale(&count, scale, NULL);
1442        *perf_counts(evsel->counts, cpu, thread) = count;
1443        return 0;
1444}
1445
1446static int get_group_fd(struct evsel *evsel, int cpu, int thread)
1447{
1448        struct evsel *leader = evsel->leader;
1449        int fd;
1450
1451        if (evsel__is_group_leader(evsel))
1452                return -1;
1453
1454        /*
1455         * Leader must be already processed/open,
1456         * if not it's a bug.
1457         */
1458        BUG_ON(!leader->core.fd);
1459
1460        fd = FD(leader, cpu, thread);
1461        BUG_ON(fd == -1);
1462
1463        return fd;
1464}
1465
1466static void perf_evsel__remove_fd(struct evsel *pos,
1467                                  int nr_cpus, int nr_threads,
1468                                  int thread_idx)
1469{
1470        for (int cpu = 0; cpu < nr_cpus; cpu++)
1471                for (int thread = thread_idx; thread < nr_threads - 1; thread++)
1472                        FD(pos, cpu, thread) = FD(pos, cpu, thread + 1);
1473}
1474
1475static int update_fds(struct evsel *evsel,
1476                      int nr_cpus, int cpu_idx,
1477                      int nr_threads, int thread_idx)
1478{
1479        struct evsel *pos;
1480
1481        if (cpu_idx >= nr_cpus || thread_idx >= nr_threads)
1482                return -EINVAL;
1483
1484        evlist__for_each_entry(evsel->evlist, pos) {
1485                nr_cpus = pos != evsel ? nr_cpus : cpu_idx;
1486
1487                perf_evsel__remove_fd(pos, nr_cpus, nr_threads, thread_idx);
1488
1489                /*
1490                 * Since fds for next evsel has not been created,
1491                 * there is no need to iterate whole event list.
1492                 */
1493                if (pos == evsel)
1494                        break;
1495        }
1496        return 0;
1497}
1498
1499static bool ignore_missing_thread(struct evsel *evsel,
1500                                  int nr_cpus, int cpu,
1501                                  struct perf_thread_map *threads,
1502                                  int thread, int err)
1503{
1504        pid_t ignore_pid = perf_thread_map__pid(threads, thread);
1505
1506        if (!evsel->ignore_missing_thread)
1507                return false;
1508
1509        /* The system wide setup does not work with threads. */
1510        if (evsel->core.system_wide)
1511                return false;
1512
1513        /* The -ESRCH is perf event syscall errno for pid's not found. */
1514        if (err != -ESRCH)
1515                return false;
1516
1517        /* If there's only one thread, let it fail. */
1518        if (threads->nr == 1)
1519                return false;
1520
1521        /*
1522         * We should remove fd for missing_thread first
1523         * because thread_map__remove() will decrease threads->nr.
1524         */
1525        if (update_fds(evsel, nr_cpus, cpu, threads->nr, thread))
1526                return false;
1527
1528        if (thread_map__remove(threads, thread))
1529                return false;
1530
1531        pr_warning("WARNING: Ignored open failure for pid %d\n",
1532                   ignore_pid);
1533        return true;
1534}
1535
1536static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
1537                                void *priv __maybe_unused)
1538{
1539        return fprintf(fp, "  %-32s %s\n", name, val);
1540}
1541
1542static void display_attr(struct perf_event_attr *attr)
1543{
1544        if (verbose >= 2 || debug_peo_args) {
1545                fprintf(stderr, "%.60s\n", graph_dotted_line);
1546                fprintf(stderr, "perf_event_attr:\n");
1547                perf_event_attr__fprintf(stderr, attr, __open_attr__fprintf, NULL);
1548                fprintf(stderr, "%.60s\n", graph_dotted_line);
1549        }
1550}
1551
1552static int perf_event_open(struct evsel *evsel,
1553                           pid_t pid, int cpu, int group_fd,
1554                           unsigned long flags)
1555{
1556        int precise_ip = evsel->core.attr.precise_ip;
1557        int fd;
1558
1559        while (1) {
1560                pr_debug2_peo("sys_perf_event_open: pid %d  cpu %d  group_fd %d  flags %#lx",
1561                          pid, cpu, group_fd, flags);
1562
1563                fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, group_fd, flags);
1564                if (fd >= 0)
1565                        break;
1566
1567                /* Do not try less precise if not requested. */
1568                if (!evsel->precise_max)
1569                        break;
1570
1571                /*
1572                 * We tried all the precise_ip values, and it's
1573                 * still failing, so leave it to standard fallback.
1574                 */
1575                if (!evsel->core.attr.precise_ip) {
1576                        evsel->core.attr.precise_ip = precise_ip;
1577                        break;
1578                }
1579
1580                pr_debug2_peo("\nsys_perf_event_open failed, error %d\n", -ENOTSUP);
1581                evsel->core.attr.precise_ip--;
1582                pr_debug2_peo("decreasing precise_ip by one (%d)\n", evsel->core.attr.precise_ip);
1583                display_attr(&evsel->core.attr);
1584        }
1585
1586        return fd;
1587}
1588
1589static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
1590                struct perf_thread_map *threads,
1591                int start_cpu, int end_cpu)
1592{
1593        int cpu, thread, nthreads;
1594        unsigned long flags = PERF_FLAG_FD_CLOEXEC;
1595        int pid = -1, err, old_errno;
1596        enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
1597
1598        if ((perf_missing_features.write_backward && evsel->core.attr.write_backward) ||
1599            (perf_missing_features.aux_output     && evsel->core.attr.aux_output))
1600                return -EINVAL;
1601
1602        if (cpus == NULL) {
1603                static struct perf_cpu_map *empty_cpu_map;
1604
1605                if (empty_cpu_map == NULL) {
1606                        empty_cpu_map = perf_cpu_map__dummy_new();
1607                        if (empty_cpu_map == NULL)
1608                                return -ENOMEM;
1609                }
1610
1611                cpus = empty_cpu_map;
1612        }
1613
1614        if (threads == NULL) {
1615                static struct perf_thread_map *empty_thread_map;
1616
1617                if (empty_thread_map == NULL) {
1618                        empty_thread_map = thread_map__new_by_tid(-1);
1619                        if (empty_thread_map == NULL)
1620                                return -ENOMEM;
1621                }
1622
1623                threads = empty_thread_map;
1624        }
1625
1626        if (evsel->core.system_wide)
1627                nthreads = 1;
1628        else
1629                nthreads = threads->nr;
1630
1631        if (evsel->core.fd == NULL &&
1632            perf_evsel__alloc_fd(&evsel->core, cpus->nr, nthreads) < 0)
1633                return -ENOMEM;
1634
1635        if (evsel->cgrp) {
1636                flags |= PERF_FLAG_PID_CGROUP;
1637                pid = evsel->cgrp->fd;
1638        }
1639
1640fallback_missing_features:
1641        if (perf_missing_features.clockid_wrong)
1642                evsel->core.attr.clockid = CLOCK_MONOTONIC; /* should always work */
1643        if (perf_missing_features.clockid) {
1644                evsel->core.attr.use_clockid = 0;
1645                evsel->core.attr.clockid = 0;
1646        }
1647        if (perf_missing_features.cloexec)
1648                flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC;
1649        if (perf_missing_features.mmap2)
1650                evsel->core.attr.mmap2 = 0;
1651        if (perf_missing_features.exclude_guest)
1652                evsel->core.attr.exclude_guest = evsel->core.attr.exclude_host = 0;
1653        if (perf_missing_features.lbr_flags)
1654                evsel->core.attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
1655                                     PERF_SAMPLE_BRANCH_NO_CYCLES);
1656        if (perf_missing_features.group_read && evsel->core.attr.inherit)
1657                evsel->core.attr.read_format &= ~(PERF_FORMAT_GROUP|PERF_FORMAT_ID);
1658        if (perf_missing_features.ksymbol)
1659                evsel->core.attr.ksymbol = 0;
1660        if (perf_missing_features.bpf)
1661                evsel->core.attr.bpf_event = 0;
1662        if (perf_missing_features.branch_hw_idx)
1663                evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX;
1664retry_sample_id:
1665        if (perf_missing_features.sample_id_all)
1666                evsel->core.attr.sample_id_all = 0;
1667
1668        display_attr(&evsel->core.attr);
1669
1670        for (cpu = start_cpu; cpu < end_cpu; cpu++) {
1671
1672                for (thread = 0; thread < nthreads; thread++) {
1673                        int fd, group_fd;
1674
1675                        if (!evsel->cgrp && !evsel->core.system_wide)
1676                                pid = perf_thread_map__pid(threads, thread);
1677
1678                        group_fd = get_group_fd(evsel, cpu, thread);
1679retry_open:
1680                        test_attr__ready();
1681
1682                        fd = perf_event_open(evsel, pid, cpus->map[cpu],
1683                                             group_fd, flags);
1684
1685                        FD(evsel, cpu, thread) = fd;
1686
1687                        if (fd < 0) {
1688                                err = -errno;
1689
1690                                if (ignore_missing_thread(evsel, cpus->nr, cpu, threads, thread, err)) {
1691                                        /*
1692                                         * We just removed 1 thread, so take a step
1693                                         * back on thread index and lower the upper
1694                                         * nthreads limit.
1695                                         */
1696                                        nthreads--;
1697                                        thread--;
1698
1699                                        /* ... and pretend like nothing have happened. */
1700                                        err = 0;
1701                                        continue;
1702                                }
1703
1704                                pr_debug2_peo("\nsys_perf_event_open failed, error %d\n",
1705                                          err);
1706                                goto try_fallback;
1707                        }
1708
1709                        pr_debug2_peo(" = %d\n", fd);
1710
1711                        if (evsel->bpf_fd >= 0) {
1712                                int evt_fd = fd;
1713                                int bpf_fd = evsel->bpf_fd;
1714
1715                                err = ioctl(evt_fd,
1716                                            PERF_EVENT_IOC_SET_BPF,
1717                                            bpf_fd);
1718                                if (err && errno != EEXIST) {
1719                                        pr_err("failed to attach bpf fd %d: %s\n",
1720                                               bpf_fd, strerror(errno));
1721                                        err = -EINVAL;
1722                                        goto out_close;
1723                                }
1724                        }
1725
1726                        set_rlimit = NO_CHANGE;
1727
1728                        /*
1729                         * If we succeeded but had to kill clockid, fail and
1730                         * have evsel__open_strerror() print us a nice error.
1731                         */
1732                        if (perf_missing_features.clockid ||
1733                            perf_missing_features.clockid_wrong) {
1734                                err = -EINVAL;
1735                                goto out_close;
1736                        }
1737                }
1738        }
1739
1740        return 0;
1741
1742try_fallback:
1743        /*
1744         * perf stat needs between 5 and 22 fds per CPU. When we run out
1745         * of them try to increase the limits.
1746         */
1747        if (err == -EMFILE && set_rlimit < INCREASED_MAX) {
1748                struct rlimit l;
1749
1750                old_errno = errno;
1751                if (getrlimit(RLIMIT_NOFILE, &l) == 0) {
1752                        if (set_rlimit == NO_CHANGE)
1753                                l.rlim_cur = l.rlim_max;
1754                        else {
1755                                l.rlim_cur = l.rlim_max + 1000;
1756                                l.rlim_max = l.rlim_cur;
1757                        }
1758                        if (setrlimit(RLIMIT_NOFILE, &l) == 0) {
1759                                set_rlimit++;
1760                                errno = old_errno;
1761                                goto retry_open;
1762                        }
1763                }
1764                errno = old_errno;
1765        }
1766
1767        if (err != -EINVAL || cpu > 0 || thread > 0)
1768                goto out_close;
1769
1770        /*
1771         * Must probe features in the order they were added to the
1772         * perf_event_attr interface.
1773         */
1774        if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) {
1775                perf_missing_features.cgroup = true;
1776                pr_debug2_peo("Kernel has no cgroup sampling support, bailing out\n");
1777                goto out_close;
1778        } else if (!perf_missing_features.branch_hw_idx &&
1779            (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) {
1780                perf_missing_features.branch_hw_idx = true;
1781                pr_debug2("switching off branch HW index support\n");
1782                goto fallback_missing_features;
1783        } else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) {
1784                perf_missing_features.aux_output = true;
1785                pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n");
1786                goto out_close;
1787        } else if (!perf_missing_features.bpf && evsel->core.attr.bpf_event) {
1788                perf_missing_features.bpf = true;
1789                pr_debug2_peo("switching off bpf_event\n");
1790                goto fallback_missing_features;
1791        } else if (!perf_missing_features.ksymbol && evsel->core.attr.ksymbol) {
1792                perf_missing_features.ksymbol = true;
1793                pr_debug2_peo("switching off ksymbol\n");
1794                goto fallback_missing_features;
1795        } else if (!perf_missing_features.write_backward && evsel->core.attr.write_backward) {
1796                perf_missing_features.write_backward = true;
1797                pr_debug2_peo("switching off write_backward\n");
1798                goto out_close;
1799        } else if (!perf_missing_features.clockid_wrong && evsel->core.attr.use_clockid) {
1800                perf_missing_features.clockid_wrong = true;
1801                pr_debug2_peo("switching off clockid\n");
1802                goto fallback_missing_features;
1803        } else if (!perf_missing_features.clockid && evsel->core.attr.use_clockid) {
1804                perf_missing_features.clockid = true;
1805                pr_debug2_peo("switching off use_clockid\n");
1806                goto fallback_missing_features;
1807        } else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
1808                perf_missing_features.cloexec = true;
1809                pr_debug2_peo("switching off cloexec flag\n");
1810                goto fallback_missing_features;
1811        } else if (!perf_missing_features.mmap2 && evsel->core.attr.mmap2) {
1812                perf_missing_features.mmap2 = true;
1813                pr_debug2_peo("switching off mmap2\n");
1814                goto fallback_missing_features;
1815        } else if (!perf_missing_features.exclude_guest &&
1816                   (evsel->core.attr.exclude_guest || evsel->core.attr.exclude_host)) {
1817                perf_missing_features.exclude_guest = true;
1818                pr_debug2_peo("switching off exclude_guest, exclude_host\n");
1819                goto fallback_missing_features;
1820        } else if (!perf_missing_features.sample_id_all) {
1821                perf_missing_features.sample_id_all = true;
1822                pr_debug2_peo("switching off sample_id_all\n");
1823                goto retry_sample_id;
1824        } else if (!perf_missing_features.lbr_flags &&
1825                        (evsel->core.attr.branch_sample_type &
1826                         (PERF_SAMPLE_BRANCH_NO_CYCLES |
1827                          PERF_SAMPLE_BRANCH_NO_FLAGS))) {
1828                perf_missing_features.lbr_flags = true;
1829                pr_debug2_peo("switching off branch sample type no (cycles/flags)\n");
1830                goto fallback_missing_features;
1831        } else if (!perf_missing_features.group_read &&
1832                    evsel->core.attr.inherit &&
1833                   (evsel->core.attr.read_format & PERF_FORMAT_GROUP) &&
1834                   evsel__is_group_leader(evsel)) {
1835                perf_missing_features.group_read = true;
1836                pr_debug2_peo("switching off group read\n");
1837                goto fallback_missing_features;
1838        }
1839out_close:
1840        if (err)
1841                threads->err_thread = thread;
1842
1843        old_errno = errno;
1844        do {
1845                while (--thread >= 0) {
1846                        if (FD(evsel, cpu, thread) >= 0)
1847                                close(FD(evsel, cpu, thread));
1848                        FD(evsel, cpu, thread) = -1;
1849                }
1850                thread = nthreads;
1851        } while (--cpu >= 0);
1852        errno = old_errno;
1853        return err;
1854}
1855
1856int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
1857                struct perf_thread_map *threads)
1858{
1859        return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1);
1860}
1861
1862void evsel__close(struct evsel *evsel)
1863{
1864        perf_evsel__close(&evsel->core);
1865        perf_evsel__free_id(&evsel->core);
1866}
1867
1868int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu)
1869{
1870        if (cpu == -1)
1871                return evsel__open_cpu(evsel, cpus, NULL, 0,
1872                                        cpus ? cpus->nr : 1);
1873
1874        return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1);
1875}
1876
1877int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads)
1878{
1879        return evsel__open(evsel, NULL, threads);
1880}
1881
1882static int perf_evsel__parse_id_sample(const struct evsel *evsel,
1883                                       const union perf_event *event,
1884                                       struct perf_sample *sample)
1885{
1886        u64 type = evsel->core.attr.sample_type;
1887        const __u64 *array = event->sample.array;
1888        bool swapped = evsel->needs_swap;
1889        union u64_swap u;
1890
1891        array += ((event->header.size -
1892                   sizeof(event->header)) / sizeof(u64)) - 1;
1893
1894        if (type & PERF_SAMPLE_IDENTIFIER) {
1895                sample->id = *array;
1896                array--;
1897        }
1898
1899        if (type & PERF_SAMPLE_CPU) {
1900                u.val64 = *array;
1901                if (swapped) {
1902                        /* undo swap of u64, then swap on individual u32s */
1903                        u.val64 = bswap_64(u.val64);
1904                        u.val32[0] = bswap_32(u.val32[0]);
1905                }
1906
1907                sample->cpu = u.val32[0];
1908                array--;
1909        }
1910
1911        if (type & PERF_SAMPLE_STREAM_ID) {
1912                sample->stream_id = *array;
1913                array--;
1914        }
1915
1916        if (type & PERF_SAMPLE_ID) {
1917                sample->id = *array;
1918                array--;
1919        }
1920
1921        if (type & PERF_SAMPLE_TIME) {
1922                sample->time = *array;
1923                array--;
1924        }
1925
1926        if (type & PERF_SAMPLE_TID) {
1927                u.val64 = *array;
1928                if (swapped) {
1929                        /* undo swap of u64, then swap on individual u32s */
1930                        u.val64 = bswap_64(u.val64);
1931                        u.val32[0] = bswap_32(u.val32[0]);
1932                        u.val32[1] = bswap_32(u.val32[1]);
1933                }
1934
1935                sample->pid = u.val32[0];
1936                sample->tid = u.val32[1];
1937                array--;
1938        }
1939
1940        return 0;
1941}
1942
1943static inline bool overflow(const void *endp, u16 max_size, const void *offset,
1944                            u64 size)
1945{
1946        return size > max_size || offset + size > endp;
1947}
1948
1949#define OVERFLOW_CHECK(offset, size, max_size)                          \
1950        do {                                                            \
1951                if (overflow(endp, (max_size), (offset), (size)))       \
1952                        return -EFAULT;                                 \
1953        } while (0)
1954
1955#define OVERFLOW_CHECK_u64(offset) \
1956        OVERFLOW_CHECK(offset, sizeof(u64), sizeof(u64))
1957
1958static int
1959perf_event__check_size(union perf_event *event, unsigned int sample_size)
1960{
1961        /*
1962         * The evsel's sample_size is based on PERF_SAMPLE_MASK which includes
1963         * up to PERF_SAMPLE_PERIOD.  After that overflow() must be used to
1964         * check the format does not go past the end of the event.
1965         */
1966        if (sample_size + sizeof(event->header) > event->header.size)
1967                return -EFAULT;
1968
1969        return 0;
1970}
1971
1972int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
1973                        struct perf_sample *data)
1974{
1975        u64 type = evsel->core.attr.sample_type;
1976        bool swapped = evsel->needs_swap;
1977        const __u64 *array;
1978        u16 max_size = event->header.size;
1979        const void *endp = (void *)event + max_size;
1980        u64 sz;
1981
1982        /*
1983         * used for cross-endian analysis. See git commit 65014ab3
1984         * for why this goofiness is needed.
1985         */
1986        union u64_swap u;
1987
1988        memset(data, 0, sizeof(*data));
1989        data->cpu = data->pid = data->tid = -1;
1990        data->stream_id = data->id = data->time = -1ULL;
1991        data->period = evsel->core.attr.sample_period;
1992        data->cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1993        data->misc    = event->header.misc;
1994        data->id = -1ULL;
1995        data->data_src = PERF_MEM_DATA_SRC_NONE;
1996
1997        if (event->header.type != PERF_RECORD_SAMPLE) {
1998                if (!evsel->core.attr.sample_id_all)
1999                        return 0;
2000                return perf_evsel__parse_id_sample(evsel, event, data);
2001        }
2002
2003        array = event->sample.array;
2004
2005        if (perf_event__check_size(event, evsel->sample_size))
2006                return -EFAULT;
2007
2008        if (type & PERF_SAMPLE_IDENTIFIER) {
2009                data->id = *array;
2010                array++;
2011        }
2012
2013        if (type & PERF_SAMPLE_IP) {
2014                data->ip = *array;
2015                array++;
2016        }
2017
2018        if (type & PERF_SAMPLE_TID) {
2019                u.val64 = *array;
2020                if (swapped) {
2021                        /* undo swap of u64, then swap on individual u32s */
2022                        u.val64 = bswap_64(u.val64);
2023                        u.val32[0] = bswap_32(u.val32[0]);
2024                        u.val32[1] = bswap_32(u.val32[1]);
2025                }
2026
2027                data->pid = u.val32[0];
2028                data->tid = u.val32[1];
2029                array++;
2030        }
2031
2032        if (type & PERF_SAMPLE_TIME) {
2033                data->time = *array;
2034                array++;
2035        }
2036
2037        if (type & PERF_SAMPLE_ADDR) {
2038                data->addr = *array;
2039                array++;
2040        }
2041
2042        if (type & PERF_SAMPLE_ID) {
2043                data->id = *array;
2044                array++;
2045        }
2046
2047        if (type & PERF_SAMPLE_STREAM_ID) {
2048                data->stream_id = *array;
2049                array++;
2050        }
2051
2052        if (type & PERF_SAMPLE_CPU) {
2053
2054                u.val64 = *array;
2055                if (swapped) {
2056                        /* undo swap of u64, then swap on individual u32s */
2057                        u.val64 = bswap_64(u.val64);
2058                        u.val32[0] = bswap_32(u.val32[0]);
2059                }
2060
2061                data->cpu = u.val32[0];
2062                array++;
2063        }
2064
2065        if (type & PERF_SAMPLE_PERIOD) {
2066                data->period = *array;
2067                array++;
2068        }
2069
2070        if (type & PERF_SAMPLE_READ) {
2071                u64 read_format = evsel->core.attr.read_format;
2072
2073                OVERFLOW_CHECK_u64(array);
2074                if (read_format & PERF_FORMAT_GROUP)
2075                        data->read.group.nr = *array;
2076                else
2077                        data->read.one.value = *array;
2078
2079                array++;
2080
2081                if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2082                        OVERFLOW_CHECK_u64(array);
2083                        data->read.time_enabled = *array;
2084                        array++;
2085                }
2086
2087                if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2088                        OVERFLOW_CHECK_u64(array);
2089                        data->read.time_running = *array;
2090                        array++;
2091                }
2092
2093                /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
2094                if (read_format & PERF_FORMAT_GROUP) {
2095                        const u64 max_group_nr = UINT64_MAX /
2096                                        sizeof(struct sample_read_value);
2097
2098                        if (data->read.group.nr > max_group_nr)
2099                                return -EFAULT;
2100                        sz = data->read.group.nr *
2101                             sizeof(struct sample_read_value);
2102                        OVERFLOW_CHECK(array, sz, max_size);
2103                        data->read.group.values =
2104                                        (struct sample_read_value *)array;
2105                        array = (void *)array + sz;
2106                } else {
2107                        OVERFLOW_CHECK_u64(array);
2108                        data->read.one.id = *array;
2109                        array++;
2110                }
2111        }
2112
2113        if (type & PERF_SAMPLE_CALLCHAIN) {
2114                const u64 max_callchain_nr = UINT64_MAX / sizeof(u64);
2115
2116                OVERFLOW_CHECK_u64(array);
2117                data->callchain = (struct ip_callchain *)array++;
2118                if (data->callchain->nr > max_callchain_nr)
2119                        return -EFAULT;
2120                sz = data->callchain->nr * sizeof(u64);
2121                OVERFLOW_CHECK(array, sz, max_size);
2122                array = (void *)array + sz;
2123        }
2124
2125        if (type & PERF_SAMPLE_RAW) {
2126                OVERFLOW_CHECK_u64(array);
2127                u.val64 = *array;
2128
2129                /*
2130                 * Undo swap of u64, then swap on individual u32s,
2131                 * get the size of the raw area and undo all of the
2132                 * swap. The pevent interface handles endianity by
2133                 * itself.
2134                 */
2135                if (swapped) {
2136                        u.val64 = bswap_64(u.val64);
2137                        u.val32[0] = bswap_32(u.val32[0]);
2138                        u.val32[1] = bswap_32(u.val32[1]);
2139                }
2140                data->raw_size = u.val32[0];
2141
2142                /*
2143                 * The raw data is aligned on 64bits including the
2144                 * u32 size, so it's safe to use mem_bswap_64.
2145                 */
2146                if (swapped)
2147                        mem_bswap_64((void *) array, data->raw_size);
2148
2149                array = (void *)array + sizeof(u32);
2150
2151                OVERFLOW_CHECK(array, data->raw_size, max_size);
2152                data->raw_data = (void *)array;
2153                array = (void *)array + data->raw_size;
2154        }
2155
2156        if (type & PERF_SAMPLE_BRANCH_STACK) {
2157                const u64 max_branch_nr = UINT64_MAX /
2158                                          sizeof(struct branch_entry);
2159
2160                OVERFLOW_CHECK_u64(array);
2161                data->branch_stack = (struct branch_stack *)array++;
2162
2163                if (data->branch_stack->nr > max_branch_nr)
2164                        return -EFAULT;
2165
2166                sz = data->branch_stack->nr * sizeof(struct branch_entry);
2167                if (evsel__has_branch_hw_idx(evsel))
2168                        sz += sizeof(u64);
2169                else
2170                        data->no_hw_idx = true;
2171                OVERFLOW_CHECK(array, sz, max_size);
2172                array = (void *)array + sz;
2173        }
2174
2175        if (type & PERF_SAMPLE_REGS_USER) {
2176                OVERFLOW_CHECK_u64(array);
2177                data->user_regs.abi = *array;
2178                array++;
2179
2180                if (data->user_regs.abi) {
2181                        u64 mask = evsel->core.attr.sample_regs_user;
2182
2183                        sz = hweight64(mask) * sizeof(u64);
2184                        OVERFLOW_CHECK(array, sz, max_size);
2185                        data->user_regs.mask = mask;
2186                        data->user_regs.regs = (u64 *)array;
2187                        array = (void *)array + sz;
2188                }
2189        }
2190
2191        if (type & PERF_SAMPLE_STACK_USER) {
2192                OVERFLOW_CHECK_u64(array);
2193                sz = *array++;
2194
2195                data->user_stack.offset = ((char *)(array - 1)
2196                                          - (char *) event);
2197
2198                if (!sz) {
2199                        data->user_stack.size = 0;
2200                } else {
2201                        OVERFLOW_CHECK(array, sz, max_size);
2202                        data->user_stack.data = (char *)array;
2203                        array = (void *)array + sz;
2204                        OVERFLOW_CHECK_u64(array);
2205                        data->user_stack.size = *array++;
2206                        if (WARN_ONCE(data->user_stack.size > sz,
2207                                      "user stack dump failure\n"))
2208                                return -EFAULT;
2209                }
2210        }
2211
2212        if (type & PERF_SAMPLE_WEIGHT) {
2213                OVERFLOW_CHECK_u64(array);
2214                data->weight = *array;
2215                array++;
2216        }
2217
2218        if (type & PERF_SAMPLE_DATA_SRC) {
2219                OVERFLOW_CHECK_u64(array);
2220                data->data_src = *array;
2221                array++;
2222        }
2223
2224        if (type & PERF_SAMPLE_TRANSACTION) {
2225                OVERFLOW_CHECK_u64(array);
2226                data->transaction = *array;
2227                array++;
2228        }
2229
2230        data->intr_regs.abi = PERF_SAMPLE_REGS_ABI_NONE;
2231        if (type & PERF_SAMPLE_REGS_INTR) {
2232                OVERFLOW_CHECK_u64(array);
2233                data->intr_regs.abi = *array;
2234                array++;
2235
2236                if (data->intr_regs.abi != PERF_SAMPLE_REGS_ABI_NONE) {
2237                        u64 mask = evsel->core.attr.sample_regs_intr;
2238
2239                        sz = hweight64(mask) * sizeof(u64);
2240                        OVERFLOW_CHECK(array, sz, max_size);
2241                        data->intr_regs.mask = mask;
2242                        data->intr_regs.regs = (u64 *)array;
2243                        array = (void *)array + sz;
2244                }
2245        }
2246
2247        data->phys_addr = 0;
2248        if (type & PERF_SAMPLE_PHYS_ADDR) {
2249                data->phys_addr = *array;
2250                array++;
2251        }
2252
2253        data->cgroup = 0;
2254        if (type & PERF_SAMPLE_CGROUP) {
2255                data->cgroup = *array;
2256                array++;
2257        }
2258
2259        if (type & PERF_SAMPLE_AUX) {
2260                OVERFLOW_CHECK_u64(array);
2261                sz = *array++;
2262
2263                OVERFLOW_CHECK(array, sz, max_size);
2264                /* Undo swap of data */
2265                if (swapped)
2266                        mem_bswap_64((char *)array, sz);
2267                data->aux_sample.size = sz;
2268                data->aux_sample.data = (char *)array;
2269                array = (void *)array + sz;
2270        }
2271
2272        return 0;
2273}
2274
2275int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
2276                                  u64 *timestamp)
2277{
2278        u64 type = evsel->core.attr.sample_type;
2279        const __u64 *array;
2280
2281        if (!(type & PERF_SAMPLE_TIME))
2282                return -1;
2283
2284        if (event->header.type != PERF_RECORD_SAMPLE) {
2285                struct perf_sample data = {
2286                        .time = -1ULL,
2287                };
2288
2289                if (!evsel->core.attr.sample_id_all)
2290                        return -1;
2291                if (perf_evsel__parse_id_sample(evsel, event, &data))
2292                        return -1;
2293
2294                *timestamp = data.time;
2295                return 0;
2296        }
2297
2298        array = event->sample.array;
2299
2300        if (perf_event__check_size(event, evsel->sample_size))
2301                return -EFAULT;
2302
2303        if (type & PERF_SAMPLE_IDENTIFIER)
2304                array++;
2305
2306        if (type & PERF_SAMPLE_IP)
2307                array++;
2308
2309        if (type & PERF_SAMPLE_TID)
2310                array++;
2311
2312        if (type & PERF_SAMPLE_TIME)
2313                *timestamp = *array;
2314
2315        return 0;
2316}
2317
2318struct tep_format_field *evsel__field(struct evsel *evsel, const char *name)
2319{
2320        return tep_find_field(evsel->tp_format, name);
2321}
2322
2323void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name)
2324{
2325        struct tep_format_field *field = evsel__field(evsel, name);
2326        int offset;
2327
2328        if (!field)
2329                return NULL;
2330
2331        offset = field->offset;
2332
2333        if (field->flags & TEP_FIELD_IS_DYNAMIC) {
2334                offset = *(int *)(sample->raw_data + field->offset);
2335                offset &= 0xffff;
2336        }
2337
2338        return sample->raw_data + offset;
2339}
2340
2341u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sample,
2342                         bool needs_swap)
2343{
2344        u64 value;
2345        void *ptr = sample->raw_data + field->offset;
2346
2347        switch (field->size) {
2348        case 1:
2349                return *(u8 *)ptr;
2350        case 2:
2351                value = *(u16 *)ptr;
2352                break;
2353        case 4:
2354                value = *(u32 *)ptr;
2355                break;
2356        case 8:
2357                memcpy(&value, ptr, sizeof(u64));
2358                break;
2359        default:
2360                return 0;
2361        }
2362
2363        if (!needs_swap)
2364                return value;
2365
2366        switch (field->size) {
2367        case 2:
2368                return bswap_16(value);
2369        case 4:
2370                return bswap_32(value);
2371        case 8:
2372                return bswap_64(value);
2373        default:
2374                return 0;
2375        }
2376
2377        return 0;
2378}
2379
2380u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name)
2381{
2382        struct tep_format_field *field = evsel__field(evsel, name);
2383
2384        if (!field)
2385                return 0;
2386
2387        return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
2388}
2389
2390bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize)
2391{
2392        int paranoid;
2393
2394        if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
2395            evsel->core.attr.type   == PERF_TYPE_HARDWARE &&
2396            evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES) {
2397                /*
2398                 * If it's cycles then fall back to hrtimer based
2399                 * cpu-clock-tick sw counter, which is always available even if
2400                 * no PMU support.
2401                 *
2402                 * PPC returns ENXIO until 2.6.37 (behavior changed with commit
2403                 * b0a873e).
2404                 */
2405                scnprintf(msg, msgsize, "%s",
2406"The cycles event is not supported, trying to fall back to cpu-clock-ticks");
2407
2408                evsel->core.attr.type   = PERF_TYPE_SOFTWARE;
2409                evsel->core.attr.config = PERF_COUNT_SW_CPU_CLOCK;
2410
2411                zfree(&evsel->name);
2412                return true;
2413        } else if (err == EACCES && !evsel->core.attr.exclude_kernel &&
2414                   (paranoid = perf_event_paranoid()) > 1) {
2415                const char *name = evsel__name(evsel);
2416                char *new_name;
2417                const char *sep = ":";
2418
2419                /* If event has exclude user then don't exclude kernel. */
2420                if (evsel->core.attr.exclude_user)
2421                        return false;
2422
2423                /* Is there already the separator in the name. */
2424                if (strchr(name, '/') ||
2425                    (strchr(name, ':') && !evsel->is_libpfm_event))
2426                        sep = "";
2427
2428                if (asprintf(&new_name, "%s%su", name, sep) < 0)
2429                        return false;
2430
2431                if (evsel->name)
2432                        free(evsel->name);
2433                evsel->name = new_name;
2434                scnprintf(msg, msgsize, "kernel.perf_event_paranoid=%d, trying "
2435                          "to fall back to excluding kernel and hypervisor "
2436                          " samples", paranoid);
2437                evsel->core.attr.exclude_kernel = 1;
2438                evsel->core.attr.exclude_hv     = 1;
2439
2440                return true;
2441        }
2442
2443        return false;
2444}
2445
2446static bool find_process(const char *name)
2447{
2448        size_t len = strlen(name);
2449        DIR *dir;
2450        struct dirent *d;
2451        int ret = -1;
2452
2453        dir = opendir(procfs__mountpoint());
2454        if (!dir)
2455                return false;
2456
2457        /* Walk through the directory. */
2458        while (ret && (d = readdir(dir)) != NULL) {
2459                char path[PATH_MAX];
2460                char *data;
2461                size_t size;
2462
2463                if ((d->d_type != DT_DIR) ||
2464                     !strcmp(".", d->d_name) ||
2465                     !strcmp("..", d->d_name))
2466                        continue;
2467
2468                scnprintf(path, sizeof(path), "%s/%s/comm",
2469                          procfs__mountpoint(), d->d_name);
2470
2471                if (filename__read_str(path, &data, &size))
2472                        continue;
2473
2474                ret = strncmp(name, data, len);
2475                free(data);
2476        }
2477
2478        closedir(dir);
2479        return ret ? false : true;
2480}
2481
2482int evsel__open_strerror(struct evsel *evsel, struct target *target,
2483                         int err, char *msg, size_t size)
2484{
2485        char sbuf[STRERR_BUFSIZE];
2486        int printed = 0, enforced = 0;
2487
2488        switch (err) {
2489        case EPERM:
2490        case EACCES:
2491                printed += scnprintf(msg + printed, size - printed,
2492                        "Access to performance monitoring and observability operations is limited.\n");
2493
2494                if (!sysfs__read_int("fs/selinux/enforce", &enforced)) {
2495                        if (enforced) {
2496                                printed += scnprintf(msg + printed, size - printed,
2497                                        "Enforced MAC policy settings (SELinux) can limit access to performance\n"
2498                                        "monitoring and observability operations. Inspect system audit records for\n"
2499                                        "more perf_event access control information and adjusting the policy.\n");
2500                        }
2501                }
2502
2503                if (err == EPERM)
2504                        printed += scnprintf(msg, size,
2505                                "No permission to enable %s event.\n\n", evsel__name(evsel));
2506
2507                return scnprintf(msg + printed, size - printed,
2508                 "Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open\n"
2509                 "access to performance monitoring and observability operations for processes\n"
2510                 "without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.\n"
2511                 "More information can be found at 'Perf events and tool security' document:\n"
2512                 "https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html\n"
2513                 "perf_event_paranoid setting is %d:\n"
2514                 "  -1: Allow use of (almost) all events by all users\n"
2515                 "      Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n"
2516                 ">= 0: Disallow raw and ftrace function tracepoint access\n"
2517                 ">= 1: Disallow CPU event access\n"
2518                 ">= 2: Disallow kernel profiling\n"
2519                 "To make the adjusted perf_event_paranoid setting permanent preserve it\n"
2520                 "in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)",
2521                 perf_event_paranoid());
2522        case ENOENT:
2523                return scnprintf(msg, size, "The %s event is not supported.", evsel__name(evsel));
2524        case EMFILE:
2525                return scnprintf(msg, size, "%s",
2526                         "Too many events are opened.\n"
2527                         "Probably the maximum number of open file descriptors has been reached.\n"
2528                         "Hint: Try again after reducing the number of events.\n"
2529                         "Hint: Try increasing the limit with 'ulimit -n <limit>'");
2530        case ENOMEM:
2531                if (evsel__has_callchain(evsel) &&
2532                    access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0)
2533                        return scnprintf(msg, size,
2534                                         "Not enough memory to setup event with callchain.\n"
2535                                         "Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n"
2536                                         "Hint: Current value: %d", sysctl__max_stack());
2537                break;
2538        case ENODEV:
2539                if (target->cpu_list)
2540                        return scnprintf(msg, size, "%s",
2541         "No such device - did you specify an out-of-range profile CPU?");
2542                break;
2543        case EOPNOTSUPP:
2544                if (evsel->core.attr.aux_output)
2545                        return scnprintf(msg, size,
2546        "%s: PMU Hardware doesn't support 'aux_output' feature",
2547                                         evsel__name(evsel));
2548                if (evsel->core.attr.sample_period != 0)
2549                        return scnprintf(msg, size,
2550        "%s: PMU Hardware doesn't support sampling/overflow-interrupts. Try 'perf stat'",
2551                                         evsel__name(evsel));
2552                if (evsel->core.attr.precise_ip)
2553                        return scnprintf(msg, size, "%s",
2554        "\'precise\' request may not be supported. Try removing 'p' modifier.");
2555#if defined(__i386__) || defined(__x86_64__)
2556                if (evsel->core.attr.type == PERF_TYPE_HARDWARE)
2557                        return scnprintf(msg, size, "%s",
2558        "No hardware sampling interrupt available.\n");
2559#endif
2560                break;
2561        case EBUSY:
2562                if (find_process("oprofiled"))
2563                        return scnprintf(msg, size,
2564        "The PMU counters are busy/taken by another profiler.\n"
2565        "We found oprofile daemon running, please stop it and try again.");
2566                break;
2567        case EINVAL:
2568                if (evsel->core.attr.write_backward && perf_missing_features.write_backward)
2569                        return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel.");
2570                if (perf_missing_features.clockid)
2571                        return scnprintf(msg, size, "clockid feature not supported.");
2572                if (perf_missing_features.clockid_wrong)
2573                        return scnprintf(msg, size, "wrong clockid (%d).", clockid);
2574                if (perf_missing_features.aux_output)
2575                        return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel.");
2576                break;
2577        default:
2578                break;
2579        }
2580
2581        return scnprintf(msg, size,
2582        "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
2583        "/bin/dmesg | grep -i perf may provide additional information.\n",
2584                         err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel));
2585}
2586
2587struct perf_env *evsel__env(struct evsel *evsel)
2588{
2589        if (evsel && evsel->evlist)
2590                return evsel->evlist->env;
2591        return &perf_env;
2592}
2593
2594static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist)
2595{
2596        int cpu, thread;
2597
2598        for (cpu = 0; cpu < xyarray__max_x(evsel->core.fd); cpu++) {
2599                for (thread = 0; thread < xyarray__max_y(evsel->core.fd);
2600                     thread++) {
2601                        int fd = FD(evsel, cpu, thread);
2602
2603                        if (perf_evlist__id_add_fd(&evlist->core, &evsel->core,
2604                                                   cpu, thread, fd) < 0)
2605                                return -1;
2606                }
2607        }
2608
2609        return 0;
2610}
2611
2612int evsel__store_ids(struct evsel *evsel, struct evlist *evlist)
2613{
2614        struct perf_cpu_map *cpus = evsel->core.cpus;
2615        struct perf_thread_map *threads = evsel->core.threads;
2616
2617        if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr))
2618                return -ENOMEM;
2619
2620        return store_evsel_ids(evsel, evlist);
2621}
2622