linux/kernel/events/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Performance events core code:
   4 *
   5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   8 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   9 */
  10
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/cpu.h>
  14#include <linux/smp.h>
  15#include <linux/idr.h>
  16#include <linux/file.h>
  17#include <linux/poll.h>
  18#include <linux/slab.h>
  19#include <linux/hash.h>
  20#include <linux/tick.h>
  21#include <linux/sysfs.h>
  22#include <linux/dcache.h>
  23#include <linux/percpu.h>
  24#include <linux/ptrace.h>
  25#include <linux/reboot.h>
  26#include <linux/vmstat.h>
  27#include <linux/device.h>
  28#include <linux/export.h>
  29#include <linux/vmalloc.h>
  30#include <linux/hardirq.h>
  31#include <linux/rculist.h>
  32#include <linux/uaccess.h>
  33#include <linux/syscalls.h>
  34#include <linux/anon_inodes.h>
  35#include <linux/kernel_stat.h>
  36#include <linux/cgroup.h>
  37#include <linux/perf_event.h>
  38#include <linux/trace_events.h>
  39#include <linux/hw_breakpoint.h>
  40#include <linux/mm_types.h>
  41#include <linux/module.h>
  42#include <linux/mman.h>
  43#include <linux/compat.h>
  44#include <linux/bpf.h>
  45#include <linux/filter.h>
  46#include <linux/namei.h>
  47#include <linux/parser.h>
  48#include <linux/sched/clock.h>
  49#include <linux/sched/mm.h>
  50#include <linux/proc_ns.h>
  51#include <linux/mount.h>
  52
  53#include "internal.h"
  54
  55#include <asm/irq_regs.h>
  56
  57typedef int (*remote_function_f)(void *);
  58
  59struct remote_function_call {
  60        struct task_struct      *p;
  61        remote_function_f       func;
  62        void                    *info;
  63        int                     ret;
  64};
  65
  66static void remote_function(void *data)
  67{
  68        struct remote_function_call *tfc = data;
  69        struct task_struct *p = tfc->p;
  70
  71        if (p) {
  72                /* -EAGAIN */
  73                if (task_cpu(p) != smp_processor_id())
  74                        return;
  75
  76                /*
  77                 * Now that we're on right CPU with IRQs disabled, we can test
  78                 * if we hit the right task without races.
  79                 */
  80
  81                tfc->ret = -ESRCH; /* No such (running) process */
  82                if (p != current)
  83                        return;
  84        }
  85
  86        tfc->ret = tfc->func(tfc->info);
  87}
  88
  89/**
  90 * task_function_call - call a function on the cpu on which a task runs
  91 * @p:          the task to evaluate
  92 * @func:       the function to be called
  93 * @info:       the function call argument
  94 *
  95 * Calls the function @func when the task is currently running. This might
  96 * be on the current CPU, which just calls the function directly
  97 *
  98 * returns: @func return value, or
  99 *          -ESRCH  - when the process isn't running
 100 *          -EAGAIN - when the process moved away
 101 */
 102static int
 103task_function_call(struct task_struct *p, remote_function_f func, void *info)
 104{
 105        struct remote_function_call data = {
 106                .p      = p,
 107                .func   = func,
 108                .info   = info,
 109                .ret    = -EAGAIN,
 110        };
 111        int ret;
 112
 113        do {
 114                ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 115                if (!ret)
 116                        ret = data.ret;
 117        } while (ret == -EAGAIN);
 118
 119        return ret;
 120}
 121
 122/**
 123 * cpu_function_call - call a function on the cpu
 124 * @func:       the function to be called
 125 * @info:       the function call argument
 126 *
 127 * Calls the function @func on the remote cpu.
 128 *
 129 * returns: @func return value or -ENXIO when the cpu is offline
 130 */
 131static int cpu_function_call(int cpu, remote_function_f func, void *info)
 132{
 133        struct remote_function_call data = {
 134                .p      = NULL,
 135                .func   = func,
 136                .info   = info,
 137                .ret    = -ENXIO, /* No such CPU */
 138        };
 139
 140        smp_call_function_single(cpu, remote_function, &data, 1);
 141
 142        return data.ret;
 143}
 144
 145static inline struct perf_cpu_context *
 146__get_cpu_context(struct perf_event_context *ctx)
 147{
 148        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 149}
 150
 151static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 152                          struct perf_event_context *ctx)
 153{
 154        raw_spin_lock(&cpuctx->ctx.lock);
 155        if (ctx)
 156                raw_spin_lock(&ctx->lock);
 157}
 158
 159static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 160                            struct perf_event_context *ctx)
 161{
 162        if (ctx)
 163                raw_spin_unlock(&ctx->lock);
 164        raw_spin_unlock(&cpuctx->ctx.lock);
 165}
 166
 167#define TASK_TOMBSTONE ((void *)-1L)
 168
 169static bool is_kernel_event(struct perf_event *event)
 170{
 171        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 172}
 173
 174/*
 175 * On task ctx scheduling...
 176 *
 177 * When !ctx->nr_events a task context will not be scheduled. This means
 178 * we can disable the scheduler hooks (for performance) without leaving
 179 * pending task ctx state.
 180 *
 181 * This however results in two special cases:
 182 *
 183 *  - removing the last event from a task ctx; this is relatively straight
 184 *    forward and is done in __perf_remove_from_context.
 185 *
 186 *  - adding the first event to a task ctx; this is tricky because we cannot
 187 *    rely on ctx->is_active and therefore cannot use event_function_call().
 188 *    See perf_install_in_context().
 189 *
 190 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 191 */
 192
 193typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
 194                        struct perf_event_context *, void *);
 195
 196struct event_function_struct {
 197        struct perf_event *event;
 198        event_f func;
 199        void *data;
 200};
 201
 202static int event_function(void *info)
 203{
 204        struct event_function_struct *efs = info;
 205        struct perf_event *event = efs->event;
 206        struct perf_event_context *ctx = event->ctx;
 207        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 208        struct perf_event_context *task_ctx = cpuctx->task_ctx;
 209        int ret = 0;
 210
 211        lockdep_assert_irqs_disabled();
 212
 213        perf_ctx_lock(cpuctx, task_ctx);
 214        /*
 215         * Since we do the IPI call without holding ctx->lock things can have
 216         * changed, double check we hit the task we set out to hit.
 217         */
 218        if (ctx->task) {
 219                if (ctx->task != current) {
 220                        ret = -ESRCH;
 221                        goto unlock;
 222                }
 223
 224                /*
 225                 * We only use event_function_call() on established contexts,
 226                 * and event_function() is only ever called when active (or
 227                 * rather, we'll have bailed in task_function_call() or the
 228                 * above ctx->task != current test), therefore we must have
 229                 * ctx->is_active here.
 230                 */
 231                WARN_ON_ONCE(!ctx->is_active);
 232                /*
 233                 * And since we have ctx->is_active, cpuctx->task_ctx must
 234                 * match.
 235                 */
 236                WARN_ON_ONCE(task_ctx != ctx);
 237        } else {
 238                WARN_ON_ONCE(&cpuctx->ctx != ctx);
 239        }
 240
 241        efs->func(event, cpuctx, ctx, efs->data);
 242unlock:
 243        perf_ctx_unlock(cpuctx, task_ctx);
 244
 245        return ret;
 246}
 247
 248static void event_function_call(struct perf_event *event, event_f func, void *data)
 249{
 250        struct perf_event_context *ctx = event->ctx;
 251        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 252        struct event_function_struct efs = {
 253                .event = event,
 254                .func = func,
 255                .data = data,
 256        };
 257
 258        if (!event->parent) {
 259                /*
 260                 * If this is a !child event, we must hold ctx::mutex to
 261                 * stabilize the the event->ctx relation. See
 262                 * perf_event_ctx_lock().
 263                 */
 264                lockdep_assert_held(&ctx->mutex);
 265        }
 266
 267        if (!task) {
 268                cpu_function_call(event->cpu, event_function, &efs);
 269                return;
 270        }
 271
 272        if (task == TASK_TOMBSTONE)
 273                return;
 274
 275again:
 276        if (!task_function_call(task, event_function, &efs))
 277                return;
 278
 279        raw_spin_lock_irq(&ctx->lock);
 280        /*
 281         * Reload the task pointer, it might have been changed by
 282         * a concurrent perf_event_context_sched_out().
 283         */
 284        task = ctx->task;
 285        if (task == TASK_TOMBSTONE) {
 286                raw_spin_unlock_irq(&ctx->lock);
 287                return;
 288        }
 289        if (ctx->is_active) {
 290                raw_spin_unlock_irq(&ctx->lock);
 291                goto again;
 292        }
 293        func(event, NULL, ctx, data);
 294        raw_spin_unlock_irq(&ctx->lock);
 295}
 296
 297/*
 298 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 299 * are already disabled and we're on the right CPU.
 300 */
 301static void event_function_local(struct perf_event *event, event_f func, void *data)
 302{
 303        struct perf_event_context *ctx = event->ctx;
 304        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 305        struct task_struct *task = READ_ONCE(ctx->task);
 306        struct perf_event_context *task_ctx = NULL;
 307
 308        lockdep_assert_irqs_disabled();
 309
 310        if (task) {
 311                if (task == TASK_TOMBSTONE)
 312                        return;
 313
 314                task_ctx = ctx;
 315        }
 316
 317        perf_ctx_lock(cpuctx, task_ctx);
 318
 319        task = ctx->task;
 320        if (task == TASK_TOMBSTONE)
 321                goto unlock;
 322
 323        if (task) {
 324                /*
 325                 * We must be either inactive or active and the right task,
 326                 * otherwise we're screwed, since we cannot IPI to somewhere
 327                 * else.
 328                 */
 329                if (ctx->is_active) {
 330                        if (WARN_ON_ONCE(task != current))
 331                                goto unlock;
 332
 333                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
 334                                goto unlock;
 335                }
 336        } else {
 337                WARN_ON_ONCE(&cpuctx->ctx != ctx);
 338        }
 339
 340        func(event, cpuctx, ctx, data);
 341unlock:
 342        perf_ctx_unlock(cpuctx, task_ctx);
 343}
 344
 345#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 346                       PERF_FLAG_FD_OUTPUT  |\
 347                       PERF_FLAG_PID_CGROUP |\
 348                       PERF_FLAG_FD_CLOEXEC)
 349
 350/*
 351 * branch priv levels that need permission checks
 352 */
 353#define PERF_SAMPLE_BRANCH_PERM_PLM \
 354        (PERF_SAMPLE_BRANCH_KERNEL |\
 355         PERF_SAMPLE_BRANCH_HV)
 356
 357enum event_type_t {
 358        EVENT_FLEXIBLE = 0x1,
 359        EVENT_PINNED = 0x2,
 360        EVENT_TIME = 0x4,
 361        /* see ctx_resched() for details */
 362        EVENT_CPU = 0x8,
 363        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 364};
 365
 366/*
 367 * perf_sched_events : >0 events exist
 368 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 369 */
 370
 371static void perf_sched_delayed(struct work_struct *work);
 372DEFINE_STATIC_KEY_FALSE(perf_sched_events);
 373static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
 374static DEFINE_MUTEX(perf_sched_mutex);
 375static atomic_t perf_sched_count;
 376
 377static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 378static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 379static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 380
 381static atomic_t nr_mmap_events __read_mostly;
 382static atomic_t nr_comm_events __read_mostly;
 383static atomic_t nr_namespaces_events __read_mostly;
 384static atomic_t nr_task_events __read_mostly;
 385static atomic_t nr_freq_events __read_mostly;
 386static atomic_t nr_switch_events __read_mostly;
 387static atomic_t nr_ksymbol_events __read_mostly;
 388static atomic_t nr_bpf_events __read_mostly;
 389
 390static LIST_HEAD(pmus);
 391static DEFINE_MUTEX(pmus_lock);
 392static struct srcu_struct pmus_srcu;
 393static cpumask_var_t perf_online_mask;
 394
 395/*
 396 * perf event paranoia level:
 397 *  -1 - not paranoid at all
 398 *   0 - disallow raw tracepoint access for unpriv
 399 *   1 - disallow cpu events for unpriv
 400 *   2 - disallow kernel profiling for unpriv
 401 */
 402int sysctl_perf_event_paranoid __read_mostly = 2;
 403
 404/* Minimum for 512 kiB + 1 user control page */
 405int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 406
 407/*
 408 * max perf event sample rate
 409 */
 410#define DEFAULT_MAX_SAMPLE_RATE         100000
 411#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 412#define DEFAULT_CPU_TIME_MAX_PERCENT    25
 413
 414int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 415
 416static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 417static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 418
 419static int perf_sample_allowed_ns __read_mostly =
 420        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 421
 422static void update_perf_cpu_limits(void)
 423{
 424        u64 tmp = perf_sample_period_ns;
 425
 426        tmp *= sysctl_perf_cpu_time_max_percent;
 427        tmp = div_u64(tmp, 100);
 428        if (!tmp)
 429                tmp = 1;
 430
 431        WRITE_ONCE(perf_sample_allowed_ns, tmp);
 432}
 433
 434static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 435
 436int perf_proc_update_handler(struct ctl_table *table, int write,
 437                void __user *buffer, size_t *lenp,
 438                loff_t *ppos)
 439{
 440        int ret;
 441        int perf_cpu = sysctl_perf_cpu_time_max_percent;
 442        /*
 443         * If throttling is disabled don't allow the write:
 444         */
 445        if (write && (perf_cpu == 100 || perf_cpu == 0))
 446                return -EINVAL;
 447
 448        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 449        if (ret || !write)
 450                return ret;
 451
 452        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 453        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 454        update_perf_cpu_limits();
 455
 456        return 0;
 457}
 458
 459int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 460
 461int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 462                                void __user *buffer, size_t *lenp,
 463                                loff_t *ppos)
 464{
 465        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 466
 467        if (ret || !write)
 468                return ret;
 469
 470        if (sysctl_perf_cpu_time_max_percent == 100 ||
 471            sysctl_perf_cpu_time_max_percent == 0) {
 472                printk(KERN_WARNING
 473                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 474                WRITE_ONCE(perf_sample_allowed_ns, 0);
 475        } else {
 476                update_perf_cpu_limits();
 477        }
 478
 479        return 0;
 480}
 481
 482/*
 483 * perf samples are done in some very critical code paths (NMIs).
 484 * If they take too much CPU time, the system can lock up and not
 485 * get any real work done.  This will drop the sample rate when
 486 * we detect that events are taking too long.
 487 */
 488#define NR_ACCUMULATED_SAMPLES 128
 489static DEFINE_PER_CPU(u64, running_sample_length);
 490
 491static u64 __report_avg;
 492static u64 __report_allowed;
 493
 494static void perf_duration_warn(struct irq_work *w)
 495{
 496        printk_ratelimited(KERN_INFO
 497                "perf: interrupt took too long (%lld > %lld), lowering "
 498                "kernel.perf_event_max_sample_rate to %d\n",
 499                __report_avg, __report_allowed,
 500                sysctl_perf_event_sample_rate);
 501}
 502
 503static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 504
 505void perf_sample_event_took(u64 sample_len_ns)
 506{
 507        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
 508        u64 running_len;
 509        u64 avg_len;
 510        u32 max;
 511
 512        if (max_len == 0)
 513                return;
 514
 515        /* Decay the counter by 1 average sample. */
 516        running_len = __this_cpu_read(running_sample_length);
 517        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
 518        running_len += sample_len_ns;
 519        __this_cpu_write(running_sample_length, running_len);
 520
 521        /*
 522         * Note: this will be biased artifically low until we have
 523         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 524         * from having to maintain a count.
 525         */
 526        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
 527        if (avg_len <= max_len)
 528                return;
 529
 530        __report_avg = avg_len;
 531        __report_allowed = max_len;
 532
 533        /*
 534         * Compute a throttle threshold 25% below the current duration.
 535         */
 536        avg_len += avg_len / 4;
 537        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
 538        if (avg_len < max)
 539                max /= (u32)avg_len;
 540        else
 541                max = 1;
 542
 543        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
 544        WRITE_ONCE(max_samples_per_tick, max);
 545
 546        sysctl_perf_event_sample_rate = max * HZ;
 547        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 548
 549        if (!irq_work_queue(&perf_duration_work)) {
 550                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 551                             "kernel.perf_event_max_sample_rate to %d\n",
 552                             __report_avg, __report_allowed,
 553                             sysctl_perf_event_sample_rate);
 554        }
 555}
 556
 557static atomic64_t perf_event_id;
 558
 559static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 560                              enum event_type_t event_type);
 561
 562static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 563                             enum event_type_t event_type,
 564                             struct task_struct *task);
 565
 566static void update_context_time(struct perf_event_context *ctx);
 567static u64 perf_event_time(struct perf_event *event);
 568
 569void __weak perf_event_print_debug(void)        { }
 570
 571extern __weak const char *perf_pmu_name(void)
 572{
 573        return "pmu";
 574}
 575
 576static inline u64 perf_clock(void)
 577{
 578        return local_clock();
 579}
 580
 581static inline u64 perf_event_clock(struct perf_event *event)
 582{
 583        return event->clock();
 584}
 585
 586/*
 587 * State based event timekeeping...
 588 *
 589 * The basic idea is to use event->state to determine which (if any) time
 590 * fields to increment with the current delta. This means we only need to
 591 * update timestamps when we change state or when they are explicitly requested
 592 * (read).
 593 *
 594 * Event groups make things a little more complicated, but not terribly so. The
 595 * rules for a group are that if the group leader is OFF the entire group is
 596 * OFF, irrespecive of what the group member states are. This results in
 597 * __perf_effective_state().
 598 *
 599 * A futher ramification is that when a group leader flips between OFF and
 600 * !OFF, we need to update all group member times.
 601 *
 602 *
 603 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 604 * need to make sure the relevant context time is updated before we try and
 605 * update our timestamps.
 606 */
 607
 608static __always_inline enum perf_event_state
 609__perf_effective_state(struct perf_event *event)
 610{
 611        struct perf_event *leader = event->group_leader;
 612
 613        if (leader->state <= PERF_EVENT_STATE_OFF)
 614                return leader->state;
 615
 616        return event->state;
 617}
 618
 619static __always_inline void
 620__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
 621{
 622        enum perf_event_state state = __perf_effective_state(event);
 623        u64 delta = now - event->tstamp;
 624
 625        *enabled = event->total_time_enabled;
 626        if (state >= PERF_EVENT_STATE_INACTIVE)
 627                *enabled += delta;
 628
 629        *running = event->total_time_running;
 630        if (state >= PERF_EVENT_STATE_ACTIVE)
 631                *running += delta;
 632}
 633
 634static void perf_event_update_time(struct perf_event *event)
 635{
 636        u64 now = perf_event_time(event);
 637
 638        __perf_update_times(event, now, &event->total_time_enabled,
 639                                        &event->total_time_running);
 640        event->tstamp = now;
 641}
 642
 643static void perf_event_update_sibling_time(struct perf_event *leader)
 644{
 645        struct perf_event *sibling;
 646
 647        for_each_sibling_event(sibling, leader)
 648                perf_event_update_time(sibling);
 649}
 650
 651static void
 652perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 653{
 654        if (event->state == state)
 655                return;
 656
 657        perf_event_update_time(event);
 658        /*
 659         * If a group leader gets enabled/disabled all its siblings
 660         * are affected too.
 661         */
 662        if ((event->state < 0) ^ (state < 0))
 663                perf_event_update_sibling_time(event);
 664
 665        WRITE_ONCE(event->state, state);
 666}
 667
 668#ifdef CONFIG_CGROUP_PERF
 669
 670static inline bool
 671perf_cgroup_match(struct perf_event *event)
 672{
 673        struct perf_event_context *ctx = event->ctx;
 674        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 675
 676        /* @event doesn't care about cgroup */
 677        if (!event->cgrp)
 678                return true;
 679
 680        /* wants specific cgroup scope but @cpuctx isn't associated with any */
 681        if (!cpuctx->cgrp)
 682                return false;
 683
 684        /*
 685         * Cgroup scoping is recursive.  An event enabled for a cgroup is
 686         * also enabled for all its descendant cgroups.  If @cpuctx's
 687         * cgroup is a descendant of @event's (the test covers identity
 688         * case), it's a match.
 689         */
 690        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 691                                    event->cgrp->css.cgroup);
 692}
 693
 694static inline void perf_detach_cgroup(struct perf_event *event)
 695{
 696        css_put(&event->cgrp->css);
 697        event->cgrp = NULL;
 698}
 699
 700static inline int is_cgroup_event(struct perf_event *event)
 701{
 702        return event->cgrp != NULL;
 703}
 704
 705static inline u64 perf_cgroup_event_time(struct perf_event *event)
 706{
 707        struct perf_cgroup_info *t;
 708
 709        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 710        return t->time;
 711}
 712
 713static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 714{
 715        struct perf_cgroup_info *info;
 716        u64 now;
 717
 718        now = perf_clock();
 719
 720        info = this_cpu_ptr(cgrp->info);
 721
 722        info->time += now - info->timestamp;
 723        info->timestamp = now;
 724}
 725
 726static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 727{
 728        struct perf_cgroup *cgrp = cpuctx->cgrp;
 729        struct cgroup_subsys_state *css;
 730
 731        if (cgrp) {
 732                for (css = &cgrp->css; css; css = css->parent) {
 733                        cgrp = container_of(css, struct perf_cgroup, css);
 734                        __update_cgrp_time(cgrp);
 735                }
 736        }
 737}
 738
 739static inline void update_cgrp_time_from_event(struct perf_event *event)
 740{
 741        struct perf_cgroup *cgrp;
 742
 743        /*
 744         * ensure we access cgroup data only when needed and
 745         * when we know the cgroup is pinned (css_get)
 746         */
 747        if (!is_cgroup_event(event))
 748                return;
 749
 750        cgrp = perf_cgroup_from_task(current, event->ctx);
 751        /*
 752         * Do not update time when cgroup is not active
 753         */
 754        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 755                __update_cgrp_time(event->cgrp);
 756}
 757
 758static inline void
 759perf_cgroup_set_timestamp(struct task_struct *task,
 760                          struct perf_event_context *ctx)
 761{
 762        struct perf_cgroup *cgrp;
 763        struct perf_cgroup_info *info;
 764        struct cgroup_subsys_state *css;
 765
 766        /*
 767         * ctx->lock held by caller
 768         * ensure we do not access cgroup data
 769         * unless we have the cgroup pinned (css_get)
 770         */
 771        if (!task || !ctx->nr_cgroups)
 772                return;
 773
 774        cgrp = perf_cgroup_from_task(task, ctx);
 775
 776        for (css = &cgrp->css; css; css = css->parent) {
 777                cgrp = container_of(css, struct perf_cgroup, css);
 778                info = this_cpu_ptr(cgrp->info);
 779                info->timestamp = ctx->timestamp;
 780        }
 781}
 782
 783static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 784
 785#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 786#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 787
 788/*
 789 * reschedule events based on the cgroup constraint of task.
 790 *
 791 * mode SWOUT : schedule out everything
 792 * mode SWIN : schedule in based on cgroup for next
 793 */
 794static void perf_cgroup_switch(struct task_struct *task, int mode)
 795{
 796        struct perf_cpu_context *cpuctx;
 797        struct list_head *list;
 798        unsigned long flags;
 799
 800        /*
 801         * Disable interrupts and preemption to avoid this CPU's
 802         * cgrp_cpuctx_entry to change under us.
 803         */
 804        local_irq_save(flags);
 805
 806        list = this_cpu_ptr(&cgrp_cpuctx_list);
 807        list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
 808                WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 809
 810                perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 811                perf_pmu_disable(cpuctx->ctx.pmu);
 812
 813                if (mode & PERF_CGROUP_SWOUT) {
 814                        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 815                        /*
 816                         * must not be done before ctxswout due
 817                         * to event_filter_match() in event_sched_out()
 818                         */
 819                        cpuctx->cgrp = NULL;
 820                }
 821
 822                if (mode & PERF_CGROUP_SWIN) {
 823                        WARN_ON_ONCE(cpuctx->cgrp);
 824                        /*
 825                         * set cgrp before ctxsw in to allow
 826                         * event_filter_match() to not have to pass
 827                         * task around
 828                         * we pass the cpuctx->ctx to perf_cgroup_from_task()
 829                         * because cgorup events are only per-cpu
 830                         */
 831                        cpuctx->cgrp = perf_cgroup_from_task(task,
 832                                                             &cpuctx->ctx);
 833                        cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 834                }
 835                perf_pmu_enable(cpuctx->ctx.pmu);
 836                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 837        }
 838
 839        local_irq_restore(flags);
 840}
 841
 842static inline void perf_cgroup_sched_out(struct task_struct *task,
 843                                         struct task_struct *next)
 844{
 845        struct perf_cgroup *cgrp1;
 846        struct perf_cgroup *cgrp2 = NULL;
 847
 848        rcu_read_lock();
 849        /*
 850         * we come here when we know perf_cgroup_events > 0
 851         * we do not need to pass the ctx here because we know
 852         * we are holding the rcu lock
 853         */
 854        cgrp1 = perf_cgroup_from_task(task, NULL);
 855        cgrp2 = perf_cgroup_from_task(next, NULL);
 856
 857        /*
 858         * only schedule out current cgroup events if we know
 859         * that we are switching to a different cgroup. Otherwise,
 860         * do no touch the cgroup events.
 861         */
 862        if (cgrp1 != cgrp2)
 863                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 864
 865        rcu_read_unlock();
 866}
 867
 868static inline void perf_cgroup_sched_in(struct task_struct *prev,
 869                                        struct task_struct *task)
 870{
 871        struct perf_cgroup *cgrp1;
 872        struct perf_cgroup *cgrp2 = NULL;
 873
 874        rcu_read_lock();
 875        /*
 876         * we come here when we know perf_cgroup_events > 0
 877         * we do not need to pass the ctx here because we know
 878         * we are holding the rcu lock
 879         */
 880        cgrp1 = perf_cgroup_from_task(task, NULL);
 881        cgrp2 = perf_cgroup_from_task(prev, NULL);
 882
 883        /*
 884         * only need to schedule in cgroup events if we are changing
 885         * cgroup during ctxsw. Cgroup events were not scheduled
 886         * out of ctxsw out if that was not the case.
 887         */
 888        if (cgrp1 != cgrp2)
 889                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 890
 891        rcu_read_unlock();
 892}
 893
 894static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 895                                      struct perf_event_attr *attr,
 896                                      struct perf_event *group_leader)
 897{
 898        struct perf_cgroup *cgrp;
 899        struct cgroup_subsys_state *css;
 900        struct fd f = fdget(fd);
 901        int ret = 0;
 902
 903        if (!f.file)
 904                return -EBADF;
 905
 906        css = css_tryget_online_from_dir(f.file->f_path.dentry,
 907                                         &perf_event_cgrp_subsys);
 908        if (IS_ERR(css)) {
 909                ret = PTR_ERR(css);
 910                goto out;
 911        }
 912
 913        cgrp = container_of(css, struct perf_cgroup, css);
 914        event->cgrp = cgrp;
 915
 916        /*
 917         * all events in a group must monitor
 918         * the same cgroup because a task belongs
 919         * to only one perf cgroup at a time
 920         */
 921        if (group_leader && group_leader->cgrp != cgrp) {
 922                perf_detach_cgroup(event);
 923                ret = -EINVAL;
 924        }
 925out:
 926        fdput(f);
 927        return ret;
 928}
 929
 930static inline void
 931perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 932{
 933        struct perf_cgroup_info *t;
 934        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 935        event->shadow_ctx_time = now - t->timestamp;
 936}
 937
 938/*
 939 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
 940 * cleared when last cgroup event is removed.
 941 */
 942static inline void
 943list_update_cgroup_event(struct perf_event *event,
 944                         struct perf_event_context *ctx, bool add)
 945{
 946        struct perf_cpu_context *cpuctx;
 947        struct list_head *cpuctx_entry;
 948
 949        if (!is_cgroup_event(event))
 950                return;
 951
 952        /*
 953         * Because cgroup events are always per-cpu events,
 954         * this will always be called from the right CPU.
 955         */
 956        cpuctx = __get_cpu_context(ctx);
 957
 958        /*
 959         * Since setting cpuctx->cgrp is conditional on the current @cgrp
 960         * matching the event's cgroup, we must do this for every new event,
 961         * because if the first would mismatch, the second would not try again
 962         * and we would leave cpuctx->cgrp unset.
 963         */
 964        if (add && !cpuctx->cgrp) {
 965                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 966
 967                if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 968                        cpuctx->cgrp = cgrp;
 969        }
 970
 971        if (add && ctx->nr_cgroups++)
 972                return;
 973        else if (!add && --ctx->nr_cgroups)
 974                return;
 975
 976        /* no cgroup running */
 977        if (!add)
 978                cpuctx->cgrp = NULL;
 979
 980        cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 981        if (add)
 982                list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 983        else
 984                list_del(cpuctx_entry);
 985}
 986
 987#else /* !CONFIG_CGROUP_PERF */
 988
 989static inline bool
 990perf_cgroup_match(struct perf_event *event)
 991{
 992        return true;
 993}
 994
 995static inline void perf_detach_cgroup(struct perf_event *event)
 996{}
 997
 998static inline int is_cgroup_event(struct perf_event *event)
 999{
1000        return 0;
1001}
1002
1003static inline void update_cgrp_time_from_event(struct perf_event *event)
1004{
1005}
1006
1007static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1008{
1009}
1010
1011static inline void perf_cgroup_sched_out(struct task_struct *task,
1012                                         struct task_struct *next)
1013{
1014}
1015
1016static inline void perf_cgroup_sched_in(struct task_struct *prev,
1017                                        struct task_struct *task)
1018{
1019}
1020
1021static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1022                                      struct perf_event_attr *attr,
1023                                      struct perf_event *group_leader)
1024{
1025        return -EINVAL;
1026}
1027
1028static inline void
1029perf_cgroup_set_timestamp(struct task_struct *task,
1030                          struct perf_event_context *ctx)
1031{
1032}
1033
1034void
1035perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1036{
1037}
1038
1039static inline void
1040perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1041{
1042}
1043
1044static inline u64 perf_cgroup_event_time(struct perf_event *event)
1045{
1046        return 0;
1047}
1048
1049static inline void
1050list_update_cgroup_event(struct perf_event *event,
1051                         struct perf_event_context *ctx, bool add)
1052{
1053}
1054
1055#endif
1056
1057/*
1058 * set default to be dependent on timer tick just
1059 * like original code
1060 */
1061#define PERF_CPU_HRTIMER (1000 / HZ)
1062/*
1063 * function must be called with interrupts disabled
1064 */
1065static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1066{
1067        struct perf_cpu_context *cpuctx;
1068        bool rotations;
1069
1070        lockdep_assert_irqs_disabled();
1071
1072        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1073        rotations = perf_rotate_context(cpuctx);
1074
1075        raw_spin_lock(&cpuctx->hrtimer_lock);
1076        if (rotations)
1077                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1078        else
1079                cpuctx->hrtimer_active = 0;
1080        raw_spin_unlock(&cpuctx->hrtimer_lock);
1081
1082        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1083}
1084
1085static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1086{
1087        struct hrtimer *timer = &cpuctx->hrtimer;
1088        struct pmu *pmu = cpuctx->ctx.pmu;
1089        u64 interval;
1090
1091        /* no multiplexing needed for SW PMU */
1092        if (pmu->task_ctx_nr == perf_sw_context)
1093                return;
1094
1095        /*
1096         * check default is sane, if not set then force to
1097         * default interval (1/tick)
1098         */
1099        interval = pmu->hrtimer_interval_ms;
1100        if (interval < 1)
1101                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1102
1103        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1104
1105        raw_spin_lock_init(&cpuctx->hrtimer_lock);
1106        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1107        timer->function = perf_mux_hrtimer_handler;
1108}
1109
1110static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1111{
1112        struct hrtimer *timer = &cpuctx->hrtimer;
1113        struct pmu *pmu = cpuctx->ctx.pmu;
1114        unsigned long flags;
1115
1116        /* not for SW PMU */
1117        if (pmu->task_ctx_nr == perf_sw_context)
1118                return 0;
1119
1120        raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1121        if (!cpuctx->hrtimer_active) {
1122                cpuctx->hrtimer_active = 1;
1123                hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1124                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1125        }
1126        raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1127
1128        return 0;
1129}
1130
1131void perf_pmu_disable(struct pmu *pmu)
1132{
1133        int *count = this_cpu_ptr(pmu->pmu_disable_count);
1134        if (!(*count)++)
1135                pmu->pmu_disable(pmu);
1136}
1137
1138void perf_pmu_enable(struct pmu *pmu)
1139{
1140        int *count = this_cpu_ptr(pmu->pmu_disable_count);
1141        if (!--(*count))
1142                pmu->pmu_enable(pmu);
1143}
1144
1145static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1146
1147/*
1148 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1149 * perf_event_task_tick() are fully serialized because they're strictly cpu
1150 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1151 * disabled, while perf_event_task_tick is called from IRQ context.
1152 */
1153static void perf_event_ctx_activate(struct perf_event_context *ctx)
1154{
1155        struct list_head *head = this_cpu_ptr(&active_ctx_list);
1156
1157        lockdep_assert_irqs_disabled();
1158
1159        WARN_ON(!list_empty(&ctx->active_ctx_list));
1160
1161        list_add(&ctx->active_ctx_list, head);
1162}
1163
1164static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1165{
1166        lockdep_assert_irqs_disabled();
1167
1168        WARN_ON(list_empty(&ctx->active_ctx_list));
1169
1170        list_del_init(&ctx->active_ctx_list);
1171}
1172
1173static void get_ctx(struct perf_event_context *ctx)
1174{
1175        refcount_inc(&ctx->refcount);
1176}
1177
1178static void free_ctx(struct rcu_head *head)
1179{
1180        struct perf_event_context *ctx;
1181
1182        ctx = container_of(head, struct perf_event_context, rcu_head);
1183        kfree(ctx->task_ctx_data);
1184        kfree(ctx);
1185}
1186
1187static void put_ctx(struct perf_event_context *ctx)
1188{
1189        if (refcount_dec_and_test(&ctx->refcount)) {
1190                if (ctx->parent_ctx)
1191                        put_ctx(ctx->parent_ctx);
1192                if (ctx->task && ctx->task != TASK_TOMBSTONE)
1193                        put_task_struct(ctx->task);
1194                call_rcu(&ctx->rcu_head, free_ctx);
1195        }
1196}
1197
1198/*
1199 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1200 * perf_pmu_migrate_context() we need some magic.
1201 *
1202 * Those places that change perf_event::ctx will hold both
1203 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1204 *
1205 * Lock ordering is by mutex address. There are two other sites where
1206 * perf_event_context::mutex nests and those are:
1207 *
1208 *  - perf_event_exit_task_context()    [ child , 0 ]
1209 *      perf_event_exit_event()
1210 *        put_event()                   [ parent, 1 ]
1211 *
1212 *  - perf_event_init_context()         [ parent, 0 ]
1213 *      inherit_task_group()
1214 *        inherit_group()
1215 *          inherit_event()
1216 *            perf_event_alloc()
1217 *              perf_init_event()
1218 *                perf_try_init_event() [ child , 1 ]
1219 *
1220 * While it appears there is an obvious deadlock here -- the parent and child
1221 * nesting levels are inverted between the two. This is in fact safe because
1222 * life-time rules separate them. That is an exiting task cannot fork, and a
1223 * spawning task cannot (yet) exit.
1224 *
1225 * But remember that that these are parent<->child context relations, and
1226 * migration does not affect children, therefore these two orderings should not
1227 * interact.
1228 *
1229 * The change in perf_event::ctx does not affect children (as claimed above)
1230 * because the sys_perf_event_open() case will install a new event and break
1231 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1232 * concerned with cpuctx and that doesn't have children.
1233 *
1234 * The places that change perf_event::ctx will issue:
1235 *
1236 *   perf_remove_from_context();
1237 *   synchronize_rcu();
1238 *   perf_install_in_context();
1239 *
1240 * to affect the change. The remove_from_context() + synchronize_rcu() should
1241 * quiesce the event, after which we can install it in the new location. This
1242 * means that only external vectors (perf_fops, prctl) can perturb the event
1243 * while in transit. Therefore all such accessors should also acquire
1244 * perf_event_context::mutex to serialize against this.
1245 *
1246 * However; because event->ctx can change while we're waiting to acquire
1247 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1248 * function.
1249 *
1250 * Lock order:
1251 *    cred_guard_mutex
1252 *      task_struct::perf_event_mutex
1253 *        perf_event_context::mutex
1254 *          perf_event::child_mutex;
1255 *            perf_event_context::lock
1256 *          perf_event::mmap_mutex
1257 *          mmap_sem
1258 *            perf_addr_filters_head::lock
1259 *
1260 *    cpu_hotplug_lock
1261 *      pmus_lock
1262 *        cpuctx->mutex / perf_event_context::mutex
1263 */
1264static struct perf_event_context *
1265perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1266{
1267        struct perf_event_context *ctx;
1268
1269again:
1270        rcu_read_lock();
1271        ctx = READ_ONCE(event->ctx);
1272        if (!refcount_inc_not_zero(&ctx->refcount)) {
1273                rcu_read_unlock();
1274                goto again;
1275        }
1276        rcu_read_unlock();
1277
1278        mutex_lock_nested(&ctx->mutex, nesting);
1279        if (event->ctx != ctx) {
1280                mutex_unlock(&ctx->mutex);
1281                put_ctx(ctx);
1282                goto again;
1283        }
1284
1285        return ctx;
1286}
1287
1288static inline struct perf_event_context *
1289perf_event_ctx_lock(struct perf_event *event)
1290{
1291        return perf_event_ctx_lock_nested(event, 0);
1292}
1293
1294static void perf_event_ctx_unlock(struct perf_event *event,
1295                                  struct perf_event_context *ctx)
1296{
1297        mutex_unlock(&ctx->mutex);
1298        put_ctx(ctx);
1299}
1300
1301/*
1302 * This must be done under the ctx->lock, such as to serialize against
1303 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1304 * calling scheduler related locks and ctx->lock nests inside those.
1305 */
1306static __must_check struct perf_event_context *
1307unclone_ctx(struct perf_event_context *ctx)
1308{
1309        struct perf_event_context *parent_ctx = ctx->parent_ctx;
1310
1311        lockdep_assert_held(&ctx->lock);
1312
1313        if (parent_ctx)
1314                ctx->parent_ctx = NULL;
1315        ctx->generation++;
1316
1317        return parent_ctx;
1318}
1319
1320static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1321                                enum pid_type type)
1322{
1323        u32 nr;
1324        /*
1325         * only top level events have the pid namespace they were created in
1326         */
1327        if (event->parent)
1328                event = event->parent;
1329
1330        nr = __task_pid_nr_ns(p, type, event->ns);
1331        /* avoid -1 if it is idle thread or runs in another ns */
1332        if (!nr && !pid_alive(p))
1333                nr = -1;
1334        return nr;
1335}
1336
1337static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1338{
1339        return perf_event_pid_type(event, p, PIDTYPE_TGID);
1340}
1341
1342static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1343{
1344        return perf_event_pid_type(event, p, PIDTYPE_PID);
1345}
1346
1347/*
1348 * If we inherit events we want to return the parent event id
1349 * to userspace.
1350 */
1351static u64 primary_event_id(struct perf_event *event)
1352{
1353        u64 id = event->id;
1354
1355        if (event->parent)
1356                id = event->parent->id;
1357
1358        return id;
1359}
1360
1361/*
1362 * Get the perf_event_context for a task and lock it.
1363 *
1364 * This has to cope with with the fact that until it is locked,
1365 * the context could get moved to another task.
1366 */
1367static struct perf_event_context *
1368perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1369{
1370        struct perf_event_context *ctx;
1371
1372retry:
1373        /*
1374         * One of the few rules of preemptible RCU is that one cannot do
1375         * rcu_read_unlock() while holding a scheduler (or nested) lock when
1376         * part of the read side critical section was irqs-enabled -- see
1377         * rcu_read_unlock_special().
1378         *
1379         * Since ctx->lock nests under rq->lock we must ensure the entire read
1380         * side critical section has interrupts disabled.
1381         */
1382        local_irq_save(*flags);
1383        rcu_read_lock();
1384        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1385        if (ctx) {
1386                /*
1387                 * If this context is a clone of another, it might
1388                 * get swapped for another underneath us by
1389                 * perf_event_task_sched_out, though the
1390                 * rcu_read_lock() protects us from any context
1391                 * getting freed.  Lock the context and check if it
1392                 * got swapped before we could get the lock, and retry
1393                 * if so.  If we locked the right context, then it
1394                 * can't get swapped on us any more.
1395                 */
1396                raw_spin_lock(&ctx->lock);
1397                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1398                        raw_spin_unlock(&ctx->lock);
1399                        rcu_read_unlock();
1400                        local_irq_restore(*flags);
1401                        goto retry;
1402                }
1403
1404                if (ctx->task == TASK_TOMBSTONE ||
1405                    !refcount_inc_not_zero(&ctx->refcount)) {
1406                        raw_spin_unlock(&ctx->lock);
1407                        ctx = NULL;
1408                } else {
1409                        WARN_ON_ONCE(ctx->task != task);
1410                }
1411        }
1412        rcu_read_unlock();
1413        if (!ctx)
1414                local_irq_restore(*flags);
1415        return ctx;
1416}
1417
1418/*
1419 * Get the context for a task and increment its pin_count so it
1420 * can't get swapped to another task.  This also increments its
1421 * reference count so that the context can't get freed.
1422 */
1423static struct perf_event_context *
1424perf_pin_task_context(struct task_struct *task, int ctxn)
1425{
1426        struct perf_event_context *ctx;
1427        unsigned long flags;
1428
1429        ctx = perf_lock_task_context(task, ctxn, &flags);
1430        if (ctx) {
1431                ++ctx->pin_count;
1432                raw_spin_unlock_irqrestore(&ctx->lock, flags);
1433        }
1434        return ctx;
1435}
1436
1437static void perf_unpin_context(struct perf_event_context *ctx)
1438{
1439        unsigned long flags;
1440
1441        raw_spin_lock_irqsave(&ctx->lock, flags);
1442        --ctx->pin_count;
1443        raw_spin_unlock_irqrestore(&ctx->lock, flags);
1444}
1445
1446/*
1447 * Update the record of the current time in a context.
1448 */
1449static void update_context_time(struct perf_event_context *ctx)
1450{
1451        u64 now = perf_clock();
1452
1453        ctx->time += now - ctx->timestamp;
1454        ctx->timestamp = now;
1455}
1456
1457static u64 perf_event_time(struct perf_event *event)
1458{
1459        struct perf_event_context *ctx = event->ctx;
1460
1461        if (is_cgroup_event(event))
1462                return perf_cgroup_event_time(event);
1463
1464        return ctx ? ctx->time : 0;
1465}
1466
1467static enum event_type_t get_event_type(struct perf_event *event)
1468{
1469        struct perf_event_context *ctx = event->ctx;
1470        enum event_type_t event_type;
1471
1472        lockdep_assert_held(&ctx->lock);
1473
1474        /*
1475         * It's 'group type', really, because if our group leader is
1476         * pinned, so are we.
1477         */
1478        if (event->group_leader != event)
1479                event = event->group_leader;
1480
1481        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1482        if (!ctx->task)
1483                event_type |= EVENT_CPU;
1484
1485        return event_type;
1486}
1487
1488/*
1489 * Helper function to initialize event group nodes.
1490 */
1491static void init_event_group(struct perf_event *event)
1492{
1493        RB_CLEAR_NODE(&event->group_node);
1494        event->group_index = 0;
1495}
1496
1497/*
1498 * Extract pinned or flexible groups from the context
1499 * based on event attrs bits.
1500 */
1501static struct perf_event_groups *
1502get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1503{
1504        if (event->attr.pinned)
1505                return &ctx->pinned_groups;
1506        else
1507                return &ctx->flexible_groups;
1508}
1509
1510/*
1511 * Helper function to initializes perf_event_group trees.
1512 */
1513static void perf_event_groups_init(struct perf_event_groups *groups)
1514{
1515        groups->tree = RB_ROOT;
1516        groups->index = 0;
1517}
1518
1519/*
1520 * Compare function for event groups;
1521 *
1522 * Implements complex key that first sorts by CPU and then by virtual index
1523 * which provides ordering when rotating groups for the same CPU.
1524 */
1525static bool
1526perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1527{
1528        if (left->cpu < right->cpu)
1529                return true;
1530        if (left->cpu > right->cpu)
1531                return false;
1532
1533        if (left->group_index < right->group_index)
1534                return true;
1535        if (left->group_index > right->group_index)
1536                return false;
1537
1538        return false;
1539}
1540
1541/*
1542 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1543 * key (see perf_event_groups_less). This places it last inside the CPU
1544 * subtree.
1545 */
1546static void
1547perf_event_groups_insert(struct perf_event_groups *groups,
1548                         struct perf_event *event)
1549{
1550        struct perf_event *node_event;
1551        struct rb_node *parent;
1552        struct rb_node **node;
1553
1554        event->group_index = ++groups->index;
1555
1556        node = &groups->tree.rb_node;
1557        parent = *node;
1558
1559        while (*node) {
1560                parent = *node;
1561                node_event = container_of(*node, struct perf_event, group_node);
1562
1563                if (perf_event_groups_less(event, node_event))
1564                        node = &parent->rb_left;
1565                else
1566                        node = &parent->rb_right;
1567        }
1568
1569        rb_link_node(&event->group_node, parent, node);
1570        rb_insert_color(&event->group_node, &groups->tree);
1571}
1572
1573/*
1574 * Helper function to insert event into the pinned or flexible groups.
1575 */
1576static void
1577add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1578{
1579        struct perf_event_groups *groups;
1580
1581        groups = get_event_groups(event, ctx);
1582        perf_event_groups_insert(groups, event);
1583}
1584
1585/*
1586 * Delete a group from a tree.
1587 */
1588static void
1589perf_event_groups_delete(struct perf_event_groups *groups,
1590                         struct perf_event *event)
1591{
1592        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1593                     RB_EMPTY_ROOT(&groups->tree));
1594
1595        rb_erase(&event->group_node, &groups->tree);
1596        init_event_group(event);
1597}
1598
1599/*
1600 * Helper function to delete event from its groups.
1601 */
1602static void
1603del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1604{
1605        struct perf_event_groups *groups;
1606
1607        groups = get_event_groups(event, ctx);
1608        perf_event_groups_delete(groups, event);
1609}
1610
1611/*
1612 * Get the leftmost event in the @cpu subtree.
1613 */
1614static struct perf_event *
1615perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1616{
1617        struct perf_event *node_event = NULL, *match = NULL;
1618        struct rb_node *node = groups->tree.rb_node;
1619
1620        while (node) {
1621                node_event = container_of(node, struct perf_event, group_node);
1622
1623                if (cpu < node_event->cpu) {
1624                        node = node->rb_left;
1625                } else if (cpu > node_event->cpu) {
1626                        node = node->rb_right;
1627                } else {
1628                        match = node_event;
1629                        node = node->rb_left;
1630                }
1631        }
1632
1633        return match;
1634}
1635
1636/*
1637 * Like rb_entry_next_safe() for the @cpu subtree.
1638 */
1639static struct perf_event *
1640perf_event_groups_next(struct perf_event *event)
1641{
1642        struct perf_event *next;
1643
1644        next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1645        if (next && next->cpu == event->cpu)
1646                return next;
1647
1648        return NULL;
1649}
1650
1651/*
1652 * Iterate through the whole groups tree.
1653 */
1654#define perf_event_groups_for_each(event, groups)                       \
1655        for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1656                                typeof(*event), group_node); event;     \
1657                event = rb_entry_safe(rb_next(&event->group_node),      \
1658                                typeof(*event), group_node))
1659
1660/*
1661 * Add an event from the lists for its context.
1662 * Must be called with ctx->mutex and ctx->lock held.
1663 */
1664static void
1665list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1666{
1667        lockdep_assert_held(&ctx->lock);
1668
1669        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1670        event->attach_state |= PERF_ATTACH_CONTEXT;
1671
1672        event->tstamp = perf_event_time(event);
1673
1674        /*
1675         * If we're a stand alone event or group leader, we go to the context
1676         * list, group events are kept attached to the group so that
1677         * perf_group_detach can, at all times, locate all siblings.
1678         */
1679        if (event->group_leader == event) {
1680                event->group_caps = event->event_caps;
1681                add_event_to_groups(event, ctx);
1682        }
1683
1684        list_update_cgroup_event(event, ctx, true);
1685
1686        list_add_rcu(&event->event_entry, &ctx->event_list);
1687        ctx->nr_events++;
1688        if (event->attr.inherit_stat)
1689                ctx->nr_stat++;
1690
1691        ctx->generation++;
1692}
1693
1694/*
1695 * Initialize event state based on the perf_event_attr::disabled.
1696 */
1697static inline void perf_event__state_init(struct perf_event *event)
1698{
1699        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1700                                              PERF_EVENT_STATE_INACTIVE;
1701}
1702
1703static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1704{
1705        int entry = sizeof(u64); /* value */
1706        int size = 0;
1707        int nr = 1;
1708
1709        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1710                size += sizeof(u64);
1711
1712        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1713                size += sizeof(u64);
1714
1715        if (event->attr.read_format & PERF_FORMAT_ID)
1716                entry += sizeof(u64);
1717
1718        if (event->attr.read_format & PERF_FORMAT_GROUP) {
1719                nr += nr_siblings;
1720                size += sizeof(u64);
1721        }
1722
1723        size += entry * nr;
1724        event->read_size = size;
1725}
1726
1727static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1728{
1729        struct perf_sample_data *data;
1730        u16 size = 0;
1731
1732        if (sample_type & PERF_SAMPLE_IP)
1733                size += sizeof(data->ip);
1734
1735        if (sample_type & PERF_SAMPLE_ADDR)
1736                size += sizeof(data->addr);
1737
1738        if (sample_type & PERF_SAMPLE_PERIOD)
1739                size += sizeof(data->period);
1740
1741        if (sample_type & PERF_SAMPLE_WEIGHT)
1742                size += sizeof(data->weight);
1743
1744        if (sample_type & PERF_SAMPLE_READ)
1745                size += event->read_size;
1746
1747        if (sample_type & PERF_SAMPLE_DATA_SRC)
1748                size += sizeof(data->data_src.val);
1749
1750        if (sample_type & PERF_SAMPLE_TRANSACTION)
1751                size += sizeof(data->txn);
1752
1753        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1754                size += sizeof(data->phys_addr);
1755
1756        event->header_size = size;
1757}
1758
1759/*
1760 * Called at perf_event creation and when events are attached/detached from a
1761 * group.
1762 */
1763static void perf_event__header_size(struct perf_event *event)
1764{
1765        __perf_event_read_size(event,
1766                               event->group_leader->nr_siblings);
1767        __perf_event_header_size(event, event->attr.sample_type);
1768}
1769
1770static void perf_event__id_header_size(struct perf_event *event)
1771{
1772        struct perf_sample_data *data;
1773        u64 sample_type = event->attr.sample_type;
1774        u16 size = 0;
1775
1776        if (sample_type & PERF_SAMPLE_TID)
1777                size += sizeof(data->tid_entry);
1778
1779        if (sample_type & PERF_SAMPLE_TIME)
1780                size += sizeof(data->time);
1781
1782        if (sample_type & PERF_SAMPLE_IDENTIFIER)
1783                size += sizeof(data->id);
1784
1785        if (sample_type & PERF_SAMPLE_ID)
1786                size += sizeof(data->id);
1787
1788        if (sample_type & PERF_SAMPLE_STREAM_ID)
1789                size += sizeof(data->stream_id);
1790
1791        if (sample_type & PERF_SAMPLE_CPU)
1792                size += sizeof(data->cpu_entry);
1793
1794        event->id_header_size = size;
1795}
1796
1797static bool perf_event_validate_size(struct perf_event *event)
1798{
1799        /*
1800         * The values computed here will be over-written when we actually
1801         * attach the event.
1802         */
1803        __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1804        __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1805        perf_event__id_header_size(event);
1806
1807        /*
1808         * Sum the lot; should not exceed the 64k limit we have on records.
1809         * Conservative limit to allow for callchains and other variable fields.
1810         */
1811        if (event->read_size + event->header_size +
1812            event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1813                return false;
1814
1815        return true;
1816}
1817
1818static void perf_group_attach(struct perf_event *event)
1819{
1820        struct perf_event *group_leader = event->group_leader, *pos;
1821
1822        lockdep_assert_held(&event->ctx->lock);
1823
1824        /*
1825         * We can have double attach due to group movement in perf_event_open.
1826         */
1827        if (event->attach_state & PERF_ATTACH_GROUP)
1828                return;
1829
1830        event->attach_state |= PERF_ATTACH_GROUP;
1831
1832        if (group_leader == event)
1833                return;
1834
1835        WARN_ON_ONCE(group_leader->ctx != event->ctx);
1836
1837        group_leader->group_caps &= event->event_caps;
1838
1839        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1840        group_leader->nr_siblings++;
1841
1842        perf_event__header_size(group_leader);
1843
1844        for_each_sibling_event(pos, group_leader)
1845                perf_event__header_size(pos);
1846}
1847
1848/*
1849 * Remove an event from the lists for its context.
1850 * Must be called with ctx->mutex and ctx->lock held.
1851 */
1852static void
1853list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1854{
1855        WARN_ON_ONCE(event->ctx != ctx);
1856        lockdep_assert_held(&ctx->lock);
1857
1858        /*
1859         * We can have double detach due to exit/hot-unplug + close.
1860         */
1861        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1862                return;
1863
1864        event->attach_state &= ~PERF_ATTACH_CONTEXT;
1865
1866        list_update_cgroup_event(event, ctx, false);
1867
1868        ctx->nr_events--;
1869        if (event->attr.inherit_stat)
1870                ctx->nr_stat--;
1871
1872        list_del_rcu(&event->event_entry);
1873
1874        if (event->group_leader == event)
1875                del_event_from_groups(event, ctx);
1876
1877        /*
1878         * If event was in error state, then keep it
1879         * that way, otherwise bogus counts will be
1880         * returned on read(). The only way to get out
1881         * of error state is by explicit re-enabling
1882         * of the event
1883         */
1884        if (event->state > PERF_EVENT_STATE_OFF)
1885                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1886
1887        ctx->generation++;
1888}
1889
1890static void perf_group_detach(struct perf_event *event)
1891{
1892        struct perf_event *sibling, *tmp;
1893        struct perf_event_context *ctx = event->ctx;
1894
1895        lockdep_assert_held(&ctx->lock);
1896
1897        /*
1898         * We can have double detach due to exit/hot-unplug + close.
1899         */
1900        if (!(event->attach_state & PERF_ATTACH_GROUP))
1901                return;
1902
1903        event->attach_state &= ~PERF_ATTACH_GROUP;
1904
1905        /*
1906         * If this is a sibling, remove it from its group.
1907         */
1908        if (event->group_leader != event) {
1909                list_del_init(&event->sibling_list);
1910                event->group_leader->nr_siblings--;
1911                goto out;
1912        }
1913
1914        /*
1915         * If this was a group event with sibling events then
1916         * upgrade the siblings to singleton events by adding them
1917         * to whatever list we are on.
1918         */
1919        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1920
1921                sibling->group_leader = sibling;
1922                list_del_init(&sibling->sibling_list);
1923
1924                /* Inherit group flags from the previous leader */
1925                sibling->group_caps = event->group_caps;
1926
1927                if (!RB_EMPTY_NODE(&event->group_node)) {
1928                        add_event_to_groups(sibling, event->ctx);
1929
1930                        if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1931                                struct list_head *list = sibling->attr.pinned ?
1932                                        &ctx->pinned_active : &ctx->flexible_active;
1933
1934                                list_add_tail(&sibling->active_list, list);
1935                        }
1936                }
1937
1938                WARN_ON_ONCE(sibling->ctx != event->ctx);
1939        }
1940
1941out:
1942        perf_event__header_size(event->group_leader);
1943
1944        for_each_sibling_event(tmp, event->group_leader)
1945                perf_event__header_size(tmp);
1946}
1947
1948static bool is_orphaned_event(struct perf_event *event)
1949{
1950        return event->state == PERF_EVENT_STATE_DEAD;
1951}
1952
1953static inline int __pmu_filter_match(struct perf_event *event)
1954{
1955        struct pmu *pmu = event->pmu;
1956        return pmu->filter_match ? pmu->filter_match(event) : 1;
1957}
1958
1959/*
1960 * Check whether we should attempt to schedule an event group based on
1961 * PMU-specific filtering. An event group can consist of HW and SW events,
1962 * potentially with a SW leader, so we must check all the filters, to
1963 * determine whether a group is schedulable:
1964 */
1965static inline int pmu_filter_match(struct perf_event *event)
1966{
1967        struct perf_event *sibling;
1968
1969        if (!__pmu_filter_match(event))
1970                return 0;
1971
1972        for_each_sibling_event(sibling, event) {
1973                if (!__pmu_filter_match(sibling))
1974                        return 0;
1975        }
1976
1977        return 1;
1978}
1979
1980static inline int
1981event_filter_match(struct perf_event *event)
1982{
1983        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1984               perf_cgroup_match(event) && pmu_filter_match(event);
1985}
1986
1987static void
1988event_sched_out(struct perf_event *event,
1989                  struct perf_cpu_context *cpuctx,
1990                  struct perf_event_context *ctx)
1991{
1992        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1993
1994        WARN_ON_ONCE(event->ctx != ctx);
1995        lockdep_assert_held(&ctx->lock);
1996
1997        if (event->state != PERF_EVENT_STATE_ACTIVE)
1998                return;
1999
2000        /*
2001         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2002         * we can schedule events _OUT_ individually through things like
2003         * __perf_remove_from_context().
2004         */
2005        list_del_init(&event->active_list);
2006
2007        perf_pmu_disable(event->pmu);
2008
2009        event->pmu->del(event, 0);
2010        event->oncpu = -1;
2011
2012        if (READ_ONCE(event->pending_disable) >= 0) {
2013                WRITE_ONCE(event->pending_disable, -1);
2014                state = PERF_EVENT_STATE_OFF;
2015        }
2016        perf_event_set_state(event, state);
2017
2018        if (!is_software_event(event))
2019                cpuctx->active_oncpu--;
2020        if (!--ctx->nr_active)
2021                perf_event_ctx_deactivate(ctx);
2022        if (event->attr.freq && event->attr.sample_freq)
2023                ctx->nr_freq--;
2024        if (event->attr.exclusive || !cpuctx->active_oncpu)
2025                cpuctx->exclusive = 0;
2026
2027        perf_pmu_enable(event->pmu);
2028}
2029
2030static void
2031group_sched_out(struct perf_event *group_event,
2032                struct perf_cpu_context *cpuctx,
2033                struct perf_event_context *ctx)
2034{
2035        struct perf_event *event;
2036
2037        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2038                return;
2039
2040        perf_pmu_disable(ctx->pmu);
2041
2042        event_sched_out(group_event, cpuctx, ctx);
2043
2044        /*
2045         * Schedule out siblings (if any):
2046         */
2047        for_each_sibling_event(event, group_event)
2048                event_sched_out(event, cpuctx, ctx);
2049
2050        perf_pmu_enable(ctx->pmu);
2051
2052        if (group_event->attr.exclusive)
2053                cpuctx->exclusive = 0;
2054}
2055
2056#define DETACH_GROUP    0x01UL
2057
2058/*
2059 * Cross CPU call to remove a performance event
2060 *
2061 * We disable the event on the hardware level first. After that we
2062 * remove it from the context list.
2063 */
2064static void
2065__perf_remove_from_context(struct perf_event *event,
2066                           struct perf_cpu_context *cpuctx,
2067                           struct perf_event_context *ctx,
2068                           void *info)
2069{
2070        unsigned long flags = (unsigned long)info;
2071
2072        if (ctx->is_active & EVENT_TIME) {
2073                update_context_time(ctx);
2074                update_cgrp_time_from_cpuctx(cpuctx);
2075        }
2076
2077        event_sched_out(event, cpuctx, ctx);
2078        if (flags & DETACH_GROUP)
2079                perf_group_detach(event);
2080        list_del_event(event, ctx);
2081
2082        if (!ctx->nr_events && ctx->is_active) {
2083                ctx->is_active = 0;
2084                if (ctx->task) {
2085                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2086                        cpuctx->task_ctx = NULL;
2087                }
2088        }
2089}
2090
2091/*
2092 * Remove the event from a task's (or a CPU's) list of events.
2093 *
2094 * If event->ctx is a cloned context, callers must make sure that
2095 * every task struct that event->ctx->task could possibly point to
2096 * remains valid.  This is OK when called from perf_release since
2097 * that only calls us on the top-level context, which can't be a clone.
2098 * When called from perf_event_exit_task, it's OK because the
2099 * context has been detached from its task.
2100 */
2101static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2102{
2103        struct perf_event_context *ctx = event->ctx;
2104
2105        lockdep_assert_held(&ctx->mutex);
2106
2107        event_function_call(event, __perf_remove_from_context, (void *)flags);
2108
2109        /*
2110         * The above event_function_call() can NO-OP when it hits
2111         * TASK_TOMBSTONE. In that case we must already have been detached
2112         * from the context (by perf_event_exit_event()) but the grouping
2113         * might still be in-tact.
2114         */
2115        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2116        if ((flags & DETACH_GROUP) &&
2117            (event->attach_state & PERF_ATTACH_GROUP)) {
2118                /*
2119                 * Since in that case we cannot possibly be scheduled, simply
2120                 * detach now.
2121                 */
2122                raw_spin_lock_irq(&ctx->lock);
2123                perf_group_detach(event);
2124                raw_spin_unlock_irq(&ctx->lock);
2125        }
2126}
2127
2128/*
2129 * Cross CPU call to disable a performance event
2130 */
2131static void __perf_event_disable(struct perf_event *event,
2132                                 struct perf_cpu_context *cpuctx,
2133                                 struct perf_event_context *ctx,
2134                                 void *info)
2135{
2136        if (event->state < PERF_EVENT_STATE_INACTIVE)
2137                return;
2138
2139        if (ctx->is_active & EVENT_TIME) {
2140                update_context_time(ctx);
2141                update_cgrp_time_from_event(event);
2142        }
2143
2144        if (event == event->group_leader)
2145                group_sched_out(event, cpuctx, ctx);
2146        else
2147                event_sched_out(event, cpuctx, ctx);
2148
2149        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2150}
2151
2152/*
2153 * Disable an event.
2154 *
2155 * If event->ctx is a cloned context, callers must make sure that
2156 * every task struct that event->ctx->task could possibly point to
2157 * remains valid.  This condition is satisifed when called through
2158 * perf_event_for_each_child or perf_event_for_each because they
2159 * hold the top-level event's child_mutex, so any descendant that
2160 * goes to exit will block in perf_event_exit_event().
2161 *
2162 * When called from perf_pending_event it's OK because event->ctx
2163 * is the current context on this CPU and preemption is disabled,
2164 * hence we can't get into perf_event_task_sched_out for this context.
2165 */
2166static void _perf_event_disable(struct perf_event *event)
2167{
2168        struct perf_event_context *ctx = event->ctx;
2169
2170        raw_spin_lock_irq(&ctx->lock);
2171        if (event->state <= PERF_EVENT_STATE_OFF) {
2172                raw_spin_unlock_irq(&ctx->lock);
2173                return;
2174        }
2175        raw_spin_unlock_irq(&ctx->lock);
2176
2177        event_function_call(event, __perf_event_disable, NULL);
2178}
2179
2180void perf_event_disable_local(struct perf_event *event)
2181{
2182        event_function_local(event, __perf_event_disable, NULL);
2183}
2184
2185/*
2186 * Strictly speaking kernel users cannot create groups and therefore this
2187 * interface does not need the perf_event_ctx_lock() magic.
2188 */
2189void perf_event_disable(struct perf_event *event)
2190{
2191        struct perf_event_context *ctx;
2192
2193        ctx = perf_event_ctx_lock(event);
2194        _perf_event_disable(event);
2195        perf_event_ctx_unlock(event, ctx);
2196}
2197EXPORT_SYMBOL_GPL(perf_event_disable);
2198
2199void perf_event_disable_inatomic(struct perf_event *event)
2200{
2201        WRITE_ONCE(event->pending_disable, smp_processor_id());
2202        /* can fail, see perf_pending_event_disable() */
2203        irq_work_queue(&event->pending);
2204}
2205
2206static void perf_set_shadow_time(struct perf_event *event,
2207                                 struct perf_event_context *ctx)
2208{
2209        /*
2210         * use the correct time source for the time snapshot
2211         *
2212         * We could get by without this by leveraging the
2213         * fact that to get to this function, the caller
2214         * has most likely already called update_context_time()
2215         * and update_cgrp_time_xx() and thus both timestamp
2216         * are identical (or very close). Given that tstamp is,
2217         * already adjusted for cgroup, we could say that:
2218         *    tstamp - ctx->timestamp
2219         * is equivalent to
2220         *    tstamp - cgrp->timestamp.
2221         *
2222         * Then, in perf_output_read(), the calculation would
2223         * work with no changes because:
2224         * - event is guaranteed scheduled in
2225         * - no scheduled out in between
2226         * - thus the timestamp would be the same
2227         *
2228         * But this is a bit hairy.
2229         *
2230         * So instead, we have an explicit cgroup call to remain
2231         * within the time time source all along. We believe it
2232         * is cleaner and simpler to understand.
2233         */
2234        if (is_cgroup_event(event))
2235                perf_cgroup_set_shadow_time(event, event->tstamp);
2236        else
2237                event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2238}
2239
2240#define MAX_INTERRUPTS (~0ULL)
2241
2242static void perf_log_throttle(struct perf_event *event, int enable);
2243static void perf_log_itrace_start(struct perf_event *event);
2244
2245static int
2246event_sched_in(struct perf_event *event,
2247                 struct perf_cpu_context *cpuctx,
2248                 struct perf_event_context *ctx)
2249{
2250        int ret = 0;
2251
2252        lockdep_assert_held(&ctx->lock);
2253
2254        if (event->state <= PERF_EVENT_STATE_OFF)
2255                return 0;
2256
2257        WRITE_ONCE(event->oncpu, smp_processor_id());
2258        /*
2259         * Order event::oncpu write to happen before the ACTIVE state is
2260         * visible. This allows perf_event_{stop,read}() to observe the correct
2261         * ->oncpu if it sees ACTIVE.
2262         */
2263        smp_wmb();
2264        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2265
2266        /*
2267         * Unthrottle events, since we scheduled we might have missed several
2268         * ticks already, also for a heavily scheduling task there is little
2269         * guarantee it'll get a tick in a timely manner.
2270         */
2271        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2272                perf_log_throttle(event, 1);
2273                event->hw.interrupts = 0;
2274        }
2275
2276        perf_pmu_disable(event->pmu);
2277
2278        perf_set_shadow_time(event, ctx);
2279
2280        perf_log_itrace_start(event);
2281
2282        if (event->pmu->add(event, PERF_EF_START)) {
2283                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2284                event->oncpu = -1;
2285                ret = -EAGAIN;
2286                goto out;
2287        }
2288
2289        if (!is_software_event(event))
2290                cpuctx->active_oncpu++;
2291        if (!ctx->nr_active++)
2292                perf_event_ctx_activate(ctx);
2293        if (event->attr.freq && event->attr.sample_freq)
2294                ctx->nr_freq++;
2295
2296        if (event->attr.exclusive)
2297                cpuctx->exclusive = 1;
2298
2299out:
2300        perf_pmu_enable(event->pmu);
2301
2302        return ret;
2303}
2304
2305static int
2306group_sched_in(struct perf_event *group_event,
2307               struct perf_cpu_context *cpuctx,
2308               struct perf_event_context *ctx)
2309{
2310        struct perf_event *event, *partial_group = NULL;
2311        struct pmu *pmu = ctx->pmu;
2312
2313        if (group_event->state == PERF_EVENT_STATE_OFF)
2314                return 0;
2315
2316        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2317
2318        if (event_sched_in(group_event, cpuctx, ctx)) {
2319                pmu->cancel_txn(pmu);
2320                perf_mux_hrtimer_restart(cpuctx);
2321                return -EAGAIN;
2322        }
2323
2324        /*
2325         * Schedule in siblings as one group (if any):
2326         */
2327        for_each_sibling_event(event, group_event) {
2328                if (event_sched_in(event, cpuctx, ctx)) {
2329                        partial_group = event;
2330                        goto group_error;
2331                }
2332        }
2333
2334        if (!pmu->commit_txn(pmu))
2335                return 0;
2336
2337group_error:
2338        /*
2339         * Groups can be scheduled in as one unit only, so undo any
2340         * partial group before returning:
2341         * The events up to the failed event are scheduled out normally.
2342         */
2343        for_each_sibling_event(event, group_event) {
2344                if (event == partial_group)
2345                        break;
2346
2347                event_sched_out(event, cpuctx, ctx);
2348        }
2349        event_sched_out(group_event, cpuctx, ctx);
2350
2351        pmu->cancel_txn(pmu);
2352
2353        perf_mux_hrtimer_restart(cpuctx);
2354
2355        return -EAGAIN;
2356}
2357
2358/*
2359 * Work out whether we can put this event group on the CPU now.
2360 */
2361static int group_can_go_on(struct perf_event *event,
2362                           struct perf_cpu_context *cpuctx,
2363                           int can_add_hw)
2364{
2365        /*
2366         * Groups consisting entirely of software events can always go on.
2367         */
2368        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2369                return 1;
2370        /*
2371         * If an exclusive group is already on, no other hardware
2372         * events can go on.
2373         */
2374        if (cpuctx->exclusive)
2375                return 0;
2376        /*
2377         * If this group is exclusive and there are already
2378         * events on the CPU, it can't go on.
2379         */
2380        if (event->attr.exclusive && cpuctx->active_oncpu)
2381                return 0;
2382        /*
2383         * Otherwise, try to add it if all previous groups were able
2384         * to go on.
2385         */
2386        return can_add_hw;
2387}
2388
2389static void add_event_to_ctx(struct perf_event *event,
2390                               struct perf_event_context *ctx)
2391{
2392        list_add_event(event, ctx);
2393        perf_group_attach(event);
2394}
2395
2396static void ctx_sched_out(struct perf_event_context *ctx,
2397                          struct perf_cpu_context *cpuctx,
2398                          enum event_type_t event_type);
2399static void
2400ctx_sched_in(struct perf_event_context *ctx,
2401             struct perf_cpu_context *cpuctx,
2402             enum event_type_t event_type,
2403             struct task_struct *task);
2404
2405static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2406                               struct perf_event_context *ctx,
2407                               enum event_type_t event_type)
2408{
2409        if (!cpuctx->task_ctx)
2410                return;
2411
2412        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2413                return;
2414
2415        ctx_sched_out(ctx, cpuctx, event_type);
2416}
2417
2418static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2419                                struct perf_event_context *ctx,
2420                                struct task_struct *task)
2421{
2422        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2423        if (ctx)
2424                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2425        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2426        if (ctx)
2427                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2428}
2429
2430/*
2431 * We want to maintain the following priority of scheduling:
2432 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2433 *  - task pinned (EVENT_PINNED)
2434 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2435 *  - task flexible (EVENT_FLEXIBLE).
2436 *
2437 * In order to avoid unscheduling and scheduling back in everything every
2438 * time an event is added, only do it for the groups of equal priority and
2439 * below.
2440 *
2441 * This can be called after a batch operation on task events, in which case
2442 * event_type is a bit mask of the types of events involved. For CPU events,
2443 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2444 */
2445static void ctx_resched(struct perf_cpu_context *cpuctx,
2446                        struct perf_event_context *task_ctx,
2447                        enum event_type_t event_type)
2448{
2449        enum event_type_t ctx_event_type;
2450        bool cpu_event = !!(event_type & EVENT_CPU);
2451
2452        /*
2453         * If pinned groups are involved, flexible groups also need to be
2454         * scheduled out.
2455         */
2456        if (event_type & EVENT_PINNED)
2457                event_type |= EVENT_FLEXIBLE;
2458
2459        ctx_event_type = event_type & EVENT_ALL;
2460
2461        perf_pmu_disable(cpuctx->ctx.pmu);
2462        if (task_ctx)
2463                task_ctx_sched_out(cpuctx, task_ctx, event_type);
2464
2465        /*
2466         * Decide which cpu ctx groups to schedule out based on the types
2467         * of events that caused rescheduling:
2468         *  - EVENT_CPU: schedule out corresponding groups;
2469         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2470         *  - otherwise, do nothing more.
2471         */
2472        if (cpu_event)
2473                cpu_ctx_sched_out(cpuctx, ctx_event_type);
2474        else if (ctx_event_type & EVENT_PINNED)
2475                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2476
2477        perf_event_sched_in(cpuctx, task_ctx, current);
2478        perf_pmu_enable(cpuctx->ctx.pmu);
2479}
2480
2481void perf_pmu_resched(struct pmu *pmu)
2482{
2483        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2484        struct perf_event_context *task_ctx = cpuctx->task_ctx;
2485
2486        perf_ctx_lock(cpuctx, task_ctx);
2487        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2488        perf_ctx_unlock(cpuctx, task_ctx);
2489}
2490
2491/*
2492 * Cross CPU call to install and enable a performance event
2493 *
2494 * Very similar to remote_function() + event_function() but cannot assume that
2495 * things like ctx->is_active and cpuctx->task_ctx are set.
2496 */
2497static int  __perf_install_in_context(void *info)
2498{
2499        struct perf_event *event = info;
2500        struct perf_event_context *ctx = event->ctx;
2501        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2502        struct perf_event_context *task_ctx = cpuctx->task_ctx;
2503        bool reprogram = true;
2504        int ret = 0;
2505
2506        raw_spin_lock(&cpuctx->ctx.lock);
2507        if (ctx->task) {
2508                raw_spin_lock(&ctx->lock);
2509                task_ctx = ctx;
2510
2511                reprogram = (ctx->task == current);
2512
2513                /*
2514                 * If the task is running, it must be running on this CPU,
2515                 * otherwise we cannot reprogram things.
2516                 *
2517                 * If its not running, we don't care, ctx->lock will
2518                 * serialize against it becoming runnable.
2519                 */
2520                if (task_curr(ctx->task) && !reprogram) {
2521                        ret = -ESRCH;
2522                        goto unlock;
2523                }
2524
2525                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2526        } else if (task_ctx) {
2527                raw_spin_lock(&task_ctx->lock);
2528        }
2529
2530#ifdef CONFIG_CGROUP_PERF
2531        if (is_cgroup_event(event)) {
2532                /*
2533                 * If the current cgroup doesn't match the event's
2534                 * cgroup, we should not try to schedule it.
2535                 */
2536                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2537                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2538                                        event->cgrp->css.cgroup);
2539        }
2540#endif
2541
2542        if (reprogram) {
2543                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2544                add_event_to_ctx(event, ctx);
2545                ctx_resched(cpuctx, task_ctx, get_event_type(event));
2546        } else {
2547                add_event_to_ctx(event, ctx);
2548        }
2549
2550unlock:
2551        perf_ctx_unlock(cpuctx, task_ctx);
2552
2553        return ret;
2554}
2555
2556static bool exclusive_event_installable(struct perf_event *event,
2557                                        struct perf_event_context *ctx);
2558
2559/*
2560 * Attach a performance event to a context.
2561 *
2562 * Very similar to event_function_call, see comment there.
2563 */
2564static void
2565perf_install_in_context(struct perf_event_context *ctx,
2566                        struct perf_event *event,
2567                        int cpu)
2568{
2569        struct task_struct *task = READ_ONCE(ctx->task);
2570
2571        lockdep_assert_held(&ctx->mutex);
2572
2573        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2574
2575        if (event->cpu != -1)
2576                event->cpu = cpu;
2577
2578        /*
2579         * Ensures that if we can observe event->ctx, both the event and ctx
2580         * will be 'complete'. See perf_iterate_sb_cpu().
2581         */
2582        smp_store_release(&event->ctx, ctx);
2583
2584        if (!task) {
2585                cpu_function_call(cpu, __perf_install_in_context, event);
2586                return;
2587        }
2588
2589        /*
2590         * Should not happen, we validate the ctx is still alive before calling.
2591         */
2592        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2593                return;
2594
2595        /*
2596         * Installing events is tricky because we cannot rely on ctx->is_active
2597         * to be set in case this is the nr_events 0 -> 1 transition.
2598         *
2599         * Instead we use task_curr(), which tells us if the task is running.
2600         * However, since we use task_curr() outside of rq::lock, we can race
2601         * against the actual state. This means the result can be wrong.
2602         *
2603         * If we get a false positive, we retry, this is harmless.
2604         *
2605         * If we get a false negative, things are complicated. If we are after
2606         * perf_event_context_sched_in() ctx::lock will serialize us, and the
2607         * value must be correct. If we're before, it doesn't matter since
2608         * perf_event_context_sched_in() will program the counter.
2609         *
2610         * However, this hinges on the remote context switch having observed
2611         * our task->perf_event_ctxp[] store, such that it will in fact take
2612         * ctx::lock in perf_event_context_sched_in().
2613         *
2614         * We do this by task_function_call(), if the IPI fails to hit the task
2615         * we know any future context switch of task must see the
2616         * perf_event_ctpx[] store.
2617         */
2618
2619        /*
2620         * This smp_mb() orders the task->perf_event_ctxp[] store with the
2621         * task_cpu() load, such that if the IPI then does not find the task
2622         * running, a future context switch of that task must observe the
2623         * store.
2624         */
2625        smp_mb();
2626again:
2627        if (!task_function_call(task, __perf_install_in_context, event))
2628                return;
2629
2630        raw_spin_lock_irq(&ctx->lock);
2631        task = ctx->task;
2632        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2633                /*
2634                 * Cannot happen because we already checked above (which also
2635                 * cannot happen), and we hold ctx->mutex, which serializes us
2636                 * against perf_event_exit_task_context().
2637                 */
2638                raw_spin_unlock_irq(&ctx->lock);
2639                return;
2640        }
2641        /*
2642         * If the task is not running, ctx->lock will avoid it becoming so,
2643         * thus we can safely install the event.
2644         */
2645        if (task_curr(task)) {
2646                raw_spin_unlock_irq(&ctx->lock);
2647                goto again;
2648        }
2649        add_event_to_ctx(event, ctx);
2650        raw_spin_unlock_irq(&ctx->lock);
2651}
2652
2653/*
2654 * Cross CPU call to enable a performance event
2655 */
2656static void __perf_event_enable(struct perf_event *event,
2657                                struct perf_cpu_context *cpuctx,
2658                                struct perf_event_context *ctx,
2659                                void *info)
2660{
2661        struct perf_event *leader = event->group_leader;
2662        struct perf_event_context *task_ctx;
2663
2664        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2665            event->state <= PERF_EVENT_STATE_ERROR)
2666                return;
2667
2668        if (ctx->is_active)
2669                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2670
2671        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2672
2673        if (!ctx->is_active)
2674                return;
2675
2676        if (!event_filter_match(event)) {
2677                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2678                return;
2679        }
2680
2681        /*
2682         * If the event is in a group and isn't the group leader,
2683         * then don't put it on unless the group is on.
2684         */
2685        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2686                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2687                return;
2688        }
2689
2690        task_ctx = cpuctx->task_ctx;
2691        if (ctx->task)
2692                WARN_ON_ONCE(task_ctx != ctx);
2693
2694        ctx_resched(cpuctx, task_ctx, get_event_type(event));
2695}
2696
2697/*
2698 * Enable an event.
2699 *
2700 * If event->ctx is a cloned context, callers must make sure that
2701 * every task struct that event->ctx->task could possibly point to
2702 * remains valid.  This condition is satisfied when called through
2703 * perf_event_for_each_child or perf_event_for_each as described
2704 * for perf_event_disable.
2705 */
2706static void _perf_event_enable(struct perf_event *event)
2707{
2708        struct perf_event_context *ctx = event->ctx;
2709
2710        raw_spin_lock_irq(&ctx->lock);
2711        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2712            event->state <  PERF_EVENT_STATE_ERROR) {
2713                raw_spin_unlock_irq(&ctx->lock);
2714                return;
2715        }
2716
2717        /*
2718         * If the event is in error state, clear that first.
2719         *
2720         * That way, if we see the event in error state below, we know that it
2721         * has gone back into error state, as distinct from the task having
2722         * been scheduled away before the cross-call arrived.
2723         */
2724        if (event->state == PERF_EVENT_STATE_ERROR)
2725                event->state = PERF_EVENT_STATE_OFF;
2726        raw_spin_unlock_irq(&ctx->lock);
2727
2728        event_function_call(event, __perf_event_enable, NULL);
2729}
2730
2731/*
2732 * See perf_event_disable();
2733 */
2734void perf_event_enable(struct perf_event *event)
2735{
2736        struct perf_event_context *ctx;
2737
2738        ctx = perf_event_ctx_lock(event);
2739        _perf_event_enable(event);
2740        perf_event_ctx_unlock(event, ctx);
2741}
2742EXPORT_SYMBOL_GPL(perf_event_enable);
2743
2744struct stop_event_data {
2745        struct perf_event       *event;
2746        unsigned int            restart;
2747};
2748
2749static int __perf_event_stop(void *info)
2750{
2751        struct stop_event_data *sd = info;
2752        struct perf_event *event = sd->event;
2753
2754        /* if it's already INACTIVE, do nothing */
2755        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2756                return 0;
2757
2758        /* matches smp_wmb() in event_sched_in() */
2759        smp_rmb();
2760
2761        /*
2762         * There is a window with interrupts enabled before we get here,
2763         * so we need to check again lest we try to stop another CPU's event.
2764         */
2765        if (READ_ONCE(event->oncpu) != smp_processor_id())
2766                return -EAGAIN;
2767
2768        event->pmu->stop(event, PERF_EF_UPDATE);
2769
2770        /*
2771         * May race with the actual stop (through perf_pmu_output_stop()),
2772         * but it is only used for events with AUX ring buffer, and such
2773         * events will refuse to restart because of rb::aux_mmap_count==0,
2774         * see comments in perf_aux_output_begin().
2775         *
2776         * Since this is happening on an event-local CPU, no trace is lost
2777         * while restarting.
2778         */
2779        if (sd->restart)
2780                event->pmu->start(event, 0);
2781
2782        return 0;
2783}
2784
2785static int perf_event_stop(struct perf_event *event, int restart)
2786{
2787        struct stop_event_data sd = {
2788                .event          = event,
2789                .restart        = restart,
2790        };
2791        int ret = 0;
2792
2793        do {
2794                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2795                        return 0;
2796
2797                /* matches smp_wmb() in event_sched_in() */
2798                smp_rmb();
2799
2800                /*
2801                 * We only want to restart ACTIVE events, so if the event goes
2802                 * inactive here (event->oncpu==-1), there's nothing more to do;
2803                 * fall through with ret==-ENXIO.
2804                 */
2805                ret = cpu_function_call(READ_ONCE(event->oncpu),
2806                                        __perf_event_stop, &sd);
2807        } while (ret == -EAGAIN);
2808
2809        return ret;
2810}
2811
2812/*
2813 * In order to contain the amount of racy and tricky in the address filter
2814 * configuration management, it is a two part process:
2815 *
2816 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2817 *      we update the addresses of corresponding vmas in
2818 *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
2819 * (p2) when an event is scheduled in (pmu::add), it calls
2820 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2821 *      if the generation has changed since the previous call.
2822 *
2823 * If (p1) happens while the event is active, we restart it to force (p2).
2824 *
2825 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2826 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2827 *     ioctl;
2828 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829 *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2830 *     for reading;
2831 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2832 *     of exec.
2833 */
2834void perf_event_addr_filters_sync(struct perf_event *event)
2835{
2836        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2837
2838        if (!has_addr_filter(event))
2839                return;
2840
2841        raw_spin_lock(&ifh->lock);
2842        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2843                event->pmu->addr_filters_sync(event);
2844                event->hw.addr_filters_gen = event->addr_filters_gen;
2845        }
2846        raw_spin_unlock(&ifh->lock);
2847}
2848EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2849
2850static int _perf_event_refresh(struct perf_event *event, int refresh)
2851{
2852        /*
2853         * not supported on inherited events
2854         */
2855        if (event->attr.inherit || !is_sampling_event(event))
2856                return -EINVAL;
2857
2858        atomic_add(refresh, &event->event_limit);
2859        _perf_event_enable(event);
2860
2861        return 0;
2862}
2863
2864/*
2865 * See perf_event_disable()
2866 */
2867int perf_event_refresh(struct perf_event *event, int refresh)
2868{
2869        struct perf_event_context *ctx;
2870        int ret;
2871
2872        ctx = perf_event_ctx_lock(event);
2873        ret = _perf_event_refresh(event, refresh);
2874        perf_event_ctx_unlock(event, ctx);
2875
2876        return ret;
2877}
2878EXPORT_SYMBOL_GPL(perf_event_refresh);
2879
2880static int perf_event_modify_breakpoint(struct perf_event *bp,
2881                                         struct perf_event_attr *attr)
2882{
2883        int err;
2884
2885        _perf_event_disable(bp);
2886
2887        err = modify_user_hw_breakpoint_check(bp, attr, true);
2888
2889        if (!bp->attr.disabled)
2890                _perf_event_enable(bp);
2891
2892        return err;
2893}
2894
2895static int perf_event_modify_attr(struct perf_event *event,
2896                                  struct perf_event_attr *attr)
2897{
2898        if (event->attr.type != attr->type)
2899                return -EINVAL;
2900
2901        switch (event->attr.type) {
2902        case PERF_TYPE_BREAKPOINT:
2903                return perf_event_modify_breakpoint(event, attr);
2904        default:
2905                /* Place holder for future additions. */
2906                return -EOPNOTSUPP;
2907        }
2908}
2909
2910static void ctx_sched_out(struct perf_event_context *ctx,
2911                          struct perf_cpu_context *cpuctx,
2912                          enum event_type_t event_type)
2913{
2914        struct perf_event *event, *tmp;
2915        int is_active = ctx->is_active;
2916
2917        lockdep_assert_held(&ctx->lock);
2918
2919        if (likely(!ctx->nr_events)) {
2920                /*
2921                 * See __perf_remove_from_context().
2922                 */
2923                WARN_ON_ONCE(ctx->is_active);
2924                if (ctx->task)
2925                        WARN_ON_ONCE(cpuctx->task_ctx);
2926                return;
2927        }
2928
2929        ctx->is_active &= ~event_type;
2930        if (!(ctx->is_active & EVENT_ALL))
2931                ctx->is_active = 0;
2932
2933        if (ctx->task) {
2934                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2935                if (!ctx->is_active)
2936                        cpuctx->task_ctx = NULL;
2937        }
2938
2939        /*
2940         * Always update time if it was set; not only when it changes.
2941         * Otherwise we can 'forget' to update time for any but the last
2942         * context we sched out. For example:
2943         *
2944         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2945         *   ctx_sched_out(.event_type = EVENT_PINNED)
2946         *
2947         * would only update time for the pinned events.
2948         */
2949        if (is_active & EVENT_TIME) {
2950                /* update (and stop) ctx time */
2951                update_context_time(ctx);
2952                update_cgrp_time_from_cpuctx(cpuctx);
2953        }
2954
2955        is_active ^= ctx->is_active; /* changed bits */
2956
2957        if (!ctx->nr_active || !(is_active & EVENT_ALL))
2958                return;
2959
2960        /*
2961         * If we had been multiplexing, no rotations are necessary, now no events
2962         * are active.
2963         */
2964        ctx->rotate_necessary = 0;
2965
2966        perf_pmu_disable(ctx->pmu);
2967        if (is_active & EVENT_PINNED) {
2968                list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2969                        group_sched_out(event, cpuctx, ctx);
2970        }
2971
2972        if (is_active & EVENT_FLEXIBLE) {
2973                list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2974                        group_sched_out(event, cpuctx, ctx);
2975        }
2976        perf_pmu_enable(ctx->pmu);
2977}
2978
2979/*
2980 * Test whether two contexts are equivalent, i.e. whether they have both been
2981 * cloned from the same version of the same context.
2982 *
2983 * Equivalence is measured using a generation number in the context that is
2984 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2985 * and list_del_event().
2986 */
2987static int context_equiv(struct perf_event_context *ctx1,
2988                         struct perf_event_context *ctx2)
2989{
2990        lockdep_assert_held(&ctx1->lock);
2991        lockdep_assert_held(&ctx2->lock);
2992
2993        /* Pinning disables the swap optimization */
2994        if (ctx1->pin_count || ctx2->pin_count)
2995                return 0;
2996
2997        /* If ctx1 is the parent of ctx2 */
2998        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2999                return 1;
3000
3001        /* If ctx2 is the parent of ctx1 */
3002        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3003                return 1;
3004
3005        /*
3006         * If ctx1 and ctx2 have the same parent; we flatten the parent
3007         * hierarchy, see perf_event_init_context().
3008         */
3009        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3010                        ctx1->parent_gen == ctx2->parent_gen)
3011                return 1;
3012
3013        /* Unmatched */
3014        return 0;
3015}
3016
3017static void __perf_event_sync_stat(struct perf_event *event,
3018                                     struct perf_event *next_event)
3019{
3020        u64 value;
3021
3022        if (!event->attr.inherit_stat)
3023                return;
3024
3025        /*
3026         * Update the event value, we cannot use perf_event_read()
3027         * because we're in the middle of a context switch and have IRQs
3028         * disabled, which upsets smp_call_function_single(), however
3029         * we know the event must be on the current CPU, therefore we
3030         * don't need to use it.
3031         */
3032        if (event->state == PERF_EVENT_STATE_ACTIVE)
3033                event->pmu->read(event);
3034
3035        perf_event_update_time(event);
3036
3037        /*
3038         * In order to keep per-task stats reliable we need to flip the event
3039         * values when we flip the contexts.
3040         */
3041        value = local64_read(&next_event->count);
3042        value = local64_xchg(&event->count, value);
3043        local64_set(&next_event->count, value);
3044
3045        swap(event->total_time_enabled, next_event->total_time_enabled);
3046        swap(event->total_time_running, next_event->total_time_running);
3047
3048        /*
3049         * Since we swizzled the values, update the user visible data too.
3050         */
3051        perf_event_update_userpage(event);
3052        perf_event_update_userpage(next_event);
3053}
3054
3055static void perf_event_sync_stat(struct perf_event_context *ctx,
3056                                   struct perf_event_context *next_ctx)
3057{
3058        struct perf_event *event, *next_event;
3059
3060        if (!ctx->nr_stat)
3061                return;
3062
3063        update_context_time(ctx);
3064
3065        event = list_first_entry(&ctx->event_list,
3066                                   struct perf_event, event_entry);
3067
3068        next_event = list_first_entry(&next_ctx->event_list,
3069                                        struct perf_event, event_entry);
3070
3071        while (&event->event_entry != &ctx->event_list &&
3072               &next_event->event_entry != &next_ctx->event_list) {
3073
3074                __perf_event_sync_stat(event, next_event);
3075
3076                event = list_next_entry(event, event_entry);
3077                next_event = list_next_entry(next_event, event_entry);
3078        }
3079}
3080
3081static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3082                                         struct task_struct *next)
3083{
3084        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3085        struct perf_event_context *next_ctx;
3086        struct perf_event_context *parent, *next_parent;
3087        struct perf_cpu_context *cpuctx;
3088        int do_switch = 1;
3089
3090        if (likely(!ctx))
3091                return;
3092
3093        cpuctx = __get_cpu_context(ctx);
3094        if (!cpuctx->task_ctx)
3095                return;
3096
3097        rcu_read_lock();
3098        next_ctx = next->perf_event_ctxp[ctxn];
3099        if (!next_ctx)
3100                goto unlock;
3101
3102        parent = rcu_dereference(ctx->parent_ctx);
3103        next_parent = rcu_dereference(next_ctx->parent_ctx);
3104
3105        /* If neither context have a parent context; they cannot be clones. */
3106        if (!parent && !next_parent)
3107                goto unlock;
3108
3109        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3110                /*
3111                 * Looks like the two contexts are clones, so we might be
3112                 * able to optimize the context switch.  We lock both
3113                 * contexts and check that they are clones under the
3114                 * lock (including re-checking that neither has been
3115                 * uncloned in the meantime).  It doesn't matter which
3116                 * order we take the locks because no other cpu could
3117                 * be trying to lock both of these tasks.
3118                 */
3119                raw_spin_lock(&ctx->lock);
3120                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3121                if (context_equiv(ctx, next_ctx)) {
3122                        WRITE_ONCE(ctx->task, next);
3123                        WRITE_ONCE(next_ctx->task, task);
3124
3125                        swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3126
3127                        /*
3128                         * RCU_INIT_POINTER here is safe because we've not
3129                         * modified the ctx and the above modification of
3130                         * ctx->task and ctx->task_ctx_data are immaterial
3131                         * since those values are always verified under
3132                         * ctx->lock which we're now holding.
3133                         */
3134                        RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3135                        RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3136
3137                        do_switch = 0;
3138
3139                        perf_event_sync_stat(ctx, next_ctx);
3140                }
3141                raw_spin_unlock(&next_ctx->lock);
3142                raw_spin_unlock(&ctx->lock);
3143        }
3144unlock:
3145        rcu_read_unlock();
3146
3147        if (do_switch) {
3148                raw_spin_lock(&ctx->lock);
3149                task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3150                raw_spin_unlock(&ctx->lock);
3151        }
3152}
3153
3154static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3155
3156void perf_sched_cb_dec(struct pmu *pmu)
3157{
3158        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3159
3160        this_cpu_dec(perf_sched_cb_usages);
3161
3162        if (!--cpuctx->sched_cb_usage)
3163                list_del(&cpuctx->sched_cb_entry);
3164}
3165
3166
3167void perf_sched_cb_inc(struct pmu *pmu)
3168{
3169        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3170
3171        if (!cpuctx->sched_cb_usage++)
3172                list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3173
3174        this_cpu_inc(perf_sched_cb_usages);
3175}
3176
3177/*
3178 * This function provides the context switch callback to the lower code
3179 * layer. It is invoked ONLY when the context switch callback is enabled.
3180 *
3181 * This callback is relevant even to per-cpu events; for example multi event
3182 * PEBS requires this to provide PID/TID information. This requires we flush
3183 * all queued PEBS records before we context switch to a new task.
3184 */
3185static void perf_pmu_sched_task(struct task_struct *prev,
3186                                struct task_struct *next,
3187                                bool sched_in)
3188{
3189        struct perf_cpu_context *cpuctx;
3190        struct pmu *pmu;
3191
3192        if (prev == next)
3193                return;
3194
3195        list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3196                pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3197
3198                if (WARN_ON_ONCE(!pmu->sched_task))
3199                        continue;
3200
3201                perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3202                perf_pmu_disable(pmu);
3203
3204                pmu->sched_task(cpuctx->task_ctx, sched_in);
3205
3206                perf_pmu_enable(pmu);
3207                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3208        }
3209}
3210
3211static void perf_event_switch(struct task_struct *task,
3212                              struct task_struct *next_prev, bool sched_in);
3213
3214#define for_each_task_context_nr(ctxn)                                  \
3215        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3216
3217/*
3218 * Called from scheduler to remove the events of the current task,
3219 * with interrupts disabled.
3220 *
3221 * We stop each event and update the event value in event->count.
3222 *
3223 * This does not protect us against NMI, but disable()
3224 * sets the disabled bit in the control field of event _before_
3225 * accessing the event control register. If a NMI hits, then it will
3226 * not restart the event.
3227 */
3228void __perf_event_task_sched_out(struct task_struct *task,
3229                                 struct task_struct *next)
3230{
3231        int ctxn;
3232
3233        if (__this_cpu_read(perf_sched_cb_usages))
3234                perf_pmu_sched_task(task, next, false);
3235
3236        if (atomic_read(&nr_switch_events))
3237                perf_event_switch(task, next, false);
3238
3239        for_each_task_context_nr(ctxn)
3240                perf_event_context_sched_out(task, ctxn, next);
3241
3242        /*
3243         * if cgroup events exist on this CPU, then we need
3244         * to check if we have to switch out PMU state.
3245         * cgroup event are system-wide mode only
3246         */
3247        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3248                perf_cgroup_sched_out(task, next);
3249}
3250
3251/*
3252 * Called with IRQs disabled
3253 */
3254static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3255                              enum event_type_t event_type)
3256{
3257        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3258}
3259
3260static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3261                              int (*func)(struct perf_event *, void *), void *data)
3262{
3263        struct perf_event **evt, *evt1, *evt2;
3264        int ret;
3265
3266        evt1 = perf_event_groups_first(groups, -1);
3267        evt2 = perf_event_groups_first(groups, cpu);
3268
3269        while (evt1 || evt2) {
3270                if (evt1 && evt2) {
3271                        if (evt1->group_index < evt2->group_index)
3272                                evt = &evt1;
3273                        else
3274                                evt = &evt2;
3275                } else if (evt1) {
3276                        evt = &evt1;
3277                } else {
3278                        evt = &evt2;
3279                }
3280
3281                ret = func(*evt, data);
3282                if (ret)
3283                        return ret;
3284
3285                *evt = perf_event_groups_next(*evt);
3286        }
3287
3288        return 0;
3289}
3290
3291struct sched_in_data {
3292        struct perf_event_context *ctx;
3293        struct perf_cpu_context *cpuctx;
3294        int can_add_hw;
3295};
3296
3297static int pinned_sched_in(struct perf_event *event, void *data)
3298{
3299        struct sched_in_data *sid = data;
3300
3301        if (event->state <= PERF_EVENT_STATE_OFF)
3302                return 0;
3303
3304        if (!event_filter_match(event))
3305                return 0;
3306
3307        if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3308                if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3309                        list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3310        }
3311
3312        /*
3313         * If this pinned group hasn't been scheduled,
3314         * put it in error state.
3315         */
3316        if (event->state == PERF_EVENT_STATE_INACTIVE)
3317                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3318
3319        return 0;
3320}
3321
3322static int flexible_sched_in(struct perf_event *event, void *data)
3323{
3324        struct sched_in_data *sid = data;
3325
3326        if (event->state <= PERF_EVENT_STATE_OFF)
3327                return 0;
3328
3329        if (!event_filter_match(event))
3330                return 0;
3331
3332        if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3333                int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
3334                if (ret) {
3335                        sid->can_add_hw = 0;
3336                        sid->ctx->rotate_necessary = 1;
3337                        return 0;
3338                }
3339                list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3340        }
3341
3342        return 0;
3343}
3344
3345static void
3346ctx_pinned_sched_in(struct perf_event_context *ctx,
3347                    struct perf_cpu_context *cpuctx)
3348{
3349        struct sched_in_data sid = {
3350                .ctx = ctx,
3351                .cpuctx = cpuctx,
3352                .can_add_hw = 1,
3353        };
3354
3355        visit_groups_merge(&ctx->pinned_groups,
3356                           smp_processor_id(),
3357                           pinned_sched_in, &sid);
3358}
3359
3360static void
3361ctx_flexible_sched_in(struct perf_event_context *ctx,
3362                      struct perf_cpu_context *cpuctx)
3363{
3364        struct sched_in_data sid = {
3365                .ctx = ctx,
3366                .cpuctx = cpuctx,
3367                .can_add_hw = 1,
3368        };
3369
3370        visit_groups_merge(&ctx->flexible_groups,
3371                           smp_processor_id(),
3372                           flexible_sched_in, &sid);
3373}
3374
3375static void
3376ctx_sched_in(struct perf_event_context *ctx,
3377             struct perf_cpu_context *cpuctx,
3378             enum event_type_t event_type,
3379             struct task_struct *task)
3380{
3381        int is_active = ctx->is_active;
3382        u64 now;
3383
3384        lockdep_assert_held(&ctx->lock);
3385
3386        if (likely(!ctx->nr_events))
3387                return;
3388
3389        ctx->is_active |= (event_type | EVENT_TIME);
3390        if (ctx->task) {
3391                if (!is_active)
3392                        cpuctx->task_ctx = ctx;
3393                else
3394                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3395        }
3396
3397        is_active ^= ctx->is_active; /* changed bits */
3398
3399        if (is_active & EVENT_TIME) {
3400                /* start ctx time */
3401                now = perf_clock();
3402                ctx->timestamp = now;
3403                perf_cgroup_set_timestamp(task, ctx);
3404        }
3405
3406        /*
3407         * First go through the list and put on any pinned groups
3408         * in order to give them the best chance of going on.
3409         */
3410        if (is_active & EVENT_PINNED)
3411                ctx_pinned_sched_in(ctx, cpuctx);
3412
3413        /* Then walk through the lower prio flexible groups */
3414        if (is_active & EVENT_FLEXIBLE)
3415                ctx_flexible_sched_in(ctx, cpuctx);
3416}
3417
3418static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3419                             enum event_type_t event_type,
3420                             struct task_struct *task)
3421{
3422        struct perf_event_context *ctx = &cpuctx->ctx;
3423
3424        ctx_sched_in(ctx, cpuctx, event_type, task);
3425}
3426
3427static void perf_event_context_sched_in(struct perf_event_context *ctx,
3428                                        struct task_struct *task)
3429{
3430        struct perf_cpu_context *cpuctx;
3431
3432        cpuctx = __get_cpu_context(ctx);
3433        if (cpuctx->task_ctx == ctx)
3434                return;
3435
3436        perf_ctx_lock(cpuctx, ctx);
3437        /*
3438         * We must check ctx->nr_events while holding ctx->lock, such
3439         * that we serialize against perf_install_in_context().
3440         */
3441        if (!ctx->nr_events)
3442                goto unlock;
3443
3444        perf_pmu_disable(ctx->pmu);
3445        /*
3446         * We want to keep the following priority order:
3447         * cpu pinned (that don't need to move), task pinned,
3448         * cpu flexible, task flexible.
3449         *
3450         * However, if task's ctx is not carrying any pinned
3451         * events, no need to flip the cpuctx's events around.
3452         */
3453        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3454                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3455        perf_event_sched_in(cpuctx, ctx, task);
3456        perf_pmu_enable(ctx->pmu);
3457
3458unlock:
3459        perf_ctx_unlock(cpuctx, ctx);
3460}
3461
3462/*
3463 * Called from scheduler to add the events of the current task
3464 * with interrupts disabled.
3465 *
3466 * We restore the event value and then enable it.
3467 *
3468 * This does not protect us against NMI, but enable()
3469 * sets the enabled bit in the control field of event _before_
3470 * accessing the event control register. If a NMI hits, then it will
3471 * keep the event running.
3472 */
3473void __perf_event_task_sched_in(struct task_struct *prev,
3474                                struct task_struct *task)
3475{
3476        struct perf_event_context *ctx;
3477        int ctxn;
3478
3479        /*
3480         * If cgroup events exist on this CPU, then we need to check if we have
3481         * to switch in PMU state; cgroup event are system-wide mode only.
3482         *
3483         * Since cgroup events are CPU events, we must schedule these in before
3484         * we schedule in the task events.
3485         */
3486        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3487                perf_cgroup_sched_in(prev, task);
3488
3489        for_each_task_context_nr(ctxn) {
3490                ctx = task->perf_event_ctxp[ctxn];
3491                if (likely(!ctx))
3492                        continue;
3493
3494                perf_event_context_sched_in(ctx, task);
3495        }
3496
3497        if (atomic_read(&nr_switch_events))
3498                perf_event_switch(task, prev, true);
3499
3500        if (__this_cpu_read(perf_sched_cb_usages))
3501                perf_pmu_sched_task(prev, task, true);
3502}
3503
3504static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3505{
3506        u64 frequency = event->attr.sample_freq;
3507        u64 sec = NSEC_PER_SEC;
3508        u64 divisor, dividend;
3509
3510        int count_fls, nsec_fls, frequency_fls, sec_fls;
3511
3512        count_fls = fls64(count);
3513        nsec_fls = fls64(nsec);
3514        frequency_fls = fls64(frequency);
3515        sec_fls = 30;
3516
3517        /*
3518         * We got @count in @nsec, with a target of sample_freq HZ
3519         * the target period becomes:
3520         *
3521         *             @count * 10^9
3522         * period = -------------------
3523         *          @nsec * sample_freq
3524         *
3525         */
3526
3527        /*
3528         * Reduce accuracy by one bit such that @a and @b converge
3529         * to a similar magnitude.
3530         */
3531#define REDUCE_FLS(a, b)                \
3532do {                                    \
3533        if (a##_fls > b##_fls) {        \
3534                a >>= 1;                \
3535                a##_fls--;              \
3536        } else {                        \
3537                b >>= 1;                \
3538                b##_fls--;              \
3539        }                               \
3540} while (0)
3541
3542        /*
3543         * Reduce accuracy until either term fits in a u64, then proceed with
3544         * the other, so that finally we can do a u64/u64 division.
3545         */
3546        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3547                REDUCE_FLS(nsec, frequency);
3548                REDUCE_FLS(sec, count);
3549        }
3550
3551        if (count_fls + sec_fls > 64) {
3552                divisor = nsec * frequency;
3553
3554                while (count_fls + sec_fls > 64) {
3555                        REDUCE_FLS(count, sec);
3556                        divisor >>= 1;
3557                }
3558
3559                dividend = count * sec;
3560        } else {
3561                dividend = count * sec;
3562
3563                while (nsec_fls + frequency_fls > 64) {
3564                        REDUCE_FLS(nsec, frequency);
3565                        dividend >>= 1;
3566                }
3567
3568                divisor = nsec * frequency;
3569        }
3570
3571        if (!divisor)
3572                return dividend;
3573
3574        return div64_u64(dividend, divisor);
3575}
3576
3577static DEFINE_PER_CPU(int, perf_throttled_count);
3578static DEFINE_PER_CPU(u64, perf_throttled_seq);
3579
3580static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3581{
3582        struct hw_perf_event *hwc = &event->hw;
3583        s64 period, sample_period;
3584        s64 delta;
3585
3586        period = perf_calculate_period(event, nsec, count);
3587
3588        delta = (s64)(period - hwc->sample_period);
3589        delta = (delta + 7) / 8; /* low pass filter */
3590
3591        sample_period = hwc->sample_period + delta;
3592
3593        if (!sample_period)
3594                sample_period = 1;
3595
3596        hwc->sample_period = sample_period;
3597
3598        if (local64_read(&hwc->period_left) > 8*sample_period) {
3599                if (disable)
3600                        event->pmu->stop(event, PERF_EF_UPDATE);
3601
3602                local64_set(&hwc->period_left, 0);
3603
3604                if (disable)
3605                        event->pmu->start(event, PERF_EF_RELOAD);
3606        }
3607}
3608
3609/*
3610 * combine freq adjustment with unthrottling to avoid two passes over the
3611 * events. At the same time, make sure, having freq events does not change
3612 * the rate of unthrottling as that would introduce bias.
3613 */
3614static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3615                                           int needs_unthr)
3616{
3617        struct perf_event *event;
3618        struct hw_perf_event *hwc;
3619        u64 now, period = TICK_NSEC;
3620        s64 delta;
3621
3622        /*
3623         * only need to iterate over all events iff:
3624         * - context have events in frequency mode (needs freq adjust)
3625         * - there are events to unthrottle on this cpu
3626         */
3627        if (!(ctx->nr_freq || needs_unthr))
3628                return;
3629
3630        raw_spin_lock(&ctx->lock);
3631        perf_pmu_disable(ctx->pmu);
3632
3633        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3634                if (event->state != PERF_EVENT_STATE_ACTIVE)
3635                        continue;
3636
3637                if (!event_filter_match(event))
3638                        continue;
3639
3640                perf_pmu_disable(event->pmu);
3641
3642                hwc = &event->hw;
3643
3644                if (hwc->interrupts == MAX_INTERRUPTS) {
3645                        hwc->interrupts = 0;
3646                        perf_log_throttle(event, 1);
3647                        event->pmu->start(event, 0);
3648                }
3649
3650                if (!event->attr.freq || !event->attr.sample_freq)
3651                        goto next;
3652
3653                /*
3654                 * stop the event and update event->count
3655                 */
3656                event->pmu->stop(event, PERF_EF_UPDATE);
3657
3658                now = local64_read(&event->count);
3659                delta = now - hwc->freq_count_stamp;
3660                hwc->freq_count_stamp = now;
3661
3662                /*
3663                 * restart the event
3664                 * reload only if value has changed
3665                 * we have stopped the event so tell that
3666                 * to perf_adjust_period() to avoid stopping it
3667                 * twice.
3668                 */
3669                if (delta > 0)
3670                        perf_adjust_period(event, period, delta, false);
3671
3672                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3673        next:
3674                perf_pmu_enable(event->pmu);
3675        }
3676
3677        perf_pmu_enable(ctx->pmu);
3678        raw_spin_unlock(&ctx->lock);
3679}
3680
3681/*
3682 * Move @event to the tail of the @ctx's elegible events.
3683 */
3684static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3685{
3686        /*
3687         * Rotate the first entry last of non-pinned groups. Rotation might be
3688         * disabled by the inheritance code.
3689         */
3690        if (ctx->rotate_disable)
3691                return;
3692
3693        perf_event_groups_delete(&ctx->flexible_groups, event);
3694        perf_event_groups_insert(&ctx->flexible_groups, event);
3695}
3696
3697static inline struct perf_event *
3698ctx_first_active(struct perf_event_context *ctx)
3699{
3700        return list_first_entry_or_null(&ctx->flexible_active,
3701                                        struct perf_event, active_list);
3702}
3703
3704static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3705{
3706        struct perf_event *cpu_event = NULL, *task_event = NULL;
3707        struct perf_event_context *task_ctx = NULL;
3708        int cpu_rotate, task_rotate;
3709
3710        /*
3711         * Since we run this from IRQ context, nobody can install new
3712         * events, thus the event count values are stable.
3713         */
3714
3715        cpu_rotate = cpuctx->ctx.rotate_necessary;
3716        task_ctx = cpuctx->task_ctx;
3717        task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3718
3719        if (!(cpu_rotate || task_rotate))
3720                return false;
3721
3722        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3723        perf_pmu_disable(cpuctx->ctx.pmu);
3724
3725        if (task_rotate)
3726                task_event = ctx_first_active(task_ctx);
3727        if (cpu_rotate)
3728                cpu_event = ctx_first_active(&cpuctx->ctx);
3729
3730        /*
3731         * As per the order given at ctx_resched() first 'pop' task flexible
3732         * and then, if needed CPU flexible.
3733         */
3734        if (task_event || (task_ctx && cpu_event))
3735                ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3736        if (cpu_event)
3737                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3738
3739        if (task_event)
3740                rotate_ctx(task_ctx, task_event);
3741        if (cpu_event)
3742                rotate_ctx(&cpuctx->ctx, cpu_event);
3743
3744        perf_event_sched_in(cpuctx, task_ctx, current);
3745
3746        perf_pmu_enable(cpuctx->ctx.pmu);
3747        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3748
3749        return true;
3750}
3751
3752void perf_event_task_tick(void)
3753{
3754        struct list_head *head = this_cpu_ptr(&active_ctx_list);
3755        struct perf_event_context *ctx, *tmp;
3756        int throttled;
3757
3758        lockdep_assert_irqs_disabled();
3759
3760        __this_cpu_inc(perf_throttled_seq);
3761        throttled = __this_cpu_xchg(perf_throttled_count, 0);
3762        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3763
3764        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3765                perf_adjust_freq_unthr_context(ctx, throttled);
3766}
3767
3768static int event_enable_on_exec(struct perf_event *event,
3769                                struct perf_event_context *ctx)
3770{
3771        if (!event->attr.enable_on_exec)
3772                return 0;
3773
3774        event->attr.enable_on_exec = 0;
3775        if (event->state >= PERF_EVENT_STATE_INACTIVE)
3776                return 0;
3777
3778        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3779
3780        return 1;
3781}
3782
3783/*
3784 * Enable all of a task's events that have been marked enable-on-exec.
3785 * This expects task == current.
3786 */
3787static void perf_event_enable_on_exec(int ctxn)
3788{
3789        struct perf_event_context *ctx, *clone_ctx = NULL;
3790        enum event_type_t event_type = 0;
3791        struct perf_cpu_context *cpuctx;
3792        struct perf_event *event;
3793        unsigned long flags;
3794        int enabled = 0;
3795
3796        local_irq_save(flags);
3797        ctx = current->perf_event_ctxp[ctxn];
3798        if (!ctx || !ctx->nr_events)
3799                goto out;
3800
3801        cpuctx = __get_cpu_context(ctx);
3802        perf_ctx_lock(cpuctx, ctx);
3803        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3804        list_for_each_entry(event, &ctx->event_list, event_entry) {
3805                enabled |= event_enable_on_exec(event, ctx);
3806                event_type |= get_event_type(event);
3807        }
3808
3809        /*
3810         * Unclone and reschedule this context if we enabled any event.
3811         */
3812        if (enabled) {
3813                clone_ctx = unclone_ctx(ctx);
3814                ctx_resched(cpuctx, ctx, event_type);
3815        } else {
3816                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3817        }
3818        perf_ctx_unlock(cpuctx, ctx);
3819
3820out:
3821        local_irq_restore(flags);
3822
3823        if (clone_ctx)
3824                put_ctx(clone_ctx);
3825}
3826
3827struct perf_read_data {
3828        struct perf_event *event;
3829        bool group;
3830        int ret;
3831};
3832
3833static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3834{
3835        u16 local_pkg, event_pkg;
3836
3837        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3838                int local_cpu = smp_processor_id();
3839
3840                event_pkg = topology_physical_package_id(event_cpu);
3841                local_pkg = topology_physical_package_id(local_cpu);
3842
3843                if (event_pkg == local_pkg)
3844                        return local_cpu;
3845        }
3846
3847        return event_cpu;
3848}
3849
3850/*
3851 * Cross CPU call to read the hardware event
3852 */
3853static void __perf_event_read(void *info)
3854{
3855        struct perf_read_data *data = info;
3856        struct perf_event *sub, *event = data->event;
3857        struct perf_event_context *ctx = event->ctx;
3858        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3859        struct pmu *pmu = event->pmu;
3860
3861        /*
3862         * If this is a task context, we need to check whether it is
3863         * the current task context of this cpu.  If not it has been
3864         * scheduled out before the smp call arrived.  In that case
3865         * event->count would have been updated to a recent sample
3866         * when the event was scheduled out.
3867         */
3868        if (ctx->task && cpuctx->task_ctx != ctx)
3869                return;
3870
3871        raw_spin_lock(&ctx->lock);
3872        if (ctx->is_active & EVENT_TIME) {
3873                update_context_time(ctx);
3874                update_cgrp_time_from_event(event);
3875        }
3876
3877        perf_event_update_time(event);
3878        if (data->group)
3879                perf_event_update_sibling_time(event);
3880
3881        if (event->state != PERF_EVENT_STATE_ACTIVE)
3882                goto unlock;
3883
3884        if (!data->group) {
3885                pmu->read(event);
3886                data->ret = 0;
3887                goto unlock;
3888        }
3889
3890        pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3891
3892        pmu->read(event);
3893
3894        for_each_sibling_event(sub, event) {
3895                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3896                        /*
3897                         * Use sibling's PMU rather than @event's since
3898                         * sibling could be on different (eg: software) PMU.
3899                         */
3900                        sub->pmu->read(sub);
3901                }
3902        }
3903
3904        data->ret = pmu->commit_txn(pmu);
3905
3906unlock:
3907        raw_spin_unlock(&ctx->lock);
3908}
3909
3910static inline u64 perf_event_count(struct perf_event *event)
3911{
3912        return local64_read(&event->count) + atomic64_read(&event->child_count);
3913}
3914
3915/*
3916 * NMI-safe method to read a local event, that is an event that
3917 * is:
3918 *   - either for the current task, or for this CPU
3919 *   - does not have inherit set, for inherited task events
3920 *     will not be local and we cannot read them atomically
3921 *   - must not have a pmu::count method
3922 */
3923int perf_event_read_local(struct perf_event *event, u64 *value,
3924                          u64 *enabled, u64 *running)
3925{
3926        unsigned long flags;
3927        int ret = 0;
3928
3929        /*
3930         * Disabling interrupts avoids all counter scheduling (context
3931         * switches, timer based rotation and IPIs).
3932         */
3933        local_irq_save(flags);
3934
3935        /*
3936         * It must not be an event with inherit set, we cannot read
3937         * all child counters from atomic context.
3938         */
3939        if (event->attr.inherit) {
3940                ret = -EOPNOTSUPP;
3941                goto out;
3942        }
3943
3944        /* If this is a per-task event, it must be for current */
3945        if ((event->attach_state & PERF_ATTACH_TASK) &&
3946            event->hw.target != current) {
3947                ret = -EINVAL;
3948                goto out;
3949        }
3950
3951        /* If this is a per-CPU event, it must be for this CPU */
3952        if (!(event->attach_state & PERF_ATTACH_TASK) &&
3953            event->cpu != smp_processor_id()) {
3954                ret = -EINVAL;
3955                goto out;
3956        }
3957
3958        /* If this is a pinned event it must be running on this CPU */
3959        if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3960                ret = -EBUSY;
3961                goto out;
3962        }
3963
3964        /*
3965         * If the event is currently on this CPU, its either a per-task event,
3966         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3967         * oncpu == -1).
3968         */
3969        if (event->oncpu == smp_processor_id())
3970                event->pmu->read(event);
3971
3972        *value = local64_read(&event->count);
3973        if (enabled || running) {
3974                u64 now = event->shadow_ctx_time + perf_clock();
3975                u64 __enabled, __running;
3976
3977                __perf_update_times(event, now, &__enabled, &__running);
3978                if (enabled)
3979                        *enabled = __enabled;
3980                if (running)
3981                        *running = __running;
3982        }
3983out:
3984        local_irq_restore(flags);
3985
3986        return ret;
3987}
3988
3989static int perf_event_read(struct perf_event *event, bool group)
3990{
3991        enum perf_event_state state = READ_ONCE(event->state);
3992        int event_cpu, ret = 0;
3993
3994        /*
3995         * If event is enabled and currently active on a CPU, update the
3996         * value in the event structure:
3997         */
3998again:
3999        if (state == PERF_EVENT_STATE_ACTIVE) {
4000                struct perf_read_data data;
4001
4002                /*
4003                 * Orders the ->state and ->oncpu loads such that if we see
4004                 * ACTIVE we must also see the right ->oncpu.
4005                 *
4006                 * Matches the smp_wmb() from event_sched_in().
4007                 */
4008                smp_rmb();
4009
4010                event_cpu = READ_ONCE(event->oncpu);
4011                if ((unsigned)event_cpu >= nr_cpu_ids)
4012                        return 0;
4013
4014                data = (struct perf_read_data){
4015                        .event = event,
4016                        .group = group,
4017                        .ret = 0,
4018                };
4019
4020                preempt_disable();
4021                event_cpu = __perf_event_read_cpu(event, event_cpu);
4022
4023                /*
4024                 * Purposely ignore the smp_call_function_single() return
4025                 * value.
4026                 *
4027                 * If event_cpu isn't a valid CPU it means the event got
4028                 * scheduled out and that will have updated the event count.
4029                 *
4030                 * Therefore, either way, we'll have an up-to-date event count
4031                 * after this.
4032                 */
4033                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4034                preempt_enable();
4035                ret = data.ret;
4036
4037        } else if (state == PERF_EVENT_STATE_INACTIVE) {
4038                struct perf_event_context *ctx = event->ctx;
4039                unsigned long flags;
4040
4041                raw_spin_lock_irqsave(&ctx->lock, flags);
4042                state = event->state;
4043                if (state != PERF_EVENT_STATE_INACTIVE) {
4044                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
4045                        goto again;
4046                }
4047
4048                /*
4049                 * May read while context is not active (e.g., thread is
4050                 * blocked), in that case we cannot update context time
4051                 */
4052                if (ctx->is_active & EVENT_TIME) {
4053                        update_context_time(ctx);
4054                        update_cgrp_time_from_event(event);
4055                }
4056
4057                perf_event_update_time(event);
4058                if (group)
4059                        perf_event_update_sibling_time(event);
4060                raw_spin_unlock_irqrestore(&ctx->lock, flags);
4061        }
4062
4063        return ret;
4064}
4065
4066/*
4067 * Initialize the perf_event context in a task_struct:
4068 */
4069static void __perf_event_init_context(struct perf_event_context *ctx)
4070{
4071        raw_spin_lock_init(&ctx->lock);
4072        mutex_init(&ctx->mutex);
4073        INIT_LIST_HEAD(&ctx->active_ctx_list);
4074        perf_event_groups_init(&ctx->pinned_groups);
4075        perf_event_groups_init(&ctx->flexible_groups);
4076        INIT_LIST_HEAD(&ctx->event_list);
4077        INIT_LIST_HEAD(&ctx->pinned_active);
4078        INIT_LIST_HEAD(&ctx->flexible_active);
4079        refcount_set(&ctx->refcount, 1);
4080}
4081
4082static struct perf_event_context *
4083alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4084{
4085        struct perf_event_context *ctx;
4086
4087        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4088        if (!ctx)
4089                return NULL;
4090
4091        __perf_event_init_context(ctx);
4092        if (task) {
4093                ctx->task = task;
4094                get_task_struct(task);
4095        }
4096        ctx->pmu = pmu;
4097
4098        return ctx;
4099}
4100
4101static struct task_struct *
4102find_lively_task_by_vpid(pid_t vpid)
4103{
4104        struct task_struct *task;
4105
4106        rcu_read_lock();
4107        if (!vpid)
4108                task = current;
4109        else
4110                task = find_task_by_vpid(vpid);
4111        if (task)
4112                get_task_struct(task);
4113        rcu_read_unlock();
4114
4115        if (!task)
4116                return ERR_PTR(-ESRCH);
4117
4118        return task;
4119}
4120
4121/*
4122 * Returns a matching context with refcount and pincount.
4123 */
4124static struct perf_event_context *
4125find_get_context(struct pmu *pmu, struct task_struct *task,
4126                struct perf_event *event)
4127{
4128        struct perf_event_context *ctx, *clone_ctx = NULL;
4129        struct perf_cpu_context *cpuctx;
4130        void *task_ctx_data = NULL;
4131        unsigned long flags;
4132        int ctxn, err;
4133        int cpu = event->cpu;
4134
4135        if (!task) {
4136                /* Must be root to operate on a CPU event: */
4137                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4138                        return ERR_PTR(-EACCES);
4139
4140                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4141                ctx = &cpuctx->ctx;
4142                get_ctx(ctx);
4143                ++ctx->pin_count;
4144
4145                return ctx;
4146        }
4147
4148        err = -EINVAL;
4149        ctxn = pmu->task_ctx_nr;
4150        if (ctxn < 0)
4151                goto errout;
4152
4153        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4154                task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4155                if (!task_ctx_data) {
4156                        err = -ENOMEM;
4157                        goto errout;
4158                }
4159        }
4160
4161retry:
4162        ctx = perf_lock_task_context(task, ctxn, &flags);
4163        if (ctx) {
4164                clone_ctx = unclone_ctx(ctx);
4165                ++ctx->pin_count;
4166
4167                if (task_ctx_data && !ctx->task_ctx_data) {
4168                        ctx->task_ctx_data = task_ctx_data;
4169                        task_ctx_data = NULL;
4170                }
4171                raw_spin_unlock_irqrestore(&ctx->lock, flags);
4172
4173                if (clone_ctx)
4174                        put_ctx(clone_ctx);
4175        } else {
4176                ctx = alloc_perf_context(pmu, task);
4177                err = -ENOMEM;
4178                if (!ctx)
4179                        goto errout;
4180
4181                if (task_ctx_data) {
4182                        ctx->task_ctx_data = task_ctx_data;
4183                        task_ctx_data = NULL;
4184                }
4185
4186                err = 0;
4187                mutex_lock(&task->perf_event_mutex);
4188                /*
4189                 * If it has already passed perf_event_exit_task().
4190                 * we must see PF_EXITING, it takes this mutex too.
4191                 */
4192                if (task->flags & PF_EXITING)
4193                        err = -ESRCH;
4194                else if (task->perf_event_ctxp[ctxn])
4195                        err = -EAGAIN;
4196                else {
4197                        get_ctx(ctx);
4198                        ++ctx->pin_count;
4199                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4200                }
4201                mutex_unlock(&task->perf_event_mutex);
4202
4203                if (unlikely(err)) {
4204                        put_ctx(ctx);
4205
4206                        if (err == -EAGAIN)
4207                                goto retry;
4208                        goto errout;
4209                }
4210        }
4211
4212        kfree(task_ctx_data);
4213        return ctx;
4214
4215errout:
4216        kfree(task_ctx_data);
4217        return ERR_PTR(err);
4218}
4219
4220static void perf_event_free_filter(struct perf_event *event);
4221static void perf_event_free_bpf_prog(struct perf_event *event);
4222
4223static void free_event_rcu(struct rcu_head *head)
4224{
4225        struct perf_event *event;
4226
4227        event = container_of(head, struct perf_event, rcu_head);
4228        if (event->ns)
4229                put_pid_ns(event->ns);
4230        perf_event_free_filter(event);
4231        kfree(event);
4232}
4233
4234static void ring_buffer_attach(struct perf_event *event,
4235                               struct ring_buffer *rb);
4236
4237static void detach_sb_event(struct perf_event *event)
4238{
4239        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4240
4241        raw_spin_lock(&pel->lock);
4242        list_del_rcu(&event->sb_list);
4243        raw_spin_unlock(&pel->lock);
4244}
4245
4246static bool is_sb_event(struct perf_event *event)
4247{
4248        struct perf_event_attr *attr = &event->attr;
4249
4250        if (event->parent)
4251                return false;
4252
4253        if (event->attach_state & PERF_ATTACH_TASK)
4254                return false;
4255
4256        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4257            attr->comm || attr->comm_exec ||
4258            attr->task || attr->ksymbol ||
4259            attr->context_switch ||
4260            attr->bpf_event)
4261                return true;
4262        return false;
4263}
4264
4265static void unaccount_pmu_sb_event(struct perf_event *event)
4266{
4267        if (is_sb_event(event))
4268                detach_sb_event(event);
4269}
4270
4271static void unaccount_event_cpu(struct perf_event *event, int cpu)
4272{
4273        if (event->parent)
4274                return;
4275
4276        if (is_cgroup_event(event))
4277                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4278}
4279
4280#ifdef CONFIG_NO_HZ_FULL
4281static DEFINE_SPINLOCK(nr_freq_lock);
4282#endif
4283
4284static void unaccount_freq_event_nohz(void)
4285{
4286#ifdef CONFIG_NO_HZ_FULL
4287        spin_lock(&nr_freq_lock);
4288        if (atomic_dec_and_test(&nr_freq_events))
4289                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4290        spin_unlock(&nr_freq_lock);
4291#endif
4292}
4293
4294static void unaccount_freq_event(void)
4295{
4296        if (tick_nohz_full_enabled())
4297                unaccount_freq_event_nohz();
4298        else
4299                atomic_dec(&nr_freq_events);
4300}
4301
4302static void unaccount_event(struct perf_event *event)
4303{
4304        bool dec = false;
4305
4306        if (event->parent)
4307                return;
4308
4309        if (event->attach_state & PERF_ATTACH_TASK)
4310                dec = true;
4311        if (event->attr.mmap || event->attr.mmap_data)
4312                atomic_dec(&nr_mmap_events);
4313        if (event->attr.comm)
4314                atomic_dec(&nr_comm_events);
4315        if (event->attr.namespaces)
4316                atomic_dec(&nr_namespaces_events);
4317        if (event->attr.task)
4318                atomic_dec(&nr_task_events);
4319        if (event->attr.freq)
4320                unaccount_freq_event();
4321        if (event->attr.context_switch) {
4322                dec = true;
4323                atomic_dec(&nr_switch_events);
4324        }
4325        if (is_cgroup_event(event))
4326                dec = true;
4327        if (has_branch_stack(event))
4328                dec = true;
4329        if (event->attr.ksymbol)
4330                atomic_dec(&nr_ksymbol_events);
4331        if (event->attr.bpf_event)
4332                atomic_dec(&nr_bpf_events);
4333
4334        if (dec) {
4335                if (!atomic_add_unless(&perf_sched_count, -1, 1))
4336                        schedule_delayed_work(&perf_sched_work, HZ);
4337        }
4338
4339        unaccount_event_cpu(event, event->cpu);
4340
4341        unaccount_pmu_sb_event(event);
4342}
4343
4344static void perf_sched_delayed(struct work_struct *work)
4345{
4346        mutex_lock(&perf_sched_mutex);
4347        if (atomic_dec_and_test(&perf_sched_count))
4348                static_branch_disable(&perf_sched_events);
4349        mutex_unlock(&perf_sched_mutex);
4350}
4351
4352/*
4353 * The following implement mutual exclusion of events on "exclusive" pmus
4354 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4355 * at a time, so we disallow creating events that might conflict, namely:
4356 *
4357 *  1) cpu-wide events in the presence of per-task events,
4358 *  2) per-task events in the presence of cpu-wide events,
4359 *  3) two matching events on the same context.
4360 *
4361 * The former two cases are handled in the allocation path (perf_event_alloc(),
4362 * _free_event()), the latter -- before the first perf_install_in_context().
4363 */
4364static int exclusive_event_init(struct perf_event *event)
4365{
4366        struct pmu *pmu = event->pmu;
4367
4368        if (!is_exclusive_pmu(pmu))
4369                return 0;
4370
4371        /*
4372         * Prevent co-existence of per-task and cpu-wide events on the
4373         * same exclusive pmu.
4374         *
4375         * Negative pmu::exclusive_cnt means there are cpu-wide
4376         * events on this "exclusive" pmu, positive means there are
4377         * per-task events.
4378         *
4379         * Since this is called in perf_event_alloc() path, event::ctx
4380         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4381         * to mean "per-task event", because unlike other attach states it
4382         * never gets cleared.
4383         */
4384        if (event->attach_state & PERF_ATTACH_TASK) {
4385                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4386                        return -EBUSY;
4387        } else {
4388                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4389                        return -EBUSY;
4390        }
4391
4392        return 0;
4393}
4394
4395static void exclusive_event_destroy(struct perf_event *event)
4396{
4397        struct pmu *pmu = event->pmu;
4398
4399        if (!is_exclusive_pmu(pmu))
4400                return;
4401
4402        /* see comment in exclusive_event_init() */
4403        if (event->attach_state & PERF_ATTACH_TASK)
4404                atomic_dec(&pmu->exclusive_cnt);
4405        else
4406                atomic_inc(&pmu->exclusive_cnt);
4407}
4408
4409static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4410{
4411        if ((e1->pmu == e2->pmu) &&
4412            (e1->cpu == e2->cpu ||
4413             e1->cpu == -1 ||
4414             e2->cpu == -1))
4415                return true;
4416        return false;
4417}
4418
4419static bool exclusive_event_installable(struct perf_event *event,
4420                                        struct perf_event_context *ctx)
4421{
4422        struct perf_event *iter_event;
4423        struct pmu *pmu = event->pmu;
4424
4425        lockdep_assert_held(&ctx->mutex);
4426
4427        if (!is_exclusive_pmu(pmu))
4428                return true;
4429
4430        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4431                if (exclusive_event_match(iter_event, event))
4432                        return false;
4433        }
4434
4435        return true;
4436}
4437
4438static void perf_addr_filters_splice(struct perf_event *event,
4439                                       struct list_head *head);
4440
4441static void _free_event(struct perf_event *event)
4442{
4443        irq_work_sync(&event->pending);
4444
4445        unaccount_event(event);
4446
4447        if (event->rb) {
4448                /*
4449                 * Can happen when we close an event with re-directed output.
4450                 *
4451                 * Since we have a 0 refcount, perf_mmap_close() will skip
4452                 * over us; possibly making our ring_buffer_put() the last.
4453                 */
4454                mutex_lock(&event->mmap_mutex);
4455                ring_buffer_attach(event, NULL);
4456                mutex_unlock(&event->mmap_mutex);
4457        }
4458
4459        if (is_cgroup_event(event))
4460                perf_detach_cgroup(event);
4461
4462        if (!event->parent) {
4463                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4464                        put_callchain_buffers();
4465        }
4466
4467        perf_event_free_bpf_prog(event);
4468        perf_addr_filters_splice(event, NULL);
4469        kfree(event->addr_filter_ranges);
4470
4471        if (event->destroy)
4472                event->destroy(event);
4473
4474        /*
4475         * Must be after ->destroy(), due to uprobe_perf_close() using
4476         * hw.target.
4477         */
4478        if (event->hw.target)
4479                put_task_struct(event->hw.target);
4480
4481        /*
4482         * perf_event_free_task() relies on put_ctx() being 'last', in particular
4483         * all task references must be cleaned up.
4484         */
4485        if (event->ctx)
4486                put_ctx(event->ctx);
4487
4488        exclusive_event_destroy(event);
4489        module_put(event->pmu->module);
4490
4491        call_rcu(&event->rcu_head, free_event_rcu);
4492}
4493
4494/*
4495 * Used to free events which have a known refcount of 1, such as in error paths
4496 * where the event isn't exposed yet and inherited events.
4497 */
4498static void free_event(struct perf_event *event)
4499{
4500        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4501                                "unexpected event refcount: %ld; ptr=%p\n",
4502                                atomic_long_read(&event->refcount), event)) {
4503                /* leak to avoid use-after-free */
4504                return;
4505        }
4506
4507        _free_event(event);
4508}
4509
4510/*
4511 * Remove user event from the owner task.
4512 */
4513static void perf_remove_from_owner(struct perf_event *event)
4514{
4515        struct task_struct *owner;
4516
4517        rcu_read_lock();
4518        /*
4519         * Matches the smp_store_release() in perf_event_exit_task(). If we
4520         * observe !owner it means the list deletion is complete and we can
4521         * indeed free this event, otherwise we need to serialize on
4522         * owner->perf_event_mutex.
4523         */
4524        owner = READ_ONCE(event->owner);
4525        if (owner) {
4526                /*
4527                 * Since delayed_put_task_struct() also drops the last
4528                 * task reference we can safely take a new reference
4529                 * while holding the rcu_read_lock().
4530                 */
4531                get_task_struct(owner);
4532        }
4533        rcu_read_unlock();
4534
4535        if (owner) {
4536                /*
4537                 * If we're here through perf_event_exit_task() we're already
4538                 * holding ctx->mutex which would be an inversion wrt. the
4539                 * normal lock order.
4540                 *
4541                 * However we can safely take this lock because its the child
4542                 * ctx->mutex.
4543                 */
4544                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4545
4546                /*
4547                 * We have to re-check the event->owner field, if it is cleared
4548                 * we raced with perf_event_exit_task(), acquiring the mutex
4549                 * ensured they're done, and we can proceed with freeing the
4550                 * event.
4551                 */
4552                if (event->owner) {
4553                        list_del_init(&event->owner_entry);
4554                        smp_store_release(&event->owner, NULL);
4555                }
4556                mutex_unlock(&owner->perf_event_mutex);
4557                put_task_struct(owner);
4558        }
4559}
4560
4561static void put_event(struct perf_event *event)
4562{
4563        if (!atomic_long_dec_and_test(&event->refcount))
4564                return;
4565
4566        _free_event(event);
4567}
4568
4569/*
4570 * Kill an event dead; while event:refcount will preserve the event
4571 * object, it will not preserve its functionality. Once the last 'user'
4572 * gives up the object, we'll destroy the thing.
4573 */
4574int perf_event_release_kernel(struct perf_event *event)
4575{
4576        struct perf_event_context *ctx = event->ctx;
4577        struct perf_event *child, *tmp;
4578        LIST_HEAD(free_list);
4579
4580        /*
4581         * If we got here through err_file: fput(event_file); we will not have
4582         * attached to a context yet.
4583         */
4584        if (!ctx) {
4585                WARN_ON_ONCE(event->attach_state &
4586                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4587                goto no_ctx;
4588        }
4589
4590        if (!is_kernel_event(event))
4591                perf_remove_from_owner(event);
4592
4593        ctx = perf_event_ctx_lock(event);
4594        WARN_ON_ONCE(ctx->parent_ctx);
4595        perf_remove_from_context(event, DETACH_GROUP);
4596
4597        raw_spin_lock_irq(&ctx->lock);
4598        /*
4599         * Mark this event as STATE_DEAD, there is no external reference to it
4600         * anymore.
4601         *
4602         * Anybody acquiring event->child_mutex after the below loop _must_
4603         * also see this, most importantly inherit_event() which will avoid
4604         * placing more children on the list.
4605         *
4606         * Thus this guarantees that we will in fact observe and kill _ALL_
4607         * child events.
4608         */
4609        event->state = PERF_EVENT_STATE_DEAD;
4610        raw_spin_unlock_irq(&ctx->lock);
4611
4612        perf_event_ctx_unlock(event, ctx);
4613
4614again:
4615        mutex_lock(&event->child_mutex);
4616        list_for_each_entry(child, &event->child_list, child_list) {
4617
4618                /*
4619                 * Cannot change, child events are not migrated, see the
4620                 * comment with perf_event_ctx_lock_nested().
4621                 */
4622                ctx = READ_ONCE(child->ctx);
4623                /*
4624                 * Since child_mutex nests inside ctx::mutex, we must jump
4625                 * through hoops. We start by grabbing a reference on the ctx.
4626                 *
4627                 * Since the event cannot get freed while we hold the
4628                 * child_mutex, the context must also exist and have a !0
4629                 * reference count.
4630                 */
4631                get_ctx(ctx);
4632
4633                /*
4634                 * Now that we have a ctx ref, we can drop child_mutex, and
4635                 * acquire ctx::mutex without fear of it going away. Then we
4636                 * can re-acquire child_mutex.
4637                 */
4638                mutex_unlock(&event->child_mutex);
4639                mutex_lock(&ctx->mutex);
4640                mutex_lock(&event->child_mutex);
4641
4642                /*
4643                 * Now that we hold ctx::mutex and child_mutex, revalidate our
4644                 * state, if child is still the first entry, it didn't get freed
4645                 * and we can continue doing so.
4646                 */
4647                tmp = list_first_entry_or_null(&event->child_list,
4648                                               struct perf_event, child_list);
4649                if (tmp == child) {
4650                        perf_remove_from_context(child, DETACH_GROUP);
4651                        list_move(&child->child_list, &free_list);
4652                        /*
4653                         * This matches the refcount bump in inherit_event();
4654                         * this can't be the last reference.
4655                         */
4656                        put_event(event);
4657                }
4658
4659                mutex_unlock(&event->child_mutex);
4660                mutex_unlock(&ctx->mutex);
4661                put_ctx(ctx);
4662                goto again;
4663        }
4664        mutex_unlock(&event->child_mutex);
4665
4666        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4667                void *var = &child->ctx->refcount;
4668
4669                list_del(&child->child_list);
4670                free_event(child);
4671
4672                /*
4673                 * Wake any perf_event_free_task() waiting for this event to be
4674                 * freed.
4675                 */
4676                smp_mb(); /* pairs with wait_var_event() */
4677                wake_up_var(var);
4678        }
4679
4680no_ctx:
4681        put_event(event); /* Must be the 'last' reference */
4682        return 0;
4683}
4684EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4685
4686/*
4687 * Called when the last reference to the file is gone.
4688 */
4689static int perf_release(struct inode *inode, struct file *file)
4690{
4691        perf_event_release_kernel(file->private_data);
4692        return 0;
4693}
4694
4695static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4696{
4697        struct perf_event *child;
4698        u64 total = 0;
4699
4700        *enabled = 0;
4701        *running = 0;
4702
4703        mutex_lock(&event->child_mutex);
4704
4705        (void)perf_event_read(event, false);
4706        total += perf_event_count(event);
4707
4708        *enabled += event->total_time_enabled +
4709                        atomic64_read(&event->child_total_time_enabled);
4710        *running += event->total_time_running +
4711                        atomic64_read(&event->child_total_time_running);
4712
4713        list_for_each_entry(child, &event->child_list, child_list) {
4714                (void)perf_event_read(child, false);
4715                total += perf_event_count(child);
4716                *enabled += child->total_time_enabled;
4717                *running += child->total_time_running;
4718        }
4719        mutex_unlock(&event->child_mutex);
4720
4721        return total;
4722}
4723
4724u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4725{
4726        struct perf_event_context *ctx;
4727        u64 count;
4728
4729        ctx = perf_event_ctx_lock(event);
4730        count = __perf_event_read_value(event, enabled, running);
4731        perf_event_ctx_unlock(event, ctx);
4732
4733        return count;
4734}
4735EXPORT_SYMBOL_GPL(perf_event_read_value);
4736
4737static int __perf_read_group_add(struct perf_event *leader,
4738                                        u64 read_format, u64 *values)
4739{
4740        struct perf_event_context *ctx = leader->ctx;
4741        struct perf_event *sub;
4742        unsigned long flags;
4743        int n = 1; /* skip @nr */
4744        int ret;
4745
4746        ret = perf_event_read(leader, true);
4747        if (ret)
4748                return ret;
4749
4750        raw_spin_lock_irqsave(&ctx->lock, flags);
4751
4752        /*
4753         * Since we co-schedule groups, {enabled,running} times of siblings
4754         * will be identical to those of the leader, so we only publish one
4755         * set.
4756         */
4757        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4758                values[n++] += leader->total_time_enabled +
4759                        atomic64_read(&leader->child_total_time_enabled);
4760        }
4761
4762        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4763                values[n++] += leader->total_time_running +
4764                        atomic64_read(&leader->child_total_time_running);
4765        }
4766
4767        /*
4768         * Write {count,id} tuples for every sibling.
4769         */
4770        values[n++] += perf_event_count(leader);
4771        if (read_format & PERF_FORMAT_ID)
4772                values[n++] = primary_event_id(leader);
4773
4774        for_each_sibling_event(sub, leader) {
4775                values[n++] += perf_event_count(sub);
4776                if (read_format & PERF_FORMAT_ID)
4777                        values[n++] = primary_event_id(sub);
4778        }
4779
4780        raw_spin_unlock_irqrestore(&ctx->lock, flags);
4781        return 0;
4782}
4783
4784static int perf_read_group(struct perf_event *event,
4785                                   u64 read_format, char __user *buf)
4786{
4787        struct perf_event *leader = event->group_leader, *child;
4788        struct perf_event_context *ctx = leader->ctx;
4789        int ret;
4790        u64 *values;
4791
4792        lockdep_assert_held(&ctx->mutex);
4793
4794        values = kzalloc(event->read_size, GFP_KERNEL);
4795        if (!values)
4796                return -ENOMEM;
4797
4798        values[0] = 1 + leader->nr_siblings;
4799
4800        /*
4801         * By locking the child_mutex of the leader we effectively
4802         * lock the child list of all siblings.. XXX explain how.
4803         */
4804        mutex_lock(&leader->child_mutex);
4805
4806        ret = __perf_read_group_add(leader, read_format, values);
4807        if (ret)
4808                goto unlock;
4809
4810        list_for_each_entry(child, &leader->child_list, child_list) {
4811                ret = __perf_read_group_add(child, read_format, values);
4812                if (ret)
4813                        goto unlock;
4814        }
4815
4816        mutex_unlock(&leader->child_mutex);
4817
4818        ret = event->read_size;
4819        if (copy_to_user(buf, values, event->read_size))
4820                ret = -EFAULT;
4821        goto out;
4822
4823unlock:
4824        mutex_unlock(&leader->child_mutex);
4825out:
4826        kfree(values);
4827        return ret;
4828}
4829
4830static int perf_read_one(struct perf_event *event,
4831                                 u64 read_format, char __user *buf)
4832{
4833        u64 enabled, running;
4834        u64 values[4];
4835        int n = 0;
4836
4837        values[n++] = __perf_event_read_value(event, &enabled, &running);
4838        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4839                values[n++] = enabled;
4840        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4841                values[n++] = running;
4842        if (read_format & PERF_FORMAT_ID)
4843                values[n++] = primary_event_id(event);
4844
4845        if (copy_to_user(buf, values, n * sizeof(u64)))
4846                return -EFAULT;
4847
4848        return n * sizeof(u64);
4849}
4850
4851static bool is_event_hup(struct perf_event *event)
4852{
4853        bool no_children;
4854
4855        if (event->state > PERF_EVENT_STATE_EXIT)
4856                return false;
4857
4858        mutex_lock(&event->child_mutex);
4859        no_children = list_empty(&event->child_list);
4860        mutex_unlock(&event->child_mutex);
4861        return no_children;
4862}
4863
4864/*
4865 * Read the performance event - simple non blocking version for now
4866 */
4867static ssize_t
4868__perf_read(struct perf_event *event, char __user *buf, size_t count)
4869{
4870        u64 read_format = event->attr.read_format;
4871        int ret;
4872
4873        /*
4874         * Return end-of-file for a read on an event that is in
4875         * error state (i.e. because it was pinned but it couldn't be
4876         * scheduled on to the CPU at some point).
4877         */
4878        if (event->state == PERF_EVENT_STATE_ERROR)
4879                return 0;
4880
4881        if (count < event->read_size)
4882                return -ENOSPC;
4883
4884        WARN_ON_ONCE(event->ctx->parent_ctx);
4885        if (read_format & PERF_FORMAT_GROUP)
4886                ret = perf_read_group(event, read_format, buf);
4887        else
4888                ret = perf_read_one(event, read_format, buf);
4889
4890        return ret;
4891}
4892
4893static ssize_t
4894perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4895{
4896        struct perf_event *event = file->private_data;
4897        struct perf_event_context *ctx;
4898        int ret;
4899
4900        ctx = perf_event_ctx_lock(event);
4901        ret = __perf_read(event, buf, count);
4902        perf_event_ctx_unlock(event, ctx);
4903
4904        return ret;
4905}
4906
4907static __poll_t perf_poll(struct file *file, poll_table *wait)
4908{
4909        struct perf_event *event = file->private_data;
4910        struct ring_buffer *rb;
4911        __poll_t events = EPOLLHUP;
4912
4913        poll_wait(file, &event->waitq, wait);
4914
4915        if (is_event_hup(event))
4916                return events;
4917
4918        /*
4919         * Pin the event->rb by taking event->mmap_mutex; otherwise
4920         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4921         */
4922        mutex_lock(&event->mmap_mutex);
4923        rb = event->rb;
4924        if (rb)
4925                events = atomic_xchg(&rb->poll, 0);
4926        mutex_unlock(&event->mmap_mutex);
4927        return events;
4928}
4929
4930static void _perf_event_reset(struct perf_event *event)
4931{
4932        (void)perf_event_read(event, false);
4933        local64_set(&event->count, 0);
4934        perf_event_update_userpage(event);
4935}
4936
4937/*
4938 * Holding the top-level event's child_mutex means that any
4939 * descendant process that has inherited this event will block
4940 * in perf_event_exit_event() if it goes to exit, thus satisfying the
4941 * task existence requirements of perf_event_enable/disable.
4942 */
4943static void perf_event_for_each_child(struct perf_event *event,
4944                                        void (*func)(struct perf_event *))
4945{
4946        struct perf_event *child;
4947
4948        WARN_ON_ONCE(event->ctx->parent_ctx);
4949
4950        mutex_lock(&event->child_mutex);
4951        func(event);
4952        list_for_each_entry(child, &event->child_list, child_list)
4953                func(child);
4954        mutex_unlock(&event->child_mutex);
4955}
4956
4957static void perf_event_for_each(struct perf_event *event,
4958                                  void (*func)(struct perf_event *))
4959{
4960        struct perf_event_context *ctx = event->ctx;
4961        struct perf_event *sibling;
4962
4963        lockdep_assert_held(&ctx->mutex);
4964
4965        event = event->group_leader;
4966
4967        perf_event_for_each_child(event, func);
4968        for_each_sibling_event(sibling, event)
4969                perf_event_for_each_child(sibling, func);
4970}
4971
4972static void __perf_event_period(struct perf_event *event,
4973                                struct perf_cpu_context *cpuctx,
4974                                struct perf_event_context *ctx,
4975                                void *info)
4976{
4977        u64 value = *((u64 *)info);
4978        bool active;
4979
4980        if (event->attr.freq) {
4981                event->attr.sample_freq = value;
4982        } else {
4983                event->attr.sample_period = value;
4984                event->hw.sample_period = value;
4985        }
4986
4987        active = (event->state == PERF_EVENT_STATE_ACTIVE);
4988        if (active) {
4989                perf_pmu_disable(ctx->pmu);
4990                /*
4991                 * We could be throttled; unthrottle now to avoid the tick
4992                 * trying to unthrottle while we already re-started the event.
4993                 */
4994                if (event->hw.interrupts == MAX_INTERRUPTS) {
4995                        event->hw.interrupts = 0;
4996                        perf_log_throttle(event, 1);
4997                }
4998                event->pmu->stop(event, PERF_EF_UPDATE);
4999        }
5000
5001        local64_set(&event->hw.period_left, 0);
5002
5003        if (active) {
5004                event->pmu->start(event, PERF_EF_RELOAD);
5005                perf_pmu_enable(ctx->pmu);
5006        }
5007}
5008
5009static int perf_event_check_period(struct perf_event *event, u64 value)
5010{
5011        return event->pmu->check_period(event, value);
5012}
5013
5014static int perf_event_period(struct perf_event *event, u64 __user *arg)
5015{
5016        u64 value;
5017
5018        if (!is_sampling_event(event))
5019                return -EINVAL;
5020
5021        if (copy_from_user(&value, arg, sizeof(value)))
5022                return -EFAULT;
5023
5024        if (!value)
5025                return -EINVAL;
5026
5027        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5028                return -EINVAL;
5029
5030        if (perf_event_check_period(event, value))
5031                return -EINVAL;
5032
5033        if (!event->attr.freq && (value & (1ULL << 63)))
5034                return -EINVAL;
5035
5036        event_function_call(event, __perf_event_period, &value);
5037
5038        return 0;
5039}
5040
5041static const struct file_operations perf_fops;
5042
5043static inline int perf_fget_light(int fd, struct fd *p)
5044{
5045        struct fd f = fdget(fd);
5046        if (!f.file)
5047                return -EBADF;
5048
5049        if (f.file->f_op != &perf_fops) {
5050                fdput(f);
5051                return -EBADF;
5052        }
5053        *p = f;
5054        return 0;
5055}
5056
5057static int perf_event_set_output(struct perf_event *event,
5058                                 struct perf_event *output_event);
5059static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5060static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5061static int perf_copy_attr(struct perf_event_attr __user *uattr,
5062                          struct perf_event_attr *attr);
5063
5064static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5065{
5066        void (*func)(struct perf_event *);
5067        u32 flags = arg;
5068
5069        switch (cmd) {
5070        case PERF_EVENT_IOC_ENABLE:
5071                func = _perf_event_enable;
5072                break;
5073        case PERF_EVENT_IOC_DISABLE:
5074                func = _perf_event_disable;
5075                break;
5076        case PERF_EVENT_IOC_RESET:
5077                func = _perf_event_reset;
5078                break;
5079
5080        case PERF_EVENT_IOC_REFRESH:
5081                return _perf_event_refresh(event, arg);
5082
5083        case PERF_EVENT_IOC_PERIOD:
5084                return perf_event_period(event, (u64 __user *)arg);
5085
5086        case PERF_EVENT_IOC_ID:
5087        {
5088                u64 id = primary_event_id(event);
5089
5090                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5091                        return -EFAULT;
5092                return 0;
5093        }
5094
5095        case PERF_EVENT_IOC_SET_OUTPUT:
5096        {
5097                int ret;
5098                if (arg != -1) {
5099                        struct perf_event *output_event;
5100                        struct fd output;
5101                        ret = perf_fget_light(arg, &output);
5102                        if (ret)
5103                                return ret;
5104                        output_event = output.file->private_data;
5105                        ret = perf_event_set_output(event, output_event);
5106                        fdput(output);
5107                } else {
5108                        ret = perf_event_set_output(event, NULL);
5109                }
5110                return ret;
5111        }
5112
5113        case PERF_EVENT_IOC_SET_FILTER:
5114                return perf_event_set_filter(event, (void __user *)arg);
5115
5116        case PERF_EVENT_IOC_SET_BPF:
5117                return perf_event_set_bpf_prog(event, arg);
5118
5119        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5120                struct ring_buffer *rb;
5121
5122                rcu_read_lock();
5123                rb = rcu_dereference(event->rb);
5124                if (!rb || !rb->nr_pages) {
5125                        rcu_read_unlock();
5126                        return -EINVAL;
5127                }
5128                rb_toggle_paused(rb, !!arg);
5129                rcu_read_unlock();
5130                return 0;
5131        }
5132
5133        case PERF_EVENT_IOC_QUERY_BPF:
5134                return perf_event_query_prog_array(event, (void __user *)arg);
5135
5136        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5137                struct perf_event_attr new_attr;
5138                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5139                                         &new_attr);
5140
5141                if (err)
5142                        return err;
5143
5144                return perf_event_modify_attr(event,  &new_attr);
5145        }
5146        default:
5147                return -ENOTTY;
5148        }
5149
5150        if (flags & PERF_IOC_FLAG_GROUP)
5151                perf_event_for_each(event, func);
5152        else
5153                perf_event_for_each_child(event, func);
5154
5155        return 0;
5156}
5157
5158static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5159{
5160        struct perf_event *event = file->private_data;
5161        struct perf_event_context *ctx;
5162        long ret;
5163
5164        ctx = perf_event_ctx_lock(event);
5165        ret = _perf_ioctl(event, cmd, arg);
5166        perf_event_ctx_unlock(event, ctx);
5167
5168        return ret;
5169}
5170
5171#ifdef CONFIG_COMPAT
5172static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5173                                unsigned long arg)
5174{
5175        switch (_IOC_NR(cmd)) {
5176        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5177        case _IOC_NR(PERF_EVENT_IOC_ID):
5178        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5179        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5180                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5181                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5182                        cmd &= ~IOCSIZE_MASK;
5183                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5184                }
5185                break;
5186        }
5187        return perf_ioctl(file, cmd, arg);
5188}
5189#else
5190# define perf_compat_ioctl NULL
5191#endif
5192
5193int perf_event_task_enable(void)
5194{
5195        struct perf_event_context *ctx;
5196        struct perf_event *event;
5197
5198        mutex_lock(&current->perf_event_mutex);
5199        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5200                ctx = perf_event_ctx_lock(event);
5201                perf_event_for_each_child(event, _perf_event_enable);
5202                perf_event_ctx_unlock(event, ctx);
5203        }
5204        mutex_unlock(&current->perf_event_mutex);
5205
5206        return 0;
5207}
5208
5209int perf_event_task_disable(void)
5210{
5211        struct perf_event_context *ctx;
5212        struct perf_event *event;
5213
5214        mutex_lock(&current->perf_event_mutex);
5215        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5216                ctx = perf_event_ctx_lock(event);
5217                perf_event_for_each_child(event, _perf_event_disable);
5218                perf_event_ctx_unlock(event, ctx);
5219        }
5220        mutex_unlock(&current->perf_event_mutex);
5221
5222        return 0;
5223}
5224
5225static int perf_event_index(struct perf_event *event)
5226{
5227        if (event->hw.state & PERF_HES_STOPPED)
5228                return 0;
5229
5230        if (event->state != PERF_EVENT_STATE_ACTIVE)
5231                return 0;
5232
5233        return event->pmu->event_idx(event);
5234}
5235
5236static void calc_timer_values(struct perf_event *event,
5237                                u64 *now,
5238                                u64 *enabled,
5239                                u64 *running)
5240{
5241        u64 ctx_time;
5242
5243        *now = perf_clock();
5244        ctx_time = event->shadow_ctx_time + *now;
5245        __perf_update_times(event, ctx_time, enabled, running);
5246}
5247
5248static void perf_event_init_userpage(struct perf_event *event)
5249{
5250        struct perf_event_mmap_page *userpg;
5251        struct ring_buffer *rb;
5252
5253        rcu_read_lock();
5254        rb = rcu_dereference(event->rb);
5255        if (!rb)
5256                goto unlock;
5257
5258        userpg = rb->user_page;
5259
5260        /* Allow new userspace to detect that bit 0 is deprecated */
5261        userpg->cap_bit0_is_deprecated = 1;
5262        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5263        userpg->data_offset = PAGE_SIZE;
5264        userpg->data_size = perf_data_size(rb);
5265
5266unlock:
5267        rcu_read_unlock();
5268}
5269
5270void __weak arch_perf_update_userpage(
5271        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5272{
5273}
5274
5275/*
5276 * Callers need to ensure there can be no nesting of this function, otherwise
5277 * the seqlock logic goes bad. We can not serialize this because the arch
5278 * code calls this from NMI context.
5279 */
5280void perf_event_update_userpage(struct perf_event *event)
5281{
5282        struct perf_event_mmap_page *userpg;
5283        struct ring_buffer *rb;
5284        u64 enabled, running, now;
5285
5286        rcu_read_lock();
5287        rb = rcu_dereference(event->rb);
5288        if (!rb)
5289                goto unlock;
5290
5291        /*
5292         * compute total_time_enabled, total_time_running
5293         * based on snapshot values taken when the event
5294         * was last scheduled in.
5295         *
5296         * we cannot simply called update_context_time()
5297         * because of locking issue as we can be called in
5298         * NMI context
5299         */
5300        calc_timer_values(event, &now, &enabled, &running);
5301
5302        userpg = rb->user_page;
5303        /*
5304         * Disable preemption to guarantee consistent time stamps are stored to
5305         * the user page.
5306         */
5307        preempt_disable();
5308        ++userpg->lock;
5309        barrier();
5310        userpg->index = perf_event_index(event);
5311        userpg->offset = perf_event_count(event);
5312        if (userpg->index)
5313                userpg->offset -= local64_read(&event->hw.prev_count);
5314
5315        userpg->time_enabled = enabled +
5316                        atomic64_read(&event->child_total_time_enabled);
5317
5318        userpg->time_running = running +
5319                        atomic64_read(&event->child_total_time_running);
5320
5321        arch_perf_update_userpage(event, userpg, now);
5322
5323        barrier();
5324        ++userpg->lock;
5325        preempt_enable();
5326unlock:
5327        rcu_read_unlock();
5328}
5329EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5330
5331static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5332{
5333        struct perf_event *event = vmf->vma->vm_file->private_data;
5334        struct ring_buffer *rb;
5335        vm_fault_t ret = VM_FAULT_SIGBUS;
5336
5337        if (vmf->flags & FAULT_FLAG_MKWRITE) {
5338                if (vmf->pgoff == 0)
5339                        ret = 0;
5340                return ret;
5341        }
5342
5343        rcu_read_lock();
5344        rb = rcu_dereference(event->rb);
5345        if (!rb)
5346                goto unlock;
5347
5348        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5349                goto unlock;
5350
5351        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5352        if (!vmf->page)
5353                goto unlock;
5354
5355        get_page(vmf->page);
5356        vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5357        vmf->page->index   = vmf->pgoff;
5358
5359        ret = 0;
5360unlock:
5361        rcu_read_unlock();
5362
5363        return ret;
5364}
5365
5366static void ring_buffer_attach(struct perf_event *event,
5367                               struct ring_buffer *rb)
5368{
5369        struct ring_buffer *old_rb = NULL;
5370        unsigned long flags;
5371
5372        if (event->rb) {
5373                /*
5374                 * Should be impossible, we set this when removing
5375                 * event->rb_entry and wait/clear when adding event->rb_entry.
5376                 */
5377                WARN_ON_ONCE(event->rcu_pending);
5378
5379                old_rb = event->rb;
5380                spin_lock_irqsave(&old_rb->event_lock, flags);
5381                list_del_rcu(&event->rb_entry);
5382                spin_unlock_irqrestore(&old_rb->event_lock, flags);
5383
5384                event->rcu_batches = get_state_synchronize_rcu();
5385                event->rcu_pending = 1;
5386        }
5387
5388        if (rb) {
5389                if (event->rcu_pending) {
5390                        cond_synchronize_rcu(event->rcu_batches);
5391                        event->rcu_pending = 0;
5392                }
5393
5394                spin_lock_irqsave(&rb->event_lock, flags);
5395                list_add_rcu(&event->rb_entry, &rb->event_list);
5396                spin_unlock_irqrestore(&rb->event_lock, flags);
5397        }
5398
5399        /*
5400         * Avoid racing with perf_mmap_close(AUX): stop the event
5401         * before swizzling the event::rb pointer; if it's getting
5402         * unmapped, its aux_mmap_count will be 0 and it won't
5403         * restart. See the comment in __perf_pmu_output_stop().
5404         *
5405         * Data will inevitably be lost when set_output is done in
5406         * mid-air, but then again, whoever does it like this is
5407         * not in for the data anyway.
5408         */
5409        if (has_aux(event))
5410                perf_event_stop(event, 0);
5411
5412        rcu_assign_pointer(event->rb, rb);
5413
5414        if (old_rb) {
5415                ring_buffer_put(old_rb);
5416                /*
5417                 * Since we detached before setting the new rb, so that we
5418                 * could attach the new rb, we could have missed a wakeup.
5419                 * Provide it now.
5420                 */
5421                wake_up_all(&event->waitq);
5422        }
5423}
5424
5425static void ring_buffer_wakeup(struct perf_event *event)
5426{
5427        struct ring_buffer *rb;
5428
5429        rcu_read_lock();
5430        rb = rcu_dereference(event->rb);
5431        if (rb) {
5432                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5433                        wake_up_all(&event->waitq);
5434        }
5435        rcu_read_unlock();
5436}
5437
5438struct ring_buffer *ring_buffer_get(struct perf_event *event)
5439{
5440        struct ring_buffer *rb;
5441
5442        rcu_read_lock();
5443        rb = rcu_dereference(event->rb);
5444        if (rb) {
5445                if (!refcount_inc_not_zero(&rb->refcount))
5446                        rb = NULL;
5447        }
5448        rcu_read_unlock();
5449
5450        return rb;
5451}
5452
5453void ring_buffer_put(struct ring_buffer *rb)
5454{
5455        if (!refcount_dec_and_test(&rb->refcount))
5456                return;
5457
5458        WARN_ON_ONCE(!list_empty(&rb->event_list));
5459
5460        call_rcu(&rb->rcu_head, rb_free_rcu);
5461}
5462
5463static void perf_mmap_open(struct vm_area_struct *vma)
5464{
5465        struct perf_event *event = vma->vm_file->private_data;
5466
5467        atomic_inc(&event->mmap_count);
5468        atomic_inc(&event->rb->mmap_count);
5469
5470        if (vma->vm_pgoff)
5471                atomic_inc(&event->rb->aux_mmap_count);
5472
5473        if (event->pmu->event_mapped)
5474                event->pmu->event_mapped(event, vma->vm_mm);
5475}
5476
5477static void perf_pmu_output_stop(struct perf_event *event);
5478
5479/*
5480 * A buffer can be mmap()ed multiple times; either directly through the same
5481 * event, or through other events by use of perf_event_set_output().
5482 *
5483 * In order to undo the VM accounting done by perf_mmap() we need to destroy
5484 * the buffer here, where we still have a VM context. This means we need
5485 * to detach all events redirecting to us.
5486 */
5487static void perf_mmap_close(struct vm_area_struct *vma)
5488{
5489        struct perf_event *event = vma->vm_file->private_data;
5490
5491        struct ring_buffer *rb = ring_buffer_get(event);
5492        struct user_struct *mmap_user = rb->mmap_user;
5493        int mmap_locked = rb->mmap_locked;
5494        unsigned long size = perf_data_size(rb);
5495
5496        if (event->pmu->event_unmapped)
5497                event->pmu->event_unmapped(event, vma->vm_mm);
5498
5499        /*
5500         * rb->aux_mmap_count will always drop before rb->mmap_count and
5501         * event->mmap_count, so it is ok to use event->mmap_mutex to
5502         * serialize with perf_mmap here.
5503         */
5504        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5505            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5506                /*
5507                 * Stop all AUX events that are writing to this buffer,
5508                 * so that we can free its AUX pages and corresponding PMU
5509                 * data. Note that after rb::aux_mmap_count dropped to zero,
5510                 * they won't start any more (see perf_aux_output_begin()).
5511                 */
5512                perf_pmu_output_stop(event);
5513
5514                /* now it's safe to free the pages */
5515                atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5516                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5517
5518                /* this has to be the last one */
5519                rb_free_aux(rb);
5520                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5521
5522                mutex_unlock(&event->mmap_mutex);
5523        }
5524
5525        atomic_dec(&rb->mmap_count);
5526
5527        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5528                goto out_put;
5529
5530        ring_buffer_attach(event, NULL);
5531        mutex_unlock(&event->mmap_mutex);
5532
5533        /* If there's still other mmap()s of this buffer, we're done. */
5534        if (atomic_read(&rb->mmap_count))
5535                goto out_put;
5536
5537        /*
5538         * No other mmap()s, detach from all other events that might redirect
5539         * into the now unreachable buffer. Somewhat complicated by the
5540         * fact that rb::event_lock otherwise nests inside mmap_mutex.
5541         */
5542again:
5543        rcu_read_lock();
5544        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5545                if (!atomic_long_inc_not_zero(&event->refcount)) {
5546                        /*
5547                         * This event is en-route to free_event() which will
5548                         * detach it and remove it from the list.
5549                         */
5550                        continue;
5551                }
5552                rcu_read_unlock();
5553
5554                mutex_lock(&event->mmap_mutex);
5555                /*
5556                 * Check we didn't race with perf_event_set_output() which can
5557                 * swizzle the rb from under us while we were waiting to
5558                 * acquire mmap_mutex.
5559                 *
5560                 * If we find a different rb; ignore this event, a next
5561                 * iteration will no longer find it on the list. We have to
5562                 * still restart the iteration to make sure we're not now
5563                 * iterating the wrong list.
5564                 */
5565                if (event->rb == rb)
5566                        ring_buffer_attach(event, NULL);
5567
5568                mutex_unlock(&event->mmap_mutex);
5569                put_event(event);
5570
5571                /*
5572                 * Restart the iteration; either we're on the wrong list or
5573                 * destroyed its integrity by doing a deletion.
5574                 */
5575                goto again;
5576        }
5577        rcu_read_unlock();
5578
5579        /*
5580         * It could be there's still a few 0-ref events on the list; they'll
5581         * get cleaned up by free_event() -- they'll also still have their
5582         * ref on the rb and will free it whenever they are done with it.
5583         *
5584         * Aside from that, this buffer is 'fully' detached and unmapped,
5585         * undo the VM accounting.
5586         */
5587
5588        atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5589        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5590        free_uid(mmap_user);
5591
5592out_put:
5593        ring_buffer_put(rb); /* could be last */
5594}
5595
5596static const struct vm_operations_struct perf_mmap_vmops = {
5597        .open           = perf_mmap_open,
5598        .close          = perf_mmap_close, /* non mergeable */
5599        .fault          = perf_mmap_fault,
5600        .page_mkwrite   = perf_mmap_fault,
5601};
5602
5603static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5604{
5605        struct perf_event *event = file->private_data;
5606        unsigned long user_locked, user_lock_limit;
5607        struct user_struct *user = current_user();
5608        unsigned long locked, lock_limit;
5609        struct ring_buffer *rb = NULL;
5610        unsigned long vma_size;
5611        unsigned long nr_pages;
5612        long user_extra = 0, extra = 0;
5613        int ret = 0, flags = 0;
5614
5615        /*
5616         * Don't allow mmap() of inherited per-task counters. This would
5617         * create a performance issue due to all children writing to the
5618         * same rb.
5619         */
5620        if (event->cpu == -1 && event->attr.inherit)
5621                return -EINVAL;
5622
5623        if (!(vma->vm_flags & VM_SHARED))
5624                return -EINVAL;
5625
5626        vma_size = vma->vm_end - vma->vm_start;
5627
5628        if (vma->vm_pgoff == 0) {
5629                nr_pages = (vma_size / PAGE_SIZE) - 1;
5630        } else {
5631                /*
5632                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5633                 * mapped, all subsequent mappings should have the same size
5634                 * and offset. Must be above the normal perf buffer.
5635                 */
5636                u64 aux_offset, aux_size;
5637
5638                if (!event->rb)
5639                        return -EINVAL;
5640
5641                nr_pages = vma_size / PAGE_SIZE;
5642
5643                mutex_lock(&event->mmap_mutex);
5644                ret = -EINVAL;
5645
5646                rb = event->rb;
5647                if (!rb)
5648                        goto aux_unlock;
5649
5650                aux_offset = READ_ONCE(rb->user_page->aux_offset);
5651                aux_size = READ_ONCE(rb->user_page->aux_size);
5652
5653                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5654                        goto aux_unlock;
5655
5656                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5657                        goto aux_unlock;
5658
5659                /* already mapped with a different offset */
5660                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5661                        goto aux_unlock;
5662
5663                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5664                        goto aux_unlock;
5665
5666                /* already mapped with a different size */
5667                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5668                        goto aux_unlock;
5669
5670                if (!is_power_of_2(nr_pages))
5671                        goto aux_unlock;
5672
5673                if (!atomic_inc_not_zero(&rb->mmap_count))
5674                        goto aux_unlock;
5675
5676                if (rb_has_aux(rb)) {
5677                        atomic_inc(&rb->aux_mmap_count);
5678                        ret = 0;
5679                        goto unlock;
5680                }
5681
5682                atomic_set(&rb->aux_mmap_count, 1);
5683                user_extra = nr_pages;
5684
5685                goto accounting;
5686        }
5687
5688        /*
5689         * If we have rb pages ensure they're a power-of-two number, so we
5690         * can do bitmasks instead of modulo.
5691         */
5692        if (nr_pages != 0 && !is_power_of_2(nr_pages))
5693                return -EINVAL;
5694
5695        if (vma_size != PAGE_SIZE * (1 + nr_pages))
5696                return -EINVAL;
5697
5698        WARN_ON_ONCE(event->ctx->parent_ctx);
5699again:
5700        mutex_lock(&event->mmap_mutex);
5701        if (event->rb) {
5702                if (event->rb->nr_pages != nr_pages) {
5703                        ret = -EINVAL;
5704                        goto unlock;
5705                }
5706
5707                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5708                        /*
5709                         * Raced against perf_mmap_close() through
5710                         * perf_event_set_output(). Try again, hope for better
5711                         * luck.
5712                         */
5713                        mutex_unlock(&event->mmap_mutex);
5714                        goto again;
5715                }
5716
5717                goto unlock;
5718        }
5719
5720        user_extra = nr_pages + 1;
5721
5722accounting:
5723        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5724
5725        /*
5726         * Increase the limit linearly with more CPUs:
5727         */
5728        user_lock_limit *= num_online_cpus();
5729
5730        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5731
5732        if (user_locked > user_lock_limit)
5733                extra = user_locked - user_lock_limit;
5734
5735        lock_limit = rlimit(RLIMIT_MEMLOCK);
5736        lock_limit >>= PAGE_SHIFT;
5737        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
5738
5739        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5740                !capable(CAP_IPC_LOCK)) {
5741                ret = -EPERM;
5742                goto unlock;
5743        }
5744
5745        WARN_ON(!rb && event->rb);
5746
5747        if (vma->vm_flags & VM_WRITE)
5748                flags |= RING_BUFFER_WRITABLE;
5749
5750        if (!rb) {
5751                rb = rb_alloc(nr_pages,
5752                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
5753                              event->cpu, flags);
5754
5755                if (!rb) {
5756                        ret = -ENOMEM;
5757                        goto unlock;
5758                }
5759
5760                atomic_set(&rb->mmap_count, 1);
5761                rb->mmap_user = get_current_user();
5762                rb->mmap_locked = extra;
5763
5764                ring_buffer_attach(event, rb);
5765
5766                perf_event_init_userpage(event);
5767                perf_event_update_userpage(event);
5768        } else {
5769                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5770                                   event->attr.aux_watermark, flags);
5771                if (!ret)
5772                        rb->aux_mmap_locked = extra;
5773        }
5774
5775unlock:
5776        if (!ret) {
5777                atomic_long_add(user_extra, &user->locked_vm);
5778                atomic64_add(extra, &vma->vm_mm->pinned_vm);
5779
5780                atomic_inc(&event->mmap_count);
5781        } else if (rb) {
5782                atomic_dec(&rb->mmap_count);
5783        }
5784aux_unlock:
5785        mutex_unlock(&event->mmap_mutex);
5786
5787        /*
5788         * Since pinned accounting is per vm we cannot allow fork() to copy our
5789         * vma.
5790         */
5791        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5792        vma->vm_ops = &perf_mmap_vmops;
5793
5794        if (event->pmu->event_mapped)
5795                event->pmu->event_mapped(event, vma->vm_mm);
5796
5797        return ret;
5798}
5799
5800static int perf_fasync(int fd, struct file *filp, int on)
5801{
5802        struct inode *inode = file_inode(filp);
5803        struct perf_event *event = filp->private_data;
5804        int retval;
5805
5806        inode_lock(inode);
5807        retval = fasync_helper(fd, filp, on, &event->fasync);
5808        inode_unlock(inode);
5809
5810        if (retval < 0)
5811                return retval;
5812
5813        return 0;
5814}
5815
5816static const struct file_operations perf_fops = {
5817        .llseek                 = no_llseek,
5818        .release                = perf_release,
5819        .read                   = perf_read,
5820        .poll                   = perf_poll,
5821        .unlocked_ioctl         = perf_ioctl,
5822        .compat_ioctl           = perf_compat_ioctl,
5823        .mmap                   = perf_mmap,
5824        .fasync                 = perf_fasync,
5825};
5826
5827/*
5828 * Perf event wakeup
5829 *
5830 * If there's data, ensure we set the poll() state and publish everything
5831 * to user-space before waking everybody up.
5832 */
5833
5834static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5835{
5836        /* only the parent has fasync state */
5837        if (event->parent)
5838                event = event->parent;
5839        return &event->fasync;
5840}
5841
5842void perf_event_wakeup(struct perf_event *event)
5843{
5844        ring_buffer_wakeup(event);
5845
5846        if (event->pending_kill) {
5847                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5848                event->pending_kill = 0;
5849        }
5850}
5851
5852static void perf_pending_event_disable(struct perf_event *event)
5853{
5854        int cpu = READ_ONCE(event->pending_disable);
5855
5856        if (cpu < 0)
5857                return;
5858
5859        if (cpu == smp_processor_id()) {
5860                WRITE_ONCE(event->pending_disable, -1);
5861                perf_event_disable_local(event);
5862                return;
5863        }
5864
5865        /*
5866         *  CPU-A                       CPU-B
5867         *
5868         *  perf_event_disable_inatomic()
5869         *    @pending_disable = CPU-A;
5870         *    irq_work_queue();
5871         *
5872         *  sched-out
5873         *    @pending_disable = -1;
5874         *
5875         *                              sched-in
5876         *                              perf_event_disable_inatomic()
5877         *                                @pending_disable = CPU-B;
5878         *                                irq_work_queue(); // FAILS
5879         *
5880         *  irq_work_run()
5881         *    perf_pending_event()
5882         *
5883         * But the event runs on CPU-B and wants disabling there.
5884         */
5885        irq_work_queue_on(&event->pending, cpu);
5886}
5887
5888static void perf_pending_event(struct irq_work *entry)
5889{
5890        struct perf_event *event = container_of(entry, struct perf_event, pending);
5891        int rctx;
5892
5893        rctx = perf_swevent_get_recursion_context();
5894        /*
5895         * If we 'fail' here, that's OK, it means recursion is already disabled
5896         * and we won't recurse 'further'.
5897         */
5898
5899        perf_pending_event_disable(event);
5900
5901        if (event->pending_wakeup) {
5902                event->pending_wakeup = 0;
5903                perf_event_wakeup(event);
5904        }
5905
5906        if (rctx >= 0)
5907                perf_swevent_put_recursion_context(rctx);
5908}
5909
5910/*
5911 * We assume there is only KVM supporting the callbacks.
5912 * Later on, we might change it to a list if there is
5913 * another virtualization implementation supporting the callbacks.
5914 */
5915struct perf_guest_info_callbacks *perf_guest_cbs;
5916
5917int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5918{
5919        perf_guest_cbs = cbs;
5920        return 0;
5921}
5922EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5923
5924int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5925{
5926        perf_guest_cbs = NULL;
5927        return 0;
5928}
5929EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5930
5931static void
5932perf_output_sample_regs(struct perf_output_handle *handle,
5933                        struct pt_regs *regs, u64 mask)
5934{
5935        int bit;
5936        DECLARE_BITMAP(_mask, 64);
5937
5938        bitmap_from_u64(_mask, mask);
5939        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5940                u64 val;
5941
5942                val = perf_reg_value(regs, bit);
5943                perf_output_put(handle, val);
5944        }
5945}
5946
5947static void perf_sample_regs_user(struct perf_regs *regs_user,
5948                                  struct pt_regs *regs,
5949                                  struct pt_regs *regs_user_copy)
5950{
5951        if (user_mode(regs)) {
5952                regs_user->abi = perf_reg_abi(current);
5953                regs_user->regs = regs;
5954        } else if (!(current->flags & PF_KTHREAD)) {
5955                perf_get_regs_user(regs_user, regs, regs_user_copy);
5956        } else {
5957                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5958                regs_user->regs = NULL;
5959        }
5960}
5961
5962static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5963                                  struct pt_regs *regs)
5964{
5965        regs_intr->regs = regs;
5966        regs_intr->abi  = perf_reg_abi(current);
5967}
5968
5969
5970/*
5971 * Get remaining task size from user stack pointer.
5972 *
5973 * It'd be better to take stack vma map and limit this more
5974 * precisly, but there's no way to get it safely under interrupt,
5975 * so using TASK_SIZE as limit.
5976 */
5977static u64 perf_ustack_task_size(struct pt_regs *regs)
5978{
5979        unsigned long addr = perf_user_stack_pointer(regs);
5980
5981        if (!addr || addr >= TASK_SIZE)
5982                return 0;
5983
5984        return TASK_SIZE - addr;
5985}
5986
5987static u16
5988perf_sample_ustack_size(u16 stack_size, u16 header_size,
5989                        struct pt_regs *regs)
5990{
5991        u64 task_size;
5992
5993        /* No regs, no stack pointer, no dump. */
5994        if (!regs)
5995                return 0;
5996
5997        /*
5998         * Check if we fit in with the requested stack size into the:
5999         * - TASK_SIZE
6000         *   If we don't, we limit the size to the TASK_SIZE.
6001         *
6002         * - remaining sample size
6003         *   If we don't, we customize the stack size to
6004         *   fit in to the remaining sample size.
6005         */
6006
6007        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6008        stack_size = min(stack_size, (u16) task_size);
6009
6010        /* Current header size plus static size and dynamic size. */
6011        header_size += 2 * sizeof(u64);
6012
6013        /* Do we fit in with the current stack dump size? */
6014        if ((u16) (header_size + stack_size) < header_size) {
6015                /*
6016                 * If we overflow the maximum size for the sample,
6017                 * we customize the stack dump size to fit in.
6018                 */
6019                stack_size = USHRT_MAX - header_size - sizeof(u64);
6020                stack_size = round_up(stack_size, sizeof(u64));
6021        }
6022
6023        return stack_size;
6024}
6025
6026static void
6027perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6028                          struct pt_regs *regs)
6029{
6030        /* Case of a kernel thread, nothing to dump */
6031        if (!regs) {
6032                u64 size = 0;
6033                perf_output_put(handle, size);
6034        } else {
6035                unsigned long sp;
6036                unsigned int rem;
6037                u64 dyn_size;
6038                mm_segment_t fs;
6039
6040                /*
6041                 * We dump:
6042                 * static size
6043                 *   - the size requested by user or the best one we can fit
6044                 *     in to the sample max size
6045                 * data
6046                 *   - user stack dump data
6047                 * dynamic size
6048                 *   - the actual dumped size
6049                 */
6050
6051                /* Static size. */
6052                perf_output_put(handle, dump_size);
6053
6054                /* Data. */
6055                sp = perf_user_stack_pointer(regs);
6056                fs = get_fs();
6057                set_fs(USER_DS);
6058                rem = __output_copy_user(handle, (void *) sp, dump_size);
6059                set_fs(fs);
6060                dyn_size = dump_size - rem;
6061
6062                perf_output_skip(handle, rem);
6063
6064                /* Dynamic size. */
6065                perf_output_put(handle, dyn_size);
6066        }
6067}
6068
6069static void __perf_event_header__init_id(struct perf_event_header *header,
6070                                         struct perf_sample_data *data,
6071                                         struct perf_event *event)
6072{
6073        u64 sample_type = event->attr.sample_type;
6074
6075        data->type = sample_type;
6076        header->size += event->id_header_size;
6077
6078        if (sample_type & PERF_SAMPLE_TID) {
6079                /* namespace issues */
6080                data->tid_entry.pid = perf_event_pid(event, current);
6081                data->tid_entry.tid = perf_event_tid(event, current);
6082        }
6083
6084        if (sample_type & PERF_SAMPLE_TIME)
6085                data->time = perf_event_clock(event);
6086
6087        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6088                data->id = primary_event_id(event);
6089
6090        if (sample_type & PERF_SAMPLE_STREAM_ID)
6091                data->stream_id = event->id;
6092
6093        if (sample_type & PERF_SAMPLE_CPU) {
6094                data->cpu_entry.cpu      = raw_smp_processor_id();
6095                data->cpu_entry.reserved = 0;
6096        }
6097}
6098
6099void perf_event_header__init_id(struct perf_event_header *header,
6100                                struct perf_sample_data *data,
6101                                struct perf_event *event)
6102{
6103        if (event->attr.sample_id_all)
6104                __perf_event_header__init_id(header, data, event);
6105}
6106
6107static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6108                                           struct perf_sample_data *data)
6109{
6110        u64 sample_type = data->type;
6111
6112        if (sample_type & PERF_SAMPLE_TID)
6113                perf_output_put(handle, data->tid_entry);
6114
6115        if (sample_type & PERF_SAMPLE_TIME)
6116                perf_output_put(handle, data->time);
6117
6118        if (sample_type & PERF_SAMPLE_ID)
6119                perf_output_put(handle, data->id);
6120
6121        if (sample_type & PERF_SAMPLE_STREAM_ID)
6122                perf_output_put(handle, data->stream_id);
6123
6124        if (sample_type & PERF_SAMPLE_CPU)
6125                perf_output_put(handle, data->cpu_entry);
6126
6127        if (sample_type & PERF_SAMPLE_IDENTIFIER)
6128                perf_output_put(handle, data->id);
6129}
6130
6131void perf_event__output_id_sample(struct perf_event *event,
6132                                  struct perf_output_handle *handle,
6133                                  struct perf_sample_data *sample)
6134{
6135        if (event->attr.sample_id_all)
6136                __perf_event__output_id_sample(handle, sample);
6137}
6138
6139static void perf_output_read_one(struct perf_output_handle *handle,
6140                                 struct perf_event *event,
6141                                 u64 enabled, u64 running)
6142{
6143        u64 read_format = event->attr.read_format;
6144        u64 values[4];
6145        int n = 0;
6146
6147        values[n++] = perf_event_count(event);
6148        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6149                values[n++] = enabled +
6150                        atomic64_read(&event->child_total_time_enabled);
6151        }
6152        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6153                values[n++] = running +
6154                        atomic64_read(&event->child_total_time_running);
6155        }
6156        if (read_format & PERF_FORMAT_ID)
6157                values[n++] = primary_event_id(event);
6158
6159        __output_copy(handle, values, n * sizeof(u64));
6160}
6161
6162static void perf_output_read_group(struct perf_output_handle *handle,
6163                            struct perf_event *event,
6164                            u64 enabled, u64 running)
6165{
6166        struct perf_event *leader = event->group_leader, *sub;
6167        u64 read_format = event->attr.read_format;
6168        u64 values[5];
6169        int n = 0;
6170
6171        values[n++] = 1 + leader->nr_siblings;
6172
6173        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6174                values[n++] = enabled;
6175
6176        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6177                values[n++] = running;
6178
6179        if ((leader != event) &&
6180            (leader->state == PERF_EVENT_STATE_ACTIVE))
6181                leader->pmu->read(leader);
6182
6183        values[n++] = perf_event_count(leader);
6184        if (read_format & PERF_FORMAT_ID)
6185                values[n++] = primary_event_id(leader);
6186
6187        __output_copy(handle, values, n * sizeof(u64));
6188
6189        for_each_sibling_event(sub, leader) {
6190                n = 0;
6191
6192                if ((sub != event) &&
6193                    (sub->state == PERF_EVENT_STATE_ACTIVE))
6194                        sub->pmu->read(sub);
6195
6196                values[n++] = perf_event_count(sub);
6197                if (read_format & PERF_FORMAT_ID)
6198                        values[n++] = primary_event_id(sub);
6199
6200                __output_copy(handle, values, n * sizeof(u64));
6201        }
6202}
6203
6204#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6205                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
6206
6207/*
6208 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
6209 *
6210 * The problem is that its both hard and excessively expensive to iterate the
6211 * child list, not to mention that its impossible to IPI the children running
6212 * on another CPU, from interrupt/NMI context.
6213 */
6214static void perf_output_read(struct perf_output_handle *handle,
6215                             struct perf_event *event)
6216{
6217        u64 enabled = 0, running = 0, now;
6218        u64 read_format = event->attr.read_format;
6219
6220        /*
6221         * compute total_time_enabled, total_time_running
6222         * based on snapshot values taken when the event
6223         * was last scheduled in.
6224         *
6225         * we cannot simply called update_context_time()
6226         * because of locking issue as we are called in
6227         * NMI context
6228         */
6229        if (read_format & PERF_FORMAT_TOTAL_TIMES)
6230                calc_timer_values(event, &now, &enabled, &running);
6231
6232        if (event->attr.read_format & PERF_FORMAT_GROUP)
6233                perf_output_read_group(handle, event, enabled, running);
6234        else
6235                perf_output_read_one(handle, event, enabled, running);
6236}
6237
6238void perf_output_sample(struct perf_output_handle *handle,
6239                        struct perf_event_header *header,
6240                        struct perf_sample_data *data,
6241                        struct perf_event *event)
6242{
6243        u64 sample_type = data->type;
6244
6245        perf_output_put(handle, *header);
6246
6247        if (sample_type & PERF_SAMPLE_IDENTIFIER)
6248                perf_output_put(handle, data->id);
6249
6250        if (sample_type & PERF_SAMPLE_IP)
6251                perf_output_put(handle, data->ip);
6252
6253        if (sample_type & PERF_SAMPLE_TID)
6254                perf_output_put(handle, data->tid_entry);
6255
6256        if (sample_type & PERF_SAMPLE_TIME)
6257                perf_output_put(handle, data->time);
6258
6259        if (sample_type & PERF_SAMPLE_ADDR)
6260                perf_output_put(handle, data->addr);
6261
6262        if (sample_type & PERF_SAMPLE_ID)
6263                perf_output_put(handle, data->id);
6264
6265        if (sample_type & PERF_SAMPLE_STREAM_ID)
6266                perf_output_put(handle, data->stream_id);
6267
6268        if (sample_type & PERF_SAMPLE_CPU)
6269                perf_output_put(handle, data->cpu_entry);
6270
6271        if (sample_type & PERF_SAMPLE_PERIOD)
6272                perf_output_put(handle, data->period);
6273
6274        if (sample_type & PERF_SAMPLE_READ)
6275                perf_output_read(handle, event);
6276
6277        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6278                int size = 1;
6279
6280                size += data->callchain->nr;
6281                size *= sizeof(u64);
6282                __output_copy(handle, data->callchain, size);
6283        }
6284
6285        if (sample_type & PERF_SAMPLE_RAW) {
6286                struct perf_raw_record *raw = data->raw;
6287
6288                if (raw) {
6289                        struct perf_raw_frag *frag = &raw->frag;
6290
6291                        perf_output_put(handle, raw->size);
6292                        do {
6293                                if (frag->copy) {
6294                                        __output_custom(handle, frag->copy,
6295                                                        frag->data, frag->size);
6296                                } else {
6297                                        __output_copy(handle, frag->data,
6298                                                      frag->size);
6299                                }
6300                                if (perf_raw_frag_last(frag))
6301                                        break;
6302                                frag = frag->next;
6303                        } while (1);
6304                        if (frag->pad)
6305                                __output_skip(handle, NULL, frag->pad);
6306                } else {
6307                        struct {
6308                                u32     size;
6309                                u32     data;
6310                        } raw = {
6311                                .size = sizeof(u32),
6312                                .data = 0,
6313                        };
6314                        perf_output_put(handle, raw);
6315                }
6316        }
6317
6318        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6319                if (data->br_stack) {
6320                        size_t size;
6321
6322                        size = data->br_stack->nr
6323                             * sizeof(struct perf_branch_entry);
6324
6325                        perf_output_put(handle, data->br_stack->nr);
6326                        perf_output_copy(handle, data->br_stack->entries, size);
6327                } else {
6328                        /*
6329                         * we always store at least the value of nr
6330                         */
6331                        u64 nr = 0;
6332                        perf_output_put(handle, nr);
6333                }
6334        }
6335
6336        if (sample_type & PERF_SAMPLE_REGS_USER) {
6337                u64 abi = data->regs_user.abi;
6338
6339                /*
6340                 * If there are no regs to dump, notice it through
6341                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6342                 */
6343                perf_output_put(handle, abi);
6344
6345                if (abi) {
6346                        u64 mask = event->attr.sample_regs_user;
6347                        perf_output_sample_regs(handle,
6348                                                data->regs_user.regs,
6349                                                mask);
6350                }
6351        }
6352
6353        if (sample_type & PERF_SAMPLE_STACK_USER) {
6354                perf_output_sample_ustack(handle,
6355                                          data->stack_user_size,
6356                                          data->regs_user.regs);
6357        }
6358
6359        if (sample_type & PERF_SAMPLE_WEIGHT)
6360                perf_output_put(handle, data->weight);
6361
6362        if (sample_type & PERF_SAMPLE_DATA_SRC)
6363                perf_output_put(handle, data->data_src.val);
6364
6365        if (sample_type & PERF_SAMPLE_TRANSACTION)
6366                perf_output_put(handle, data->txn);
6367
6368        if (sample_type & PERF_SAMPLE_REGS_INTR) {
6369                u64 abi = data->regs_intr.abi;
6370                /*
6371                 * If there are no regs to dump, notice it through
6372                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6373                 */
6374                perf_output_put(handle, abi);
6375
6376                if (abi) {
6377                        u64 mask = event->attr.sample_regs_intr;
6378
6379                        perf_output_sample_regs(handle,
6380                                                data->regs_intr.regs,
6381                                                mask);
6382                }
6383        }
6384
6385        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6386                perf_output_put(handle, data->phys_addr);
6387
6388        if (!event->attr.watermark) {
6389                int wakeup_events = event->attr.wakeup_events;
6390
6391                if (wakeup_events) {
6392                        struct ring_buffer *rb = handle->rb;
6393                        int events = local_inc_return(&rb->events);
6394
6395                        if (events >= wakeup_events) {
6396                                local_sub(wakeup_events, &rb->events);
6397                                local_inc(&rb->wakeup);
6398                        }
6399                }
6400        }
6401}
6402
6403static u64 perf_virt_to_phys(u64 virt)
6404{
6405        u64 phys_addr = 0;
6406        struct page *p = NULL;
6407
6408        if (!virt)
6409                return 0;
6410
6411        if (virt >= TASK_SIZE) {
6412                /* If it's vmalloc()d memory, leave phys_addr as 0 */
6413                if (virt_addr_valid((void *)(uintptr_t)virt) &&
6414                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
6415                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6416        } else {
6417                /*
6418                 * Walking the pages tables for user address.
6419                 * Interrupts are disabled, so it prevents any tear down
6420                 * of the page tables.
6421                 * Try IRQ-safe __get_user_pages_fast first.
6422                 * If failed, leave phys_addr as 0.
6423                 */
6424                if ((current->mm != NULL) &&
6425                    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6426                        phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6427
6428                if (p)
6429                        put_page(p);
6430        }
6431
6432        return phys_addr;
6433}
6434
6435static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6436
6437struct perf_callchain_entry *
6438perf_callchain(struct perf_event *event, struct pt_regs *regs)
6439{
6440        bool kernel = !event->attr.exclude_callchain_kernel;
6441        bool user   = !event->attr.exclude_callchain_user;
6442        /* Disallow cross-task user callchains. */
6443        bool crosstask = event->ctx->task && event->ctx->task != current;
6444        const u32 max_stack = event->attr.sample_max_stack;
6445        struct perf_callchain_entry *callchain;
6446
6447        if (!kernel && !user)
6448                return &__empty_callchain;
6449
6450        callchain = get_perf_callchain(regs, 0, kernel, user,
6451                                       max_stack, crosstask, true);
6452        return callchain ?: &__empty_callchain;
6453}
6454
6455void perf_prepare_sample(struct perf_event_header *header,
6456                         struct perf_sample_data *data,
6457                         struct perf_event *event,
6458                         struct pt_regs *regs)
6459{
6460        u64 sample_type = event->attr.sample_type;
6461
6462        header->type = PERF_RECORD_SAMPLE;
6463        header->size = sizeof(*header) + event->header_size;
6464
6465        header->misc = 0;
6466        header->misc |= perf_misc_flags(regs);
6467
6468        __perf_event_header__init_id(header, data, event);
6469
6470        if (sample_type & PERF_SAMPLE_IP)
6471                data->ip = perf_instruction_pointer(regs);
6472
6473        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6474                int size = 1;
6475
6476                if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
6477                        data->callchain = perf_callchain(event, regs);
6478
6479                size += data->callchain->nr;
6480
6481                header->size += size * sizeof(u64);
6482        }
6483
6484        if (sample_type & PERF_SAMPLE_RAW) {
6485                struct perf_raw_record *raw = data->raw;
6486                int size;
6487
6488                if (raw) {
6489                        struct perf_raw_frag *frag = &raw->frag;
6490                        u32 sum = 0;
6491
6492                        do {
6493                                sum += frag->size;
6494                                if (perf_raw_frag_last(frag))
6495                                        break;
6496                                frag = frag->next;
6497                        } while (1);
6498
6499                        size = round_up(sum + sizeof(u32), sizeof(u64));
6500                        raw->size = size - sizeof(u32);
6501                        frag->pad = raw->size - sum;
6502                } else {
6503                        size = sizeof(u64);
6504                }
6505
6506                header->size += size;
6507        }
6508
6509        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6510                int size = sizeof(u64); /* nr */
6511                if (data->br_stack) {
6512                        size += data->br_stack->nr
6513                              * sizeof(struct perf_branch_entry);
6514                }
6515                header->size += size;
6516        }
6517
6518        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6519                perf_sample_regs_user(&data->regs_user, regs,
6520                                      &data->regs_user_copy);
6521
6522        if (sample_type & PERF_SAMPLE_REGS_USER) {
6523                /* regs dump ABI info */
6524                int size = sizeof(u64);
6525
6526                if (data->regs_user.regs) {
6527                        u64 mask = event->attr.sample_regs_user;
6528                        size += hweight64(mask) * sizeof(u64);
6529                }
6530
6531                header->size += size;
6532        }
6533
6534        if (sample_type & PERF_SAMPLE_STACK_USER) {
6535                /*
6536                 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6537                 * processed as the last one or have additional check added
6538                 * in case new sample type is added, because we could eat
6539                 * up the rest of the sample size.
6540                 */
6541                u16 stack_size = event->attr.sample_stack_user;
6542                u16 size = sizeof(u64);
6543
6544                stack_size = perf_sample_ustack_size(stack_size, header->size,
6545                                                     data->regs_user.regs);
6546
6547                /*
6548                 * If there is something to dump, add space for the dump
6549                 * itself and for the field that tells the dynamic size,
6550                 * which is how many have been actually dumped.
6551                 */
6552                if (stack_size)
6553                        size += sizeof(u64) + stack_size;
6554
6555                data->stack_user_size = stack_size;
6556                header->size += size;
6557        }
6558
6559        if (sample_type & PERF_SAMPLE_REGS_INTR) {
6560                /* regs dump ABI info */
6561                int size = sizeof(u64);
6562
6563                perf_sample_regs_intr(&data->regs_intr, regs);
6564
6565                if (data->regs_intr.regs) {
6566                        u64 mask = event->attr.sample_regs_intr;
6567
6568                        size += hweight64(mask) * sizeof(u64);
6569                }
6570
6571                header->size += size;
6572        }
6573
6574        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6575                data->phys_addr = perf_virt_to_phys(data->addr);
6576}
6577
6578static __always_inline int
6579__perf_event_output(struct perf_event *event,
6580                    struct perf_sample_data *data,
6581                    struct pt_regs *regs,
6582                    int (*output_begin)(struct perf_output_handle *,
6583                                        struct perf_event *,
6584                                        unsigned int))
6585{
6586        struct perf_output_handle handle;
6587        struct perf_event_header header;
6588        int err;
6589
6590        /* protect the callchain buffers */
6591        rcu_read_lock();
6592
6593        perf_prepare_sample(&header, data, event, regs);
6594
6595        err = output_begin(&handle, event, header.size);
6596        if (err)
6597                goto exit;
6598
6599        perf_output_sample(&handle, &header, data, event);
6600
6601        perf_output_end(&handle);
6602
6603exit:
6604        rcu_read_unlock();
6605        return err;
6606}
6607
6608void
6609perf_event_output_forward(struct perf_event *event,
6610                         struct perf_sample_data *data,
6611                         struct pt_regs *regs)
6612{
6613        __perf_event_output(event, data, regs, perf_output_begin_forward);
6614}
6615
6616void
6617perf_event_output_backward(struct perf_event *event,
6618                           struct perf_sample_data *data,
6619                           struct pt_regs *regs)
6620{
6621        __perf_event_output(event, data, regs, perf_output_begin_backward);
6622}
6623
6624int
6625perf_event_output(struct perf_event *event,
6626                  struct perf_sample_data *data,
6627                  struct pt_regs *regs)
6628{
6629        return __perf_event_output(event, data, regs, perf_output_begin);
6630}
6631
6632/*
6633 * read event_id
6634 */
6635
6636struct perf_read_event {
6637        struct perf_event_header        header;
6638
6639        u32                             pid;
6640        u32                             tid;
6641};
6642
6643static void
6644perf_event_read_event(struct perf_event *event,
6645                        struct task_struct *task)
6646{
6647        struct perf_output_handle handle;
6648        struct perf_sample_data sample;
6649        struct perf_read_event read_event = {
6650                .header = {
6651                        .type = PERF_RECORD_READ,
6652                        .misc = 0,
6653                        .size = sizeof(read_event) + event->read_size,
6654                },
6655                .pid = perf_event_pid(event, task),
6656                .tid = perf_event_tid(event, task),
6657        };
6658        int ret;
6659
6660        perf_event_header__init_id(&read_event.header, &sample, event);
6661        ret = perf_output_begin(&handle, event, read_event.header.size);
6662        if (ret)
6663                return;
6664
6665        perf_output_put(&handle, read_event);
6666        perf_output_read(&handle, event);
6667        perf_event__output_id_sample(event, &handle, &sample);
6668
6669        perf_output_end(&handle);
6670}
6671
6672typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6673
6674static void
6675perf_iterate_ctx(struct perf_event_context *ctx,
6676                   perf_iterate_f output,
6677                   void *data, bool all)
6678{
6679        struct perf_event *event;
6680
6681        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6682                if (!all) {
6683                        if (event->state < PERF_EVENT_STATE_INACTIVE)
6684                                continue;
6685                        if (!event_filter_match(event))
6686                                continue;
6687                }
6688
6689                output(event, data);
6690        }
6691}
6692
6693static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6694{
6695        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6696        struct perf_event *event;
6697
6698        list_for_each_entry_rcu(event, &pel->list, sb_list) {
6699                /*
6700                 * Skip events that are not fully formed yet; ensure that
6701                 * if we observe event->ctx, both event and ctx will be
6702                 * complete enough. See perf_install_in_context().
6703                 */
6704                if (!smp_load_acquire(&event->ctx))
6705                        continue;
6706
6707                if (event->state < PERF_EVENT_STATE_INACTIVE)
6708                        continue;
6709                if (!event_filter_match(event))
6710                        continue;
6711                output(event, data);
6712        }
6713}
6714
6715/*
6716 * Iterate all events that need to receive side-band events.
6717 *
6718 * For new callers; ensure that account_pmu_sb_event() includes
6719 * your event, otherwise it might not get delivered.
6720 */
6721static void
6722perf_iterate_sb(perf_iterate_f output, void *data,
6723               struct perf_event_context *task_ctx)
6724{
6725        struct perf_event_context *ctx;
6726        int ctxn;
6727
6728        rcu_read_lock();
6729        preempt_disable();
6730
6731        /*
6732         * If we have task_ctx != NULL we only notify the task context itself.
6733         * The task_ctx is set only for EXIT events before releasing task
6734         * context.
6735         */
6736        if (task_ctx) {
6737                perf_iterate_ctx(task_ctx, output, data, false);
6738                goto done;
6739        }
6740
6741        perf_iterate_sb_cpu(output, data);
6742
6743        for_each_task_context_nr(ctxn) {
6744                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6745                if (ctx)
6746                        perf_iterate_ctx(ctx, output, data, false);
6747        }
6748done:
6749        preempt_enable();
6750        rcu_read_unlock();
6751}
6752
6753/*
6754 * Clear all file-based filters at exec, they'll have to be
6755 * re-instated when/if these objects are mmapped again.
6756 */
6757static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6758{
6759        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6760        struct perf_addr_filter *filter;
6761        unsigned int restart = 0, count = 0;
6762        unsigned long flags;
6763
6764        if (!has_addr_filter(event))
6765                return;
6766
6767        raw_spin_lock_irqsave(&ifh->lock, flags);
6768        list_for_each_entry(filter, &ifh->list, entry) {
6769                if (filter->path.dentry) {
6770                        event->addr_filter_ranges[count].start = 0;
6771                        event->addr_filter_ranges[count].size = 0;
6772                        restart++;
6773                }
6774
6775                count++;
6776        }
6777
6778        if (restart)
6779                event->addr_filters_gen++;
6780        raw_spin_unlock_irqrestore(&ifh->lock, flags);
6781
6782        if (restart)
6783                perf_event_stop(event, 1);
6784}
6785
6786void perf_event_exec(void)
6787{
6788        struct perf_event_context *ctx;
6789        int ctxn;
6790
6791        rcu_read_lock();
6792        for_each_task_context_nr(ctxn) {
6793                ctx = current->perf_event_ctxp[ctxn];
6794                if (!ctx)
6795                        continue;
6796
6797                perf_event_enable_on_exec(ctxn);
6798
6799                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6800                                   true);
6801        }
6802        rcu_read_unlock();
6803}
6804
6805struct remote_output {
6806        struct ring_buffer      *rb;
6807        int                     err;
6808};
6809
6810static void __perf_event_output_stop(struct perf_event *event, void *data)
6811{
6812        struct perf_event *parent = event->parent;
6813        struct remote_output *ro = data;
6814        struct ring_buffer *rb = ro->rb;
6815        struct stop_event_data sd = {
6816                .event  = event,
6817        };
6818
6819        if (!has_aux(event))
6820                return;
6821
6822        if (!parent)
6823                parent = event;
6824
6825        /*
6826         * In case of inheritance, it will be the parent that links to the
6827         * ring-buffer, but it will be the child that's actually using it.
6828         *
6829         * We are using event::rb to determine if the event should be stopped,
6830         * however this may race with ring_buffer_attach() (through set_output),
6831         * which will make us skip the event that actually needs to be stopped.
6832         * So ring_buffer_attach() has to stop an aux event before re-assigning
6833         * its rb pointer.
6834         */
6835        if (rcu_dereference(parent->rb) == rb)
6836                ro->err = __perf_event_stop(&sd);
6837}
6838
6839static int __perf_pmu_output_stop(void *info)
6840{
6841        struct perf_event *event = info;
6842        struct pmu *pmu = event->pmu;
6843        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6844        struct remote_output ro = {
6845                .rb     = event->rb,
6846        };
6847
6848        rcu_read_lock();
6849        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6850        if (cpuctx->task_ctx)
6851                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6852                                   &ro, false);
6853        rcu_read_unlock();
6854
6855        return ro.err;
6856}
6857
6858static void perf_pmu_output_stop(struct perf_event *event)
6859{
6860        struct perf_event *iter;
6861        int err, cpu;
6862
6863restart:
6864        rcu_read_lock();
6865        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6866                /*
6867                 * For per-CPU events, we need to make sure that neither they
6868                 * nor their children are running; for cpu==-1 events it's
6869                 * sufficient to stop the event itself if it's active, since
6870                 * it can't have children.
6871                 */
6872                cpu = iter->cpu;
6873                if (cpu == -1)
6874                        cpu = READ_ONCE(iter->oncpu);
6875
6876                if (cpu == -1)
6877                        continue;
6878
6879                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6880                if (err == -EAGAIN) {
6881                        rcu_read_unlock();
6882                        goto restart;
6883                }
6884        }
6885        rcu_read_unlock();
6886}
6887
6888/*
6889 * task tracking -- fork/exit
6890 *
6891 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6892 */
6893
6894struct perf_task_event {
6895        struct task_struct              *task;
6896        struct perf_event_context       *task_ctx;
6897
6898        struct {
6899                struct perf_event_header        header;
6900
6901                u32                             pid;
6902                u32                             ppid;
6903                u32                             tid;
6904                u32                             ptid;
6905                u64                             time;
6906        } event_id;
6907};
6908
6909static int perf_event_task_match(struct perf_event *event)
6910{
6911        return event->attr.comm  || event->attr.mmap ||
6912               event->attr.mmap2 || event->attr.mmap_data ||
6913               event->attr.task;
6914}
6915
6916static void perf_event_task_output(struct perf_event *event,
6917                                   void *data)
6918{
6919        struct perf_task_event *task_event = data;
6920        struct perf_output_handle handle;
6921        struct perf_sample_data sample;
6922        struct task_struct *task = task_event->task;
6923        int ret, size = task_event->event_id.header.size;
6924
6925        if (!perf_event_task_match(event))
6926                return;
6927
6928        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6929
6930        ret = perf_output_begin(&handle, event,
6931                                task_event->event_id.header.size);
6932        if (ret)
6933                goto out;
6934
6935        task_event->event_id.pid = perf_event_pid(event, task);
6936        task_event->event_id.ppid = perf_event_pid(event, current);
6937
6938        task_event->event_id.tid = perf_event_tid(event, task);
6939        task_event->event_id.ptid = perf_event_tid(event, current);
6940
6941        task_event->event_id.time = perf_event_clock(event);
6942
6943        perf_output_put(&handle, task_event->event_id);
6944
6945        perf_event__output_id_sample(event, &handle, &sample);
6946
6947        perf_output_end(&handle);
6948out:
6949        task_event->event_id.header.size = size;
6950}
6951
6952static void perf_event_task(struct task_struct *task,
6953                              struct perf_event_context *task_ctx,
6954                              int new)
6955{
6956        struct perf_task_event task_event;
6957
6958        if (!atomic_read(&nr_comm_events) &&
6959            !atomic_read(&nr_mmap_events) &&
6960            !atomic_read(&nr_task_events))
6961                return;
6962
6963        task_event = (struct perf_task_event){
6964                .task     = task,
6965                .task_ctx = task_ctx,
6966                .event_id    = {
6967                        .header = {
6968                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6969                                .misc = 0,
6970                                .size = sizeof(task_event.event_id),
6971                        },
6972                        /* .pid  */
6973                        /* .ppid */
6974                        /* .tid  */
6975                        /* .ptid */
6976                        /* .time */
6977                },
6978        };
6979
6980        perf_iterate_sb(perf_event_task_output,
6981                       &task_event,
6982                       task_ctx);
6983}
6984
6985void perf_event_fork(struct task_struct *task)
6986{
6987        perf_event_task(task, NULL, 1);
6988        perf_event_namespaces(task);
6989}
6990
6991/*
6992 * comm tracking
6993 */
6994
6995struct perf_comm_event {
6996        struct task_struct      *task;
6997        char                    *comm;
6998        int                     comm_size;
6999
7000        struct {
7001                struct perf_event_header        header;
7002
7003                u32                             pid;
7004                u32                             tid;
7005        } event_id;
7006};
7007
7008static int perf_event_comm_match(struct perf_event *event)
7009{
7010        return event->attr.comm;
7011}
7012
7013static void perf_event_comm_output(struct perf_event *event,
7014                                   void *data)
7015{
7016        struct perf_comm_event *comm_event = data;
7017        struct perf_output_handle handle;
7018        struct perf_sample_data sample;
7019        int size = comm_event->event_id.header.size;
7020        int ret;
7021
7022        if (!perf_event_comm_match(event))
7023                return;
7024
7025        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7026        ret = perf_output_begin(&handle, event,
7027                                comm_event->event_id.header.size);
7028
7029        if (ret)
7030                goto out;
7031
7032        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7033        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7034
7035        perf_output_put(&handle, comm_event->event_id);
7036        __output_copy(&handle, comm_event->comm,
7037                                   comm_event->comm_size);
7038
7039        perf_event__output_id_sample(event, &handle, &sample);
7040
7041        perf_output_end(&handle);
7042out:
7043        comm_event->event_id.header.size = size;
7044}
7045
7046static void perf_event_comm_event(struct perf_comm_event *comm_event)
7047{
7048        char comm[TASK_COMM_LEN];
7049        unsigned int size;
7050
7051        memset(comm, 0, sizeof(comm));
7052        strlcpy(comm, comm_event->task->comm, sizeof(comm));
7053        size = ALIGN(strlen(comm)+1, sizeof(u64));
7054
7055        comm_event->comm = comm;
7056        comm_event->comm_size = size;
7057
7058        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7059
7060        perf_iterate_sb(perf_event_comm_output,
7061                       comm_event,
7062                       NULL);
7063}
7064
7065void perf_event_comm(struct task_struct *task, bool exec)
7066{
7067        struct perf_comm_event comm_event;
7068
7069        if (!atomic_read(&nr_comm_events))
7070                return;
7071
7072        comm_event = (struct perf_comm_event){
7073                .task   = task,
7074                /* .comm      */
7075                /* .comm_size */
7076                .event_id  = {
7077                        .header = {
7078                                .type = PERF_RECORD_COMM,
7079                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7080                                /* .size */
7081                        },
7082                        /* .pid */
7083                        /* .tid */
7084                },
7085        };
7086
7087        perf_event_comm_event(&comm_event);
7088}
7089
7090/*
7091 * namespaces tracking
7092 */
7093
7094struct perf_namespaces_event {
7095        struct task_struct              *task;
7096
7097        struct {
7098                struct perf_event_header        header;
7099
7100                u32                             pid;
7101                u32                             tid;
7102                u64                             nr_namespaces;
7103                struct perf_ns_link_info        link_info[NR_NAMESPACES];
7104        } event_id;
7105};
7106
7107static int perf_event_namespaces_match(struct perf_event *event)
7108{
7109        return event->attr.namespaces;
7110}
7111
7112static void perf_event_namespaces_output(struct perf_event *event,
7113                                         void *data)
7114{
7115        struct perf_namespaces_event *namespaces_event = data;
7116        struct perf_output_handle handle;
7117        struct perf_sample_data sample;
7118        u16 header_size = namespaces_event->event_id.header.size;
7119        int ret;
7120
7121        if (!perf_event_namespaces_match(event))
7122                return;
7123
7124        perf_event_header__init_id(&namespaces_event->event_id.header,
7125                                   &sample, event);
7126        ret = perf_output_begin(&handle, event,
7127                                namespaces_event->event_id.header.size);
7128        if (ret)
7129                goto out;
7130
7131        namespaces_event->event_id.pid = perf_event_pid(event,
7132                                                        namespaces_event->task);
7133        namespaces_event->event_id.tid = perf_event_tid(event,
7134                                                        namespaces_event->task);
7135
7136        perf_output_put(&handle, namespaces_event->event_id);
7137
7138        perf_event__output_id_sample(event, &handle, &sample);
7139
7140        perf_output_end(&handle);
7141out:
7142        namespaces_event->event_id.header.size = header_size;
7143}
7144
7145static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7146                                   struct task_struct *task,
7147                                   const struct proc_ns_operations *ns_ops)
7148{
7149        struct path ns_path;
7150        struct inode *ns_inode;
7151        void *error;
7152
7153        error = ns_get_path(&ns_path, task, ns_ops);
7154        if (!error) {
7155                ns_inode = ns_path.dentry->d_inode;
7156                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7157                ns_link_info->ino = ns_inode->i_ino;
7158                path_put(&ns_path);
7159        }
7160}
7161
7162void perf_event_namespaces(struct task_struct *task)
7163{
7164        struct perf_namespaces_event namespaces_event;
7165        struct perf_ns_link_info *ns_link_info;
7166
7167        if (!atomic_read(&nr_namespaces_events))
7168                return;
7169
7170        namespaces_event = (struct perf_namespaces_event){
7171                .task   = task,
7172                .event_id  = {
7173                        .header = {
7174                                .type = PERF_RECORD_NAMESPACES,
7175                                .misc = 0,
7176                                .size = sizeof(namespaces_event.event_id),
7177                        },
7178                        /* .pid */
7179                        /* .tid */
7180                        .nr_namespaces = NR_NAMESPACES,
7181                        /* .link_info[NR_NAMESPACES] */
7182                },
7183        };
7184
7185        ns_link_info = namespaces_event.event_id.link_info;
7186
7187        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7188                               task, &mntns_operations);
7189
7190#ifdef CONFIG_USER_NS
7191        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7192                               task, &userns_operations);
7193#endif
7194#ifdef CONFIG_NET_NS
7195        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7196                               task, &netns_operations);
7197#endif
7198#ifdef CONFIG_UTS_NS
7199        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7200                               task, &utsns_operations);
7201#endif
7202#ifdef CONFIG_IPC_NS
7203        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7204                               task, &ipcns_operations);
7205#endif
7206#ifdef CONFIG_PID_NS
7207        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7208                               task, &pidns_operations);
7209#endif
7210#ifdef CONFIG_CGROUPS
7211        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7212                               task, &cgroupns_operations);
7213#endif
7214
7215        perf_iterate_sb(perf_event_namespaces_output,
7216                        &namespaces_event,
7217                        NULL);
7218}
7219
7220/*
7221 * mmap tracking
7222 */
7223
7224struct perf_mmap_event {
7225        struct vm_area_struct   *vma;
7226
7227        const char              *file_name;
7228        int                     file_size;
7229        int                     maj, min;
7230        u64                     ino;
7231        u64                     ino_generation;
7232        u32                     prot, flags;
7233
7234        struct {
7235                struct perf_event_header        header;
7236
7237                u32                             pid;
7238                u32                             tid;
7239                u64                             start;
7240                u64                             len;
7241                u64                             pgoff;
7242        } event_id;
7243};
7244
7245static int perf_event_mmap_match(struct perf_event *event,
7246                                 void *data)
7247{
7248        struct perf_mmap_event *mmap_event = data;
7249        struct vm_area_struct *vma = mmap_event->vma;
7250        int executable = vma->vm_flags & VM_EXEC;
7251
7252        return (!executable && event->attr.mmap_data) ||
7253               (executable && (event->attr.mmap || event->attr.mmap2));
7254}
7255
7256static void perf_event_mmap_output(struct perf_event *event,
7257                                   void *data)
7258{
7259        struct perf_mmap_event *mmap_event = data;
7260        struct perf_output_handle handle;
7261        struct perf_sample_data sample;
7262        int size = mmap_event->event_id.header.size;
7263        u32 type = mmap_event->event_id.header.type;
7264        int ret;
7265
7266        if (!perf_event_mmap_match(event, data))
7267                return;
7268
7269        if (event->attr.mmap2) {
7270                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7271                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7272                mmap_event->event_id.header.size += sizeof(mmap_event->min);
7273                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7274                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7275                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7276                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7277        }
7278
7279        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7280        ret = perf_output_begin(&handle, event,
7281                                mmap_event->event_id.header.size);
7282        if (ret)
7283                goto out;
7284
7285        mmap_event->event_id.pid = perf_event_pid(event, current);
7286        mmap_event->event_id.tid = perf_event_tid(event, current);
7287
7288        perf_output_put(&handle, mmap_event->event_id);
7289
7290        if (event->attr.mmap2) {
7291                perf_output_put(&handle, mmap_event->maj);
7292                perf_output_put(&handle, mmap_event->min);
7293                perf_output_put(&handle, mmap_event->ino);
7294                perf_output_put(&handle, mmap_event->ino_generation);
7295                perf_output_put(&handle, mmap_event->prot);
7296                perf_output_put(&handle, mmap_event->flags);
7297        }
7298
7299        __output_copy(&handle, mmap_event->file_name,
7300                                   mmap_event->file_size);
7301
7302        perf_event__output_id_sample(event, &handle, &sample);
7303
7304        perf_output_end(&handle);
7305out:
7306        mmap_event->event_id.header.size = size;
7307        mmap_event->event_id.header.type = type;
7308}
7309
7310static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7311{
7312        struct vm_area_struct *vma = mmap_event->vma;
7313        struct file *file = vma->vm_file;
7314        int maj = 0, min = 0;
7315        u64 ino = 0, gen = 0;
7316        u32 prot = 0, flags = 0;
7317        unsigned int size;
7318        char tmp[16];
7319        char *buf = NULL;
7320        char *name;
7321
7322        if (vma->vm_flags & VM_READ)
7323                prot |= PROT_READ;
7324        if (vma->vm_flags & VM_WRITE)
7325                prot |= PROT_WRITE;
7326        if (vma->vm_flags & VM_EXEC)
7327                prot |= PROT_EXEC;
7328
7329        if (vma->vm_flags & VM_MAYSHARE)
7330                flags = MAP_SHARED;
7331        else
7332                flags = MAP_PRIVATE;
7333
7334        if (vma->vm_flags & VM_DENYWRITE)
7335                flags |= MAP_DENYWRITE;
7336        if (vma->vm_flags & VM_MAYEXEC)
7337                flags |= MAP_EXECUTABLE;
7338        if (vma->vm_flags & VM_LOCKED)
7339                flags |= MAP_LOCKED;
7340        if (vma->vm_flags & VM_HUGETLB)
7341                flags |= MAP_HUGETLB;
7342
7343        if (file) {
7344                struct inode *inode;
7345                dev_t dev;
7346
7347                buf = kmalloc(PATH_MAX, GFP_KERNEL);
7348                if (!buf) {
7349                        name = "//enomem";
7350                        goto cpy_name;
7351                }
7352                /*
7353                 * d_path() works from the end of the rb backwards, so we
7354                 * need to add enough zero bytes after the string to handle
7355                 * the 64bit alignment we do later.
7356                 */
7357                name = file_path(file, buf, PATH_MAX - sizeof(u64));
7358                if (IS_ERR(name)) {
7359                        name = "//toolong";
7360                        goto cpy_name;
7361                }
7362                inode = file_inode(vma->vm_file);
7363                dev = inode->i_sb->s_dev;
7364                ino = inode->i_ino;
7365                gen = inode->i_generation;
7366                maj = MAJOR(dev);
7367                min = MINOR(dev);
7368
7369                goto got_name;
7370        } else {
7371                if (vma->vm_ops && vma->vm_ops->name) {
7372                        name = (char *) vma->vm_ops->name(vma);
7373                        if (name)
7374                                goto cpy_name;
7375                }
7376
7377                name = (char *)arch_vma_name(vma);
7378                if (name)
7379                        goto cpy_name;
7380
7381                if (vma->vm_start <= vma->vm_mm->start_brk &&
7382                                vma->vm_end >= vma->vm_mm->brk) {
7383                        name = "[heap]";
7384                        goto cpy_name;
7385                }
7386                if (vma->vm_start <= vma->vm_mm->start_stack &&
7387                                vma->vm_end >= vma->vm_mm->start_stack) {
7388                        name = "[stack]";
7389                        goto cpy_name;
7390                }
7391
7392                name = "//anon";
7393                goto cpy_name;
7394        }
7395
7396cpy_name:
7397        strlcpy(tmp, name, sizeof(tmp));
7398        name = tmp;
7399got_name:
7400        /*
7401         * Since our buffer works in 8 byte units we need to align our string
7402         * size to a multiple of 8. However, we must guarantee the tail end is
7403         * zero'd out to avoid leaking random bits to userspace.
7404         */
7405        size = strlen(name)+1;
7406        while (!IS_ALIGNED(size, sizeof(u64)))
7407                name[size++] = '\0';
7408
7409        mmap_event->file_name = name;
7410        mmap_event->file_size = size;
7411        mmap_event->maj = maj;
7412        mmap_event->min = min;
7413        mmap_event->ino = ino;
7414        mmap_event->ino_generation = gen;
7415        mmap_event->prot = prot;
7416        mmap_event->flags = flags;
7417
7418        if (!(vma->vm_flags & VM_EXEC))
7419                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7420
7421        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7422
7423        perf_iterate_sb(perf_event_mmap_output,
7424                       mmap_event,
7425                       NULL);
7426
7427        kfree(buf);
7428}
7429
7430/*
7431 * Check whether inode and address range match filter criteria.
7432 */
7433static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7434                                     struct file *file, unsigned long offset,
7435                                     unsigned long size)
7436{
7437        /* d_inode(NULL) won't be equal to any mapped user-space file */
7438        if (!filter->path.dentry)
7439                return false;
7440
7441        if (d_inode(filter->path.dentry) != file_inode(file))
7442                return false;
7443
7444        if (filter->offset > offset + size)
7445                return false;
7446
7447        if (filter->offset + filter->size < offset)
7448                return false;
7449
7450        return true;
7451}
7452
7453static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
7454                                        struct vm_area_struct *vma,
7455                                        struct perf_addr_filter_range *fr)
7456{
7457        unsigned long vma_size = vma->vm_end - vma->vm_start;
7458        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
7459        struct file *file = vma->vm_file;
7460
7461        if (!perf_addr_filter_match(filter, file, off, vma_size))
7462                return false;
7463
7464        if (filter->offset < off) {
7465                fr->start = vma->vm_start;
7466                fr->size = min(vma_size, filter->size - (off - filter->offset));
7467        } else {
7468                fr->start = vma->vm_start + filter->offset - off;
7469                fr->size = min(vma->vm_end - fr->start, filter->size);
7470        }
7471
7472        return true;
7473}
7474
7475static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7476{
7477        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7478        struct vm_area_struct *vma = data;
7479        struct perf_addr_filter *filter;
7480        unsigned int restart = 0, count = 0;
7481        unsigned long flags;
7482
7483        if (!has_addr_filter(event))
7484                return;
7485
7486        if (!vma->vm_file)
7487                return;
7488
7489        raw_spin_lock_irqsave(&ifh->lock, flags);
7490        list_for_each_entry(filter, &ifh->list, entry) {
7491                if (perf_addr_filter_vma_adjust(filter, vma,
7492                                                &event->addr_filter_ranges[count]))
7493                        restart++;
7494
7495                count++;
7496        }
7497
7498        if (restart)
7499                event->addr_filters_gen++;
7500        raw_spin_unlock_irqrestore(&ifh->lock, flags);
7501
7502        if (restart)
7503                perf_event_stop(event, 1);
7504}
7505
7506/*
7507 * Adjust all task's events' filters to the new vma
7508 */
7509static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7510{
7511        struct perf_event_context *ctx;
7512        int ctxn;
7513
7514        /*
7515         * Data tracing isn't supported yet and as such there is no need
7516         * to keep track of anything that isn't related to executable code:
7517         */
7518        if (!(vma->vm_flags & VM_EXEC))
7519                return;
7520
7521        rcu_read_lock();
7522        for_each_task_context_nr(ctxn) {
7523                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7524                if (!ctx)
7525                        continue;
7526
7527                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7528        }
7529        rcu_read_unlock();
7530}
7531
7532void perf_event_mmap(struct vm_area_struct *vma)
7533{
7534        struct perf_mmap_event mmap_event;
7535
7536        if (!atomic_read(&nr_mmap_events))
7537                return;
7538
7539        mmap_event = (struct perf_mmap_event){
7540                .vma    = vma,
7541                /* .file_name */
7542                /* .file_size */
7543                .event_id  = {
7544                        .header = {
7545                                .type = PERF_RECORD_MMAP,
7546                                .misc = PERF_RECORD_MISC_USER,
7547                                /* .size */
7548                        },
7549                        /* .pid */
7550                        /* .tid */
7551                        .start  = vma->vm_start,
7552                        .len    = vma->vm_end - vma->vm_start,
7553                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
7554                },
7555                /* .maj (attr_mmap2 only) */
7556                /* .min (attr_mmap2 only) */
7557                /* .ino (attr_mmap2 only) */
7558                /* .ino_generation (attr_mmap2 only) */
7559                /* .prot (attr_mmap2 only) */
7560                /* .flags (attr_mmap2 only) */
7561        };
7562
7563        perf_addr_filters_adjust(vma);
7564        perf_event_mmap_event(&mmap_event);
7565}
7566
7567void perf_event_aux_event(struct perf_event *event, unsigned long head,
7568                          unsigned long size, u64 flags)
7569{
7570        struct perf_output_handle handle;
7571        struct perf_sample_data sample;
7572        struct perf_aux_event {
7573                struct perf_event_header        header;
7574                u64                             offset;
7575                u64                             size;
7576                u64                             flags;
7577        } rec = {
7578                .header = {
7579                        .type = PERF_RECORD_AUX,
7580                        .misc = 0,
7581                        .size = sizeof(rec),
7582                },
7583                .offset         = head,
7584                .size           = size,
7585                .flags          = flags,
7586        };
7587        int ret;
7588
7589        perf_event_header__init_id(&rec.header, &sample, event);
7590        ret = perf_output_begin(&handle, event, rec.header.size);
7591
7592        if (ret)
7593                return;
7594
7595        perf_output_put(&handle, rec);
7596        perf_event__output_id_sample(event, &handle, &sample);
7597
7598        perf_output_end(&handle);
7599}
7600
7601/*
7602 * Lost/dropped samples logging
7603 */
7604void perf_log_lost_samples(struct perf_event *event, u64 lost)
7605{
7606        struct perf_output_handle handle;
7607        struct perf_sample_data sample;
7608        int ret;
7609
7610        struct {
7611                struct perf_event_header        header;
7612                u64                             lost;
7613        } lost_samples_event = {
7614                .header = {
7615                        .type = PERF_RECORD_LOST_SAMPLES,
7616                        .misc = 0,
7617                        .size = sizeof(lost_samples_event),
7618                },
7619                .lost           = lost,
7620        };
7621
7622        perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7623
7624        ret = perf_output_begin(&handle, event,
7625                                lost_samples_event.header.size);
7626        if (ret)
7627                return;
7628
7629        perf_output_put(&handle, lost_samples_event);
7630        perf_event__output_id_sample(event, &handle, &sample);
7631        perf_output_end(&handle);
7632}
7633
7634/*
7635 * context_switch tracking
7636 */
7637
7638struct perf_switch_event {
7639        struct task_struct      *task;
7640        struct task_struct      *next_prev;
7641
7642        struct {
7643                struct perf_event_header        header;
7644                u32                             next_prev_pid;
7645                u32                             next_prev_tid;
7646        } event_id;
7647};
7648
7649static int perf_event_switch_match(struct perf_event *event)
7650{
7651        return event->attr.context_switch;
7652}
7653
7654static void perf_event_switch_output(struct perf_event *event, void *data)
7655{
7656        struct perf_switch_event *se = data;
7657        struct perf_output_handle handle;
7658        struct perf_sample_data sample;
7659        int ret;
7660
7661        if (!perf_event_switch_match(event))
7662                return;
7663
7664        /* Only CPU-wide events are allowed to see next/prev pid/tid */
7665        if (event->ctx->task) {
7666                se->event_id.header.type = PERF_RECORD_SWITCH;
7667                se->event_id.header.size = sizeof(se->event_id.header);
7668        } else {
7669                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7670                se->event_id.header.size = sizeof(se->event_id);
7671                se->event_id.next_prev_pid =
7672                                        perf_event_pid(event, se->next_prev);
7673                se->event_id.next_prev_tid =
7674                                        perf_event_tid(event, se->next_prev);
7675        }
7676
7677        perf_event_header__init_id(&se->event_id.header, &sample, event);
7678
7679        ret = perf_output_begin(&handle, event, se->event_id.header.size);
7680        if (ret)
7681                return;
7682
7683        if (event->ctx->task)
7684                perf_output_put(&handle, se->event_id.header);
7685        else
7686                perf_output_put(&handle, se->event_id);
7687
7688        perf_event__output_id_sample(event, &handle, &sample);
7689
7690        perf_output_end(&handle);
7691}
7692
7693static void perf_event_switch(struct task_struct *task,
7694                              struct task_struct *next_prev, bool sched_in)
7695{
7696        struct perf_switch_event switch_event;
7697
7698        /* N.B. caller checks nr_switch_events != 0 */
7699
7700        switch_event = (struct perf_switch_event){
7701                .task           = task,
7702                .next_prev      = next_prev,
7703                .event_id       = {
7704                        .header = {
7705                                /* .type */
7706                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7707                                /* .size */
7708                        },
7709                        /* .next_prev_pid */
7710                        /* .next_prev_tid */
7711                },
7712        };
7713
7714        if (!sched_in && task->state == TASK_RUNNING)
7715                switch_event.event_id.header.misc |=
7716                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
7717
7718        perf_iterate_sb(perf_event_switch_output,
7719                       &switch_event,
7720                       NULL);
7721}
7722
7723/*
7724 * IRQ throttle logging
7725 */
7726
7727static void perf_log_throttle(struct perf_event *event, int enable)
7728{
7729        struct perf_output_handle handle;
7730        struct perf_sample_data sample;
7731        int ret;
7732
7733        struct {
7734                struct perf_event_header        header;
7735                u64                             time;
7736                u64                             id;
7737                u64                             stream_id;
7738        } throttle_event = {
7739                .header = {
7740                        .type = PERF_RECORD_THROTTLE,
7741                        .misc = 0,
7742                        .size = sizeof(throttle_event),
7743                },
7744                .time           = perf_event_clock(event),
7745                .id             = primary_event_id(event),
7746                .stream_id      = event->id,
7747        };
7748
7749        if (enable)
7750                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7751
7752        perf_event_header__init_id(&throttle_event.header, &sample, event);
7753
7754        ret = perf_output_begin(&handle, event,
7755                                throttle_event.header.size);
7756        if (ret)
7757                return;
7758
7759        perf_output_put(&handle, throttle_event);
7760        perf_event__output_id_sample(event, &handle, &sample);
7761        perf_output_end(&handle);
7762}
7763
7764/*
7765 * ksymbol register/unregister tracking
7766 */
7767
7768struct perf_ksymbol_event {
7769        const char      *name;
7770        int             name_len;
7771        struct {
7772                struct perf_event_header        header;
7773                u64                             addr;
7774                u32                             len;
7775                u16                             ksym_type;
7776                u16                             flags;
7777        } event_id;
7778};
7779
7780static int perf_event_ksymbol_match(struct perf_event *event)
7781{
7782        return event->attr.ksymbol;
7783}
7784
7785static void perf_event_ksymbol_output(struct perf_event *event, void *data)
7786{
7787        struct perf_ksymbol_event *ksymbol_event = data;
7788        struct perf_output_handle handle;
7789        struct perf_sample_data sample;
7790        int ret;
7791
7792        if (!perf_event_ksymbol_match(event))
7793                return;
7794
7795        perf_event_header__init_id(&ksymbol_event->event_id.header,
7796                                   &sample, event);
7797        ret = perf_output_begin(&handle, event,
7798                                ksymbol_event->event_id.header.size);
7799        if (ret)
7800                return;
7801
7802        perf_output_put(&handle, ksymbol_event->event_id);
7803        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
7804        perf_event__output_id_sample(event, &handle, &sample);
7805
7806        perf_output_end(&handle);
7807}
7808
7809void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
7810                        const char *sym)
7811{
7812        struct perf_ksymbol_event ksymbol_event;
7813        char name[KSYM_NAME_LEN];
7814        u16 flags = 0;
7815        int name_len;
7816
7817        if (!atomic_read(&nr_ksymbol_events))
7818                return;
7819
7820        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
7821            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
7822                goto err;
7823
7824        strlcpy(name, sym, KSYM_NAME_LEN);
7825        name_len = strlen(name) + 1;
7826        while (!IS_ALIGNED(name_len, sizeof(u64)))
7827                name[name_len++] = '\0';
7828        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
7829
7830        if (unregister)
7831                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
7832
7833        ksymbol_event = (struct perf_ksymbol_event){
7834                .name = name,
7835                .name_len = name_len,
7836                .event_id = {
7837                        .header = {
7838                                .type = PERF_RECORD_KSYMBOL,
7839                                .size = sizeof(ksymbol_event.event_id) +
7840                                        name_len,
7841                        },
7842                        .addr = addr,
7843                        .len = len,
7844                        .ksym_type = ksym_type,
7845                        .flags = flags,
7846                },
7847        };
7848
7849        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
7850        return;
7851err:
7852        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
7853}
7854
7855/*
7856 * bpf program load/unload tracking
7857 */
7858
7859struct perf_bpf_event {
7860        struct bpf_prog *prog;
7861        struct {
7862                struct perf_event_header        header;
7863                u16                             type;
7864                u16                             flags;
7865                u32                             id;
7866                u8                              tag[BPF_TAG_SIZE];
7867        } event_id;
7868};
7869
7870static int perf_event_bpf_match(struct perf_event *event)
7871{
7872        return event->attr.bpf_event;
7873}
7874
7875static void perf_event_bpf_output(struct perf_event *event, void *data)
7876{
7877        struct perf_bpf_event *bpf_event = data;
7878        struct perf_output_handle handle;
7879        struct perf_sample_data sample;
7880        int ret;
7881
7882        if (!perf_event_bpf_match(event))
7883                return;
7884
7885        perf_event_header__init_id(&bpf_event->event_id.header,
7886                                   &sample, event);
7887        ret = perf_output_begin(&handle, event,
7888                                bpf_event->event_id.header.size);
7889        if (ret)
7890                return;
7891
7892        perf_output_put(&handle, bpf_event->event_id);
7893        perf_event__output_id_sample(event, &handle, &sample);
7894
7895        perf_output_end(&handle);
7896}
7897
7898static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
7899                                         enum perf_bpf_event_type type)
7900{
7901        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
7902        char sym[KSYM_NAME_LEN];
7903        int i;
7904
7905        if (prog->aux->func_cnt == 0) {
7906                bpf_get_prog_name(prog, sym);
7907                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
7908                                   (u64)(unsigned long)prog->bpf_func,
7909                                   prog->jited_len, unregister, sym);
7910        } else {
7911                for (i = 0; i < prog->aux->func_cnt; i++) {
7912                        struct bpf_prog *subprog = prog->aux->func[i];
7913
7914                        bpf_get_prog_name(subprog, sym);
7915                        perf_event_ksymbol(
7916                                PERF_RECORD_KSYMBOL_TYPE_BPF,
7917                                (u64)(unsigned long)subprog->bpf_func,
7918                                subprog->jited_len, unregister, sym);
7919                }
7920        }
7921}
7922
7923void perf_event_bpf_event(struct bpf_prog *prog,
7924                          enum perf_bpf_event_type type,
7925                          u16 flags)
7926{
7927        struct perf_bpf_event bpf_event;
7928
7929        if (type <= PERF_BPF_EVENT_UNKNOWN ||
7930            type >= PERF_BPF_EVENT_MAX)
7931                return;
7932
7933        switch (type) {
7934        case PERF_BPF_EVENT_PROG_LOAD:
7935        case PERF_BPF_EVENT_PROG_UNLOAD:
7936                if (atomic_read(&nr_ksymbol_events))
7937                        perf_event_bpf_emit_ksymbols(prog, type);
7938                break;
7939        default:
7940                break;
7941        }
7942
7943        if (!atomic_read(&nr_bpf_events))
7944                return;
7945
7946        bpf_event = (struct perf_bpf_event){
7947                .prog = prog,
7948                .event_id = {
7949                        .header = {
7950                                .type = PERF_RECORD_BPF_EVENT,
7951                                .size = sizeof(bpf_event.event_id),
7952                        },
7953                        .type = type,
7954                        .flags = flags,
7955                        .id = prog->aux->id,
7956                },
7957        };
7958
7959        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
7960
7961        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
7962        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
7963}
7964
7965void perf_event_itrace_started(struct perf_event *event)
7966{
7967        event->attach_state |= PERF_ATTACH_ITRACE;
7968}
7969
7970static void perf_log_itrace_start(struct perf_event *event)
7971{
7972        struct perf_output_handle handle;
7973        struct perf_sample_data sample;
7974        struct perf_aux_event {
7975                struct perf_event_header        header;
7976                u32                             pid;
7977                u32                             tid;
7978        } rec;
7979        int ret;
7980
7981        if (event->parent)
7982                event = event->parent;
7983
7984        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7985            event->attach_state & PERF_ATTACH_ITRACE)
7986                return;
7987
7988        rec.header.type = PERF_RECORD_ITRACE_START;
7989        rec.header.misc = 0;
7990        rec.header.size = sizeof(rec);
7991        rec.pid = perf_event_pid(event, current);
7992        rec.tid = perf_event_tid(event, current);
7993
7994        perf_event_header__init_id(&rec.header, &sample, event);
7995        ret = perf_output_begin(&handle, event, rec.header.size);
7996
7997        if (ret)
7998                return;
7999
8000        perf_output_put(&handle, rec);
8001        perf_event__output_id_sample(event, &handle, &sample);
8002
8003        perf_output_end(&handle);
8004}
8005
8006static int
8007__perf_event_account_interrupt(struct perf_event *event, int throttle)
8008{
8009        struct hw_perf_event *hwc = &event->hw;
8010        int ret = 0;
8011        u64 seq;
8012
8013        seq = __this_cpu_read(perf_throttled_seq);
8014        if (seq != hwc->interrupts_seq) {
8015                hwc->interrupts_seq = seq;
8016                hwc->interrupts = 1;
8017        } else {
8018                hwc->interrupts++;
8019                if (unlikely(throttle
8020                             && hwc->interrupts >= max_samples_per_tick)) {
8021                        __this_cpu_inc(perf_throttled_count);
8022                        tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
8023                        hwc->interrupts = MAX_INTERRUPTS;
8024                        perf_log_throttle(event, 0);
8025                        ret = 1;
8026                }
8027        }
8028
8029        if (event->attr.freq) {
8030                u64 now = perf_clock();
8031                s64 delta = now - hwc->freq_time_stamp;
8032
8033                hwc->freq_time_stamp = now;
8034
8035                if (delta > 0 && delta < 2*TICK_NSEC)
8036                        perf_adjust_period(event, delta, hwc->last_period, true);
8037        }
8038
8039        return ret;
8040}
8041
8042int perf_event_account_interrupt(struct perf_event *event)
8043{
8044        return __perf_event_account_interrupt(event, 1);
8045}
8046
8047/*
8048 * Generic event overflow handling, sampling.
8049 */
8050
8051static int __perf_event_overflow(struct perf_event *event,
8052                                   int throttle, struct perf_sample_data *data,
8053                                   struct pt_regs *regs)
8054{
8055        int events = atomic_read(&event->event_limit);
8056        int ret = 0;
8057
8058        /*
8059         * Non-sampling counters might still use the PMI to fold short
8060         * hardware counters, ignore those.
8061         */
8062        if (unlikely(!is_sampling_event(event)))
8063                return 0;
8064
8065        ret = __perf_event_account_interrupt(event, throttle);
8066
8067        /*
8068         * XXX event_limit might not quite work as expected on inherited
8069         * events
8070         */
8071
8072        event->pending_kill = POLL_IN;
8073        if (events && atomic_dec_and_test(&event->event_limit)) {
8074                ret = 1;
8075                event->pending_kill = POLL_HUP;
8076
8077                perf_event_disable_inatomic(event);
8078        }
8079
8080        READ_ONCE(event->overflow_handler)(event, data, regs);
8081
8082        if (*perf_event_fasync(event) && event->pending_kill) {
8083                event->pending_wakeup = 1;
8084                irq_work_queue(&event->pending);
8085        }
8086
8087        return ret;
8088}
8089
8090int perf_event_overflow(struct perf_event *event,
8091                          struct perf_sample_data *data,
8092                          struct pt_regs *regs)
8093{
8094        return __perf_event_overflow(event, 1, data, regs);
8095}
8096
8097/*
8098 * Generic software event infrastructure
8099 */
8100
8101struct swevent_htable {
8102        struct swevent_hlist            *swevent_hlist;
8103        struct mutex                    hlist_mutex;
8104        int                             hlist_refcount;
8105
8106        /* Recursion avoidance in each contexts */
8107        int                             recursion[PERF_NR_CONTEXTS];
8108};
8109
8110static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
8111
8112/*
8113 * We directly increment event->count and keep a second value in
8114 * event->hw.period_left to count intervals. This period event
8115 * is kept in the range [-sample_period, 0] so that we can use the
8116 * sign as trigger.
8117 */
8118
8119u64 perf_swevent_set_period(struct perf_event *event)
8120{
8121        struct hw_perf_event *hwc = &event->hw;
8122        u64 period = hwc->last_period;
8123        u64 nr, offset;
8124        s64 old, val;
8125
8126        hwc->last_period = hwc->sample_period;
8127
8128again:
8129        old = val = local64_read(&hwc->period_left);
8130        if (val < 0)
8131                return 0;
8132
8133        nr = div64_u64(period + val, period);
8134        offset = nr * period;
8135        val -= offset;
8136        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
8137                goto again;
8138
8139        return nr;
8140}
8141
8142static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
8143                                    struct perf_sample_data *data,
8144                                    struct pt_regs *regs)
8145{
8146        struct hw_perf_event *hwc = &event->hw;
8147        int throttle = 0;
8148
8149        if (!overflow)
8150                overflow = perf_swevent_set_period(event);
8151
8152        if (hwc->interrupts == MAX_INTERRUPTS)
8153                return;
8154
8155        for (; overflow; overflow--) {
8156                if (__perf_event_overflow(event, throttle,
8157                                            data, regs)) {
8158                        /*
8159                         * We inhibit the overflow from happening when
8160                         * hwc->interrupts == MAX_INTERRUPTS.
8161                         */
8162                        break;
8163                }
8164                throttle = 1;
8165        }
8166}
8167
8168static void perf_swevent_event(struct perf_event *event, u64 nr,
8169                               struct perf_sample_data *data,
8170                               struct pt_regs *regs)
8171{
8172        struct hw_perf_event *hwc = &event->hw;
8173
8174        local64_add(nr, &event->count);
8175
8176        if (!regs)
8177                return;
8178
8179        if (!is_sampling_event(event))
8180                return;
8181
8182        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8183                data->period = nr;
8184                return perf_swevent_overflow(event, 1, data, regs);
8185        } else
8186                data->period = event->hw.last_period;
8187
8188        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8189                return perf_swevent_overflow(event, 1, data, regs);
8190
8191        if (local64_add_negative(nr, &hwc->period_left))
8192                return;
8193
8194        perf_swevent_overflow(event, 0, data, regs);
8195}
8196
8197static int perf_exclude_event(struct perf_event *event,
8198                              struct pt_regs *regs)
8199{
8200        if (event->hw.state & PERF_HES_STOPPED)
8201                return 1;
8202
8203        if (regs) {
8204                if (event->attr.exclude_user && user_mode(regs))
8205                        return 1;
8206
8207                if (event->attr.exclude_kernel && !user_mode(regs))
8208                        return 1;
8209        }
8210
8211        return 0;
8212}
8213
8214static int perf_swevent_match(struct perf_event *event,
8215                                enum perf_type_id type,
8216                                u32 event_id,
8217                                struct perf_sample_data *data,
8218                                struct pt_regs *regs)
8219{
8220        if (event->attr.type != type)
8221                return 0;
8222
8223        if (event->attr.config != event_id)
8224                return 0;
8225
8226        if (perf_exclude_event(event, regs))
8227                return 0;
8228
8229        return 1;
8230}
8231
8232static inline u64 swevent_hash(u64 type, u32 event_id)
8233{
8234        u64 val = event_id | (type << 32);
8235
8236        return hash_64(val, SWEVENT_HLIST_BITS);
8237}
8238
8239static inline struct hlist_head *
8240__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
8241{
8242        u64 hash = swevent_hash(type, event_id);
8243
8244        return &hlist->heads[hash];
8245}
8246
8247/* For the read side: events when they trigger */
8248static inline struct hlist_head *
8249find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
8250{
8251        struct swevent_hlist *hlist;
8252
8253        hlist = rcu_dereference(swhash->swevent_hlist);
8254        if (!hlist)
8255                return NULL;
8256
8257        return __find_swevent_head(hlist, type, event_id);
8258}
8259
8260/* For the event head insertion and removal in the hlist */
8261static inline struct hlist_head *
8262find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
8263{
8264        struct swevent_hlist *hlist;
8265        u32 event_id = event->attr.config;
8266        u64 type = event->attr.type;
8267
8268        /*
8269         * Event scheduling is always serialized against hlist allocation
8270         * and release. Which makes the protected version suitable here.
8271         * The context lock guarantees that.
8272         */
8273        hlist = rcu_dereference_protected(swhash->swevent_hlist,
8274                                          lockdep_is_held(&event->ctx->lock));
8275        if (!hlist)
8276                return NULL;
8277
8278        return __find_swevent_head(hlist, type, event_id);
8279}
8280
8281static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
8282                                    u64 nr,
8283                                    struct perf_sample_data *data,
8284                                    struct pt_regs *regs)
8285{
8286        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8287        struct perf_event *event;
8288        struct hlist_head *head;
8289
8290        rcu_read_lock();
8291        head = find_swevent_head_rcu(swhash, type, event_id);
8292        if (!head)
8293                goto end;
8294
8295        hlist_for_each_entry_rcu(event, head, hlist_entry) {
8296                if (perf_swevent_match(event, type, event_id, data, regs))
8297                        perf_swevent_event(event, nr, data, regs);
8298        }
8299end:
8300        rcu_read_unlock();
8301}
8302
8303DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
8304
8305int perf_swevent_get_recursion_context(void)
8306{
8307        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8308
8309        return get_recursion_context(swhash->recursion);
8310}
8311EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
8312
8313void perf_swevent_put_recursion_context(int rctx)
8314{
8315        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8316
8317        put_recursion_context(swhash->recursion, rctx);
8318}
8319
8320void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8321{
8322        struct perf_sample_data data;
8323
8324        if (WARN_ON_ONCE(!regs))
8325                return;
8326
8327        perf_sample_data_init(&data, addr, 0);
8328        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8329}
8330
8331void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8332{
8333        int rctx;
8334
8335        preempt_disable_notrace();
8336        rctx = perf_swevent_get_recursion_context();
8337        if (unlikely(rctx < 0))
8338                goto fail;
8339
8340        ___perf_sw_event(event_id, nr, regs, addr);
8341
8342        perf_swevent_put_recursion_context(rctx);
8343fail:
8344        preempt_enable_notrace();
8345}
8346
8347static void perf_swevent_read(struct perf_event *event)
8348{
8349}
8350
8351static int perf_swevent_add(struct perf_event *event, int flags)
8352{
8353        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8354        struct hw_perf_event *hwc = &event->hw;
8355        struct hlist_head *head;
8356
8357        if (is_sampling_event(event)) {
8358                hwc->last_period = hwc->sample_period;
8359                perf_swevent_set_period(event);
8360        }
8361
8362        hwc->state = !(flags & PERF_EF_START);
8363
8364        head = find_swevent_head(swhash, event);
8365        if (WARN_ON_ONCE(!head))
8366                return -EINVAL;
8367
8368        hlist_add_head_rcu(&event->hlist_entry, head);
8369        perf_event_update_userpage(event);
8370
8371        return 0;
8372}
8373
8374static void perf_swevent_del(struct perf_event *event, int flags)
8375{
8376        hlist_del_rcu(&event->hlist_entry);
8377}
8378
8379static void perf_swevent_start(struct perf_event *event, int flags)
8380{
8381        event->hw.state = 0;
8382}
8383
8384static void perf_swevent_stop(struct perf_event *event, int flags)
8385{
8386        event->hw.state = PERF_HES_STOPPED;
8387}
8388
8389/* Deref the hlist from the update side */
8390static inline struct swevent_hlist *
8391swevent_hlist_deref(struct swevent_htable *swhash)
8392{
8393        return rcu_dereference_protected(swhash->swevent_hlist,
8394                                         lockdep_is_held(&swhash->hlist_mutex));
8395}
8396
8397static void swevent_hlist_release(struct swevent_htable *swhash)
8398{
8399        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
8400
8401        if (!hlist)
8402                return;
8403
8404        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
8405        kfree_rcu(hlist, rcu_head);
8406}
8407
8408static void swevent_hlist_put_cpu(int cpu)
8409{
8410        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8411
8412        mutex_lock(&swhash->hlist_mutex);
8413
8414        if (!--swhash->hlist_refcount)
8415                swevent_hlist_release(swhash);
8416
8417        mutex_unlock(&swhash->hlist_mutex);
8418}
8419
8420static void swevent_hlist_put(void)
8421{
8422        int cpu;
8423
8424        for_each_possible_cpu(cpu)
8425                swevent_hlist_put_cpu(cpu);
8426}
8427
8428static int swevent_hlist_get_cpu(int cpu)
8429{
8430        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8431        int err = 0;
8432
8433        mutex_lock(&swhash->hlist_mutex);
8434        if (!swevent_hlist_deref(swhash) &&
8435            cpumask_test_cpu(cpu, perf_online_mask)) {
8436                struct swevent_hlist *hlist;
8437
8438                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
8439                if (!hlist) {
8440                        err = -ENOMEM;
8441                        goto exit;
8442                }
8443                rcu_assign_pointer(swhash->swevent_hlist, hlist);
8444        }
8445        swhash->hlist_refcount++;
8446exit:
8447        mutex_unlock(&swhash->hlist_mutex);
8448
8449        return err;
8450}
8451
8452static int swevent_hlist_get(void)
8453{
8454        int err, cpu, failed_cpu;
8455
8456        mutex_lock(&pmus_lock);
8457        for_each_possible_cpu(cpu) {
8458                err = swevent_hlist_get_cpu(cpu);
8459                if (err) {
8460                        failed_cpu = cpu;
8461                        goto fail;
8462                }
8463        }
8464        mutex_unlock(&pmus_lock);
8465        return 0;
8466fail:
8467        for_each_possible_cpu(cpu) {
8468                if (cpu == failed_cpu)
8469                        break;
8470                swevent_hlist_put_cpu(cpu);
8471        }
8472        mutex_unlock(&pmus_lock);
8473        return err;
8474}
8475
8476struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
8477
8478static void sw_perf_event_destroy(struct perf_event *event)
8479{
8480        u64 event_id = event->attr.config;
8481
8482        WARN_ON(event->parent);
8483
8484        static_key_slow_dec(&perf_swevent_enabled[event_id]);
8485        swevent_hlist_put();
8486}
8487
8488static int perf_swevent_init(struct perf_event *event)
8489{
8490        u64 event_id = event->attr.config;
8491
8492        if (event->attr.type != PERF_TYPE_SOFTWARE)
8493                return -ENOENT;
8494
8495        /*
8496         * no branch sampling for software events
8497         */
8498        if (has_branch_stack(event))
8499                return -EOPNOTSUPP;
8500
8501        switch (event_id) {
8502        case PERF_COUNT_SW_CPU_CLOCK:
8503        case PERF_COUNT_SW_TASK_CLOCK:
8504                return -ENOENT;
8505
8506        default:
8507                break;
8508        }
8509
8510        if (event_id >= PERF_COUNT_SW_MAX)
8511                return -ENOENT;
8512
8513        if (!event->parent) {
8514                int err;
8515
8516                err = swevent_hlist_get();
8517                if (err)
8518                        return err;
8519
8520                static_key_slow_inc(&perf_swevent_enabled[event_id]);
8521                event->destroy = sw_perf_event_destroy;
8522        }
8523
8524        return 0;
8525}
8526
8527static struct pmu perf_swevent = {
8528        .task_ctx_nr    = perf_sw_context,
8529
8530        .capabilities   = PERF_PMU_CAP_NO_NMI,
8531
8532        .event_init     = perf_swevent_init,
8533        .add            = perf_swevent_add,
8534        .del            = perf_swevent_del,
8535        .start          = perf_swevent_start,
8536        .stop           = perf_swevent_stop,
8537        .read           = perf_swevent_read,
8538};
8539
8540#ifdef CONFIG_EVENT_TRACING
8541
8542static int perf_tp_filter_match(struct perf_event *event,
8543                                struct perf_sample_data *data)
8544{
8545        void *record = data->raw->frag.data;
8546
8547        /* only top level events have filters set */
8548        if (event->parent)
8549                event = event->parent;
8550
8551        if (likely(!event->filter) || filter_match_preds(event->filter, record))
8552                return 1;
8553        return 0;
8554}
8555
8556static int perf_tp_event_match(struct perf_event *event,
8557                                struct perf_sample_data *data,
8558                                struct pt_regs *regs)
8559{
8560        if (event->hw.state & PERF_HES_STOPPED)
8561                return 0;
8562        /*
8563         * If exclude_kernel, only trace user-space tracepoints (uprobes)
8564         */
8565        if (event->attr.exclude_kernel && !user_mode(regs))
8566                return 0;
8567
8568        if (!perf_tp_filter_match(event, data))
8569                return 0;
8570
8571        return 1;
8572}
8573
8574void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8575                               struct trace_event_call *call, u64 count,
8576                               struct pt_regs *regs, struct hlist_head *head,
8577                               struct task_struct *task)
8578{
8579        if (bpf_prog_array_valid(call)) {
8580                *(struct pt_regs **)raw_data = regs;
8581                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
8582                        perf_swevent_put_recursion_context(rctx);
8583                        return;
8584                }
8585        }
8586        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8587                      rctx, task);
8588}
8589EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8590
8591void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8592                   struct pt_regs *regs, struct hlist_head *head, int rctx,
8593                   struct task_struct *task)
8594{
8595        struct perf_sample_data data;
8596        struct perf_event *event;
8597
8598        struct perf_raw_record raw = {
8599                .frag = {
8600                        .size = entry_size,
8601                        .data = record,
8602                },
8603        };
8604
8605        perf_sample_data_init(&data, 0, 0);
8606        data.raw = &raw;
8607
8608        perf_trace_buf_update(record, event_type);
8609
8610        hlist_for_each_entry_rcu(event, head, hlist_entry) {
8611                if (perf_tp_event_match(event, &data, regs))
8612                        perf_swevent_event(event, count, &data, regs);
8613        }
8614
8615        /*
8616         * If we got specified a target task, also iterate its context and
8617         * deliver this event there too.
8618         */
8619        if (task && task != current) {
8620                struct perf_event_context *ctx;
8621                struct trace_entry *entry = record;
8622
8623                rcu_read_lock();
8624                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8625                if (!ctx)
8626                        goto unlock;
8627
8628                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8629                        if (event->cpu != smp_processor_id())
8630                                continue;
8631                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
8632                                continue;
8633                        if (event->attr.config != entry->type)
8634                                continue;
8635                        if (perf_tp_event_match(event, &data, regs))
8636                                perf_swevent_event(event, count, &data, regs);
8637                }
8638unlock:
8639                rcu_read_unlock();
8640        }
8641
8642        perf_swevent_put_recursion_context(rctx);
8643}
8644EXPORT_SYMBOL_GPL(perf_tp_event);
8645
8646static void tp_perf_event_destroy(struct perf_event *event)
8647{
8648        perf_trace_destroy(event);
8649}
8650
8651static int perf_tp_event_init(struct perf_event *event)
8652{
8653        int err;
8654
8655        if (event->attr.type != PERF_TYPE_TRACEPOINT)
8656                return -ENOENT;
8657
8658        /*
8659         * no branch sampling for tracepoint events
8660         */
8661        if (has_branch_stack(event))
8662                return -EOPNOTSUPP;
8663
8664        err = perf_trace_init(event);
8665        if (err)
8666                return err;
8667
8668        event->destroy = tp_perf_event_destroy;
8669
8670        return 0;
8671}
8672
8673static struct pmu perf_tracepoint = {
8674        .task_ctx_nr    = perf_sw_context,
8675
8676        .event_init     = perf_tp_event_init,
8677        .add            = perf_trace_add,
8678        .del            = perf_trace_del,
8679        .start          = perf_swevent_start,
8680        .stop           = perf_swevent_stop,
8681        .read           = perf_swevent_read,
8682};
8683
8684#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8685/*
8686 * Flags in config, used by dynamic PMU kprobe and uprobe
8687 * The flags should match following PMU_FORMAT_ATTR().
8688 *
8689 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8690 *                               if not set, create kprobe/uprobe
8691 *
8692 * The following values specify a reference counter (or semaphore in the
8693 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
8694 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
8695 *
8696 * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
8697 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
8698 */
8699enum perf_probe_config {
8700        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
8701        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
8702        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
8703};
8704
8705PMU_FORMAT_ATTR(retprobe, "config:0");
8706#endif
8707
8708#ifdef CONFIG_KPROBE_EVENTS
8709static struct attribute *kprobe_attrs[] = {
8710        &format_attr_retprobe.attr,
8711        NULL,
8712};
8713
8714static struct attribute_group kprobe_format_group = {
8715        .name = "format",
8716        .attrs = kprobe_attrs,
8717};
8718
8719static const struct attribute_group *kprobe_attr_groups[] = {
8720        &kprobe_format_group,
8721        NULL,
8722};
8723
8724static int perf_kprobe_event_init(struct perf_event *event);
8725static struct pmu perf_kprobe = {
8726        .task_ctx_nr    = perf_sw_context,
8727        .event_init     = perf_kprobe_event_init,
8728        .add            = perf_trace_add,
8729        .del            = perf_trace_del,
8730        .start          = perf_swevent_start,
8731        .stop           = perf_swevent_stop,
8732        .read           = perf_swevent_read,
8733        .attr_groups    = kprobe_attr_groups,
8734};
8735
8736static int perf_kprobe_event_init(struct perf_event *event)
8737{
8738        int err;
8739        bool is_retprobe;
8740
8741        if (event->attr.type != perf_kprobe.type)
8742                return -ENOENT;
8743
8744        if (!capable(CAP_SYS_ADMIN))
8745                return -EACCES;
8746
8747        /*
8748         * no branch sampling for probe events
8749         */
8750        if (has_branch_stack(event))
8751                return -EOPNOTSUPP;
8752
8753        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8754        err = perf_kprobe_init(event, is_retprobe);
8755        if (err)
8756                return err;
8757
8758        event->destroy = perf_kprobe_destroy;
8759
8760        return 0;
8761}
8762#endif /* CONFIG_KPROBE_EVENTS */
8763
8764#ifdef CONFIG_UPROBE_EVENTS
8765PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
8766
8767static struct attribute *uprobe_attrs[] = {
8768        &format_attr_retprobe.attr,
8769        &format_attr_ref_ctr_offset.attr,
8770        NULL,
8771};
8772
8773static struct attribute_group uprobe_format_group = {
8774        .name = "format",
8775        .attrs = uprobe_attrs,
8776};
8777
8778static const struct attribute_group *uprobe_attr_groups[] = {
8779        &uprobe_format_group,
8780        NULL,
8781};
8782
8783static int perf_uprobe_event_init(struct perf_event *event);
8784static struct pmu perf_uprobe = {
8785        .task_ctx_nr    = perf_sw_context,
8786        .event_init     = perf_uprobe_event_init,
8787        .add            = perf_trace_add,
8788        .del            = perf_trace_del,
8789        .start          = perf_swevent_start,
8790        .stop           = perf_swevent_stop,
8791        .read           = perf_swevent_read,
8792        .attr_groups    = uprobe_attr_groups,
8793};
8794
8795static int perf_uprobe_event_init(struct perf_event *event)
8796{
8797        int err;
8798        unsigned long ref_ctr_offset;
8799        bool is_retprobe;
8800
8801        if (event->attr.type != perf_uprobe.type)
8802                return -ENOENT;
8803
8804        if (!capable(CAP_SYS_ADMIN))
8805                return -EACCES;
8806
8807        /*
8808         * no branch sampling for probe events
8809         */
8810        if (has_branch_stack(event))
8811                return -EOPNOTSUPP;
8812
8813        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8814        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
8815        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
8816        if (err)
8817                return err;
8818
8819        event->destroy = perf_uprobe_destroy;
8820
8821        return 0;
8822}
8823#endif /* CONFIG_UPROBE_EVENTS */
8824
8825static inline void perf_tp_register(void)
8826{
8827        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8828#ifdef CONFIG_KPROBE_EVENTS
8829        perf_pmu_register(&perf_kprobe, "kprobe", -1);
8830#endif
8831#ifdef CONFIG_UPROBE_EVENTS
8832        perf_pmu_register(&perf_uprobe, "uprobe", -1);
8833#endif
8834}
8835
8836static void perf_event_free_filter(struct perf_event *event)
8837{
8838        ftrace_profile_free_filter(event);
8839}
8840
8841#ifdef CONFIG_BPF_SYSCALL
8842static void bpf_overflow_handler(struct perf_event *event,
8843                                 struct perf_sample_data *data,
8844                                 struct pt_regs *regs)
8845{
8846        struct bpf_perf_event_data_kern ctx = {
8847                .data = data,
8848                .event = event,
8849        };
8850        int ret = 0;
8851
8852        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8853        preempt_disable();
8854        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8855                goto out;
8856        rcu_read_lock();
8857        ret = BPF_PROG_RUN(event->prog, &ctx);
8858        rcu_read_unlock();
8859out:
8860        __this_cpu_dec(bpf_prog_active);
8861        preempt_enable();
8862        if (!ret)
8863                return;
8864
8865        event->orig_overflow_handler(event, data, regs);
8866}
8867
8868static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8869{
8870        struct bpf_prog *prog;
8871
8872        if (event->overflow_handler_context)
8873                /* hw breakpoint or kernel counter */
8874                return -EINVAL;
8875
8876        if (event->prog)
8877                return -EEXIST;
8878
8879        prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8880        if (IS_ERR(prog))
8881                return PTR_ERR(prog);
8882
8883        event->prog = prog;
8884        event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8885        WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8886        return 0;
8887}
8888
8889static void perf_event_free_bpf_handler(struct perf_event *event)
8890{
8891        struct bpf_prog *prog = event->prog;
8892
8893        if (!prog)
8894                return;
8895
8896        WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8897        event->prog = NULL;
8898        bpf_prog_put(prog);
8899}
8900#else
8901static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8902{
8903        return -EOPNOTSUPP;
8904}
8905static void perf_event_free_bpf_handler(struct perf_event *event)
8906{
8907}
8908#endif
8909
8910/*
8911 * returns true if the event is a tracepoint, or a kprobe/upprobe created
8912 * with perf_event_open()
8913 */
8914static inline bool perf_event_is_tracing(struct perf_event *event)
8915{
8916        if (event->pmu == &perf_tracepoint)
8917                return true;
8918#ifdef CONFIG_KPROBE_EVENTS
8919        if (event->pmu == &perf_kprobe)
8920                return true;
8921#endif
8922#ifdef CONFIG_UPROBE_EVENTS
8923        if (event->pmu == &perf_uprobe)
8924                return true;
8925#endif
8926        return false;
8927}
8928
8929static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8930{
8931        bool is_kprobe, is_tracepoint, is_syscall_tp;
8932        struct bpf_prog *prog;
8933        int ret;
8934
8935        if (!perf_event_is_tracing(event))
8936                return perf_event_set_bpf_handler(event, prog_fd);
8937
8938        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8939        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8940        is_syscall_tp = is_syscall_trace_event(event->tp_event);
8941        if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8942                /* bpf programs can only be attached to u/kprobe or tracepoint */
8943                return -EINVAL;
8944
8945        prog = bpf_prog_get(prog_fd);
8946        if (IS_ERR(prog))
8947                return PTR_ERR(prog);
8948
8949        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8950            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8951            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8952                /* valid fd, but invalid bpf program type */
8953                bpf_prog_put(prog);
8954                return -EINVAL;
8955        }
8956
8957        /* Kprobe override only works for kprobes, not uprobes. */
8958        if (prog->kprobe_override &&
8959            !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8960                bpf_prog_put(prog);
8961                return -EINVAL;
8962        }
8963
8964        if (is_tracepoint || is_syscall_tp) {
8965                int off = trace_event_get_offsets(event->tp_event);
8966
8967                if (prog->aux->max_ctx_offset > off) {
8968                        bpf_prog_put(prog);
8969                        return -EACCES;
8970                }
8971        }
8972
8973        ret = perf_event_attach_bpf_prog(event, prog);
8974        if (ret)
8975                bpf_prog_put(prog);
8976        return ret;
8977}
8978
8979static void perf_event_free_bpf_prog(struct perf_event *event)
8980{
8981        if (!perf_event_is_tracing(event)) {
8982                perf_event_free_bpf_handler(event);
8983                return;
8984        }
8985        perf_event_detach_bpf_prog(event);
8986}
8987
8988#else
8989
8990static inline void perf_tp_register(void)
8991{
8992}
8993
8994static void perf_event_free_filter(struct perf_event *event)
8995{
8996}
8997
8998static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8999{
9000        return -ENOENT;
9001}
9002
9003static void perf_event_free_bpf_prog(struct perf_event *event)
9004{
9005}
9006#endif /* CONFIG_EVENT_TRACING */
9007
9008#ifdef CONFIG_HAVE_HW_BREAKPOINT
9009void perf_bp_event(struct perf_event *bp, void *data)
9010{
9011        struct perf_sample_data sample;
9012        struct pt_regs *regs = data;
9013
9014        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
9015
9016        if (!bp->hw.state && !perf_exclude_event(bp, regs))
9017                perf_swevent_event(bp, 1, &sample, regs);
9018}
9019#endif
9020
9021/*
9022 * Allocate a new address filter
9023 */
9024static struct perf_addr_filter *
9025perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
9026{
9027        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
9028        struct perf_addr_filter *filter;
9029
9030        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
9031        if (!filter)
9032                return NULL;
9033
9034        INIT_LIST_HEAD(&filter->entry);
9035        list_add_tail(&filter->entry, filters);
9036
9037        return filter;
9038}
9039
9040static void free_filters_list(struct list_head *filters)
9041{
9042        struct perf_addr_filter *filter, *iter;
9043
9044        list_for_each_entry_safe(filter, iter, filters, entry) {
9045                path_put(&filter->path);
9046                list_del(&filter->entry);
9047                kfree(filter);
9048        }
9049}
9050
9051/*
9052 * Free existing address filters and optionally install new ones
9053 */
9054static void perf_addr_filters_splice(struct perf_event *event,
9055                                     struct list_head *head)
9056{
9057        unsigned long flags;
9058        LIST_HEAD(list);
9059
9060        if (!has_addr_filter(event))
9061                return;
9062
9063        /* don't bother with children, they don't have their own filters */
9064        if (event->parent)
9065                return;
9066
9067        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
9068
9069        list_splice_init(&event->addr_filters.list, &list);
9070        if (head)
9071                list_splice(head, &event->addr_filters.list);
9072
9073        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
9074
9075        free_filters_list(&list);
9076}
9077
9078/*
9079 * Scan through mm's vmas and see if one of them matches the
9080 * @filter; if so, adjust filter's address range.
9081 * Called with mm::mmap_sem down for reading.
9082 */
9083static void perf_addr_filter_apply(struct perf_addr_filter *filter,
9084                                   struct mm_struct *mm,
9085                                   struct perf_addr_filter_range *fr)
9086{
9087        struct vm_area_struct *vma;
9088
9089        for (vma = mm->mmap; vma; vma = vma->vm_next) {
9090                if (!vma->vm_file)
9091                        continue;
9092
9093                if (perf_addr_filter_vma_adjust(filter, vma, fr))
9094                        return;
9095        }
9096}
9097
9098/*
9099 * Update event's address range filters based on the
9100 * task's existing mappings, if any.
9101 */
9102static void perf_event_addr_filters_apply(struct perf_event *event)
9103{
9104        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9105        struct task_struct *task = READ_ONCE(event->ctx->task);
9106        struct perf_addr_filter *filter;
9107        struct mm_struct *mm = NULL;
9108        unsigned int count = 0;
9109        unsigned long flags;
9110
9111        /*
9112         * We may observe TASK_TOMBSTONE, which means that the event tear-down
9113         * will stop on the parent's child_mutex that our caller is also holding
9114         */
9115        if (task == TASK_TOMBSTONE)
9116                return;
9117
9118        if (ifh->nr_file_filters) {
9119                mm = get_task_mm(event->ctx->task);
9120                if (!mm)
9121                        goto restart;
9122
9123                down_read(&mm->mmap_sem);
9124        }
9125
9126        raw_spin_lock_irqsave(&ifh->lock, flags);
9127        list_for_each_entry(filter, &ifh->list, entry) {
9128                if (filter->path.dentry) {
9129                        /*
9130                         * Adjust base offset if the filter is associated to a
9131                         * binary that needs to be mapped:
9132                         */
9133                        event->addr_filter_ranges[count].start = 0;
9134                        event->addr_filter_ranges[count].size = 0;
9135
9136                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
9137                } else {
9138                        event->addr_filter_ranges[count].start = filter->offset;
9139                        event->addr_filter_ranges[count].size  = filter->size;
9140                }
9141
9142                count++;
9143        }
9144
9145        event->addr_filters_gen++;
9146        raw_spin_unlock_irqrestore(&ifh->lock, flags);
9147
9148        if (ifh->nr_file_filters) {
9149                up_read(&mm->mmap_sem);
9150
9151                mmput(mm);
9152        }
9153
9154restart:
9155        perf_event_stop(event, 1);
9156}
9157
9158/*
9159 * Address range filtering: limiting the data to certain
9160 * instruction address ranges. Filters are ioctl()ed to us from
9161 * userspace as ascii strings.
9162 *
9163 * Filter string format:
9164 *
9165 * ACTION RANGE_SPEC
9166 * where ACTION is one of the
9167 *  * "filter": limit the trace to this region
9168 *  * "start": start tracing from this address
9169 *  * "stop": stop tracing at this address/region;
9170 * RANGE_SPEC is
9171 *  * for kernel addresses: <start address>[/<size>]
9172 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
9173 *
9174 * if <size> is not specified or is zero, the range is treated as a single
9175 * address; not valid for ACTION=="filter".
9176 */
9177enum {
9178        IF_ACT_NONE = -1,
9179        IF_ACT_FILTER,
9180        IF_ACT_START,
9181        IF_ACT_STOP,
9182        IF_SRC_FILE,
9183        IF_SRC_KERNEL,
9184        IF_SRC_FILEADDR,
9185        IF_SRC_KERNELADDR,
9186};
9187
9188enum {
9189        IF_STATE_ACTION = 0,
9190        IF_STATE_SOURCE,
9191        IF_STATE_END,
9192};
9193
9194static const match_table_t if_tokens = {
9195        { IF_ACT_FILTER,        "filter" },
9196        { IF_ACT_START,         "start" },
9197        { IF_ACT_STOP,          "stop" },
9198        { IF_SRC_FILE,          "%u/%u@%s" },
9199        { IF_SRC_KERNEL,        "%u/%u" },
9200        { IF_SRC_FILEADDR,      "%u@%s" },
9201        { IF_SRC_KERNELADDR,    "%u" },
9202        { IF_ACT_NONE,          NULL },
9203};
9204
9205/*
9206 * Address filter string parser
9207 */
9208static int
9209perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
9210                             struct list_head *filters)
9211{
9212        struct perf_addr_filter *filter = NULL;
9213        char *start, *orig, *filename = NULL;
9214        substring_t args[MAX_OPT_ARGS];
9215        int state = IF_STATE_ACTION, token;
9216        unsigned int kernel = 0;
9217        int ret = -EINVAL;
9218
9219        orig = fstr = kstrdup(fstr, GFP_KERNEL);
9220        if (!fstr)
9221                return -ENOMEM;
9222
9223        while ((start = strsep(&fstr, " ,\n")) != NULL) {
9224                static const enum perf_addr_filter_action_t actions[] = {
9225                        [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
9226                        [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
9227                        [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
9228                };
9229                ret = -EINVAL;
9230
9231                if (!*start)
9232                        continue;
9233
9234                /* filter definition begins */
9235                if (state == IF_STATE_ACTION) {
9236                        filter = perf_addr_filter_new(event, filters);
9237                        if (!filter)
9238                                goto fail;
9239                }
9240
9241                token = match_token(start, if_tokens, args);
9242                switch (token) {
9243                case IF_ACT_FILTER:
9244                case IF_ACT_START:
9245                case IF_ACT_STOP:
9246                        if (state != IF_STATE_ACTION)
9247                                goto fail;
9248
9249                        filter->action = actions[token];
9250                        state = IF_STATE_SOURCE;
9251                        break;
9252
9253                case IF_SRC_KERNELADDR:
9254                case IF_SRC_KERNEL:
9255                        kernel = 1;
9256                        /* fall through */
9257
9258                case IF_SRC_FILEADDR:
9259                case IF_SRC_FILE:
9260                        if (state != IF_STATE_SOURCE)
9261                                goto fail;
9262
9263                        *args[0].to = 0;
9264                        ret = kstrtoul(args[0].from, 0, &filter->offset);
9265                        if (ret)
9266                                goto fail;
9267
9268                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
9269                                *args[1].to = 0;
9270                                ret = kstrtoul(args[1].from, 0, &filter->size);
9271                                if (ret)
9272                                        goto fail;
9273                        }
9274
9275                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
9276                                int fpos = token == IF_SRC_FILE ? 2 : 1;
9277
9278                                filename = match_strdup(&args[fpos]);
9279                                if (!filename) {
9280                                        ret = -ENOMEM;
9281                                        goto fail;
9282                                }
9283                        }
9284
9285                        state = IF_STATE_END;
9286                        break;
9287
9288                default:
9289                        goto fail;
9290                }
9291
9292                /*
9293                 * Filter definition is fully parsed, validate and install it.
9294                 * Make sure that it doesn't contradict itself or the event's
9295                 * attribute.
9296                 */
9297                if (state == IF_STATE_END) {
9298                        ret = -EINVAL;
9299                        if (kernel && event->attr.exclude_kernel)
9300                                goto fail;
9301
9302                        /*
9303                         * ACTION "filter" must have a non-zero length region
9304                         * specified.
9305                         */
9306                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
9307                            !filter->size)
9308                                goto fail;
9309
9310                        if (!kernel) {
9311                                if (!filename)
9312                                        goto fail;
9313
9314                                /*
9315                                 * For now, we only support file-based filters
9316                                 * in per-task events; doing so for CPU-wide
9317                                 * events requires additional context switching
9318                                 * trickery, since same object code will be
9319                                 * mapped at different virtual addresses in
9320                                 * different processes.
9321                                 */
9322                                ret = -EOPNOTSUPP;
9323                                if (!event->ctx->task)
9324                                        goto fail_free_name;
9325
9326                                /* look up the path and grab its inode */
9327                                ret = kern_path(filename, LOOKUP_FOLLOW,
9328                                                &filter->path);
9329                                if (ret)
9330                                        goto fail_free_name;
9331
9332                                kfree(filename);
9333                                filename = NULL;
9334
9335                                ret = -EINVAL;
9336                                if (!filter->path.dentry ||
9337                                    !S_ISREG(d_inode(filter->path.dentry)
9338                                             ->i_mode))
9339                                        goto fail;
9340
9341                                event->addr_filters.nr_file_filters++;
9342                        }
9343
9344                        /* ready to consume more filters */
9345                        state = IF_STATE_ACTION;
9346                        filter = NULL;
9347                }
9348        }
9349
9350        if (state != IF_STATE_ACTION)
9351                goto fail;
9352
9353        kfree(orig);
9354
9355        return 0;
9356
9357fail_free_name:
9358        kfree(filename);
9359fail:
9360        free_filters_list(filters);
9361        kfree(orig);
9362
9363        return ret;
9364}
9365
9366static int
9367perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
9368{
9369        LIST_HEAD(filters);
9370        int ret;
9371
9372        /*
9373         * Since this is called in perf_ioctl() path, we're already holding
9374         * ctx::mutex.
9375         */
9376        lockdep_assert_held(&event->ctx->mutex);
9377
9378        if (WARN_ON_ONCE(event->parent))
9379                return -EINVAL;
9380
9381        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
9382        if (ret)
9383                goto fail_clear_files;
9384
9385        ret = event->pmu->addr_filters_validate(&filters);
9386        if (ret)
9387                goto fail_free_filters;
9388
9389        /* remove existing filters, if any */
9390        perf_addr_filters_splice(event, &filters);
9391
9392        /* install new filters */
9393        perf_event_for_each_child(event, perf_event_addr_filters_apply);
9394
9395        return ret;
9396
9397fail_free_filters:
9398        free_filters_list(&filters);
9399
9400fail_clear_files:
9401        event->addr_filters.nr_file_filters = 0;
9402
9403        return ret;
9404}
9405
9406static int perf_event_set_filter(struct perf_event *event, void __user *arg)
9407{
9408        int ret = -EINVAL;
9409        char *filter_str;
9410
9411        filter_str = strndup_user(arg, PAGE_SIZE);
9412        if (IS_ERR(filter_str))
9413                return PTR_ERR(filter_str);
9414
9415#ifdef CONFIG_EVENT_TRACING
9416        if (perf_event_is_tracing(event)) {
9417                struct perf_event_context *ctx = event->ctx;
9418
9419                /*
9420                 * Beware, here be dragons!!
9421                 *
9422                 * the tracepoint muck will deadlock against ctx->mutex, but
9423                 * the tracepoint stuff does not actually need it. So
9424                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
9425                 * already have a reference on ctx.
9426                 *
9427                 * This can result in event getting moved to a different ctx,
9428                 * but that does not affect the tracepoint state.
9429                 */
9430                mutex_unlock(&ctx->mutex);
9431                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9432                mutex_lock(&ctx->mutex);
9433        } else
9434#endif
9435        if (has_addr_filter(event))
9436                ret = perf_event_set_addr_filter(event, filter_str);
9437
9438        kfree(filter_str);
9439        return ret;
9440}
9441
9442/*
9443 * hrtimer based swevent callback
9444 */
9445
9446static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
9447{
9448        enum hrtimer_restart ret = HRTIMER_RESTART;
9449        struct perf_sample_data data;
9450        struct pt_regs *regs;
9451        struct perf_event *event;
9452        u64 period;
9453
9454        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
9455
9456        if (event->state != PERF_EVENT_STATE_ACTIVE)
9457                return HRTIMER_NORESTART;
9458
9459        event->pmu->read(event);
9460
9461        perf_sample_data_init(&data, 0, event->hw.last_period);
9462        regs = get_irq_regs();
9463
9464        if (regs && !perf_exclude_event(event, regs)) {
9465                if (!(event->attr.exclude_idle && is_idle_task(current)))
9466                        if (__perf_event_overflow(event, 1, &data, regs))
9467                                ret = HRTIMER_NORESTART;
9468        }
9469
9470        period = max_t(u64, 10000, event->hw.sample_period);
9471        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
9472
9473        return ret;
9474}
9475
9476static void perf_swevent_start_hrtimer(struct perf_event *event)
9477{
9478        struct hw_perf_event *hwc = &event->hw;
9479        s64 period;
9480
9481        if (!is_sampling_event(event))
9482                return;
9483
9484        period = local64_read(&hwc->period_left);
9485        if (period) {
9486                if (period < 0)
9487                        period = 10000;
9488
9489                local64_set(&hwc->period_left, 0);
9490        } else {
9491                period = max_t(u64, 10000, hwc->sample_period);
9492        }
9493        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9494                      HRTIMER_MODE_REL_PINNED);
9495}
9496
9497static void perf_swevent_cancel_hrtimer(struct perf_event *event)
9498{
9499        struct hw_perf_event *hwc = &event->hw;
9500
9501        if (is_sampling_event(event)) {
9502                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
9503                local64_set(&hwc->period_left, ktime_to_ns(remaining));
9504
9505                hrtimer_cancel(&hwc->hrtimer);
9506        }
9507}
9508
9509static void perf_swevent_init_hrtimer(struct perf_event *event)
9510{
9511        struct hw_perf_event *hwc = &event->hw;
9512
9513        if (!is_sampling_event(event))
9514                return;
9515
9516        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
9517        hwc->hrtimer.function = perf_swevent_hrtimer;
9518
9519        /*
9520         * Since hrtimers have a fixed rate, we can do a static freq->period
9521         * mapping and avoid the whole period adjust feedback stuff.
9522         */
9523        if (event->attr.freq) {
9524                long freq = event->attr.sample_freq;
9525
9526                event->attr.sample_period = NSEC_PER_SEC / freq;
9527                hwc->sample_period = event->attr.sample_period;
9528                local64_set(&hwc->period_left, hwc->sample_period);
9529                hwc->last_period = hwc->sample_period;
9530                event->attr.freq = 0;
9531        }
9532}
9533
9534/*
9535 * Software event: cpu wall time clock
9536 */
9537
9538static void cpu_clock_event_update(struct perf_event *event)
9539{
9540        s64 prev;
9541        u64 now;
9542
9543        now = local_clock();
9544        prev = local64_xchg(&event->hw.prev_count, now);
9545        local64_add(now - prev, &event->count);
9546}
9547
9548static void cpu_clock_event_start(struct perf_event *event, int flags)
9549{
9550        local64_set(&event->hw.prev_count, local_clock());
9551        perf_swevent_start_hrtimer(event);
9552}
9553
9554static void cpu_clock_event_stop(struct perf_event *event, int flags)
9555{
9556        perf_swevent_cancel_hrtimer(event);
9557        cpu_clock_event_update(event);
9558}
9559
9560static int cpu_clock_event_add(struct perf_event *event, int flags)
9561{
9562        if (flags & PERF_EF_START)
9563                cpu_clock_event_start(event, flags);
9564        perf_event_update_userpage(event);
9565
9566        return 0;
9567}
9568
9569static void cpu_clock_event_del(struct perf_event *event, int flags)
9570{
9571        cpu_clock_event_stop(event, flags);
9572}
9573
9574static void cpu_clock_event_read(struct perf_event *event)
9575{
9576        cpu_clock_event_update(event);
9577}
9578
9579static int cpu_clock_event_init(struct perf_event *event)
9580{
9581        if (event->attr.type != PERF_TYPE_SOFTWARE)
9582                return -ENOENT;
9583
9584        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
9585                return -ENOENT;
9586
9587        /*
9588         * no branch sampling for software events
9589         */
9590        if (has_branch_stack(event))
9591                return -EOPNOTSUPP;
9592
9593        perf_swevent_init_hrtimer(event);
9594
9595        return 0;
9596}
9597
9598static struct pmu perf_cpu_clock = {
9599        .task_ctx_nr    = perf_sw_context,
9600
9601        .capabilities   = PERF_PMU_CAP_NO_NMI,
9602
9603        .event_init     = cpu_clock_event_init,
9604        .add            = cpu_clock_event_add,
9605        .del            = cpu_clock_event_del,
9606        .start          = cpu_clock_event_start,
9607        .stop           = cpu_clock_event_stop,
9608        .read           = cpu_clock_event_read,
9609};
9610
9611/*
9612 * Software event: task time clock
9613 */
9614
9615static void task_clock_event_update(struct perf_event *event, u64 now)
9616{
9617        u64 prev;
9618        s64 delta;
9619
9620        prev = local64_xchg(&event->hw.prev_count, now);
9621        delta = now - prev;
9622        local64_add(delta, &event->count);
9623}
9624
9625static void task_clock_event_start(struct perf_event *event, int flags)
9626{
9627        local64_set(&event->hw.prev_count, event->ctx->time);
9628        perf_swevent_start_hrtimer(event);
9629}
9630
9631static void task_clock_event_stop(struct perf_event *event, int flags)
9632{
9633        perf_swevent_cancel_hrtimer(event);
9634        task_clock_event_update(event, event->ctx->time);
9635}
9636
9637static int task_clock_event_add(struct perf_event *event, int flags)
9638{
9639        if (flags & PERF_EF_START)
9640                task_clock_event_start(event, flags);
9641        perf_event_update_userpage(event);
9642
9643        return 0;
9644}
9645
9646static void task_clock_event_del(struct perf_event *event, int flags)
9647{
9648        task_clock_event_stop(event, PERF_EF_UPDATE);
9649}
9650
9651static void task_clock_event_read(struct perf_event *event)
9652{
9653        u64 now = perf_clock();
9654        u64 delta = now - event->ctx->timestamp;
9655        u64 time = event->ctx->time + delta;
9656
9657        task_clock_event_update(event, time);
9658}
9659
9660static int task_clock_event_init(struct perf_event *event)
9661{
9662        if (event->attr.type != PERF_TYPE_SOFTWARE)
9663                return -ENOENT;
9664
9665        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
9666                return -ENOENT;
9667
9668        /*
9669         * no branch sampling for software events
9670         */
9671        if (has_branch_stack(event))
9672                return -EOPNOTSUPP;
9673
9674        perf_swevent_init_hrtimer(event);
9675
9676        return 0;
9677}
9678
9679static struct pmu perf_task_clock = {
9680        .task_ctx_nr    = perf_sw_context,
9681
9682        .capabilities   = PERF_PMU_CAP_NO_NMI,
9683
9684        .event_init     = task_clock_event_init,
9685        .add            = task_clock_event_add,
9686        .del            = task_clock_event_del,
9687        .start          = task_clock_event_start,
9688        .stop           = task_clock_event_stop,
9689        .read           = task_clock_event_read,
9690};
9691
9692static void perf_pmu_nop_void(struct pmu *pmu)
9693{
9694}
9695
9696static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
9697{
9698}
9699
9700static int perf_pmu_nop_int(struct pmu *pmu)
9701{
9702        return 0;
9703}
9704
9705static int perf_event_nop_int(struct perf_event *event, u64 value)
9706{
9707        return 0;
9708}
9709
9710static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9711
9712static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9713{
9714        __this_cpu_write(nop_txn_flags, flags);
9715
9716        if (flags & ~PERF_PMU_TXN_ADD)
9717                return;
9718
9719        perf_pmu_disable(pmu);
9720}
9721
9722static int perf_pmu_commit_txn(struct pmu *pmu)
9723{
9724        unsigned int flags = __this_cpu_read(nop_txn_flags);
9725
9726        __this_cpu_write(nop_txn_flags, 0);
9727
9728        if (flags & ~PERF_PMU_TXN_ADD)
9729                return 0;
9730
9731        perf_pmu_enable(pmu);
9732        return 0;
9733}
9734
9735static void perf_pmu_cancel_txn(struct pmu *pmu)
9736{
9737        unsigned int flags =  __this_cpu_read(nop_txn_flags);
9738
9739        __this_cpu_write(nop_txn_flags, 0);
9740
9741        if (flags & ~PERF_PMU_TXN_ADD)
9742                return;
9743
9744        perf_pmu_enable(pmu);
9745}
9746
9747static int perf_event_idx_default(struct perf_event *event)
9748{
9749        return 0;
9750}
9751
9752/*
9753 * Ensures all contexts with the same task_ctx_nr have the same
9754 * pmu_cpu_context too.
9755 */
9756static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9757{
9758        struct pmu *pmu;
9759
9760        if (ctxn < 0)
9761                return NULL;
9762
9763        list_for_each_entry(pmu, &pmus, entry) {
9764                if (pmu->task_ctx_nr == ctxn)
9765                        return pmu->pmu_cpu_context;
9766        }
9767
9768        return NULL;
9769}
9770
9771static void free_pmu_context(struct pmu *pmu)
9772{
9773        /*
9774         * Static contexts such as perf_sw_context have a global lifetime
9775         * and may be shared between different PMUs. Avoid freeing them
9776         * when a single PMU is going away.
9777         */
9778        if (pmu->task_ctx_nr > perf_invalid_context)
9779                return;
9780
9781        free_percpu(pmu->pmu_cpu_context);
9782}
9783
9784/*
9785 * Let userspace know that this PMU supports address range filtering:
9786 */
9787static ssize_t nr_addr_filters_show(struct device *dev,
9788                                    struct device_attribute *attr,
9789                                    char *page)
9790{
9791        struct pmu *pmu = dev_get_drvdata(dev);
9792
9793        return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9794}
9795DEVICE_ATTR_RO(nr_addr_filters);
9796
9797static struct idr pmu_idr;
9798
9799static ssize_t
9800type_show(struct device *dev, struct device_attribute *attr, char *page)
9801{
9802        struct pmu *pmu = dev_get_drvdata(dev);
9803
9804        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9805}
9806static DEVICE_ATTR_RO(type);
9807
9808static ssize_t
9809perf_event_mux_interval_ms_show(struct device *dev,
9810                                struct device_attribute *attr,
9811                                char *page)
9812{
9813        struct pmu *pmu = dev_get_drvdata(dev);
9814
9815        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9816}
9817
9818static DEFINE_MUTEX(mux_interval_mutex);
9819
9820static ssize_t
9821perf_event_mux_interval_ms_store(struct device *dev,
9822                                 struct device_attribute *attr,
9823                                 const char *buf, size_t count)
9824{
9825        struct pmu *pmu = dev_get_drvdata(dev);
9826        int timer, cpu, ret;
9827
9828        ret = kstrtoint(buf, 0, &timer);
9829        if (ret)
9830                return ret;
9831
9832        if (timer < 1)
9833                return -EINVAL;
9834
9835        /* same value, noting to do */
9836        if (timer == pmu->hrtimer_interval_ms)
9837                return count;
9838
9839        mutex_lock(&mux_interval_mutex);
9840        pmu->hrtimer_interval_ms = timer;
9841
9842        /* update all cpuctx for this PMU */
9843        cpus_read_lock();
9844        for_each_online_cpu(cpu) {
9845                struct perf_cpu_context *cpuctx;
9846                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9847                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9848
9849                cpu_function_call(cpu,
9850                        (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9851        }
9852        cpus_read_unlock();
9853        mutex_unlock(&mux_interval_mutex);
9854
9855        return count;
9856}
9857static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9858
9859static struct attribute *pmu_dev_attrs[] = {
9860        &dev_attr_type.attr,
9861        &dev_attr_perf_event_mux_interval_ms.attr,
9862        NULL,
9863};
9864ATTRIBUTE_GROUPS(pmu_dev);
9865
9866static int pmu_bus_running;
9867static struct bus_type pmu_bus = {
9868        .name           = "event_source",
9869        .dev_groups     = pmu_dev_groups,
9870};
9871
9872static void pmu_dev_release(struct device *dev)
9873{
9874        kfree(dev);
9875}
9876
9877static int pmu_dev_alloc(struct pmu *pmu)
9878{
9879        int ret = -ENOMEM;
9880
9881        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9882        if (!pmu->dev)
9883                goto out;
9884
9885        pmu->dev->groups = pmu->attr_groups;
9886        device_initialize(pmu->dev);
9887        ret = dev_set_name(pmu->dev, "%s", pmu->name);
9888        if (ret)
9889                goto free_dev;
9890
9891        dev_set_drvdata(pmu->dev, pmu);
9892        pmu->dev->bus = &pmu_bus;
9893        pmu->dev->release = pmu_dev_release;
9894        ret = device_add(pmu->dev);
9895        if (ret)
9896                goto free_dev;
9897
9898        /* For PMUs with address filters, throw in an extra attribute: */
9899        if (pmu->nr_addr_filters)
9900                ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9901
9902        if (ret)
9903                goto del_dev;
9904
9905        if (pmu->attr_update)
9906                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
9907
9908        if (ret)
9909                goto del_dev;
9910
9911out:
9912        return ret;
9913
9914del_dev:
9915        device_del(pmu->dev);
9916
9917free_dev:
9918        put_device(pmu->dev);
9919        goto out;
9920}
9921
9922static struct lock_class_key cpuctx_mutex;
9923static struct lock_class_key cpuctx_lock;
9924
9925int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9926{
9927        int cpu, ret;
9928
9929        mutex_lock(&pmus_lock);
9930        ret = -ENOMEM;
9931        pmu->pmu_disable_count = alloc_percpu(int);
9932        if (!pmu->pmu_disable_count)
9933                goto unlock;
9934
9935        pmu->type = -1;
9936        if (!name)
9937                goto skip_type;
9938        pmu->name = name;
9939
9940        if (type < 0) {
9941                type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9942                if (type < 0) {
9943                        ret = type;
9944                        goto free_pdc;
9945                }
9946        }
9947        pmu->type = type;
9948
9949        if (pmu_bus_running) {
9950                ret = pmu_dev_alloc(pmu);
9951                if (ret)
9952                        goto free_idr;
9953        }
9954
9955skip_type:
9956        if (pmu->task_ctx_nr == perf_hw_context) {
9957                static int hw_context_taken = 0;
9958
9959                /*
9960                 * Other than systems with heterogeneous CPUs, it never makes
9961                 * sense for two PMUs to share perf_hw_context. PMUs which are
9962                 * uncore must use perf_invalid_context.
9963                 */
9964                if (WARN_ON_ONCE(hw_context_taken &&
9965                    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9966                        pmu->task_ctx_nr = perf_invalid_context;
9967
9968                hw_context_taken = 1;
9969        }
9970
9971        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9972        if (pmu->pmu_cpu_context)
9973                goto got_cpu_context;
9974
9975        ret = -ENOMEM;
9976        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9977        if (!pmu->pmu_cpu_context)
9978                goto free_dev;
9979
9980        for_each_possible_cpu(cpu) {
9981                struct perf_cpu_context *cpuctx;
9982
9983                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9984                __perf_event_init_context(&cpuctx->ctx);
9985                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9986                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9987                cpuctx->ctx.pmu = pmu;
9988                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9989
9990                __perf_mux_hrtimer_init(cpuctx, cpu);
9991        }
9992
9993got_cpu_context:
9994        if (!pmu->start_txn) {
9995                if (pmu->pmu_enable) {
9996                        /*
9997                         * If we have pmu_enable/pmu_disable calls, install
9998                         * transaction stubs that use that to try and batch
9999                         * hardware accesses.
10000                         */
10001                        pmu->start_txn  = perf_pmu_start_txn;
10002                        pmu->commit_txn = perf_pmu_commit_txn;
10003                        pmu->cancel_txn = perf_pmu_cancel_txn;
10004                } else {
10005                        pmu->start_txn  = perf_pmu_nop_txn;
10006                        pmu->commit_txn = perf_pmu_nop_int;
10007                        pmu->cancel_txn = perf_pmu_nop_void;
10008                }
10009        }
10010
10011        if (!pmu->pmu_enable) {
10012                pmu->pmu_enable  = perf_pmu_nop_void;
10013                pmu->pmu_disable = perf_pmu_nop_void;
10014        }
10015
10016        if (!pmu->check_period)
10017                pmu->check_period = perf_event_nop_int;
10018
10019        if (!pmu->event_idx)
10020                pmu->event_idx = perf_event_idx_default;
10021
10022        list_add_rcu(&pmu->entry, &pmus);
10023        atomic_set(&pmu->exclusive_cnt, 0);
10024        ret = 0;
10025unlock:
10026        mutex_unlock(&pmus_lock);
10027
10028        return ret;
10029
10030free_dev:
10031        device_del(pmu->dev);
10032        put_device(pmu->dev);
10033
10034free_idr:
10035        if (pmu->type >= PERF_TYPE_MAX)
10036                idr_remove(&pmu_idr, pmu->type);
10037
10038free_pdc:
10039        free_percpu(pmu->pmu_disable_count);
10040        goto unlock;
10041}
10042EXPORT_SYMBOL_GPL(perf_pmu_register);
10043
10044void perf_pmu_unregister(struct pmu *pmu)
10045{
10046        mutex_lock(&pmus_lock);
10047        list_del_rcu(&pmu->entry);
10048
10049        /*
10050         * We dereference the pmu list under both SRCU and regular RCU, so
10051         * synchronize against both of those.
10052         */
10053        synchronize_srcu(&pmus_srcu);
10054        synchronize_rcu();
10055
10056        free_percpu(pmu->pmu_disable_count);
10057        if (pmu->type >= PERF_TYPE_MAX)
10058                idr_remove(&pmu_idr, pmu->type);
10059        if (pmu_bus_running) {
10060                if (pmu->nr_addr_filters)
10061                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
10062                device_del(pmu->dev);
10063                put_device(pmu->dev);
10064        }
10065        free_pmu_context(pmu);
10066        mutex_unlock(&pmus_lock);
10067}
10068EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10069
10070static inline bool has_extended_regs(struct perf_event *event)
10071{
10072        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10073               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10074}
10075
10076static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10077{
10078        struct perf_event_context *ctx = NULL;
10079        int ret;
10080
10081        if (!try_module_get(pmu->module))
10082                return -ENODEV;
10083
10084        /*
10085         * A number of pmu->event_init() methods iterate the sibling_list to,
10086         * for example, validate if the group fits on the PMU. Therefore,
10087         * if this is a sibling event, acquire the ctx->mutex to protect
10088         * the sibling_list.
10089         */
10090        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
10091                /*
10092                 * This ctx->mutex can nest when we're called through
10093                 * inheritance. See the perf_event_ctx_lock_nested() comment.
10094                 */
10095                ctx = perf_event_ctx_lock_nested(event->group_leader,
10096                                                 SINGLE_DEPTH_NESTING);
10097                BUG_ON(!ctx);
10098        }
10099
10100        event->pmu = pmu;
10101        ret = pmu->event_init(event);
10102
10103        if (ctx)
10104                perf_event_ctx_unlock(event->group_leader, ctx);
10105
10106        if (!ret) {
10107                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
10108                    has_extended_regs(event))
10109                        ret = -EOPNOTSUPP;
10110
10111                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
10112                    event_has_any_exclude_flag(event))
10113                        ret = -EINVAL;
10114
10115                if (ret && event->destroy)
10116                        event->destroy(event);
10117        }
10118
10119        if (ret)
10120                module_put(pmu->module);
10121
10122        return ret;
10123}
10124
10125static struct pmu *perf_init_event(struct perf_event *event)
10126{
10127        struct pmu *pmu;
10128        int idx;
10129        int ret;
10130
10131        idx = srcu_read_lock(&pmus_srcu);
10132
10133        /* Try parent's PMU first: */
10134        if (event->parent && event->parent->pmu) {
10135                pmu = event->parent->pmu;
10136                ret = perf_try_init_event(pmu, event);
10137                if (!ret)
10138                        goto unlock;
10139        }
10140
10141        rcu_read_lock();
10142        pmu = idr_find(&pmu_idr, event->attr.type);
10143        rcu_read_unlock();
10144        if (pmu) {
10145                ret = perf_try_init_event(pmu, event);
10146                if (ret)
10147                        pmu = ERR_PTR(ret);
10148                goto unlock;
10149        }
10150
10151        list_for_each_entry_rcu(pmu, &pmus, entry) {
10152                ret = perf_try_init_event(pmu, event);
10153                if (!ret)
10154                        goto unlock;
10155
10156                if (ret != -ENOENT) {
10157                        pmu = ERR_PTR(ret);
10158                        goto unlock;
10159                }
10160        }
10161        pmu = ERR_PTR(-ENOENT);
10162unlock:
10163        srcu_read_unlock(&pmus_srcu, idx);
10164
10165        return pmu;
10166}
10167
10168static void attach_sb_event(struct perf_event *event)
10169{
10170        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
10171
10172        raw_spin_lock(&pel->lock);
10173        list_add_rcu(&event->sb_list, &pel->list);
10174        raw_spin_unlock(&pel->lock);
10175}
10176
10177/*
10178 * We keep a list of all !task (and therefore per-cpu) events
10179 * that need to receive side-band records.
10180 *
10181 * This avoids having to scan all the various PMU per-cpu contexts
10182 * looking for them.
10183 */
10184static void account_pmu_sb_event(struct perf_event *event)
10185{
10186        if (is_sb_event(event))
10187                attach_sb_event(event);
10188}
10189
10190static void account_event_cpu(struct perf_event *event, int cpu)
10191{
10192        if (event->parent)
10193                return;
10194
10195        if (is_cgroup_event(event))
10196                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
10197}
10198
10199/* Freq events need the tick to stay alive (see perf_event_task_tick). */
10200static void account_freq_event_nohz(void)
10201{
10202#ifdef CONFIG_NO_HZ_FULL
10203        /* Lock so we don't race with concurrent unaccount */
10204        spin_lock(&nr_freq_lock);
10205        if (atomic_inc_return(&nr_freq_events) == 1)
10206                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
10207        spin_unlock(&nr_freq_lock);
10208#endif
10209}
10210
10211static void account_freq_event(void)
10212{
10213        if (tick_nohz_full_enabled())
10214                account_freq_event_nohz();
10215        else
10216                atomic_inc(&nr_freq_events);
10217}
10218
10219
10220static void account_event(struct perf_event *event)
10221{
10222        bool inc = false;
10223
10224        if (event->parent)
10225                return;
10226
10227        if (event->attach_state & PERF_ATTACH_TASK)
10228                inc = true;
10229        if (event->attr.mmap || event->attr.mmap_data)
10230                atomic_inc(&nr_mmap_events);
10231        if (event->attr.comm)
10232                atomic_inc(&nr_comm_events);
10233        if (event->attr.namespaces)
10234                atomic_inc(&nr_namespaces_events);
10235        if (event->attr.task)
10236                atomic_inc(&nr_task_events);
10237        if (event->attr.freq)
10238                account_freq_event();
10239        if (event->attr.context_switch) {
10240                atomic_inc(&nr_switch_events);
10241                inc = true;
10242        }
10243        if (has_branch_stack(event))
10244                inc = true;
10245        if (is_cgroup_event(event))
10246                inc = true;
10247        if (event->attr.ksymbol)
10248                atomic_inc(&nr_ksymbol_events);
10249        if (event->attr.bpf_event)
10250                atomic_inc(&nr_bpf_events);
10251
10252        if (inc) {
10253                /*
10254                 * We need the mutex here because static_branch_enable()
10255                 * must complete *before* the perf_sched_count increment
10256                 * becomes visible.
10257                 */
10258                if (atomic_inc_not_zero(&perf_sched_count))
10259                        goto enabled;
10260
10261                mutex_lock(&perf_sched_mutex);
10262                if (!atomic_read(&perf_sched_count)) {
10263                        static_branch_enable(&perf_sched_events);
10264                        /*
10265                         * Guarantee that all CPUs observe they key change and
10266                         * call the perf scheduling hooks before proceeding to
10267                         * install events that need them.
10268                         */
10269                        synchronize_rcu();
10270                }
10271                /*
10272                 * Now that we have waited for the sync_sched(), allow further
10273                 * increments to by-pass the mutex.
10274                 */
10275                atomic_inc(&perf_sched_count);
10276                mutex_unlock(&perf_sched_mutex);
10277        }
10278enabled:
10279
10280        account_event_cpu(event, event->cpu);
10281
10282        account_pmu_sb_event(event);
10283}
10284
10285/*
10286 * Allocate and initialize an event structure
10287 */
10288static struct perf_event *
10289perf_event_alloc(struct perf_event_attr *attr, int cpu,
10290                 struct task_struct *task,
10291                 struct perf_event *group_leader,
10292                 struct perf_event *parent_event,
10293                 perf_overflow_handler_t overflow_handler,
10294                 void *context, int cgroup_fd)
10295{
10296        struct pmu *pmu;
10297        struct perf_event *event;
10298        struct hw_perf_event *hwc;
10299        long err = -EINVAL;
10300
10301        if ((unsigned)cpu >= nr_cpu_ids) {
10302                if (!task || cpu != -1)
10303                        return ERR_PTR(-EINVAL);
10304        }
10305
10306        event = kzalloc(sizeof(*event), GFP_KERNEL);
10307        if (!event)
10308                return ERR_PTR(-ENOMEM);
10309
10310        /*
10311         * Single events are their own group leaders, with an
10312         * empty sibling list:
10313         */
10314        if (!group_leader)
10315                group_leader = event;
10316
10317        mutex_init(&event->child_mutex);
10318        INIT_LIST_HEAD(&event->child_list);
10319
10320        INIT_LIST_HEAD(&event->event_entry);
10321        INIT_LIST_HEAD(&event->sibling_list);
10322        INIT_LIST_HEAD(&event->active_list);
10323        init_event_group(event);
10324        INIT_LIST_HEAD(&event->rb_entry);
10325        INIT_LIST_HEAD(&event->active_entry);
10326        INIT_LIST_HEAD(&event->addr_filters.list);
10327        INIT_HLIST_NODE(&event->hlist_entry);
10328
10329
10330        init_waitqueue_head(&event->waitq);
10331        event->pending_disable = -1;
10332        init_irq_work(&event->pending, perf_pending_event);
10333
10334        mutex_init(&event->mmap_mutex);
10335        raw_spin_lock_init(&event->addr_filters.lock);
10336
10337        atomic_long_set(&event->refcount, 1);
10338        event->cpu              = cpu;
10339        event->attr             = *attr;
10340        event->group_leader     = group_leader;
10341        event->pmu              = NULL;
10342        event->oncpu            = -1;
10343
10344        event->parent           = parent_event;
10345
10346        event->ns               = get_pid_ns(task_active_pid_ns(current));
10347        event->id               = atomic64_inc_return(&perf_event_id);
10348
10349        event->state            = PERF_EVENT_STATE_INACTIVE;
10350
10351        if (task) {
10352                event->attach_state = PERF_ATTACH_TASK;
10353                /*
10354                 * XXX pmu::event_init needs to know what task to account to
10355                 * and we cannot use the ctx information because we need the
10356                 * pmu before we get a ctx.
10357                 */
10358                get_task_struct(task);
10359                event->hw.target = task;
10360        }
10361
10362        event->clock = &local_clock;
10363        if (parent_event)
10364                event->clock = parent_event->clock;
10365
10366        if (!overflow_handler && parent_event) {
10367                overflow_handler = parent_event->overflow_handler;
10368                context = parent_event->overflow_handler_context;
10369#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10370                if (overflow_handler == bpf_overflow_handler) {
10371                        struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
10372
10373                        if (IS_ERR(prog)) {
10374                                err = PTR_ERR(prog);
10375                                goto err_ns;
10376                        }
10377                        event->prog = prog;
10378                        event->orig_overflow_handler =
10379                                parent_event->orig_overflow_handler;
10380                }
10381#endif
10382        }
10383
10384        if (overflow_handler) {
10385                event->overflow_handler = overflow_handler;
10386                event->overflow_handler_context = context;
10387        } else if (is_write_backward(event)){
10388                event->overflow_handler = perf_event_output_backward;
10389                event->overflow_handler_context = NULL;
10390        } else {
10391                event->overflow_handler = perf_event_output_forward;
10392                event->overflow_handler_context = NULL;
10393        }
10394
10395        perf_event__state_init(event);
10396
10397        pmu = NULL;
10398
10399        hwc = &event->hw;
10400        hwc->sample_period = attr->sample_period;
10401        if (attr->freq && attr->sample_freq)
10402                hwc->sample_period = 1;
10403        hwc->last_period = hwc->sample_period;
10404
10405        local64_set(&hwc->period_left, hwc->sample_period);
10406
10407        /*
10408         * We currently do not support PERF_SAMPLE_READ on inherited events.
10409         * See perf_output_read().
10410         */
10411        if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
10412                goto err_ns;
10413
10414        if (!has_branch_stack(event))
10415                event->attr.branch_sample_type = 0;
10416
10417        if (cgroup_fd != -1) {
10418                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10419                if (err)
10420                        goto err_ns;
10421        }
10422
10423        pmu = perf_init_event(event);
10424        if (IS_ERR(pmu)) {
10425                err = PTR_ERR(pmu);
10426                goto err_ns;
10427        }
10428
10429        err = exclusive_event_init(event);
10430        if (err)
10431                goto err_pmu;
10432
10433        if (has_addr_filter(event)) {
10434                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
10435                                                    sizeof(struct perf_addr_filter_range),
10436                                                    GFP_KERNEL);
10437                if (!event->addr_filter_ranges) {
10438                        err = -ENOMEM;
10439                        goto err_per_task;
10440                }
10441
10442                /*
10443                 * Clone the parent's vma offsets: they are valid until exec()
10444                 * even if the mm is not shared with the parent.
10445                 */
10446                if (event->parent) {
10447                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10448
10449                        raw_spin_lock_irq(&ifh->lock);
10450                        memcpy(event->addr_filter_ranges,
10451                               event->parent->addr_filter_ranges,
10452                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
10453                        raw_spin_unlock_irq(&ifh->lock);
10454                }
10455
10456                /* force hw sync on the address filters */
10457                event->addr_filters_gen = 1;
10458        }
10459
10460        if (!event->parent) {
10461                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
10462                        err = get_callchain_buffers(attr->sample_max_stack);
10463                        if (err)
10464                                goto err_addr_filters;
10465                }
10466        }
10467
10468        /* symmetric to unaccount_event() in _free_event() */
10469        account_event(event);
10470
10471        return event;
10472
10473err_addr_filters:
10474        kfree(event->addr_filter_ranges);
10475
10476err_per_task:
10477        exclusive_event_destroy(event);
10478
10479err_pmu:
10480        if (event->destroy)
10481                event->destroy(event);
10482        module_put(pmu->module);
10483err_ns:
10484        if (is_cgroup_event(event))
10485                perf_detach_cgroup(event);
10486        if (event->ns)
10487                put_pid_ns(event->ns);
10488        if (event->hw.target)
10489                put_task_struct(event->hw.target);
10490        kfree(event);
10491
10492        return ERR_PTR(err);
10493}
10494
10495static int perf_copy_attr(struct perf_event_attr __user *uattr,
10496                          struct perf_event_attr *attr)
10497{
10498        u32 size;
10499        int ret;
10500
10501        if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
10502                return -EFAULT;
10503
10504        /*
10505         * zero the full structure, so that a short copy will be nice.
10506         */
10507        memset(attr, 0, sizeof(*attr));
10508
10509        ret = get_user(size, &uattr->size);
10510        if (ret)
10511                return ret;
10512
10513        if (size > PAGE_SIZE)   /* silly large */
10514                goto err_size;
10515
10516        if (!size)              /* abi compat */
10517                size = PERF_ATTR_SIZE_VER0;
10518
10519        if (size < PERF_ATTR_SIZE_VER0)
10520                goto err_size;
10521
10522        /*
10523         * If we're handed a bigger struct than we know of,
10524         * ensure all the unknown bits are 0 - i.e. new
10525         * user-space does not rely on any kernel feature
10526         * extensions we dont know about yet.
10527         */
10528        if (size > sizeof(*attr)) {
10529                unsigned char __user *addr;
10530                unsigned char __user *end;
10531                unsigned char val;
10532
10533                addr = (void __user *)uattr + sizeof(*attr);
10534                end  = (void __user *)uattr + size;
10535
10536                for (; addr < end; addr++) {
10537                        ret = get_user(val, addr);
10538                        if (ret)
10539                                return ret;
10540                        if (val)
10541                                goto err_size;
10542                }
10543                size = sizeof(*attr);
10544        }
10545
10546        ret = copy_from_user(attr, uattr, size);
10547        if (ret)
10548                return -EFAULT;
10549
10550        attr->size = size;
10551
10552        if (attr->__reserved_1)
10553                return -EINVAL;
10554
10555        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
10556                return -EINVAL;
10557
10558        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
10559                return -EINVAL;
10560
10561        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
10562                u64 mask = attr->branch_sample_type;
10563
10564                /* only using defined bits */
10565                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
10566                        return -EINVAL;
10567
10568                /* at least one branch bit must be set */
10569                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
10570                        return -EINVAL;
10571
10572                /* propagate priv level, when not set for branch */
10573                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
10574
10575                        /* exclude_kernel checked on syscall entry */
10576                        if (!attr->exclude_kernel)
10577                                mask |= PERF_SAMPLE_BRANCH_KERNEL;
10578
10579                        if (!attr->exclude_user)
10580                                mask |= PERF_SAMPLE_BRANCH_USER;
10581
10582                        if (!attr->exclude_hv)
10583                                mask |= PERF_SAMPLE_BRANCH_HV;
10584                        /*
10585                         * adjust user setting (for HW filter setup)
10586                         */
10587                        attr->branch_sample_type = mask;
10588                }
10589                /* privileged levels capture (kernel, hv): check permissions */
10590                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
10591                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10592                        return -EACCES;
10593        }
10594
10595        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
10596                ret = perf_reg_validate(attr->sample_regs_user);
10597                if (ret)
10598                        return ret;
10599        }
10600
10601        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
10602                if (!arch_perf_have_user_stack_dump())
10603                        return -ENOSYS;
10604
10605                /*
10606                 * We have __u32 type for the size, but so far
10607                 * we can only use __u16 as maximum due to the
10608                 * __u16 sample size limit.
10609                 */
10610                if (attr->sample_stack_user >= USHRT_MAX)
10611                        return -EINVAL;
10612                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
10613                        return -EINVAL;
10614        }
10615
10616        if (!attr->sample_max_stack)
10617                attr->sample_max_stack = sysctl_perf_event_max_stack;
10618
10619        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10620                ret = perf_reg_validate(attr->sample_regs_intr);
10621out:
10622        return ret;
10623
10624err_size:
10625        put_user(sizeof(*attr), &uattr->size);
10626        ret = -E2BIG;
10627        goto out;
10628}
10629
10630static int
10631perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
10632{
10633        struct ring_buffer *rb = NULL;
10634        int ret = -EINVAL;
10635
10636        if (!output_event)
10637                goto set;
10638
10639        /* don't allow circular references */
10640        if (event == output_event)
10641                goto out;
10642
10643        /*
10644         * Don't allow cross-cpu buffers
10645         */
10646        if (output_event->cpu != event->cpu)
10647                goto out;
10648
10649        /*
10650         * If its not a per-cpu rb, it must be the same task.
10651         */
10652        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
10653                goto out;
10654
10655        /*
10656         * Mixing clocks in the same buffer is trouble you don't need.
10657         */
10658        if (output_event->clock != event->clock)
10659                goto out;
10660
10661        /*
10662         * Either writing ring buffer from beginning or from end.
10663         * Mixing is not allowed.
10664         */
10665        if (is_write_backward(output_event) != is_write_backward(event))
10666                goto out;
10667
10668        /*
10669         * If both events generate aux data, they must be on the same PMU
10670         */
10671        if (has_aux(event) && has_aux(output_event) &&
10672            event->pmu != output_event->pmu)
10673                goto out;
10674
10675set:
10676        mutex_lock(&event->mmap_mutex);
10677        /* Can't redirect output if we've got an active mmap() */
10678        if (atomic_read(&event->mmap_count))
10679                goto unlock;
10680
10681        if (output_event) {
10682                /* get the rb we want to redirect to */
10683                rb = ring_buffer_get(output_event);
10684                if (!rb)
10685                        goto unlock;
10686        }
10687
10688        ring_buffer_attach(event, rb);
10689
10690        ret = 0;
10691unlock:
10692        mutex_unlock(&event->mmap_mutex);
10693
10694out:
10695        return ret;
10696}
10697
10698static void mutex_lock_double(struct mutex *a, struct mutex *b)
10699{
10700        if (b < a)
10701                swap(a, b);
10702
10703        mutex_lock(a);
10704        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10705}
10706
10707static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10708{
10709        bool nmi_safe = false;
10710
10711        switch (clk_id) {
10712        case CLOCK_MONOTONIC:
10713                event->clock = &ktime_get_mono_fast_ns;
10714                nmi_safe = true;
10715                break;
10716
10717        case CLOCK_MONOTONIC_RAW:
10718                event->clock = &ktime_get_raw_fast_ns;
10719                nmi_safe = true;
10720                break;
10721
10722        case CLOCK_REALTIME:
10723                event->clock = &ktime_get_real_ns;
10724                break;
10725
10726        case CLOCK_BOOTTIME:
10727                event->clock = &ktime_get_boottime_ns;
10728                break;
10729
10730        case CLOCK_TAI:
10731                event->clock = &ktime_get_clocktai_ns;
10732                break;
10733
10734        default:
10735                return -EINVAL;
10736        }
10737
10738        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
10739                return -EINVAL;
10740
10741        return 0;
10742}
10743
10744/*
10745 * Variation on perf_event_ctx_lock_nested(), except we take two context
10746 * mutexes.
10747 */
10748static struct perf_event_context *
10749__perf_event_ctx_lock_double(struct perf_event *group_leader,
10750                             struct perf_event_context *ctx)
10751{
10752        struct perf_event_context *gctx;
10753
10754again:
10755        rcu_read_lock();
10756        gctx = READ_ONCE(group_leader->ctx);
10757        if (!refcount_inc_not_zero(&gctx->refcount)) {
10758                rcu_read_unlock();
10759                goto again;
10760        }
10761        rcu_read_unlock();
10762
10763        mutex_lock_double(&gctx->mutex, &ctx->mutex);
10764
10765        if (group_leader->ctx != gctx) {
10766                mutex_unlock(&ctx->mutex);
10767                mutex_unlock(&gctx->mutex);
10768                put_ctx(gctx);
10769                goto again;
10770        }
10771
10772        return gctx;
10773}
10774
10775/**
10776 * sys_perf_event_open - open a performance event, associate it to a task/cpu
10777 *
10778 * @attr_uptr:  event_id type attributes for monitoring/sampling
10779 * @pid:                target pid
10780 * @cpu:                target cpu
10781 * @group_fd:           group leader event fd
10782 */
10783SYSCALL_DEFINE5(perf_event_open,
10784                struct perf_event_attr __user *, attr_uptr,
10785                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10786{
10787        struct perf_event *group_leader = NULL, *output_event = NULL;
10788        struct perf_event *event, *sibling;
10789        struct perf_event_attr attr;
10790        struct perf_event_context *ctx, *uninitialized_var(gctx);
10791        struct file *event_file = NULL;
10792        struct fd group = {NULL, 0};
10793        struct task_struct *task = NULL;
10794        struct pmu *pmu;
10795        int event_fd;
10796        int move_group = 0;
10797        int err;
10798        int f_flags = O_RDWR;
10799        int cgroup_fd = -1;
10800
10801        /* for future expandability... */
10802        if (flags & ~PERF_FLAG_ALL)
10803                return -EINVAL;
10804
10805        err = perf_copy_attr(attr_uptr, &attr);
10806        if (err)
10807                return err;
10808
10809        if (!attr.exclude_kernel) {
10810                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10811                        return -EACCES;
10812        }
10813
10814        if (attr.namespaces) {
10815                if (!capable(CAP_SYS_ADMIN))
10816                        return -EACCES;
10817        }
10818
10819        if (attr.freq) {
10820                if (attr.sample_freq > sysctl_perf_event_sample_rate)
10821                        return -EINVAL;
10822        } else {
10823                if (attr.sample_period & (1ULL << 63))
10824                        return -EINVAL;
10825        }
10826
10827        /* Only privileged users can get physical addresses */
10828        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10829            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10830                return -EACCES;
10831
10832        /*
10833         * In cgroup mode, the pid argument is used to pass the fd
10834         * opened to the cgroup directory in cgroupfs. The cpu argument
10835         * designates the cpu on which to monitor threads from that
10836         * cgroup.
10837         */
10838        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10839                return -EINVAL;
10840
10841        if (flags & PERF_FLAG_FD_CLOEXEC)
10842                f_flags |= O_CLOEXEC;
10843
10844        event_fd = get_unused_fd_flags(f_flags);
10845        if (event_fd < 0)
10846                return event_fd;
10847
10848        if (group_fd != -1) {
10849                err = perf_fget_light(group_fd, &group);
10850                if (err)
10851                        goto err_fd;
10852                group_leader = group.file->private_data;
10853                if (flags & PERF_FLAG_FD_OUTPUT)
10854                        output_event = group_leader;
10855                if (flags & PERF_FLAG_FD_NO_GROUP)
10856                        group_leader = NULL;
10857        }
10858
10859        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10860                task = find_lively_task_by_vpid(pid);
10861                if (IS_ERR(task)) {
10862                        err = PTR_ERR(task);
10863                        goto err_group_fd;
10864                }
10865        }
10866
10867        if (task && group_leader &&
10868            group_leader->attr.inherit != attr.inherit) {
10869                err = -EINVAL;
10870                goto err_task;
10871        }
10872
10873        if (task) {
10874                err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10875                if (err)
10876                        goto err_task;
10877
10878                /*
10879                 * Reuse ptrace permission checks for now.
10880                 *
10881                 * We must hold cred_guard_mutex across this and any potential
10882                 * perf_install_in_context() call for this new event to
10883                 * serialize against exec() altering our credentials (and the
10884                 * perf_event_exit_task() that could imply).
10885                 */
10886                err = -EACCES;
10887                if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10888                        goto err_cred;
10889        }
10890
10891        if (flags & PERF_FLAG_PID_CGROUP)
10892                cgroup_fd = pid;
10893
10894        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10895                                 NULL, NULL, cgroup_fd);
10896        if (IS_ERR(event)) {
10897                err = PTR_ERR(event);
10898                goto err_cred;
10899        }
10900
10901        if (is_sampling_event(event)) {
10902                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10903                        err = -EOPNOTSUPP;
10904                        goto err_alloc;
10905                }
10906        }
10907
10908        /*
10909         * Special case software events and allow them to be part of
10910         * any hardware group.
10911         */
10912        pmu = event->pmu;
10913
10914        if (attr.use_clockid) {
10915                err = perf_event_set_clock(event, attr.clockid);
10916                if (err)
10917                        goto err_alloc;
10918        }
10919
10920        if (pmu->task_ctx_nr == perf_sw_context)
10921                event->event_caps |= PERF_EV_CAP_SOFTWARE;
10922
10923        if (group_leader) {
10924                if (is_software_event(event) &&
10925                    !in_software_context(group_leader)) {
10926                        /*
10927                         * If the event is a sw event, but the group_leader
10928                         * is on hw context.
10929                         *
10930                         * Allow the addition of software events to hw
10931                         * groups, this is safe because software events
10932                         * never fail to schedule.
10933                         */
10934                        pmu = group_leader->ctx->pmu;
10935                } else if (!is_software_event(event) &&
10936                           is_software_event(group_leader) &&
10937                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10938                        /*
10939                         * In case the group is a pure software group, and we
10940                         * try to add a hardware event, move the whole group to
10941                         * the hardware context.
10942                         */
10943                        move_group = 1;
10944                }
10945        }
10946
10947        /*
10948         * Get the target context (task or percpu):
10949         */
10950        ctx = find_get_context(pmu, task, event);
10951        if (IS_ERR(ctx)) {
10952                err = PTR_ERR(ctx);
10953                goto err_alloc;
10954        }
10955
10956        /*
10957         * Look up the group leader (we will attach this event to it):
10958         */
10959        if (group_leader) {
10960                err = -EINVAL;
10961
10962                /*
10963                 * Do not allow a recursive hierarchy (this new sibling
10964                 * becoming part of another group-sibling):
10965                 */
10966                if (group_leader->group_leader != group_leader)
10967                        goto err_context;
10968
10969                /* All events in a group should have the same clock */
10970                if (group_leader->clock != event->clock)
10971                        goto err_context;
10972
10973                /*
10974                 * Make sure we're both events for the same CPU;
10975                 * grouping events for different CPUs is broken; since
10976                 * you can never concurrently schedule them anyhow.
10977                 */
10978                if (group_leader->cpu != event->cpu)
10979                        goto err_context;
10980
10981                /*
10982                 * Make sure we're both on the same task, or both
10983                 * per-CPU events.
10984                 */
10985                if (group_leader->ctx->task != ctx->task)
10986                        goto err_context;
10987
10988                /*
10989                 * Do not allow to attach to a group in a different task
10990                 * or CPU context. If we're moving SW events, we'll fix
10991                 * this up later, so allow that.
10992                 */
10993                if (!move_group && group_leader->ctx != ctx)
10994                        goto err_context;
10995
10996                /*
10997                 * Only a group leader can be exclusive or pinned
10998                 */
10999                if (attr.exclusive || attr.pinned)
11000                        goto err_context;
11001        }
11002
11003        if (output_event) {
11004                err = perf_event_set_output(event, output_event);
11005                if (err)
11006                        goto err_context;
11007        }
11008
11009        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
11010                                        f_flags);
11011        if (IS_ERR(event_file)) {
11012                err = PTR_ERR(event_file);
11013                event_file = NULL;
11014                goto err_context;
11015        }
11016
11017        if (move_group) {
11018                gctx = __perf_event_ctx_lock_double(group_leader, ctx);
11019
11020                if (gctx->task == TASK_TOMBSTONE) {
11021                        err = -ESRCH;
11022                        goto err_locked;
11023                }
11024
11025                /*
11026                 * Check if we raced against another sys_perf_event_open() call
11027                 * moving the software group underneath us.
11028                 */
11029                if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11030                        /*
11031                         * If someone moved the group out from under us, check
11032                         * if this new event wound up on the same ctx, if so
11033                         * its the regular !move_group case, otherwise fail.
11034                         */
11035                        if (gctx != ctx) {
11036                                err = -EINVAL;
11037                                goto err_locked;
11038                        } else {
11039                                perf_event_ctx_unlock(group_leader, gctx);
11040                                move_group = 0;
11041                        }
11042                }
11043
11044                /*
11045                 * Failure to create exclusive events returns -EBUSY.
11046                 */
11047                err = -EBUSY;
11048                if (!exclusive_event_installable(group_leader, ctx))
11049                        goto err_locked;
11050
11051                for_each_sibling_event(sibling, group_leader) {
11052                        if (!exclusive_event_installable(sibling, ctx))
11053                                goto err_locked;
11054                }
11055        } else {
11056                mutex_lock(&ctx->mutex);
11057        }
11058
11059        if (ctx->task == TASK_TOMBSTONE) {
11060                err = -ESRCH;
11061                goto err_locked;
11062        }
11063
11064        if (!perf_event_validate_size(event)) {
11065                err = -E2BIG;
11066                goto err_locked;
11067        }
11068
11069        if (!task) {
11070                /*
11071                 * Check if the @cpu we're creating an event for is online.
11072                 *
11073                 * We use the perf_cpu_context::ctx::mutex to serialize against
11074                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11075                 */
11076                struct perf_cpu_context *cpuctx =
11077                        container_of(ctx, struct perf_cpu_context, ctx);
11078
11079                if (!cpuctx->online) {
11080                        err = -ENODEV;
11081                        goto err_locked;
11082                }
11083        }
11084
11085
11086        /*
11087         * Must be under the same ctx::mutex as perf_install_in_context(),
11088         * because we need to serialize with concurrent event creation.
11089         */
11090        if (!exclusive_event_installable(event, ctx)) {
11091                err = -EBUSY;
11092                goto err_locked;
11093        }
11094
11095        WARN_ON_ONCE(ctx->parent_ctx);
11096
11097        /*
11098         * This is the point on no return; we cannot fail hereafter. This is
11099         * where we start modifying current state.
11100         */
11101
11102        if (move_group) {
11103                /*
11104                 * See perf_event_ctx_lock() for comments on the details
11105                 * of swizzling perf_event::ctx.
11106                 */
11107                perf_remove_from_context(group_leader, 0);
11108                put_ctx(gctx);
11109
11110                for_each_sibling_event(sibling, group_leader) {
11111                        perf_remove_from_context(sibling, 0);
11112                        put_ctx(gctx);
11113                }
11114
11115                /*
11116                 * Wait for everybody to stop referencing the events through
11117                 * the old lists, before installing it on new lists.
11118                 */
11119                synchronize_rcu();
11120
11121                /*
11122                 * Install the group siblings before the group leader.
11123                 *
11124                 * Because a group leader will try and install the entire group
11125                 * (through the sibling list, which is still in-tact), we can
11126                 * end up with siblings installed in the wrong context.
11127                 *
11128                 * By installing siblings first we NO-OP because they're not
11129                 * reachable through the group lists.
11130                 */
11131                for_each_sibling_event(sibling, group_leader) {
11132                        perf_event__state_init(sibling);
11133                        perf_install_in_context(ctx, sibling, sibling->cpu);
11134                        get_ctx(ctx);
11135                }
11136
11137                /*
11138                 * Removing from the context ends up with disabled
11139                 * event. What we want here is event in the initial
11140                 * startup state, ready to be add into new context.
11141                 */
11142                perf_event__state_init(group_leader);
11143                perf_install_in_context(ctx, group_leader, group_leader->cpu);
11144                get_ctx(ctx);
11145        }
11146
11147        /*
11148         * Precalculate sample_data sizes; do while holding ctx::mutex such
11149         * that we're serialized against further additions and before
11150         * perf_install_in_context() which is the point the event is active and
11151         * can use these values.
11152         */
11153        perf_event__header_size(event);
11154        perf_event__id_header_size(event);
11155
11156        event->owner = current;
11157
11158        perf_install_in_context(ctx, event, event->cpu);
11159        perf_unpin_context(ctx);
11160
11161        if (move_group)
11162                perf_event_ctx_unlock(group_leader, gctx);
11163        mutex_unlock(&ctx->mutex);
11164
11165        if (task) {
11166                mutex_unlock(&task->signal->cred_guard_mutex);
11167                put_task_struct(task);
11168        }
11169
11170        mutex_lock(&current->perf_event_mutex);
11171        list_add_tail(&event->owner_entry, &current->perf_event_list);
11172        mutex_unlock(&current->perf_event_mutex);
11173
11174        /*
11175         * Drop the reference on the group_event after placing the
11176         * new event on the sibling_list. This ensures destruction
11177         * of the group leader will find the pointer to itself in
11178         * perf_group_detach().
11179         */
11180        fdput(group);
11181        fd_install(event_fd, event_file);
11182        return event_fd;
11183
11184err_locked:
11185        if (move_group)
11186                perf_event_ctx_unlock(group_leader, gctx);
11187        mutex_unlock(&ctx->mutex);
11188/* err_file: */
11189        fput(event_file);
11190err_context:
11191        perf_unpin_context(ctx);
11192        put_ctx(ctx);
11193err_alloc:
11194        /*
11195         * If event_file is set, the fput() above will have called ->release()
11196         * and that will take care of freeing the event.
11197         */
11198        if (!event_file)
11199                free_event(event);
11200err_cred:
11201        if (task)
11202                mutex_unlock(&task->signal->cred_guard_mutex);
11203err_task:
11204        if (task)
11205                put_task_struct(task);
11206err_group_fd:
11207        fdput(group);
11208err_fd:
11209        put_unused_fd(event_fd);
11210        return err;
11211}
11212
11213/**
11214 * perf_event_create_kernel_counter
11215 *
11216 * @attr: attributes of the counter to create
11217 * @cpu: cpu in which the counter is bound
11218 * @task: task to profile (NULL for percpu)
11219 */
11220struct perf_event *
11221perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
11222                                 struct task_struct *task,
11223                                 perf_overflow_handler_t overflow_handler,
11224                                 void *context)
11225{
11226        struct perf_event_context *ctx;
11227        struct perf_event *event;
11228        int err;
11229
11230        /*
11231         * Get the target context (task or percpu):
11232         */
11233
11234        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11235                                 overflow_handler, context, -1);
11236        if (IS_ERR(event)) {
11237                err = PTR_ERR(event);
11238                goto err;
11239        }
11240
11241        /* Mark owner so we could distinguish it from user events. */
11242        event->owner = TASK_TOMBSTONE;
11243
11244        ctx = find_get_context(event->pmu, task, event);
11245        if (IS_ERR(ctx)) {
11246                err = PTR_ERR(ctx);
11247                goto err_free;
11248        }
11249
11250        WARN_ON_ONCE(ctx->parent_ctx);
11251        mutex_lock(&ctx->mutex);
11252        if (ctx->task == TASK_TOMBSTONE) {
11253                err = -ESRCH;
11254                goto err_unlock;
11255        }
11256
11257        if (!task) {
11258                /*
11259                 * Check if the @cpu we're creating an event for is online.
11260                 *
11261                 * We use the perf_cpu_context::ctx::mutex to serialize against
11262                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11263                 */
11264                struct perf_cpu_context *cpuctx =
11265                        container_of(ctx, struct perf_cpu_context, ctx);
11266                if (!cpuctx->online) {
11267                        err = -ENODEV;
11268                        goto err_unlock;
11269                }
11270        }
11271
11272        if (!exclusive_event_installable(event, ctx)) {
11273                err = -EBUSY;
11274                goto err_unlock;
11275        }
11276
11277        perf_install_in_context(ctx, event, event->cpu);
11278        perf_unpin_context(ctx);
11279        mutex_unlock(&ctx->mutex);
11280
11281        return event;
11282
11283err_unlock:
11284        mutex_unlock(&ctx->mutex);
11285        perf_unpin_context(ctx);
11286        put_ctx(ctx);
11287err_free:
11288        free_event(event);
11289err:
11290        return ERR_PTR(err);
11291}
11292EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
11293
11294void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
11295{
11296        struct perf_event_context *src_ctx;
11297        struct perf_event_context *dst_ctx;
11298        struct perf_event *event, *tmp;
11299        LIST_HEAD(events);
11300
11301        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
11302        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
11303
11304        /*
11305         * See perf_event_ctx_lock() for comments on the details
11306         * of swizzling perf_event::ctx.
11307         */
11308        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
11309        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
11310                                 event_entry) {
11311                perf_remove_from_context(event, 0);
11312                unaccount_event_cpu(event, src_cpu);
11313                put_ctx(src_ctx);
11314                list_add(&event->migrate_entry, &events);
11315        }
11316
11317        /*
11318         * Wait for the events to quiesce before re-instating them.
11319         */
11320        synchronize_rcu();
11321
11322        /*
11323         * Re-instate events in 2 passes.
11324         *
11325         * Skip over group leaders and only install siblings on this first
11326         * pass, siblings will not get enabled without a leader, however a
11327         * leader will enable its siblings, even if those are still on the old
11328         * context.
11329         */
11330        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11331                if (event->group_leader == event)
11332                        continue;
11333
11334                list_del(&event->migrate_entry);
11335                if (event->state >= PERF_EVENT_STATE_OFF)
11336                        event->state = PERF_EVENT_STATE_INACTIVE;
11337                account_event_cpu(event, dst_cpu);
11338                perf_install_in_context(dst_ctx, event, dst_cpu);
11339                get_ctx(dst_ctx);
11340        }
11341
11342        /*
11343         * Once all the siblings are setup properly, install the group leaders
11344         * to make it go.
11345         */
11346        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11347                list_del(&event->migrate_entry);
11348                if (event->state >= PERF_EVENT_STATE_OFF)
11349                        event->state = PERF_EVENT_STATE_INACTIVE;
11350                account_event_cpu(event, dst_cpu);
11351                perf_install_in_context(dst_ctx, event, dst_cpu);
11352                get_ctx(dst_ctx);
11353        }
11354        mutex_unlock(&dst_ctx->mutex);
11355        mutex_unlock(&src_ctx->mutex);
11356}
11357EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
11358
11359static void sync_child_event(struct perf_event *child_event,
11360                               struct task_struct *child)
11361{
11362        struct perf_event *parent_event = child_event->parent;
11363        u64 child_val;
11364
11365        if (child_event->attr.inherit_stat)
11366                perf_event_read_event(child_event, child);
11367
11368        child_val = perf_event_count(child_event);
11369
11370        /*
11371         * Add back the child's count to the parent's count:
11372         */
11373        atomic64_add(child_val, &parent_event->child_count);
11374        atomic64_add(child_event->total_time_enabled,
11375                     &parent_event->child_total_time_enabled);
11376        atomic64_add(child_event->total_time_running,
11377                     &parent_event->child_total_time_running);
11378}
11379
11380static void
11381perf_event_exit_event(struct perf_event *child_event,
11382                      struct perf_event_context *child_ctx,
11383                      struct task_struct *child)
11384{
11385        struct perf_event *parent_event = child_event->parent;
11386
11387        /*
11388         * Do not destroy the 'original' grouping; because of the context
11389         * switch optimization the original events could've ended up in a
11390         * random child task.
11391         *
11392         * If we were to destroy the original group, all group related
11393         * operations would cease to function properly after this random
11394         * child dies.
11395         *
11396         * Do destroy all inherited groups, we don't care about those
11397         * and being thorough is better.
11398         */
11399        raw_spin_lock_irq(&child_ctx->lock);
11400        WARN_ON_ONCE(child_ctx->is_active);
11401
11402        if (parent_event)
11403                perf_group_detach(child_event);
11404        list_del_event(child_event, child_ctx);
11405        perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
11406        raw_spin_unlock_irq(&child_ctx->lock);
11407
11408        /*
11409         * Parent events are governed by their filedesc, retain them.
11410         */
11411        if (!parent_event) {
11412                perf_event_wakeup(child_event);
11413                return;
11414        }
11415        /*
11416         * Child events can be cleaned up.
11417         */
11418
11419        sync_child_event(child_event, child);
11420
11421        /*
11422         * Remove this event from the parent's list
11423         */
11424        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
11425        mutex_lock(&parent_event->child_mutex);
11426        list_del_init(&child_event->child_list);
11427        mutex_unlock(&parent_event->child_mutex);
11428
11429        /*
11430         * Kick perf_poll() for is_event_hup().
11431         */
11432        perf_event_wakeup(parent_event);
11433        free_event(child_event);
11434        put_event(parent_event);
11435}
11436
11437static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
11438{
11439        struct perf_event_context *child_ctx, *clone_ctx = NULL;
11440        struct perf_event *child_event, *next;
11441
11442        WARN_ON_ONCE(child != current);
11443
11444        child_ctx = perf_pin_task_context(child, ctxn);
11445        if (!child_ctx)
11446                return;
11447
11448        /*
11449         * In order to reduce the amount of tricky in ctx tear-down, we hold
11450         * ctx::mutex over the entire thing. This serializes against almost
11451         * everything that wants to access the ctx.
11452         *
11453         * The exception is sys_perf_event_open() /
11454         * perf_event_create_kernel_count() which does find_get_context()
11455         * without ctx::mutex (it cannot because of the move_group double mutex
11456         * lock thing). See the comments in perf_install_in_context().
11457         */
11458        mutex_lock(&child_ctx->mutex);
11459
11460        /*
11461         * In a single ctx::lock section, de-schedule the events and detach the
11462         * context from the task such that we cannot ever get it scheduled back
11463         * in.
11464         */
11465        raw_spin_lock_irq(&child_ctx->lock);
11466        task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
11467
11468        /*
11469         * Now that the context is inactive, destroy the task <-> ctx relation
11470         * and mark the context dead.
11471         */
11472        RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
11473        put_ctx(child_ctx); /* cannot be last */
11474        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
11475        put_task_struct(current); /* cannot be last */
11476
11477        clone_ctx = unclone_ctx(child_ctx);
11478        raw_spin_unlock_irq(&child_ctx->lock);
11479
11480        if (clone_ctx)
11481                put_ctx(clone_ctx);
11482
11483        /*
11484         * Report the task dead after unscheduling the events so that we
11485         * won't get any samples after PERF_RECORD_EXIT. We can however still
11486         * get a few PERF_RECORD_READ events.
11487         */
11488        perf_event_task(child, child_ctx, 0);
11489
11490        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
11491                perf_event_exit_event(child_event, child_ctx, child);
11492
11493        mutex_unlock(&child_ctx->mutex);
11494
11495        put_ctx(child_ctx);
11496}
11497
11498/*
11499 * When a child task exits, feed back event values to parent events.
11500 *
11501 * Can be called with cred_guard_mutex held when called from
11502 * install_exec_creds().
11503 */
11504void perf_event_exit_task(struct task_struct *child)
11505{
11506        struct perf_event *event, *tmp;
11507        int ctxn;
11508
11509        mutex_lock(&child->perf_event_mutex);
11510        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
11511                                 owner_entry) {
11512                list_del_init(&event->owner_entry);
11513
11514                /*
11515                 * Ensure the list deletion is visible before we clear
11516                 * the owner, closes a race against perf_release() where
11517                 * we need to serialize on the owner->perf_event_mutex.
11518                 */
11519                smp_store_release(&event->owner, NULL);
11520        }
11521        mutex_unlock(&child->perf_event_mutex);
11522
11523        for_each_task_context_nr(ctxn)
11524                perf_event_exit_task_context(child, ctxn);
11525
11526        /*
11527         * The perf_event_exit_task_context calls perf_event_task
11528         * with child's task_ctx, which generates EXIT events for
11529         * child contexts and sets child->perf_event_ctxp[] to NULL.
11530         * At this point we need to send EXIT events to cpu contexts.
11531         */
11532        perf_event_task(child, NULL, 0);
11533}
11534
11535static void perf_free_event(struct perf_event *event,
11536                            struct perf_event_context *ctx)
11537{
11538        struct perf_event *parent = event->parent;
11539
11540        if (WARN_ON_ONCE(!parent))
11541                return;
11542
11543        mutex_lock(&parent->child_mutex);
11544        list_del_init(&event->child_list);
11545        mutex_unlock(&parent->child_mutex);
11546
11547        put_event(parent);
11548
11549        raw_spin_lock_irq(&ctx->lock);
11550        perf_group_detach(event);
11551        list_del_event(event, ctx);
11552        raw_spin_unlock_irq(&ctx->lock);
11553        free_event(event);
11554}
11555
11556/*
11557 * Free a context as created by inheritance by perf_event_init_task() below,
11558 * used by fork() in case of fail.
11559 *
11560 * Even though the task has never lived, the context and events have been
11561 * exposed through the child_list, so we must take care tearing it all down.
11562 */
11563void perf_event_free_task(struct task_struct *task)
11564{
11565        struct perf_event_context *ctx;
11566        struct perf_event *event, *tmp;
11567        int ctxn;
11568
11569        for_each_task_context_nr(ctxn) {
11570                ctx = task->perf_event_ctxp[ctxn];
11571                if (!ctx)
11572                        continue;
11573
11574                mutex_lock(&ctx->mutex);
11575                raw_spin_lock_irq(&ctx->lock);
11576                /*
11577                 * Destroy the task <-> ctx relation and mark the context dead.
11578                 *
11579                 * This is important because even though the task hasn't been
11580                 * exposed yet the context has been (through child_list).
11581                 */
11582                RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
11583                WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
11584                put_task_struct(task); /* cannot be last */
11585                raw_spin_unlock_irq(&ctx->lock);
11586
11587                list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
11588                        perf_free_event(event, ctx);
11589
11590                mutex_unlock(&ctx->mutex);
11591
11592                /*
11593                 * perf_event_release_kernel() could've stolen some of our
11594                 * child events and still have them on its free_list. In that
11595                 * case we must wait for these events to have been freed (in
11596                 * particular all their references to this task must've been
11597                 * dropped).
11598                 *
11599                 * Without this copy_process() will unconditionally free this
11600                 * task (irrespective of its reference count) and
11601                 * _free_event()'s put_task_struct(event->hw.target) will be a
11602                 * use-after-free.
11603                 *
11604                 * Wait for all events to drop their context reference.
11605                 */
11606                wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
11607                put_ctx(ctx); /* must be last */
11608        }
11609}
11610
11611void perf_event_delayed_put(struct task_struct *task)
11612{
11613        int ctxn;
11614
11615        for_each_task_context_nr(ctxn)
11616                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
11617}
11618
11619struct file *perf_event_get(unsigned int fd)
11620{
11621        struct file *file = fget(fd);
11622        if (!file)
11623                return ERR_PTR(-EBADF);
11624
11625        if (file->f_op != &perf_fops) {
11626                fput(file);
11627                return ERR_PTR(-EBADF);
11628        }
11629
11630        return file;
11631}
11632
11633const struct perf_event *perf_get_event(struct file *file)
11634{
11635        if (file->f_op != &perf_fops)
11636                return ERR_PTR(-EINVAL);
11637
11638        return file->private_data;
11639}
11640
11641const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
11642{
11643        if (!event)
11644                return ERR_PTR(-EINVAL);
11645
11646        return &event->attr;
11647}
11648
11649/*
11650 * Inherit an event from parent task to child task.
11651 *
11652 * Returns:
11653 *  - valid pointer on success
11654 *  - NULL for orphaned events
11655 *  - IS_ERR() on error
11656 */
11657static struct perf_event *
11658inherit_event(struct perf_event *parent_event,
11659              struct task_struct *parent,
11660              struct perf_event_context *parent_ctx,
11661              struct task_struct *child,
11662              struct perf_event *group_leader,
11663              struct perf_event_context *child_ctx)
11664{
11665        enum perf_event_state parent_state = parent_event->state;
11666        struct perf_event *child_event;
11667        unsigned long flags;
11668
11669        /*
11670         * Instead of creating recursive hierarchies of events,
11671         * we link inherited events back to the original parent,
11672         * which has a filp for sure, which we use as the reference
11673         * count:
11674         */
11675        if (parent_event->parent)
11676                parent_event = parent_event->parent;
11677
11678        child_event = perf_event_alloc(&parent_event->attr,
11679                                           parent_event->cpu,
11680                                           child,
11681                                           group_leader, parent_event,
11682                                           NULL, NULL, -1);
11683        if (IS_ERR(child_event))
11684                return child_event;
11685
11686
11687        if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
11688            !child_ctx->task_ctx_data) {
11689                struct pmu *pmu = child_event->pmu;
11690
11691                child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11692                                                   GFP_KERNEL);
11693                if (!child_ctx->task_ctx_data) {
11694                        free_event(child_event);
11695                        return NULL;
11696                }
11697        }
11698
11699        /*
11700         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
11701         * must be under the same lock in order to serialize against
11702         * perf_event_release_kernel(), such that either we must observe
11703         * is_orphaned_event() or they will observe us on the child_list.
11704         */
11705        mutex_lock(&parent_event->child_mutex);
11706        if (is_orphaned_event(parent_event) ||
11707            !atomic_long_inc_not_zero(&parent_event->refcount)) {
11708                mutex_unlock(&parent_event->child_mutex);
11709                /* task_ctx_data is freed with child_ctx */
11710                free_event(child_event);
11711                return NULL;
11712        }
11713
11714        get_ctx(child_ctx);
11715
11716        /*
11717         * Make the child state follow the state of the parent event,
11718         * not its attr.disabled bit.  We hold the parent's mutex,
11719         * so we won't race with perf_event_{en, dis}able_family.
11720         */
11721        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
11722                child_event->state = PERF_EVENT_STATE_INACTIVE;
11723        else
11724                child_event->state = PERF_EVENT_STATE_OFF;
11725
11726        if (parent_event->attr.freq) {
11727                u64 sample_period = parent_event->hw.sample_period;
11728                struct hw_perf_event *hwc = &child_event->hw;
11729
11730                hwc->sample_period = sample_period;
11731                hwc->last_period   = sample_period;
11732
11733                local64_set(&hwc->period_left, sample_period);
11734        }
11735
11736        child_event->ctx = child_ctx;
11737        child_event->overflow_handler = parent_event->overflow_handler;
11738        child_event->overflow_handler_context
11739                = parent_event->overflow_handler_context;
11740
11741        /*
11742         * Precalculate sample_data sizes
11743         */
11744        perf_event__header_size(child_event);
11745        perf_event__id_header_size(child_event);
11746
11747        /*
11748         * Link it up in the child's context:
11749         */
11750        raw_spin_lock_irqsave(&child_ctx->lock, flags);
11751        add_event_to_ctx(child_event, child_ctx);
11752        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
11753
11754        /*
11755         * Link this into the parent event's child list
11756         */
11757        list_add_tail(&child_event->child_list, &parent_event->child_list);
11758        mutex_unlock(&parent_event->child_mutex);
11759
11760        return child_event;
11761}
11762
11763/*
11764 * Inherits an event group.
11765 *
11766 * This will quietly suppress orphaned events; !inherit_event() is not an error.
11767 * This matches with perf_event_release_kernel() removing all child events.
11768 *
11769 * Returns:
11770 *  - 0 on success
11771 *  - <0 on error
11772 */
11773static int inherit_group(struct perf_event *parent_event,
11774              struct task_struct *parent,
11775              struct perf_event_context *parent_ctx,
11776              struct task_struct *child,
11777              struct perf_event_context *child_ctx)
11778{
11779        struct perf_event *leader;
11780        struct perf_event *sub;
11781        struct perf_event *child_ctr;
11782
11783        leader = inherit_event(parent_event, parent, parent_ctx,
11784                                 child, NULL, child_ctx);
11785        if (IS_ERR(leader))
11786                return PTR_ERR(leader);
11787        /*
11788         * @leader can be NULL here because of is_orphaned_event(). In this
11789         * case inherit_event() will create individual events, similar to what
11790         * perf_group_detach() would do anyway.
11791         */
11792        for_each_sibling_event(sub, parent_event) {
11793                child_ctr = inherit_event(sub, parent, parent_ctx,
11794                                            child, leader, child_ctx);
11795                if (IS_ERR(child_ctr))
11796                        return PTR_ERR(child_ctr);
11797        }
11798        return 0;
11799}
11800
11801/*
11802 * Creates the child task context and tries to inherit the event-group.
11803 *
11804 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
11805 * inherited_all set when we 'fail' to inherit an orphaned event; this is
11806 * consistent with perf_event_release_kernel() removing all child events.
11807 *
11808 * Returns:
11809 *  - 0 on success
11810 *  - <0 on error
11811 */
11812static int
11813inherit_task_group(struct perf_event *event, struct task_struct *parent,
11814                   struct perf_event_context *parent_ctx,
11815                   struct task_struct *child, int ctxn,
11816                   int *inherited_all)
11817{
11818        int ret;
11819        struct perf_event_context *child_ctx;
11820
11821        if (!event->attr.inherit) {
11822                *inherited_all = 0;
11823                return 0;
11824        }
11825
11826        child_ctx = child->perf_event_ctxp[ctxn];
11827        if (!child_ctx) {
11828                /*
11829                 * This is executed from the parent task context, so
11830                 * inherit events that have been marked for cloning.
11831                 * First allocate and initialize a context for the
11832                 * child.
11833                 */
11834                child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11835                if (!child_ctx)
11836                        return -ENOMEM;
11837
11838                child->perf_event_ctxp[ctxn] = child_ctx;
11839        }
11840
11841        ret = inherit_group(event, parent, parent_ctx,
11842                            child, child_ctx);
11843
11844        if (ret)
11845                *inherited_all = 0;
11846
11847        return ret;
11848}
11849
11850/*
11851 * Initialize the perf_event context in task_struct
11852 */
11853static int perf_event_init_context(struct task_struct *child, int ctxn)
11854{
11855        struct perf_event_context *child_ctx, *parent_ctx;
11856        struct perf_event_context *cloned_ctx;
11857        struct perf_event *event;
11858        struct task_struct *parent = current;
11859        int inherited_all = 1;
11860        unsigned long flags;
11861        int ret = 0;
11862
11863        if (likely(!parent->perf_event_ctxp[ctxn]))
11864                return 0;
11865
11866        /*
11867         * If the parent's context is a clone, pin it so it won't get
11868         * swapped under us.
11869         */
11870        parent_ctx = perf_pin_task_context(parent, ctxn);
11871        if (!parent_ctx)
11872                return 0;
11873
11874        /*
11875         * No need to check if parent_ctx != NULL here; since we saw
11876         * it non-NULL earlier, the only reason for it to become NULL
11877         * is if we exit, and since we're currently in the middle of
11878         * a fork we can't be exiting at the same time.
11879         */
11880
11881        /*
11882         * Lock the parent list. No need to lock the child - not PID
11883         * hashed yet and not running, so nobody can access it.
11884         */
11885        mutex_lock(&parent_ctx->mutex);
11886
11887        /*
11888         * We dont have to disable NMIs - we are only looking at
11889         * the list, not manipulating it:
11890         */
11891        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
11892                ret = inherit_task_group(event, parent, parent_ctx,
11893                                         child, ctxn, &inherited_all);
11894                if (ret)
11895                        goto out_unlock;
11896        }
11897
11898        /*
11899         * We can't hold ctx->lock when iterating the ->flexible_group list due
11900         * to allocations, but we need to prevent rotation because
11901         * rotate_ctx() will change the list from interrupt context.
11902         */
11903        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11904        parent_ctx->rotate_disable = 1;
11905        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11906
11907        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
11908                ret = inherit_task_group(event, parent, parent_ctx,
11909                                         child, ctxn, &inherited_all);
11910                if (ret)
11911                        goto out_unlock;
11912        }
11913
11914        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11915        parent_ctx->rotate_disable = 0;
11916
11917        child_ctx = child->perf_event_ctxp[ctxn];
11918
11919        if (child_ctx && inherited_all) {
11920                /*
11921                 * Mark the child context as a clone of the parent
11922                 * context, or of whatever the parent is a clone of.
11923                 *
11924                 * Note that if the parent is a clone, the holding of
11925                 * parent_ctx->lock avoids it from being uncloned.
11926                 */
11927                cloned_ctx = parent_ctx->parent_ctx;
11928                if (cloned_ctx) {
11929                        child_ctx->parent_ctx = cloned_ctx;
11930                        child_ctx->parent_gen = parent_ctx->parent_gen;
11931                } else {
11932                        child_ctx->parent_ctx = parent_ctx;
11933                        child_ctx->parent_gen = parent_ctx->generation;
11934                }
11935                get_ctx(child_ctx->parent_ctx);
11936        }
11937
11938        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11939out_unlock:
11940        mutex_unlock(&parent_ctx->mutex);
11941
11942        perf_unpin_context(parent_ctx);
11943        put_ctx(parent_ctx);
11944
11945        return ret;
11946}
11947
11948/*
11949 * Initialize the perf_event context in task_struct
11950 */
11951int perf_event_init_task(struct task_struct *child)
11952{
11953        int ctxn, ret;
11954
11955        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11956        mutex_init(&child->perf_event_mutex);
11957        INIT_LIST_HEAD(&child->perf_event_list);
11958
11959        for_each_task_context_nr(ctxn) {
11960                ret = perf_event_init_context(child, ctxn);
11961                if (ret) {
11962                        perf_event_free_task(child);
11963                        return ret;
11964                }
11965        }
11966
11967        return 0;
11968}
11969
11970static void __init perf_event_init_all_cpus(void)
11971{
11972        struct swevent_htable *swhash;
11973        int cpu;
11974
11975        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11976
11977        for_each_possible_cpu(cpu) {
11978                swhash = &per_cpu(swevent_htable, cpu);
11979                mutex_init(&swhash->hlist_mutex);
11980                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11981
11982                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11983                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11984
11985#ifdef CONFIG_CGROUP_PERF
11986                INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11987#endif
11988                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11989        }
11990}
11991
11992static void perf_swevent_init_cpu(unsigned int cpu)
11993{
11994        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11995
11996        mutex_lock(&swhash->hlist_mutex);
11997        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11998                struct swevent_hlist *hlist;
11999
12000                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
12001                WARN_ON(!hlist);
12002                rcu_assign_pointer(swhash->swevent_hlist, hlist);
12003        }
12004        mutex_unlock(&swhash->hlist_mutex);
12005}
12006
12007#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
12008static void __perf_event_exit_context(void *__info)
12009{
12010        struct perf_event_context *ctx = __info;
12011        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
12012        struct perf_event *event;
12013
12014        raw_spin_lock(&ctx->lock);
12015        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
12016        list_for_each_entry(event, &ctx->event_list, event_entry)
12017                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
12018        raw_spin_unlock(&ctx->lock);
12019}
12020
12021static void perf_event_exit_cpu_context(int cpu)
12022{
12023        struct perf_cpu_context *cpuctx;
12024        struct perf_event_context *ctx;
12025        struct pmu *pmu;
12026
12027        mutex_lock(&pmus_lock);
12028        list_for_each_entry(pmu, &pmus, entry) {
12029                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12030                ctx = &cpuctx->ctx;
12031
12032                mutex_lock(&ctx->mutex);
12033                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
12034                cpuctx->online = 0;
12035                mutex_unlock(&ctx->mutex);
12036        }
12037        cpumask_clear_cpu(cpu, perf_online_mask);
12038        mutex_unlock(&pmus_lock);
12039}
12040#else
12041
12042static void perf_event_exit_cpu_context(int cpu) { }
12043
12044#endif
12045
12046int perf_event_init_cpu(unsigned int cpu)
12047{
12048        struct perf_cpu_context *cpuctx;
12049        struct perf_event_context *ctx;
12050        struct pmu *pmu;
12051
12052        perf_swevent_init_cpu(cpu);
12053
12054        mutex_lock(&pmus_lock);
12055        cpumask_set_cpu(cpu, perf_online_mask);
12056        list_for_each_entry(pmu, &pmus, entry) {
12057                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12058                ctx = &cpuctx->ctx;
12059
12060                mutex_lock(&ctx->mutex);
12061                cpuctx->online = 1;
12062                mutex_unlock(&ctx->mutex);
12063        }
12064        mutex_unlock(&pmus_lock);
12065
12066        return 0;
12067}
12068
12069int perf_event_exit_cpu(unsigned int cpu)
12070{
12071        perf_event_exit_cpu_context(cpu);
12072        return 0;
12073}
12074
12075static int
12076perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
12077{
12078        int cpu;
12079
12080        for_each_online_cpu(cpu)
12081                perf_event_exit_cpu(cpu);
12082
12083        return NOTIFY_OK;
12084}
12085
12086/*
12087 * Run the perf reboot notifier at the very last possible moment so that
12088 * the generic watchdog code runs as long as possible.
12089 */
12090static struct notifier_block perf_reboot_notifier = {
12091        .notifier_call = perf_reboot,
12092        .priority = INT_MIN,
12093};
12094
12095void __init perf_event_init(void)
12096{
12097        int ret;
12098
12099        idr_init(&pmu_idr);
12100
12101        perf_event_init_all_cpus();
12102        init_srcu_struct(&pmus_srcu);
12103        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
12104        perf_pmu_register(&perf_cpu_clock, NULL, -1);
12105        perf_pmu_register(&perf_task_clock, NULL, -1);
12106        perf_tp_register();
12107        perf_event_init_cpu(smp_processor_id());
12108        register_reboot_notifier(&perf_reboot_notifier);
12109
12110        ret = init_hw_breakpoint();
12111        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
12112
12113        /*
12114         * Build time assertion that we keep the data_head at the intended
12115         * location.  IOW, validation we got the __reserved[] size right.
12116         */
12117        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
12118                     != 1024);
12119}
12120
12121ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
12122                              char *page)
12123{
12124        struct perf_pmu_events_attr *pmu_attr =
12125                container_of(attr, struct perf_pmu_events_attr, attr);
12126
12127        if (pmu_attr->event_str)
12128                return sprintf(page, "%s\n", pmu_attr->event_str);
12129
12130        return 0;
12131}
12132EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
12133
12134static int __init perf_event_sysfs_init(void)
12135{
12136        struct pmu *pmu;
12137        int ret;
12138
12139        mutex_lock(&pmus_lock);
12140
12141        ret = bus_register(&pmu_bus);
12142        if (ret)
12143                goto unlock;
12144
12145        list_for_each_entry(pmu, &pmus, entry) {
12146                if (!pmu->name || pmu->type < 0)
12147                        continue;
12148
12149                ret = pmu_dev_alloc(pmu);
12150                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
12151        }
12152        pmu_bus_running = 1;
12153        ret = 0;
12154
12155unlock:
12156        mutex_unlock(&pmus_lock);
12157
12158        return ret;
12159}
12160device_initcall(perf_event_sysfs_init);
12161
12162#ifdef CONFIG_CGROUP_PERF
12163static struct cgroup_subsys_state *
12164perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
12165{
12166        struct perf_cgroup *jc;
12167
12168        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
12169        if (!jc)
12170                return ERR_PTR(-ENOMEM);
12171
12172        jc->info = alloc_percpu(struct perf_cgroup_info);
12173        if (!jc->info) {
12174                kfree(jc);
12175                return ERR_PTR(-ENOMEM);
12176        }
12177
12178        return &jc->css;
12179}
12180
12181static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
12182{
12183        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
12184
12185        free_percpu(jc->info);
12186        kfree(jc);
12187}
12188
12189static int __perf_cgroup_move(void *info)
12190{
12191        struct task_struct *task = info;
12192        rcu_read_lock();
12193        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
12194        rcu_read_unlock();
12195        return 0;
12196}
12197
12198static void perf_cgroup_attach(struct cgroup_taskset *tset)
12199{
12200        struct task_struct *task;
12201        struct cgroup_subsys_state *css;
12202
12203        cgroup_taskset_for_each(task, css, tset)
12204                task_function_call(task, __perf_cgroup_move, task);
12205}
12206
12207struct cgroup_subsys perf_event_cgrp_subsys = {
12208        .css_alloc      = perf_cgroup_css_alloc,
12209        .css_free       = perf_cgroup_css_free,
12210        .attach         = perf_cgroup_attach,
12211        /*
12212         * Implicitly enable on dfl hierarchy so that perf events can
12213         * always be filtered by cgroup2 path as long as perf_event
12214         * controller is not mounted on a legacy hierarchy.
12215         */
12216        .implicit_on_dfl = true,
12217        .threaded       = true,
12218};
12219#endif /* CONFIG_CGROUP_PERF */
12220